1/*
2 * Copyright (C) 2001 Edmund Grimley Evans <edmundo@rano.org>
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
17 */
18
19#if HAVE_CONFIG_H
20#  include <config.h>
21#endif
22
23#ifdef HAVE_ICONV
24
25#include <assert.h>
26#include <errno.h>
27#include <iconv.h>
28#include <stdlib.h>
29#include <string.h>
30
31#include "iconvert.h"
32#include "share/alloc.h"
33
34/*
35 * Convert data from one encoding to another. Return:
36 *
37 *  -2 : memory allocation failed
38 *  -1 : unknown encoding
39 *   0 : data was converted exactly
40 *   1 : data was converted inexactly
41 *   2 : data was invalid (but still converted)
42 *
43 * We convert in two steps, via UTF-8, as this is the only
44 * reliable way of distinguishing between invalid input
45 * and valid input which iconv refuses to transliterate.
46 * We convert from UTF-8 twice, because we have no way of
47 * knowing whether the conversion was exact if iconv returns
48 * E2BIG (due to a bug in the specification of iconv).
49 * An alternative approach is to assume that the output of
50 * iconv is never more than 4 times as long as the input,
51 * but I prefer to avoid that assumption if possible.
52 */
53
54int iconvert(const char *fromcode, const char *tocode,
55	     const char *from, size_t fromlen,
56	     char **to, size_t *tolen)
57{
58  int ret = 0;
59  iconv_t cd1, cd2;
60  char *ib;
61  char *ob;
62  char *utfbuf = 0, *outbuf, *newbuf;
63  size_t utflen, outlen, ibl, obl, k;
64  char tbuf[2048];
65
66  cd1 = iconv_open("UTF-8", fromcode);
67  if (cd1 == (iconv_t)(-1))
68    return -1;
69
70  cd2 = (iconv_t)(-1);
71  /* Don't use strcasecmp() as it's locale-dependent. */
72  if (!strchr("Uu", tocode[0]) ||
73      !strchr("Tt", tocode[1]) ||
74      !strchr("Ff", tocode[2]) ||
75      tocode[3] != '-' ||
76      tocode[4] != '8' ||
77      tocode[5] != '\0') {
78    char *tocode1;
79
80    /*
81     * Try using this non-standard feature of glibc and libiconv.
82     * This is deliberately not a config option as people often
83     * change their iconv library without rebuilding applications.
84     */
85    tocode1 = (char *)safe_malloc_add_2op_(strlen(tocode), /*+*/11);
86    if (!tocode1)
87      goto fail;
88
89    strcpy(tocode1, tocode);
90    strcat(tocode1, "//TRANSLIT");
91    cd2 = iconv_open(tocode1, "UTF-8");
92    free(tocode1);
93
94    if (cd2 == (iconv_t)(-1))
95      cd2 = iconv_open(tocode, fromcode);
96
97    if (cd2 == (iconv_t)(-1)) {
98      iconv_close(cd1);
99      return -1;
100    }
101  }
102
103  utflen = 1; /*fromlen * 2 + 1; XXX */
104  utfbuf = (char *)malloc(utflen);
105  if (!utfbuf)
106    goto fail;
107
108  /* Convert to UTF-8 */
109  ib = (char *)from;
110  ibl = fromlen;
111  ob = utfbuf;
112  obl = utflen;
113  for (;;) {
114    k = iconv(cd1, &ib, &ibl, &ob, &obl);
115    assert((!k && !ibl) ||
116	   (k == (size_t)(-1) && errno == E2BIG && ibl && obl < 6) ||
117	   (k == (size_t)(-1) &&
118	    (errno == EILSEQ || errno == EINVAL) && ibl));
119    if (!ibl)
120      break;
121    if (obl < 6) {
122      /* Enlarge the buffer */
123      if(utflen*2 < utflen) /* overflow check */
124	goto fail;
125      utflen *= 2;
126      newbuf = (char *)realloc(utfbuf, utflen);
127      if (!newbuf)
128	goto fail;
129      ob = (ob - utfbuf) + newbuf;
130      obl = utflen - (ob - newbuf);
131      utfbuf = newbuf;
132    }
133    else {
134      /* Invalid input */
135      ib++, ibl--;
136      *ob++ = '#', obl--;
137      ret = 2;
138      iconv(cd1, 0, 0, 0, 0);
139    }
140  }
141
142  if (cd2 == (iconv_t)(-1)) {
143    /* The target encoding was UTF-8 */
144    if (tolen)
145      *tolen = ob - utfbuf;
146    if (!to) {
147      free(utfbuf);
148      iconv_close(cd1);
149      return ret;
150    }
151    newbuf = (char *)safe_realloc_add_2op_(utfbuf, (ob - utfbuf), /*+*/1);
152    if (!newbuf)
153      goto fail;
154    ob = (ob - utfbuf) + newbuf;
155    *ob = '\0';
156    *to = newbuf;
157    iconv_close(cd1);
158    return ret;
159  }
160
161  /* Truncate the buffer to be tidy */
162  utflen = ob - utfbuf;
163  newbuf = (char *)realloc(utfbuf, utflen);
164  if (!newbuf)
165    goto fail;
166  utfbuf = newbuf;
167
168  /* Convert from UTF-8 to discover how long the output is */
169  outlen = 0;
170  ib = utfbuf;
171  ibl = utflen;
172  while (ibl) {
173    ob = tbuf;
174    obl = sizeof(tbuf);
175    k = iconv(cd2, &ib, &ibl, &ob, &obl);
176    assert((k != (size_t)(-1) && !ibl) ||
177	   (k == (size_t)(-1) && errno == E2BIG && ibl) ||
178	   (k == (size_t)(-1) && errno == EILSEQ && ibl));
179    if (ibl && !(k == (size_t)(-1) && errno == E2BIG)) {
180      /* Replace one character */
181      char *tb = "?";
182      size_t tbl = 1;
183
184      outlen += ob - tbuf;
185      ob = tbuf;
186      obl = sizeof(tbuf);
187      k = iconv(cd2, &tb, &tbl, &ob, &obl);
188      assert((!k && !tbl) ||
189	     (k == (size_t)(-1) && errno == EILSEQ && tbl));
190      for (++ib, --ibl; ibl && (*ib & 0x80); ib++, ibl--)
191	;
192    }
193    outlen += ob - tbuf;
194  }
195  ob = tbuf;
196  obl = sizeof(tbuf);
197  k = iconv(cd2, 0, 0, &ob, &obl);
198  assert(!k);
199  outlen += ob - tbuf;
200
201  /* Convert from UTF-8 for real */
202  outbuf = (char *)safe_malloc_add_2op_(outlen, /*+*/1);
203  if (!outbuf)
204    goto fail;
205  ib = utfbuf;
206  ibl = utflen;
207  ob = outbuf;
208  obl = outlen;
209  while (ibl) {
210    k = iconv(cd2, &ib, &ibl, &ob, &obl);
211    assert((k != (size_t)(-1) && !ibl) ||
212	   (k == (size_t)(-1) && errno == EILSEQ && ibl));
213    if (k && !ret)
214      ret = 1;
215    if (ibl && !(k == (size_t)(-1) && errno == E2BIG)) {
216      /* Replace one character */
217      char *tb = "?";
218      size_t tbl = 1;
219
220      k = iconv(cd2, &tb, &tbl, &ob, &obl);
221      assert((!k && !tbl) ||
222	     (k == (size_t)(-1) && errno == EILSEQ && tbl));
223      for (++ib, --ibl; ibl && (*ib & 0x80); ib++, ibl--)
224	;
225    }
226  }
227  k = iconv(cd2, 0, 0, &ob, &obl);
228  assert(!k);
229  assert(!obl);
230  *ob = '\0';
231
232  free(utfbuf);
233  iconv_close(cd1);
234  iconv_close(cd2);
235  if (tolen)
236    *tolen = outlen;
237  if (!to) {
238    free(outbuf);
239    return ret;
240  }
241  *to = outbuf;
242  return ret;
243
244 fail:
245  if(0 != utfbuf)
246    free(utfbuf);
247  iconv_close(cd1);
248  if (cd2 != (iconv_t)(-1))
249    iconv_close(cd2);
250  return -2;
251}
252
253#endif /* HAVE_ICONV */
254