1/* Character set conversion with error handling and autodetection.
2   Copyright (C) 2002, 2005, 2007, 2009, 2010 Free Software Foundation, Inc.
3   Written by Bruno Haible.
4
5   This program is free software: you can redistribute it and/or modify
6   it under the terms of the GNU Lesser General Public License as published by
7   the Free Software Foundation; either version 3 of the License, or
8   (at your option) any later version.
9
10   This program is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13   GNU Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public License
16   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
17
18#include <config.h>
19
20/* Specification.  */
21#include "striconveha.h"
22
23#include <errno.h>
24#include <stdlib.h>
25#include <string.h>
26
27#include "malloca.h"
28#include "c-strcase.h"
29#include "striconveh.h"
30
31#define SIZEOF(a) (sizeof(a)/sizeof(a[0]))
32
33
34/* Autodetection list.  */
35
36struct autodetect_alias
37{
38  struct autodetect_alias *next;
39  const char *name;
40  const char * const *encodings_to_try;
41};
42
43static const char * const autodetect_utf8_try[] =
44{
45  /* Try UTF-8 first. There are very few ISO-8859-1 inputs that would
46     be valid UTF-8, but many UTF-8 inputs are valid ISO-8859-1.  */
47  "UTF-8", "ISO-8859-1",
48  NULL
49};
50static const char * const autodetect_jp_try[] =
51{
52  /* Try 7-bit encoding first. If the input contains bytes >= 0x80,
53     it will fail.
54     Try EUC-JP next. Short SHIFT_JIS inputs may come out wrong. This
55     is unavoidable. People will condemn SHIFT_JIS.
56     If we tried SHIFT_JIS first, then some short EUC-JP inputs would
57     come out wrong, and people would condemn EUC-JP and Unix, which
58     would not be good.
59     Finally try SHIFT_JIS.  */
60  "ISO-2022-JP-2", "EUC-JP", "SHIFT_JIS",
61  NULL
62};
63static const char * const autodetect_kr_try[] =
64{
65  /* Try 7-bit encoding first. If the input contains bytes >= 0x80,
66     it will fail.
67     Finally try EUC-KR.  */
68  "ISO-2022-KR", "EUC-KR",
69  NULL
70};
71
72static struct autodetect_alias autodetect_predefined[] =
73{
74  { &autodetect_predefined[1], "autodetect_utf8", autodetect_utf8_try },
75  { &autodetect_predefined[2], "autodetect_jp",   autodetect_jp_try },
76  { NULL,                      "autodetect_kr",   autodetect_kr_try }
77};
78
79static struct autodetect_alias *autodetect_list = &autodetect_predefined[0];
80static struct autodetect_alias **autodetect_list_end =
81  &autodetect_predefined[SIZEOF(autodetect_predefined)-1].next;
82
83int
84uniconv_register_autodetect (const char *name,
85                             const char * const *try_in_order)
86{
87  size_t namelen;
88  size_t listlen;
89  size_t memneed;
90  size_t i;
91  char *memory;
92  struct autodetect_alias *new_alias;
93  char *new_name;
94  const char **new_try_in_order;
95
96  /* The TRY_IN_ORDER list must not be empty.  */
97  if (try_in_order[0] == NULL)
98    {
99      errno = EINVAL;
100      return -1;
101    }
102
103  /* We must deep-copy NAME and TRY_IN_ORDER, because they may be allocated
104     with dynamic extent.  */
105  namelen = strlen (name) + 1;
106  memneed = sizeof (struct autodetect_alias) + namelen + sizeof (char *);
107  for (i = 0; try_in_order[i] != NULL; i++)
108    memneed += sizeof (char *) + strlen (try_in_order[i]) + 1;
109  listlen = i;
110
111  memory = (char *) malloc (memneed);
112  if (memory != NULL)
113    {
114      new_alias = (struct autodetect_alias *) memory;
115      memory += sizeof (struct autodetect_alias);
116
117      new_try_in_order = (const char **) memory;
118      memory += (listlen + 1) * sizeof (char *);
119
120      new_name = (char *) memory;
121      memcpy (new_name, name, namelen);
122      memory += namelen;
123
124      for (i = 0; i < listlen; i++)
125        {
126          size_t len = strlen (try_in_order[i]) + 1;
127          memcpy (memory, try_in_order[i], len);
128          new_try_in_order[i] = (const char *) memory;
129          memory += len;
130        }
131      new_try_in_order[i] = NULL;
132
133      /* Now insert the new alias.  */
134      new_alias->name = new_name;
135      new_alias->encodings_to_try = new_try_in_order;
136      new_alias->next = NULL;
137      /* FIXME: Not multithread-safe.  */
138      *autodetect_list_end = new_alias;
139      autodetect_list_end = &new_alias->next;
140      return 0;
141    }
142  else
143    {
144      errno = ENOMEM;
145      return -1;
146    }
147}
148
149/* Like mem_iconveha, except no handling of transliteration.  */
150static int
151mem_iconveha_notranslit (const char *src, size_t srclen,
152                         const char *from_codeset, const char *to_codeset,
153                         enum iconv_ilseq_handler handler,
154                         size_t *offsets,
155                         char **resultp, size_t *lengthp)
156{
157  int retval = mem_iconveh (src, srclen, from_codeset, to_codeset, handler,
158                            offsets, resultp, lengthp);
159  if (retval >= 0 || errno != EINVAL)
160    return retval;
161  else
162    {
163      struct autodetect_alias *alias;
164
165      /* Unsupported from_codeset or to_codeset. Check whether the caller
166         requested autodetection.  */
167      for (alias = autodetect_list; alias != NULL; alias = alias->next)
168        if (strcmp (from_codeset, alias->name) == 0)
169          {
170            const char * const *encodings;
171
172            if (handler != iconveh_error)
173              {
174                /* First try all encodings without any forgiving.  */
175                encodings = alias->encodings_to_try;
176                do
177                  {
178                    retval = mem_iconveha_notranslit (src, srclen,
179                                                      *encodings, to_codeset,
180                                                      iconveh_error, offsets,
181                                                      resultp, lengthp);
182                    if (!(retval < 0 && errno == EILSEQ))
183                      return retval;
184                    encodings++;
185                  }
186                while (*encodings != NULL);
187              }
188
189            encodings = alias->encodings_to_try;
190            do
191              {
192                retval = mem_iconveha_notranslit (src, srclen,
193                                                  *encodings, to_codeset,
194                                                  handler, offsets,
195                                                  resultp, lengthp);
196                if (!(retval < 0 && errno == EILSEQ))
197                  return retval;
198                encodings++;
199              }
200            while (*encodings != NULL);
201
202            /* Return the last call's result.  */
203            return -1;
204          }
205
206      /* It wasn't an autodetection name.  */
207      errno = EINVAL;
208      return -1;
209    }
210}
211
212int
213mem_iconveha (const char *src, size_t srclen,
214              const char *from_codeset, const char *to_codeset,
215              bool transliterate,
216              enum iconv_ilseq_handler handler,
217              size_t *offsets,
218              char **resultp, size_t *lengthp)
219{
220  if (srclen == 0)
221    {
222      /* Nothing to convert.  */
223      *lengthp = 0;
224      return 0;
225    }
226
227  /* When using GNU libc >= 2.2 or GNU libiconv >= 1.5,
228     we want to use transliteration.  */
229#if (__GLIBC__ == 2 && __GLIBC_MINOR__ >= 2) || __GLIBC__ > 2 || _LIBICONV_VERSION >= 0x0105
230  if (transliterate)
231    {
232      int retval;
233      size_t len = strlen (to_codeset);
234      char *to_codeset_suffixed = (char *) malloca (len + 10 + 1);
235      memcpy (to_codeset_suffixed, to_codeset, len);
236      memcpy (to_codeset_suffixed + len, "//TRANSLIT", 10 + 1);
237
238      retval = mem_iconveha_notranslit (src, srclen,
239                                        from_codeset, to_codeset_suffixed,
240                                        handler, offsets, resultp, lengthp);
241
242      freea (to_codeset_suffixed);
243
244      return retval;
245    }
246  else
247#endif
248    return mem_iconveha_notranslit (src, srclen,
249                                    from_codeset, to_codeset,
250                                    handler, offsets, resultp, lengthp);
251}
252
253/* Like str_iconveha, except no handling of transliteration.  */
254static char *
255str_iconveha_notranslit (const char *src,
256                         const char *from_codeset, const char *to_codeset,
257                         enum iconv_ilseq_handler handler)
258{
259  char *result = str_iconveh (src, from_codeset, to_codeset, handler);
260
261  if (result != NULL || errno != EINVAL)
262    return result;
263  else
264    {
265      struct autodetect_alias *alias;
266
267      /* Unsupported from_codeset or to_codeset. Check whether the caller
268         requested autodetection.  */
269      for (alias = autodetect_list; alias != NULL; alias = alias->next)
270        if (strcmp (from_codeset, alias->name) == 0)
271          {
272            const char * const *encodings;
273
274            if (handler != iconveh_error)
275              {
276                /* First try all encodings without any forgiving.  */
277                encodings = alias->encodings_to_try;
278                do
279                  {
280                    result = str_iconveha_notranslit (src,
281                                                      *encodings, to_codeset,
282                                                      iconveh_error);
283                    if (!(result == NULL && errno == EILSEQ))
284                      return result;
285                    encodings++;
286                  }
287                while (*encodings != NULL);
288              }
289
290            encodings = alias->encodings_to_try;
291            do
292              {
293                result = str_iconveha_notranslit (src,
294                                                  *encodings, to_codeset,
295                                                  handler);
296                if (!(result == NULL && errno == EILSEQ))
297                  return result;
298                encodings++;
299              }
300            while (*encodings != NULL);
301
302            /* Return the last call's result.  */
303            return NULL;
304          }
305
306      /* It wasn't an autodetection name.  */
307      errno = EINVAL;
308      return NULL;
309    }
310}
311
312char *
313str_iconveha (const char *src,
314              const char *from_codeset, const char *to_codeset,
315              bool transliterate,
316              enum iconv_ilseq_handler handler)
317{
318  if (*src == '\0' || c_strcasecmp (from_codeset, to_codeset) == 0)
319    {
320      char *result = strdup (src);
321
322      if (result == NULL)
323        errno = ENOMEM;
324      return result;
325    }
326
327  /* When using GNU libc >= 2.2 or GNU libiconv >= 1.5,
328     we want to use transliteration.  */
329#if (__GLIBC__ == 2 && __GLIBC_MINOR__ >= 2) || __GLIBC__ > 2 || _LIBICONV_VERSION >= 0x0105
330  if (transliterate)
331    {
332      char *result;
333      size_t len = strlen (to_codeset);
334      char *to_codeset_suffixed = (char *) malloca (len + 10 + 1);
335      memcpy (to_codeset_suffixed, to_codeset, len);
336      memcpy (to_codeset_suffixed + len, "//TRANSLIT", 10 + 1);
337
338      result = str_iconveha_notranslit (src, from_codeset, to_codeset_suffixed,
339                                        handler);
340
341      freea (to_codeset_suffixed);
342
343      return result;
344    }
345  else
346#endif
347    return str_iconveha_notranslit (src, from_codeset, to_codeset, handler);
348}
349