1/*
2 * Copyright (C) 1999-2006 Free Software Foundation, Inc.
3 * This file is part of the GNU LIBICONV Library.
4 *
5 * The GNU LIBICONV Library is free software; you can redistribute it
6 * and/or modify it under the terms of the GNU Library General Public
7 * License as published by the Free Software Foundation; either version 2
8 * of the License, or (at your option) any later version.
9 *
10 * The GNU LIBICONV Library is distributed in the hope that it will be
11 * useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13 * Library General Public License for more details.
14 *
15 * You should have received a copy of the GNU Library General Public
16 * License along with the GNU LIBICONV Library; see the file COPYING.LIB.
17 * If not, write to the Free Software Foundation, Inc., 51 Franklin Street,
18 * Fifth Floor, Boston, MA 02110-1301, USA.
19 */
20
21#include <iconv.h>
22
23#include <stdlib.h>
24#include <string.h>
25#include "config.h"
26#include "localcharset.h"
27
28#if ENABLE_EXTRA
29/*
30 * Consider all system dependent encodings, for any system,
31 * and the extra encodings.
32 */
33#define USE_AIX
34#define USE_OSF1
35#define USE_DOS
36#define USE_EXTRA
37#else
38/*
39 * Consider those system dependent encodings that are needed for the
40 * current system.
41 */
42#ifdef _AIX
43#define USE_AIX
44#endif
45#if defined(__osf__) || defined(VMS)
46#define USE_OSF1
47#endif
48#if defined(__DJGPP__) || (defined(_WIN32) && (defined(_MSC_VER) || defined(__MINGW32__)))
49#define USE_DOS
50#endif
51#endif
52
53/*
54 * Data type for general conversion loop.
55 */
56struct loop_funcs {
57  size_t (*loop_convert) (iconv_t icd,
58                          const char* * inbuf, size_t *inbytesleft,
59                          char* * outbuf, size_t *outbytesleft);
60  size_t (*loop_reset) (iconv_t icd,
61                        char* * outbuf, size_t *outbytesleft);
62};
63
64/*
65 * Converters.
66 */
67#include "converters.h"
68
69/*
70 * Transliteration tables.
71 */
72#include "cjk_variants.h"
73#include "translit.h"
74
75/*
76 * Table of all supported encodings.
77 */
78struct encoding {
79  struct mbtowc_funcs ifuncs; /* conversion multibyte -> unicode */
80  struct wctomb_funcs ofuncs; /* conversion unicode -> multibyte */
81  int oflags;                 /* flags for unicode -> multibyte conversion */
82};
83enum {
84#define DEFENCODING(xxx_names,xxx,xxx_ifuncs1,xxx_ifuncs2,xxx_ofuncs1,xxx_ofuncs2) \
85  ei_##xxx ,
86#include "encodings.def"
87#ifdef USE_AIX
88#include "encodings_aix.def"
89#endif
90#ifdef USE_OSF1
91#include "encodings_osf1.def"
92#endif
93#ifdef USE_DOS
94#include "encodings_dos.def"
95#endif
96#ifdef USE_EXTRA
97#include "encodings_extra.def"
98#endif
99#include "encodings_local.def"
100#undef DEFENCODING
101ei_for_broken_compilers_that_dont_like_trailing_commas
102};
103#include "flags.h"
104static struct encoding const all_encodings[] = {
105#define DEFENCODING(xxx_names,xxx,xxx_ifuncs1,xxx_ifuncs2,xxx_ofuncs1,xxx_ofuncs2) \
106  { xxx_ifuncs1,xxx_ifuncs2, xxx_ofuncs1,xxx_ofuncs2, ei_##xxx##_oflags },
107#include "encodings.def"
108#ifdef USE_AIX
109#include "encodings_aix.def"
110#endif
111#ifdef USE_OSF1
112#include "encodings_osf1.def"
113#endif
114#ifdef USE_DOS
115#include "encodings_dos.def"
116#endif
117#ifdef USE_EXTRA
118#include "encodings_extra.def"
119#endif
120#undef DEFENCODING
121#define DEFENCODING(xxx_names,xxx,xxx_ifuncs1,xxx_ifuncs2,xxx_ofuncs1,xxx_ofuncs2) \
122  { xxx_ifuncs1,xxx_ifuncs2, xxx_ofuncs1,xxx_ofuncs2, 0 },
123#include "encodings_local.def"
124#undef DEFENCODING
125};
126
127/*
128 * Conversion loops.
129 */
130#include "loops.h"
131
132/*
133 * Alias lookup function.
134 * Defines
135 *   struct alias { int name; unsigned int encoding_index; };
136 *   const struct alias * aliases_lookup (const char *str, unsigned int len);
137 *   #define MAX_WORD_LENGTH ...
138 */
139#include "aliases.h"
140
141/*
142 * System dependent alias lookup function.
143 * Defines
144 *   const struct alias * aliases2_lookup (const char *str);
145 */
146#if defined(USE_AIX) || defined(USE_OSF1) || defined(USE_DOS) || defined(USE_EXTRA) /* || ... */
147struct stringpool2_t {
148#define S(tag,name,encoding_index) char stringpool_##tag[sizeof(name)];
149#include "aliases2.h"
150#undef S
151};
152static const struct stringpool2_t stringpool2_contents = {
153#define S(tag,name,encoding_index) name,
154#include "aliases2.h"
155#undef S
156};
157#define stringpool2 ((const char *) &stringpool2_contents)
158static const struct alias sysdep_aliases[] = {
159#define S(tag,name,encoding_index) { (int)(long)&((struct stringpool2_t *)0)->stringpool_##tag, encoding_index },
160#include "aliases2.h"
161#undef S
162};
163#ifdef __GNUC__
164__inline
165#endif
166const struct alias *
167aliases2_lookup (register const char *str)
168{
169  const struct alias * ptr;
170  unsigned int count;
171  for (ptr = sysdep_aliases, count = sizeof(sysdep_aliases)/sizeof(sysdep_aliases[0]); count > 0; ptr++, count--)
172    if (!strcmp(str, stringpool2 + ptr->name))
173      return ptr;
174  return NULL;
175}
176#else
177#define aliases2_lookup(str)  NULL
178#define stringpool2  NULL
179#endif
180
181#if 0
182/* Like !strcasecmp, except that the both strings can be assumed to be ASCII
183   and the first string can be assumed to be in uppercase. */
184static int strequal (const char* str1, const char* str2)
185{
186  unsigned char c1;
187  unsigned char c2;
188  for (;;) {
189    c1 = * (unsigned char *) str1++;
190    c2 = * (unsigned char *) str2++;
191    if (c1 == 0)
192      break;
193    if (c2 >= 'a' && c2 <= 'z')
194      c2 -= 'a'-'A';
195    if (c1 != c2)
196      break;
197  }
198  return (c1 == c2);
199}
200#endif
201
202iconv_t iconv_open (const char* tocode, const char* fromcode)
203{
204  struct conv_struct * cd;
205  char buf[MAX_WORD_LENGTH+10+1];
206  const char* cp;
207  char* bp;
208  const struct alias * ap;
209  unsigned int count;
210  unsigned int from_index;
211  int from_wchar;
212  unsigned int to_index;
213  int to_wchar;
214  int transliterate = 0;
215  int discard_ilseq = 0;
216
217  /* Before calling aliases_lookup, convert the input string to upper case,
218   * and check whether it's entirely ASCII (we call gperf with option "-7"
219   * to achieve a smaller table) and non-empty. If it's not entirely ASCII,
220   * or if it's too long, it is not a valid encoding name.
221   */
222  for (to_wchar = 0;;) {
223    /* Search tocode in the table. */
224    for (cp = tocode, bp = buf, count = MAX_WORD_LENGTH+10+1; ; cp++, bp++) {
225      unsigned char c = * (unsigned char *) cp;
226      if (c >= 0x80)
227        goto invalid;
228      if (c >= 'a' && c <= 'z')
229        c -= 'a'-'A';
230      *bp = c;
231      if (c == '\0')
232        break;
233      if (--count == 0)
234        goto invalid;
235    }
236    for (;;) {
237      if (bp-buf >= 10 && memcmp(bp-10,"//TRANSLIT",10)==0) {
238        bp -= 10;
239        *bp = '\0';
240        transliterate = 1;
241        continue;
242      }
243      if (bp-buf >= 8 && memcmp(bp-8,"//IGNORE",8)==0) {
244        bp -= 8;
245        *bp = '\0';
246        discard_ilseq = 1;
247        continue;
248      }
249      break;
250    }
251    if (buf[0] == '\0') {
252      tocode = locale_charset();
253      /* Avoid an endless loop that could occur when using an older version
254         of localcharset.c. */
255      if (tocode[0] == '\0')
256        goto invalid;
257      continue;
258    }
259    ap = aliases_lookup(buf,bp-buf);
260    if (ap == NULL) {
261      ap = aliases2_lookup(buf);
262      if (ap == NULL)
263        goto invalid;
264    }
265    if (ap->encoding_index == ei_local_char) {
266      tocode = locale_charset();
267      /* Avoid an endless loop that could occur when using an older version
268         of localcharset.c. */
269      if (tocode[0] == '\0')
270        goto invalid;
271      continue;
272    }
273    if (ap->encoding_index == ei_local_wchar_t) {
274#if __STDC_ISO_10646__
275      if (sizeof(wchar_t) == 4) {
276        to_index = ei_ucs4internal;
277        break;
278      }
279      if (sizeof(wchar_t) == 2) {
280        to_index = ei_ucs2internal;
281        break;
282      }
283      if (sizeof(wchar_t) == 1) {
284        to_index = ei_iso8859_1;
285        break;
286      }
287#endif
288#if HAVE_MBRTOWC
289      to_wchar = 1;
290      tocode = locale_charset();
291      continue;
292#endif
293      goto invalid;
294    }
295    to_index = ap->encoding_index;
296    break;
297  }
298  for (from_wchar = 0;;) {
299    /* Search fromcode in the table. */
300    for (cp = fromcode, bp = buf, count = MAX_WORD_LENGTH+10+1; ; cp++, bp++) {
301      unsigned char c = * (unsigned char *) cp;
302      if (c >= 0x80)
303        goto invalid;
304      if (c >= 'a' && c <= 'z')
305        c -= 'a'-'A';
306      *bp = c;
307      if (c == '\0')
308        break;
309      if (--count == 0)
310        goto invalid;
311    }
312    for (;;) {
313      if (bp-buf >= 10 && memcmp(bp-10,"//TRANSLIT",10)==0) {
314        bp -= 10;
315        *bp = '\0';
316        continue;
317      }
318      if (bp-buf >= 8 && memcmp(bp-8,"//IGNORE",8)==0) {
319        bp -= 8;
320        *bp = '\0';
321        continue;
322      }
323      break;
324    }
325    if (buf[0] == '\0') {
326      fromcode = locale_charset();
327      /* Avoid an endless loop that could occur when using an older version
328         of localcharset.c. */
329      if (fromcode[0] == '\0')
330        goto invalid;
331      continue;
332    }
333    ap = aliases_lookup(buf,bp-buf);
334    if (ap == NULL) {
335      ap = aliases2_lookup(buf);
336      if (ap == NULL)
337        goto invalid;
338    }
339    if (ap->encoding_index == ei_local_char) {
340      fromcode = locale_charset();
341      /* Avoid an endless loop that could occur when using an older version
342         of localcharset.c. */
343      if (fromcode[0] == '\0')
344        goto invalid;
345      continue;
346    }
347    if (ap->encoding_index == ei_local_wchar_t) {
348#if __STDC_ISO_10646__
349      if (sizeof(wchar_t) == 4) {
350        from_index = ei_ucs4internal;
351        break;
352      }
353      if (sizeof(wchar_t) == 2) {
354        from_index = ei_ucs2internal;
355        break;
356      }
357      if (sizeof(wchar_t) == 1) {
358        from_index = ei_iso8859_1;
359        break;
360      }
361#endif
362#if HAVE_WCRTOMB
363      from_wchar = 1;
364      fromcode = locale_charset();
365      continue;
366#endif
367      goto invalid;
368    }
369    from_index = ap->encoding_index;
370    break;
371  }
372  cd = (struct conv_struct *) malloc(from_wchar != to_wchar
373                                     ? sizeof(struct wchar_conv_struct)
374                                     : sizeof(struct conv_struct));
375  if (cd == NULL) {
376    errno = ENOMEM;
377    return (iconv_t)(-1);
378  }
379  cd->iindex = from_index;
380  cd->ifuncs = all_encodings[from_index].ifuncs;
381  cd->oindex = to_index;
382  cd->ofuncs = all_encodings[to_index].ofuncs;
383  cd->oflags = all_encodings[to_index].oflags;
384  /* Initialize the loop functions. */
385#if HAVE_MBRTOWC
386  if (to_wchar) {
387#if HAVE_WCRTOMB
388    if (from_wchar) {
389      cd->lfuncs.loop_convert = wchar_id_loop_convert;
390      cd->lfuncs.loop_reset = wchar_id_loop_reset;
391    } else
392#endif
393    {
394      cd->lfuncs.loop_convert = wchar_to_loop_convert;
395      cd->lfuncs.loop_reset = wchar_to_loop_reset;
396    }
397  } else
398#endif
399  {
400#if HAVE_WCRTOMB
401    if (from_wchar) {
402      cd->lfuncs.loop_convert = wchar_from_loop_convert;
403      cd->lfuncs.loop_reset = wchar_from_loop_reset;
404    } else
405#endif
406    {
407      cd->lfuncs.loop_convert = unicode_loop_convert;
408      cd->lfuncs.loop_reset = unicode_loop_reset;
409    }
410  }
411  /* Initialize the states. */
412  memset(&cd->istate,'\0',sizeof(state_t));
413  memset(&cd->ostate,'\0',sizeof(state_t));
414  /* Initialize the operation flags. */
415  cd->transliterate = transliterate;
416  cd->discard_ilseq = discard_ilseq;
417  #ifndef LIBICONV_PLUG
418  cd->fallbacks.mb_to_uc_fallback = NULL;
419  cd->fallbacks.uc_to_mb_fallback = NULL;
420  cd->fallbacks.mb_to_wc_fallback = NULL;
421  cd->fallbacks.wc_to_mb_fallback = NULL;
422  cd->fallbacks.data = NULL;
423  cd->hooks.uc_hook = NULL;
424  cd->hooks.wc_hook = NULL;
425  cd->hooks.data = NULL;
426  #endif
427  /* Initialize additional fields. */
428  if (from_wchar != to_wchar) {
429    struct wchar_conv_struct * wcd = (struct wchar_conv_struct *) cd;
430    memset(&wcd->state,'\0',sizeof(mbstate_t));
431  }
432  /* Done. */
433  return (iconv_t)cd;
434invalid:
435  errno = EINVAL;
436  return (iconv_t)(-1);
437}
438
439size_t iconv (iconv_t icd,
440              ICONV_CONST char* * inbuf, size_t *inbytesleft,
441              char* * outbuf, size_t *outbytesleft)
442{
443  conv_t cd = (conv_t) icd;
444  if (inbuf == NULL || *inbuf == NULL)
445    return cd->lfuncs.loop_reset(icd,outbuf,outbytesleft);
446  else
447    return cd->lfuncs.loop_convert(icd,
448                                   (const char* *)inbuf,inbytesleft,
449                                   outbuf,outbytesleft);
450}
451
452int iconv_close (iconv_t icd)
453{
454  conv_t cd = (conv_t) icd;
455  free(cd);
456  return 0;
457}
458
459#ifndef LIBICONV_PLUG
460
461int iconvctl (iconv_t icd, int request, void* argument)
462{
463  conv_t cd = (conv_t) icd;
464  switch (request) {
465    case ICONV_TRIVIALP:
466      *(int *)argument =
467        ((cd->lfuncs.loop_convert == unicode_loop_convert
468          && cd->iindex == cd->oindex)
469         || cd->lfuncs.loop_convert == wchar_id_loop_convert
470         ? 1 : 0);
471      return 0;
472    case ICONV_GET_TRANSLITERATE:
473      *(int *)argument = cd->transliterate;
474      return 0;
475    case ICONV_SET_TRANSLITERATE:
476      cd->transliterate = (*(const int *)argument ? 1 : 0);
477      return 0;
478    case ICONV_GET_DISCARD_ILSEQ:
479      *(int *)argument = cd->discard_ilseq;
480      return 0;
481    case ICONV_SET_DISCARD_ILSEQ:
482      cd->discard_ilseq = (*(const int *)argument ? 1 : 0);
483      return 0;
484    case ICONV_SET_HOOKS:
485      if (argument != NULL) {
486        cd->hooks = *(const struct iconv_hooks *)argument;
487      } else {
488        cd->hooks.uc_hook = NULL;
489        cd->hooks.wc_hook = NULL;
490        cd->hooks.data = NULL;
491      }
492      return 0;
493    case ICONV_SET_FALLBACKS:
494      if (argument != NULL) {
495        cd->fallbacks = *(const struct iconv_fallbacks *)argument;
496      } else {
497        cd->fallbacks.mb_to_uc_fallback = NULL;
498        cd->fallbacks.uc_to_mb_fallback = NULL;
499        cd->fallbacks.mb_to_wc_fallback = NULL;
500        cd->fallbacks.wc_to_mb_fallback = NULL;
501        cd->fallbacks.data = NULL;
502      }
503      return 0;
504    default:
505      errno = EINVAL;
506      return -1;
507  }
508}
509
510/* An alias after its name has been converted from 'int' to 'const char*'. */
511struct nalias { const char* name; unsigned int encoding_index; };
512
513static int compare_by_index (const void * arg1, const void * arg2)
514{
515  const struct nalias * alias1 = (const struct nalias *) arg1;
516  const struct nalias * alias2 = (const struct nalias *) arg2;
517  return (int)alias1->encoding_index - (int)alias2->encoding_index;
518}
519
520static int compare_by_name (const void * arg1, const void * arg2)
521{
522  const char * name1 = *(const char **)arg1;
523  const char * name2 = *(const char **)arg2;
524  /* Compare alphabetically, but put "CS" names at the end. */
525  int sign = strcmp(name1,name2);
526  if (sign != 0) {
527    sign = ((name1[0]=='C' && name1[1]=='S') - (name2[0]=='C' && name2[1]=='S'))
528           * 4 + (sign >= 0 ? 1 : -1);
529  }
530  return sign;
531}
532
533void iconvlist (int (*do_one) (unsigned int namescount,
534                               const char * const * names,
535                               void* data),
536                void* data)
537{
538#define aliascount1  sizeof(aliases)/sizeof(aliases[0])
539#ifndef aliases2_lookup
540#define aliascount2  sizeof(sysdep_aliases)/sizeof(sysdep_aliases[0])
541#else
542#define aliascount2  0
543#endif
544#define aliascount  (aliascount1+aliascount2)
545  struct nalias aliasbuf[aliascount];
546  const char * namesbuf[aliascount];
547  size_t num_aliases;
548  {
549    /* Put all existing aliases into a buffer. */
550    size_t i;
551    size_t j;
552    j = 0;
553    for (i = 0; i < aliascount1; i++) {
554      const struct alias * p = &aliases[i];
555      if (p->name >= 0
556          && p->encoding_index != ei_local_char
557          && p->encoding_index != ei_local_wchar_t) {
558        aliasbuf[j].name = stringpool + p->name;
559        aliasbuf[j].encoding_index = p->encoding_index;
560        j++;
561      }
562    }
563#ifndef aliases2_lookup
564    for (i = 0; i < aliascount2; i++) {
565      aliasbuf[j].name = stringpool2 + sysdep_aliases[i].name;
566      aliasbuf[j].encoding_index = sysdep_aliases[i].encoding_index;
567      j++;
568    }
569#endif
570    num_aliases = j;
571  }
572  /* Sort by encoding_index. */
573  if (num_aliases > 1)
574    qsort(aliasbuf, num_aliases, sizeof(struct nalias), compare_by_index);
575  {
576    /* Process all aliases with the same encoding_index together. */
577    size_t j;
578    j = 0;
579    while (j < num_aliases) {
580      unsigned int ei = aliasbuf[j].encoding_index;
581      size_t i = 0;
582      do
583        namesbuf[i++] = aliasbuf[j++].name;
584      while (j < num_aliases && aliasbuf[j].encoding_index == ei);
585      if (i > 1)
586        qsort(namesbuf, i, sizeof(const char *), compare_by_name);
587      /* Call the callback. */
588      if (do_one(i,namesbuf,data))
589        break;
590    }
591  }
592#undef aliascount
593#undef aliascount2
594#undef aliascount1
595}
596
597/*
598 * Table of canonical names of encodings.
599 * Instead of strings, it contains offsets into stringpool and stringpool2.
600 */
601static const unsigned short all_canonical[] = {
602#include "canonical.h"
603#ifdef USE_AIX
604#include "canonical_aix.h"
605#endif
606#ifdef USE_OSF1
607#include "canonical_osf1.h"
608#endif
609#ifdef USE_DOS
610#include "canonical_dos.h"
611#endif
612#ifdef USE_EXTRA
613#include "canonical_extra.h"
614#endif
615#include "canonical_local.h"
616};
617
618const char * iconv_canonicalize (const char * name)
619{
620  const char* code;
621  char buf[MAX_WORD_LENGTH+10+1];
622  const char* cp;
623  char* bp;
624  const struct alias * ap;
625  unsigned int count;
626  unsigned int index;
627  const char* pool;
628
629  /* Before calling aliases_lookup, convert the input string to upper case,
630   * and check whether it's entirely ASCII (we call gperf with option "-7"
631   * to achieve a smaller table) and non-empty. If it's not entirely ASCII,
632   * or if it's too long, it is not a valid encoding name.
633   */
634  for (code = name;;) {
635    /* Search code in the table. */
636    for (cp = code, bp = buf, count = MAX_WORD_LENGTH+10+1; ; cp++, bp++) {
637      unsigned char c = * (unsigned char *) cp;
638      if (c >= 0x80)
639        goto invalid;
640      if (c >= 'a' && c <= 'z')
641        c -= 'a'-'A';
642      *bp = c;
643      if (c == '\0')
644        break;
645      if (--count == 0)
646        goto invalid;
647    }
648    for (;;) {
649      if (bp-buf >= 10 && memcmp(bp-10,"//TRANSLIT",10)==0) {
650        bp -= 10;
651        *bp = '\0';
652        continue;
653      }
654      if (bp-buf >= 8 && memcmp(bp-8,"//IGNORE",8)==0) {
655        bp -= 8;
656        *bp = '\0';
657        continue;
658      }
659      break;
660    }
661    if (buf[0] == '\0') {
662      code = locale_charset();
663      /* Avoid an endless loop that could occur when using an older version
664         of localcharset.c. */
665      if (code[0] == '\0')
666        goto invalid;
667      continue;
668    }
669    pool = stringpool;
670    ap = aliases_lookup(buf,bp-buf);
671    if (ap == NULL) {
672      pool = stringpool2;
673      ap = aliases2_lookup(buf);
674      if (ap == NULL)
675        goto invalid;
676    }
677    if (ap->encoding_index == ei_local_char) {
678      code = locale_charset();
679      /* Avoid an endless loop that could occur when using an older version
680         of localcharset.c. */
681      if (code[0] == '\0')
682        goto invalid;
683      continue;
684    }
685    if (ap->encoding_index == ei_local_wchar_t) {
686#if __STDC_ISO_10646__
687      if (sizeof(wchar_t) == 4) {
688        index = ei_ucs4internal;
689        break;
690      }
691      if (sizeof(wchar_t) == 2) {
692        index = ei_ucs2internal;
693        break;
694      }
695      if (sizeof(wchar_t) == 1) {
696        index = ei_iso8859_1;
697        break;
698      }
699#endif
700    }
701    index = ap->encoding_index;
702    break;
703  }
704  return all_canonical[index] + pool;
705 invalid:
706  return name;
707}
708
709int _libiconv_version = _LIBICONV_VERSION;
710
711#if defined __FreeBSD__ && !defined __gnu_freebsd__
712/* GNU libiconv is the native FreeBSD iconv implementation since 2002.
713   It wants to define the symbols 'iconv_open', 'iconv', 'iconv_close'.  */
714#define strong_alias(name, aliasname) _strong_alias(name, aliasname)
715#define _strong_alias(name, aliasname) \
716  extern __typeof (name) aliasname __attribute__ ((alias (#name)));
717#undef iconv_open
718#undef iconv
719#undef iconv_close
720strong_alias (libiconv_open, iconv_open)
721strong_alias (libiconv, iconv)
722strong_alias (libiconv_close, iconv_close)
723#endif
724
725#endif
726