1/* Determine a canonical name for the current locale's character encoding.
2
3   Copyright (C) 2000-2006, 2008-2014 Free Software Foundation, Inc.
4
5   This program is free software; you can redistribute it and/or modify
6   it under the terms of the GNU General Public License as published by
7   the Free Software Foundation; either version 3, or (at your option)
8   any later version.
9
10   This program is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13   GNU General Public License for more details.
14
15   You should have received a copy of the GNU General Public License along
16   with this program; if not, see <http://www.gnu.org/licenses/>.  */
17
18/* Written by Bruno Haible <bruno@clisp.org>.  */
19
20#include <config.h>
21
22/* Specification.  */
23#include "localcharset.h"
24
25#include <fcntl.h>
26#include <stddef.h>
27#include <stdio.h>
28#include <string.h>
29#include <stdlib.h>
30
31#if defined __APPLE__ && defined __MACH__ && HAVE_LANGINFO_CODESET
32# define DARWIN7 /* Darwin 7 or newer, i.e. Mac OS X 10.3 or newer */
33#endif
34
35#if defined _WIN32 || defined __WIN32__
36# define WINDOWS_NATIVE
37#endif
38
39#if defined __EMX__
40/* Assume EMX program runs on OS/2, even if compiled under DOS.  */
41# ifndef OS2
42#  define OS2
43# endif
44#endif
45
46#if !defined WINDOWS_NATIVE
47# include <unistd.h>
48# if HAVE_LANGINFO_CODESET
49#  include <langinfo.h>
50# else
51#  if 0 /* see comment below */
52#   include <locale.h>
53#  endif
54# endif
55# ifdef __CYGWIN__
56#  define WIN32_LEAN_AND_MEAN
57#  include <windows.h>
58# endif
59#elif defined WINDOWS_NATIVE
60# define WIN32_LEAN_AND_MEAN
61# include <windows.h>
62#endif
63#if defined OS2
64# define INCL_DOS
65# include <os2.h>
66#endif
67
68/* For MB_CUR_MAX_L */
69#if defined DARWIN7
70# include <xlocale.h>
71#endif
72
73#if ENABLE_RELOCATABLE
74# include "relocatable.h"
75#else
76# define relocate(pathname) (pathname)
77#endif
78
79/* Get LIBDIR.  */
80#ifndef LIBDIR
81# include "configmake.h"
82#endif
83
84/* Define O_NOFOLLOW to 0 on platforms where it does not exist.  */
85#ifndef O_NOFOLLOW
86# define O_NOFOLLOW 0
87#endif
88
89#if defined _WIN32 || defined __WIN32__ || defined __CYGWIN__ || defined __EMX__ || defined __DJGPP__
90  /* Native Windows, Cygwin, OS/2, DOS */
91# define ISSLASH(C) ((C) == '/' || (C) == '\\')
92#endif
93
94#ifndef DIRECTORY_SEPARATOR
95# define DIRECTORY_SEPARATOR '/'
96#endif
97
98#ifndef ISSLASH
99# define ISSLASH(C) ((C) == DIRECTORY_SEPARATOR)
100#endif
101
102#if HAVE_DECL_GETC_UNLOCKED
103# undef getc
104# define getc getc_unlocked
105#endif
106
107/* The following static variable is declared 'volatile' to avoid a
108   possible multithread problem in the function get_charset_aliases. If we
109   are running in a threaded environment, and if two threads initialize
110   'charset_aliases' simultaneously, both will produce the same value,
111   and everything will be ok if the two assignments to 'charset_aliases'
112   are atomic. But I don't know what will happen if the two assignments mix.  */
113#if __STDC__ != 1
114# define volatile /* empty */
115#endif
116/* Pointer to the contents of the charset.alias file, if it has already been
117   read, else NULL.  Its format is:
118   ALIAS_1 '\0' CANONICAL_1 '\0' ... ALIAS_n '\0' CANONICAL_n '\0' '\0'  */
119static const char * volatile charset_aliases;
120
121/* Return a pointer to the contents of the charset.alias file.  */
122static const char *
123get_charset_aliases (void)
124{
125  const char *cp;
126
127  cp = charset_aliases;
128  if (cp == NULL)
129    {
130#if !(defined DARWIN7 || defined VMS || defined WINDOWS_NATIVE || defined __CYGWIN__)
131      const char *dir;
132      const char *base = "charset.alias";
133      char *file_name;
134
135      /* Make it possible to override the charset.alias location.  This is
136         necessary for running the testsuite before "make install".  */
137      dir = getenv ("CHARSETALIASDIR");
138      if (dir == NULL || dir[0] == '\0')
139        dir = relocate (LIBDIR);
140
141      /* Concatenate dir and base into freshly allocated file_name.  */
142      {
143        size_t dir_len = strlen (dir);
144        size_t base_len = strlen (base);
145        int add_slash = (dir_len > 0 && !ISSLASH (dir[dir_len - 1]));
146        file_name = (char *) malloc (dir_len + add_slash + base_len + 1);
147        if (file_name != NULL)
148          {
149            memcpy (file_name, dir, dir_len);
150            if (add_slash)
151              file_name[dir_len] = DIRECTORY_SEPARATOR;
152            memcpy (file_name + dir_len + add_slash, base, base_len + 1);
153          }
154      }
155
156      if (file_name == NULL)
157        /* Out of memory.  Treat the file as empty.  */
158        cp = "";
159      else
160        {
161          int fd;
162
163          /* Open the file.  Reject symbolic links on platforms that support
164             O_NOFOLLOW.  This is a security feature.  Without it, an attacker
165             could retrieve parts of the contents (namely, the tail of the
166             first line that starts with "* ") of an arbitrary file by placing
167             a symbolic link to that file under the name "charset.alias" in
168             some writable directory and defining the environment variable
169             CHARSETALIASDIR to point to that directory.  */
170          fd = open (file_name,
171                     O_RDONLY | (HAVE_WORKING_O_NOFOLLOW ? O_NOFOLLOW : 0));
172          if (fd < 0)
173            /* File not found.  Treat it as empty.  */
174            cp = "";
175          else
176            {
177              FILE *fp;
178
179              fp = fdopen (fd, "r");
180              if (fp == NULL)
181                {
182                  /* Out of memory.  Treat the file as empty.  */
183                  close (fd);
184                  cp = "";
185                }
186              else
187                {
188                  /* Parse the file's contents.  */
189                  char *res_ptr = NULL;
190                  size_t res_size = 0;
191
192                  for (;;)
193                    {
194                      int c;
195                      char buf1[50+1];
196                      char buf2[50+1];
197                      size_t l1, l2;
198                      char *old_res_ptr;
199
200                      c = getc (fp);
201                      if (c == EOF)
202                        break;
203                      if (c == '\n' || c == ' ' || c == '\t')
204                        continue;
205                      if (c == '#')
206                        {
207                          /* Skip comment, to end of line.  */
208                          do
209                            c = getc (fp);
210                          while (!(c == EOF || c == '\n'));
211                          if (c == EOF)
212                            break;
213                          continue;
214                        }
215                      ungetc (c, fp);
216                      if (fscanf (fp, "%50s %50s", buf1, buf2) < 2)
217                        break;
218                      l1 = strlen (buf1);
219                      l2 = strlen (buf2);
220                      old_res_ptr = res_ptr;
221                      if (res_size == 0)
222                        {
223                          res_size = l1 + 1 + l2 + 1;
224                          res_ptr = (char *) malloc (res_size + 1);
225                        }
226                      else
227                        {
228                          res_size += l1 + 1 + l2 + 1;
229                          res_ptr = (char *) realloc (res_ptr, res_size + 1);
230                        }
231                      if (res_ptr == NULL)
232                        {
233                          /* Out of memory. */
234                          res_size = 0;
235                          free (old_res_ptr);
236                          break;
237                        }
238                      strcpy (res_ptr + res_size - (l2 + 1) - (l1 + 1), buf1);
239                      strcpy (res_ptr + res_size - (l2 + 1), buf2);
240                    }
241                  fclose (fp);
242                  if (res_size == 0)
243                    cp = "";
244                  else
245                    {
246                      *(res_ptr + res_size) = '\0';
247                      cp = res_ptr;
248                    }
249                }
250            }
251
252          free (file_name);
253        }
254
255#else
256
257# if defined DARWIN7
258      /* To avoid the trouble of installing a file that is shared by many
259         GNU packages -- many packaging systems have problems with this --,
260         simply inline the aliases here.  */
261      cp = "ISO8859-1" "\0" "ISO-8859-1" "\0"
262           "ISO8859-2" "\0" "ISO-8859-2" "\0"
263           "ISO8859-4" "\0" "ISO-8859-4" "\0"
264           "ISO8859-5" "\0" "ISO-8859-5" "\0"
265           "ISO8859-7" "\0" "ISO-8859-7" "\0"
266           "ISO8859-9" "\0" "ISO-8859-9" "\0"
267           "ISO8859-13" "\0" "ISO-8859-13" "\0"
268           "ISO8859-15" "\0" "ISO-8859-15" "\0"
269           "KOI8-R" "\0" "KOI8-R" "\0"
270           "KOI8-U" "\0" "KOI8-U" "\0"
271           "CP866" "\0" "CP866" "\0"
272           "CP949" "\0" "CP949" "\0"
273           "CP1131" "\0" "CP1131" "\0"
274           "CP1251" "\0" "CP1251" "\0"
275           "eucCN" "\0" "GB2312" "\0"
276           "GB2312" "\0" "GB2312" "\0"
277           "eucJP" "\0" "EUC-JP" "\0"
278           "eucKR" "\0" "EUC-KR" "\0"
279           "Big5" "\0" "BIG5" "\0"
280           "Big5HKSCS" "\0" "BIG5-HKSCS" "\0"
281           "GBK" "\0" "GBK" "\0"
282           "GB18030" "\0" "GB18030" "\0"
283           "SJIS" "\0" "SHIFT_JIS" "\0"
284           "ARMSCII-8" "\0" "ARMSCII-8" "\0"
285           "PT154" "\0" "PT154" "\0"
286         /*"ISCII-DEV" "\0" "?" "\0"*/
287           "*" "\0" "UTF-8" "\0";
288# endif
289
290# if defined VMS
291      /* To avoid the troubles of an extra file charset.alias_vms in the
292         sources of many GNU packages, simply inline the aliases here.  */
293      /* The list of encodings is taken from the OpenVMS 7.3-1 documentation
294         "Compaq C Run-Time Library Reference Manual for OpenVMS systems"
295         section 10.7 "Handling Different Character Sets".  */
296      cp = "ISO8859-1" "\0" "ISO-8859-1" "\0"
297           "ISO8859-2" "\0" "ISO-8859-2" "\0"
298           "ISO8859-5" "\0" "ISO-8859-5" "\0"
299           "ISO8859-7" "\0" "ISO-8859-7" "\0"
300           "ISO8859-8" "\0" "ISO-8859-8" "\0"
301           "ISO8859-9" "\0" "ISO-8859-9" "\0"
302           /* Japanese */
303           "eucJP" "\0" "EUC-JP" "\0"
304           "SJIS" "\0" "SHIFT_JIS" "\0"
305           "DECKANJI" "\0" "DEC-KANJI" "\0"
306           "SDECKANJI" "\0" "EUC-JP" "\0"
307           /* Chinese */
308           "eucTW" "\0" "EUC-TW" "\0"
309           "DECHANYU" "\0" "DEC-HANYU" "\0"
310           "DECHANZI" "\0" "GB2312" "\0"
311           /* Korean */
312           "DECKOREAN" "\0" "EUC-KR" "\0";
313# endif
314
315# if defined WINDOWS_NATIVE || defined __CYGWIN__
316      /* To avoid the troubles of installing a separate file in the same
317         directory as the DLL and of retrieving the DLL's directory at
318         runtime, simply inline the aliases here.  */
319
320      cp = "CP936" "\0" "GBK" "\0"
321           "CP1361" "\0" "JOHAB" "\0"
322           "CP20127" "\0" "ASCII" "\0"
323           "CP20866" "\0" "KOI8-R" "\0"
324           "CP20936" "\0" "GB2312" "\0"
325           "CP21866" "\0" "KOI8-RU" "\0"
326           "CP28591" "\0" "ISO-8859-1" "\0"
327           "CP28592" "\0" "ISO-8859-2" "\0"
328           "CP28593" "\0" "ISO-8859-3" "\0"
329           "CP28594" "\0" "ISO-8859-4" "\0"
330           "CP28595" "\0" "ISO-8859-5" "\0"
331           "CP28596" "\0" "ISO-8859-6" "\0"
332           "CP28597" "\0" "ISO-8859-7" "\0"
333           "CP28598" "\0" "ISO-8859-8" "\0"
334           "CP28599" "\0" "ISO-8859-9" "\0"
335           "CP28605" "\0" "ISO-8859-15" "\0"
336           "CP38598" "\0" "ISO-8859-8" "\0"
337           "CP51932" "\0" "EUC-JP" "\0"
338           "CP51936" "\0" "GB2312" "\0"
339           "CP51949" "\0" "EUC-KR" "\0"
340           "CP51950" "\0" "EUC-TW" "\0"
341           "CP54936" "\0" "GB18030" "\0"
342           "CP65001" "\0" "UTF-8" "\0";
343# endif
344#endif
345
346      charset_aliases = cp;
347    }
348
349  return cp;
350}
351
352/* Determine the current locale's character encoding, and canonicalize it
353   into one of the canonical names listed in config.charset.
354   The result must not be freed; it is statically allocated.
355   If the canonical name cannot be determined, the result is a non-canonical
356   name.  */
357
358#ifdef STATIC
359STATIC
360#endif
361const char *
362locale_charset (void)
363{
364  const char *codeset;
365  const char *aliases;
366
367#if !(defined WINDOWS_NATIVE || defined OS2)
368
369# if HAVE_LANGINFO_CODESET
370
371  /* Most systems support nl_langinfo (CODESET) nowadays.  */
372  codeset = nl_langinfo (CODESET);
373
374#  ifdef __CYGWIN__
375  /* Cygwin < 1.7 does not have locales.  nl_langinfo (CODESET) always
376     returns "US-ASCII".  Return the suffix of the locale name from the
377     environment variables (if present) or the codepage as a number.  */
378  if (codeset != NULL && strcmp (codeset, "US-ASCII") == 0)
379    {
380      const char *locale;
381      static char buf[2 + 10 + 1];
382
383      locale = getenv ("LC_ALL");
384      if (locale == NULL || locale[0] == '\0')
385        {
386          locale = getenv ("LC_CTYPE");
387          if (locale == NULL || locale[0] == '\0')
388            locale = getenv ("LANG");
389        }
390      if (locale != NULL && locale[0] != '\0')
391        {
392          /* If the locale name contains an encoding after the dot, return
393             it.  */
394          const char *dot = strchr (locale, '.');
395
396          if (dot != NULL)
397            {
398              const char *modifier;
399
400              dot++;
401              /* Look for the possible @... trailer and remove it, if any.  */
402              modifier = strchr (dot, '@');
403              if (modifier == NULL)
404                return dot;
405              if (modifier - dot < sizeof (buf))
406                {
407                  memcpy (buf, dot, modifier - dot);
408                  buf [modifier - dot] = '\0';
409                  return buf;
410                }
411            }
412        }
413
414      /* The Windows API has a function returning the locale's codepage as a
415         number: GetACP().  This encoding is used by Cygwin, unless the user
416         has set the environment variable CYGWIN=codepage:oem (which very few
417         people do).
418         Output directed to console windows needs to be converted (to
419         GetOEMCP() if the console is using a raster font, or to
420         GetConsoleOutputCP() if it is using a TrueType font).  Cygwin does
421         this conversion transparently (see winsup/cygwin/fhandler_console.cc),
422         converting to GetConsoleOutputCP().  This leads to correct results,
423         except when SetConsoleOutputCP has been called and a raster font is
424         in use.  */
425      sprintf (buf, "CP%u", GetACP ());
426      codeset = buf;
427    }
428#  endif
429
430# else
431
432  /* On old systems which lack it, use setlocale or getenv.  */
433  const char *locale = NULL;
434
435  /* But most old systems don't have a complete set of locales.  Some
436     (like SunOS 4 or DJGPP) have only the C locale.  Therefore we don't
437     use setlocale here; it would return "C" when it doesn't support the
438     locale name the user has set.  */
439#  if 0
440  locale = setlocale (LC_CTYPE, NULL);
441#  endif
442  if (locale == NULL || locale[0] == '\0')
443    {
444      locale = getenv ("LC_ALL");
445      if (locale == NULL || locale[0] == '\0')
446        {
447          locale = getenv ("LC_CTYPE");
448          if (locale == NULL || locale[0] == '\0')
449            locale = getenv ("LANG");
450        }
451    }
452
453  /* On some old systems, one used to set locale = "iso8859_1". On others,
454     you set it to "language_COUNTRY.charset". In any case, we resolve it
455     through the charset.alias file.  */
456  codeset = locale;
457
458# endif
459
460#elif defined WINDOWS_NATIVE
461
462  static char buf[2 + 10 + 1];
463
464  /* The Windows API has a function returning the locale's codepage as a
465     number: GetACP().
466     When the output goes to a console window, it needs to be provided in
467     GetOEMCP() encoding if the console is using a raster font, or in
468     GetConsoleOutputCP() encoding if it is using a TrueType font.
469     But in GUI programs and for output sent to files and pipes, GetACP()
470     encoding is the best bet.  */
471  sprintf (buf, "CP%u", GetACP ());
472  codeset = buf;
473
474#elif defined OS2
475
476  const char *locale;
477  static char buf[2 + 10 + 1];
478  ULONG cp[3];
479  ULONG cplen;
480
481  /* Allow user to override the codeset, as set in the operating system,
482     with standard language environment variables.  */
483  locale = getenv ("LC_ALL");
484  if (locale == NULL || locale[0] == '\0')
485    {
486      locale = getenv ("LC_CTYPE");
487      if (locale == NULL || locale[0] == '\0')
488        locale = getenv ("LANG");
489    }
490  if (locale != NULL && locale[0] != '\0')
491    {
492      /* If the locale name contains an encoding after the dot, return it.  */
493      const char *dot = strchr (locale, '.');
494
495      if (dot != NULL)
496        {
497          const char *modifier;
498
499          dot++;
500          /* Look for the possible @... trailer and remove it, if any.  */
501          modifier = strchr (dot, '@');
502          if (modifier == NULL)
503            return dot;
504          if (modifier - dot < sizeof (buf))
505            {
506              memcpy (buf, dot, modifier - dot);
507              buf [modifier - dot] = '\0';
508              return buf;
509            }
510        }
511
512      /* Resolve through the charset.alias file.  */
513      codeset = locale;
514    }
515  else
516    {
517      /* OS/2 has a function returning the locale's codepage as a number.  */
518      if (DosQueryCp (sizeof (cp), cp, &cplen))
519        codeset = "";
520      else
521        {
522          sprintf (buf, "CP%u", cp[0]);
523          codeset = buf;
524        }
525    }
526
527#endif
528
529  if (codeset == NULL)
530    /* The canonical name cannot be determined.  */
531    codeset = "";
532
533  /* Resolve alias. */
534  for (aliases = get_charset_aliases ();
535       *aliases != '\0';
536       aliases += strlen (aliases) + 1, aliases += strlen (aliases) + 1)
537    if (strcmp (codeset, aliases) == 0
538        || (aliases[0] == '*' && aliases[1] == '\0'))
539      {
540        codeset = aliases + strlen (aliases) + 1;
541        break;
542      }
543
544  /* Don't return an empty string.  GNU libc and GNU libiconv interpret
545     the empty string as denoting "the locale's character encoding",
546     thus GNU libiconv would call this function a second time.  */
547  if (codeset[0] == '\0')
548    codeset = "ASCII";
549
550#ifdef DARWIN7
551  /* Mac OS X sets MB_CUR_MAX to 1 when LC_ALL=C, and "UTF-8"
552     (the default codeset) does not work when MB_CUR_MAX is 1.  */
553  if (strcmp (codeset, "UTF-8") == 0 && MB_CUR_MAX_L (uselocale (NULL)) <= 1)
554    codeset = "ASCII";
555#endif
556
557  return codeset;
558}
559