1/* localcharset.c - Determine a canonical name for the current locale's character encoding. */
2
3/* Copyright (C) 2000-2003, 2005-2009 Free Software Foundation, Inc.
4
5   This file is part of GNU Bash.
6
7   Bash is free software: you can redistribute it and/or modify
8   it under the terms of the GNU General Public License as published by
9   the Free Software Foundation, either version 3 of the License, or
10   (at your option) any later version.
11
12   Bash is distributed in the hope that it will be useful,
13   but WITHOUT ANY WARRANTY; without even the implied warranty of
14   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15   GNU General Public License for more details.
16
17   You should have received a copy of the GNU General Public License
18   along with Bash.  If not, see <http://www.gnu.org/licenses/>.
19*/
20
21/* Written by Bruno Haible <bruno@clisp.org>.  */
22
23#ifdef HAVE_CONFIG_H
24# include <config.h>
25#endif
26
27/* Specification.  */
28#include "localcharset.h"
29
30#if HAVE_STDDEF_H
31# include <stddef.h>
32#endif
33
34#include <stdio.h>
35#if HAVE_STRING_H
36# include <string.h>
37#else
38# include <strings.h>
39#endif
40#if HAVE_STDLIB_H
41# include <stdlib.h>
42#endif
43
44#if defined _WIN32 || defined __WIN32__
45# undef WIN32   /* avoid warning on mingw32 */
46# define WIN32
47#endif
48
49#if defined __EMX__
50/* Assume EMX program runs on OS/2, even if compiled under DOS.  */
51# define OS2
52#endif
53
54#if !defined WIN32
55# if HAVE_LANGINFO_CODESET
56#  include <langinfo.h>
57# else
58#  if HAVE_SETLOCALE
59#   include <locale.h>
60#  endif
61# endif
62#elif defined WIN32
63# define WIN32_LEAN_AND_MEAN
64# include <windows.h>
65#endif
66#if defined OS2
67# define INCL_DOS
68# include <os2.h>
69#endif
70
71#if ENABLE_RELOCATABLE
72# include "relocatable.h"
73#else
74# define relocate(pathname) (pathname)
75#endif
76
77#if defined _WIN32 || defined __WIN32__ || defined __EMX__ || defined __DJGPP__
78  /* Win32, OS/2, DOS */
79# define ISSLASH(C) ((C) == '/' || (C) == '\\')
80#endif
81
82#ifndef DIRECTORY_SEPARATOR
83# define DIRECTORY_SEPARATOR '/'
84#endif
85
86#ifndef ISSLASH
87# define ISSLASH(C) ((C) == DIRECTORY_SEPARATOR)
88#endif
89
90#ifdef HAVE_GETC_UNLOCKED
91# undef getc
92# define getc getc_unlocked
93#endif
94
95/* The following static variable is declared 'volatile' to avoid a
96   possible multithread problem in the function get_charset_aliases. If we
97   are running in a threaded environment, and if two threads initialize
98   'charset_aliases' simultaneously, both will produce the same value,
99   and everything will be ok if the two assignments to 'charset_aliases'
100   are atomic. But I don't know what will happen if the two assignments mix.  */
101#if __STDC__ != 1
102# define volatile /* empty */
103#endif
104/* Pointer to the contents of the charset.alias file, if it has already been
105   read, else NULL.  Its format is:
106   ALIAS_1 '\0' CANONICAL_1 '\0' ... ALIAS_n '\0' CANONICAL_n '\0' '\0'  */
107static const char * volatile charset_aliases;
108
109/* Return a pointer to the contents of the charset.alias file.  */
110static const char *
111get_charset_aliases ()
112{
113  const char *cp;
114
115  cp = charset_aliases;
116  if (cp == NULL)
117    {
118#if !(defined VMS || defined WIN32)
119      FILE *fp;
120      const char *dir = relocate (LIBDIR);
121      const char *base = "charset.alias";
122      char *file_name;
123
124      /* Concatenate dir and base into freshly allocated file_name.  */
125      {
126	size_t dir_len = strlen (dir);
127	size_t base_len = strlen (base);
128	int add_slash = (dir_len > 0 && !ISSLASH (dir[dir_len - 1]));
129	file_name = (char *) malloc (dir_len + add_slash + base_len + 1);
130	if (file_name != NULL)
131	  {
132	    memcpy (file_name, dir, dir_len);
133	    if (add_slash)
134	      file_name[dir_len] = DIRECTORY_SEPARATOR;
135	    memcpy (file_name + dir_len + add_slash, base, base_len + 1);
136	  }
137      }
138
139      if (file_name == NULL || (fp = fopen (file_name, "r")) == NULL)
140	/* Out of memory or file not found, treat it as empty.  */
141	cp = "";
142      else
143	{
144	  /* Parse the file's contents.  */
145	  int c;
146	  char buf1[50+1];
147	  char buf2[50+1];
148	  char *res_ptr = NULL;
149	  size_t res_size = 0;
150	  size_t l1, l2;
151
152	  for (;;)
153	    {
154	      c = getc (fp);
155	      if (c == EOF)
156		break;
157	      if (c == '\n' || c == ' ' || c == '\t')
158		continue;
159	      if (c == '#')
160		{
161		  /* Skip comment, to end of line.  */
162		  do
163		    c = getc (fp);
164		  while (!(c == EOF || c == '\n'));
165		  if (c == EOF)
166		    break;
167		  continue;
168		}
169	      ungetc (c, fp);
170	      if (fscanf (fp, "%50s %50s", buf1, buf2) < 2)
171		break;
172	      l1 = strlen (buf1);
173	      l2 = strlen (buf2);
174	      if (res_size == 0)
175		{
176		  res_size = l1 + 1 + l2 + 1;
177		  res_ptr = (char *) malloc (res_size + 1);
178		}
179	      else
180		{
181		  res_size += l1 + 1 + l2 + 1;
182		  res_ptr = (char *) realloc (res_ptr, res_size + 1);
183		}
184	      if (res_ptr == NULL)
185		{
186		  /* Out of memory. */
187		  res_size = 0;
188		  break;
189		}
190	      strcpy (res_ptr + res_size - (l2 + 1) - (l1 + 1), buf1);
191	      strcpy (res_ptr + res_size - (l2 + 1), buf2);
192	    }
193	  fclose (fp);
194	  if (res_size == 0)
195	    cp = "";
196	  else
197	    {
198	      *(res_ptr + res_size) = '\0';
199	      cp = res_ptr;
200	    }
201	}
202
203      if (file_name != NULL)
204	free (file_name);
205
206#else
207
208# if defined VMS
209      /* To avoid the troubles of an extra file charset.alias_vms in the
210	 sources of many GNU packages, simply inline the aliases here.  */
211      /* The list of encodings is taken from the OpenVMS 7.3-1 documentation
212	 "Compaq C Run-Time Library Reference Manual for OpenVMS systems"
213	 section 10.7 "Handling Different Character Sets".  */
214      cp = "ISO8859-1" "\0" "ISO-8859-1" "\0"
215	   "ISO8859-2" "\0" "ISO-8859-2" "\0"
216	   "ISO8859-5" "\0" "ISO-8859-5" "\0"
217	   "ISO8859-7" "\0" "ISO-8859-7" "\0"
218	   "ISO8859-8" "\0" "ISO-8859-8" "\0"
219	   "ISO8859-9" "\0" "ISO-8859-9" "\0"
220	   /* Japanese */
221	   "eucJP" "\0" "EUC-JP" "\0"
222	   "SJIS" "\0" "SHIFT_JIS" "\0"
223	   "DECKANJI" "\0" "DEC-KANJI" "\0"
224	   "SDECKANJI" "\0" "EUC-JP" "\0"
225	   /* Chinese */
226	   "eucTW" "\0" "EUC-TW" "\0"
227	   "DECHANYU" "\0" "DEC-HANYU" "\0"
228	   "DECHANZI" "\0" "GB2312" "\0"
229	   /* Korean */
230	   "DECKOREAN" "\0" "EUC-KR" "\0";
231# endif
232
233# if defined WIN32
234      /* To avoid the troubles of installing a separate file in the same
235	 directory as the DLL and of retrieving the DLL's directory at
236	 runtime, simply inline the aliases here.  */
237
238      cp = "CP936" "\0" "GBK" "\0"
239	   "CP1361" "\0" "JOHAB" "\0"
240	   "CP20127" "\0" "ASCII" "\0"
241	   "CP20866" "\0" "KOI8-R" "\0"
242	   "CP21866" "\0" "KOI8-RU" "\0"
243	   "CP28591" "\0" "ISO-8859-1" "\0"
244	   "CP28592" "\0" "ISO-8859-2" "\0"
245	   "CP28593" "\0" "ISO-8859-3" "\0"
246	   "CP28594" "\0" "ISO-8859-4" "\0"
247	   "CP28595" "\0" "ISO-8859-5" "\0"
248	   "CP28596" "\0" "ISO-8859-6" "\0"
249	   "CP28597" "\0" "ISO-8859-7" "\0"
250	   "CP28598" "\0" "ISO-8859-8" "\0"
251	   "CP28599" "\0" "ISO-8859-9" "\0"
252	   "CP28605" "\0" "ISO-8859-15" "\0";
253# endif
254#endif
255
256      charset_aliases = cp;
257    }
258
259  return cp;
260}
261
262/* Determine the current locale's character encoding, and canonicalize it
263   into one of the canonical names listed in config.charset.
264   The result must not be freed; it is statically allocated.
265   If the canonical name cannot be determined, the result is a non-canonical
266   name.  */
267
268#ifdef STATIC
269STATIC
270#endif
271const char *
272locale_charset ()
273{
274  const char *codeset;
275  const char *aliases;
276
277#if !(defined WIN32 || defined OS2)
278
279# if HAVE_LANGINFO_CODESET
280
281  /* Most systems support nl_langinfo (CODESET) nowadays.  */
282  codeset = nl_langinfo (CODESET);
283
284# else
285
286  /* On old systems which lack it, use setlocale or getenv.  */
287  const char *locale = NULL;
288
289  /* But most old systems don't have a complete set of locales.  Some
290     (like SunOS 4 or DJGPP) have only the C locale.  Therefore we don't
291     use setlocale here; it would return "C" when it doesn't support the
292     locale name the user has set.  */
293#  if HAVE_SETLOCALE && 0
294  locale = setlocale (LC_CTYPE, NULL);
295#  endif
296  if (locale == NULL || locale[0] == '\0')
297    {
298      locale = getenv ("LC_ALL");
299      if (locale == NULL || locale[0] == '\0')
300	{
301	  locale = getenv ("LC_CTYPE");
302	  if (locale == NULL || locale[0] == '\0')
303	    locale = getenv ("LANG");
304	}
305    }
306
307  /* On some old systems, one used to set locale = "iso8859_1". On others,
308     you set it to "language_COUNTRY.charset". In any case, we resolve it
309     through the charset.alias file.  */
310  codeset = locale;
311
312# endif
313
314#elif defined WIN32
315
316  static char buf[2 + 10 + 1];
317
318  /* Woe32 has a function returning the locale's codepage as a number.  */
319  sprintf (buf, "CP%u", GetACP ());
320  codeset = buf;
321
322#elif defined OS2
323
324  const char *locale;
325  static char buf[2 + 10 + 1];
326  ULONG cp[3];
327  ULONG cplen;
328
329  /* Allow user to override the codeset, as set in the operating system,
330     with standard language environment variables.  */
331  locale = getenv ("LC_ALL");
332  if (locale == NULL || locale[0] == '\0')
333    {
334      locale = getenv ("LC_CTYPE");
335      if (locale == NULL || locale[0] == '\0')
336	locale = getenv ("LANG");
337    }
338  if (locale != NULL && locale[0] != '\0')
339    {
340      /* If the locale name contains an encoding after the dot, return it.  */
341      const char *dot = strchr (locale, '.');
342
343      if (dot != NULL)
344	{
345	  const char *modifier;
346
347	  dot++;
348	  /* Look for the possible @... trailer and remove it, if any.  */
349	  modifier = strchr (dot, '@');
350	  if (modifier == NULL)
351	    return dot;
352	  if (modifier - dot < sizeof (buf))
353	    {
354	      memcpy (buf, dot, modifier - dot);
355	      buf [modifier - dot] = '\0';
356	      return buf;
357	    }
358	}
359
360      /* Resolve through the charset.alias file.  */
361      codeset = locale;
362    }
363  else
364    {
365      /* OS/2 has a function returning the locale's codepage as a number.  */
366      if (DosQueryCp (sizeof (cp), cp, &cplen))
367	codeset = "";
368      else
369	{
370	  sprintf (buf, "CP%u", cp[0]);
371	  codeset = buf;
372	}
373    }
374
375#endif
376
377  if (codeset == NULL)
378    /* The canonical name cannot be determined.  */
379    codeset = "";
380
381  /* Resolve alias. */
382  for (aliases = get_charset_aliases ();
383       *aliases != '\0';
384       aliases += strlen (aliases) + 1, aliases += strlen (aliases) + 1)
385    if (strcmp (codeset, aliases) == 0
386	|| (aliases[0] == '*' && aliases[1] == '\0'))
387      {
388	codeset = aliases + strlen (aliases) + 1;
389	break;
390      }
391
392  /* Don't return an empty string.  GNU libc and GNU libiconv interpret
393     the empty string as denoting "the locale's character encoding",
394     thus GNU libiconv would call this function a second time.  */
395  if (codeset[0] == '\0')
396    codeset = "ASCII";
397
398  return codeset;
399}
400