1/* Determine a canonical name for the current locale's character encoding.
2
3   Copyright (C) 2000-2006, 2008-2009 Free Software Foundation, Inc.
4
5   This program is free software; you can redistribute it and/or modify
6   it under the terms of the GNU General Public License as published by
7   the Free Software Foundation; either version 3, or (at your option)
8   any later version.
9
10   This program is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13   GNU General Public License for more details.
14
15   You should have received a copy of the GNU General Public License along
16   with this program; if not, write to the Free Software Foundation,
17   Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.  */
18
19/* Written by Bruno Haible <bruno@clisp.org>.  */
20
21#include <config.h>
22
23/* Specification.  */
24#include "localcharset.h"
25
26#include <stddef.h>
27#include <stdio.h>
28#include <string.h>
29#include <stdlib.h>
30
31#if defined __APPLE__ && defined __MACH__ && HAVE_LANGINFO_CODESET
32# define DARWIN7 /* Darwin 7 or newer, i.e. MacOS X 10.3 or newer */
33#endif
34
35#if defined _WIN32 || defined __WIN32__
36# define WIN32_NATIVE
37#endif
38
39#if defined __EMX__
40/* Assume EMX program runs on OS/2, even if compiled under DOS.  */
41# ifndef OS2
42#  define OS2
43# endif
44#endif
45
46#if !defined WIN32_NATIVE
47# if HAVE_LANGINFO_CODESET
48#  include <langinfo.h>
49# else
50#  if 0 /* see comment below */
51#   include <locale.h>
52#  endif
53# endif
54# ifdef __CYGWIN__
55#  define WIN32_LEAN_AND_MEAN
56#  include <windows.h>
57# endif
58#elif defined WIN32_NATIVE
59# define WIN32_LEAN_AND_MEAN
60# include <windows.h>
61#endif
62#if defined OS2
63# define INCL_DOS
64# include <os2.h>
65#endif
66
67#if ENABLE_RELOCATABLE
68# include "relocatable.h"
69#else
70# define relocate(pathname) (pathname)
71#endif
72
73/* Get LIBDIR.  */
74#ifndef LIBDIR
75# include "configmake.h"
76#endif
77
78#if defined _WIN32 || defined __WIN32__ || defined __CYGWIN__ || defined __EMX__ || defined __DJGPP__
79  /* Win32, Cygwin, OS/2, DOS */
80# define ISSLASH(C) ((C) == '/' || (C) == '\\')
81#endif
82
83#ifndef DIRECTORY_SEPARATOR
84# define DIRECTORY_SEPARATOR '/'
85#endif
86
87#ifndef ISSLASH
88# define ISSLASH(C) ((C) == DIRECTORY_SEPARATOR)
89#endif
90
91#if HAVE_DECL_GETC_UNLOCKED
92# undef getc
93# define getc getc_unlocked
94#endif
95
96/* The following static variable is declared 'volatile' to avoid a
97   possible multithread problem in the function get_charset_aliases. If we
98   are running in a threaded environment, and if two threads initialize
99   'charset_aliases' simultaneously, both will produce the same value,
100   and everything will be ok if the two assignments to 'charset_aliases'
101   are atomic. But I don't know what will happen if the two assignments mix.  */
102#if __STDC__ != 1
103# define volatile /* empty */
104#endif
105/* Pointer to the contents of the charset.alias file, if it has already been
106   read, else NULL.  Its format is:
107   ALIAS_1 '\0' CANONICAL_1 '\0' ... ALIAS_n '\0' CANONICAL_n '\0' '\0'  */
108static const char * volatile charset_aliases;
109
110/* Return a pointer to the contents of the charset.alias file.  */
111static const char *
112get_charset_aliases (void)
113{
114  const char *cp;
115
116  cp = charset_aliases;
117  if (cp == NULL)
118    {
119#if !(defined DARWIN7 || defined VMS || defined WIN32_NATIVE || defined __CYGWIN__)
120      FILE *fp;
121      const char *dir;
122      const char *base = "charset.alias";
123      char *file_name;
124
125      /* Make it possible to override the charset.alias location.  This is
126	 necessary for running the testsuite before "make install".  */
127      dir = getenv ("CHARSETALIASDIR");
128      if (dir == NULL || dir[0] == '\0')
129	dir = relocate (LIBDIR);
130
131      /* Concatenate dir and base into freshly allocated file_name.  */
132      {
133	size_t dir_len = strlen (dir);
134	size_t base_len = strlen (base);
135	int add_slash = (dir_len > 0 && !ISSLASH (dir[dir_len - 1]));
136	file_name = (char *) malloc (dir_len + add_slash + base_len + 1);
137	if (file_name != NULL)
138	  {
139	    memcpy (file_name, dir, dir_len);
140	    if (add_slash)
141	      file_name[dir_len] = DIRECTORY_SEPARATOR;
142	    memcpy (file_name + dir_len + add_slash, base, base_len + 1);
143	  }
144      }
145
146      if (file_name == NULL || (fp = fopen (file_name, "r")) == NULL)
147	/* Out of memory or file not found, treat it as empty.  */
148	cp = "";
149      else
150	{
151	  /* Parse the file's contents.  */
152	  char *res_ptr = NULL;
153	  size_t res_size = 0;
154
155	  for (;;)
156	    {
157	      int c;
158	      char buf1[50+1];
159	      char buf2[50+1];
160	      size_t l1, l2;
161	      char *old_res_ptr;
162
163	      c = getc (fp);
164	      if (c == EOF)
165		break;
166	      if (c == '\n' || c == ' ' || c == '\t')
167		continue;
168	      if (c == '#')
169		{
170		  /* Skip comment, to end of line.  */
171		  do
172		    c = getc (fp);
173		  while (!(c == EOF || c == '\n'));
174		  if (c == EOF)
175		    break;
176		  continue;
177		}
178	      ungetc (c, fp);
179	      if (fscanf (fp, "%50s %50s", buf1, buf2) < 2)
180		break;
181	      l1 = strlen (buf1);
182	      l2 = strlen (buf2);
183	      old_res_ptr = res_ptr;
184	      if (res_size == 0)
185		{
186		  res_size = l1 + 1 + l2 + 1;
187		  res_ptr = (char *) malloc (res_size + 1);
188		}
189	      else
190		{
191		  res_size += l1 + 1 + l2 + 1;
192		  res_ptr = (char *) realloc (res_ptr, res_size + 1);
193		}
194	      if (res_ptr == NULL)
195		{
196		  /* Out of memory. */
197		  res_size = 0;
198		  if (old_res_ptr != NULL)
199		    free (old_res_ptr);
200		  break;
201		}
202	      strcpy (res_ptr + res_size - (l2 + 1) - (l1 + 1), buf1);
203	      strcpy (res_ptr + res_size - (l2 + 1), buf2);
204	    }
205	  fclose (fp);
206	  if (res_size == 0)
207	    cp = "";
208	  else
209	    {
210	      *(res_ptr + res_size) = '\0';
211	      cp = res_ptr;
212	    }
213	}
214
215      if (file_name != NULL)
216	free (file_name);
217
218#else
219
220# if defined DARWIN7
221      /* To avoid the trouble of installing a file that is shared by many
222	 GNU packages -- many packaging systems have problems with this --,
223	 simply inline the aliases here.  */
224      cp = "ISO8859-1" "\0" "ISO-8859-1" "\0"
225	   "ISO8859-2" "\0" "ISO-8859-2" "\0"
226	   "ISO8859-4" "\0" "ISO-8859-4" "\0"
227	   "ISO8859-5" "\0" "ISO-8859-5" "\0"
228	   "ISO8859-7" "\0" "ISO-8859-7" "\0"
229	   "ISO8859-9" "\0" "ISO-8859-9" "\0"
230	   "ISO8859-13" "\0" "ISO-8859-13" "\0"
231	   "ISO8859-15" "\0" "ISO-8859-15" "\0"
232	   "KOI8-R" "\0" "KOI8-R" "\0"
233	   "KOI8-U" "\0" "KOI8-U" "\0"
234	   "CP866" "\0" "CP866" "\0"
235	   "CP949" "\0" "CP949" "\0"
236	   "CP1131" "\0" "CP1131" "\0"
237	   "CP1251" "\0" "CP1251" "\0"
238	   "eucCN" "\0" "GB2312" "\0"
239	   "GB2312" "\0" "GB2312" "\0"
240	   "eucJP" "\0" "EUC-JP" "\0"
241	   "eucKR" "\0" "EUC-KR" "\0"
242	   "Big5" "\0" "BIG5" "\0"
243	   "Big5HKSCS" "\0" "BIG5-HKSCS" "\0"
244	   "GBK" "\0" "GBK" "\0"
245	   "GB18030" "\0" "GB18030" "\0"
246	   "SJIS" "\0" "SHIFT_JIS" "\0"
247	   "ARMSCII-8" "\0" "ARMSCII-8" "\0"
248	   "PT154" "\0" "PT154" "\0"
249	 /*"ISCII-DEV" "\0" "?" "\0"*/
250	   "*" "\0" "UTF-8" "\0";
251# endif
252
253# if defined VMS
254      /* To avoid the troubles of an extra file charset.alias_vms in the
255	 sources of many GNU packages, simply inline the aliases here.  */
256      /* The list of encodings is taken from the OpenVMS 7.3-1 documentation
257	 "Compaq C Run-Time Library Reference Manual for OpenVMS systems"
258	 section 10.7 "Handling Different Character Sets".  */
259      cp = "ISO8859-1" "\0" "ISO-8859-1" "\0"
260	   "ISO8859-2" "\0" "ISO-8859-2" "\0"
261	   "ISO8859-5" "\0" "ISO-8859-5" "\0"
262	   "ISO8859-7" "\0" "ISO-8859-7" "\0"
263	   "ISO8859-8" "\0" "ISO-8859-8" "\0"
264	   "ISO8859-9" "\0" "ISO-8859-9" "\0"
265	   /* Japanese */
266	   "eucJP" "\0" "EUC-JP" "\0"
267	   "SJIS" "\0" "SHIFT_JIS" "\0"
268	   "DECKANJI" "\0" "DEC-KANJI" "\0"
269	   "SDECKANJI" "\0" "EUC-JP" "\0"
270	   /* Chinese */
271	   "eucTW" "\0" "EUC-TW" "\0"
272	   "DECHANYU" "\0" "DEC-HANYU" "\0"
273	   "DECHANZI" "\0" "GB2312" "\0"
274	   /* Korean */
275	   "DECKOREAN" "\0" "EUC-KR" "\0";
276# endif
277
278# if defined WIN32_NATIVE || defined __CYGWIN__
279      /* To avoid the troubles of installing a separate file in the same
280	 directory as the DLL and of retrieving the DLL's directory at
281	 runtime, simply inline the aliases here.  */
282
283      cp = "CP936" "\0" "GBK" "\0"
284	   "CP1361" "\0" "JOHAB" "\0"
285	   "CP20127" "\0" "ASCII" "\0"
286	   "CP20866" "\0" "KOI8-R" "\0"
287	   "CP20936" "\0" "GB2312" "\0"
288	   "CP21866" "\0" "KOI8-RU" "\0"
289	   "CP28591" "\0" "ISO-8859-1" "\0"
290	   "CP28592" "\0" "ISO-8859-2" "\0"
291	   "CP28593" "\0" "ISO-8859-3" "\0"
292	   "CP28594" "\0" "ISO-8859-4" "\0"
293	   "CP28595" "\0" "ISO-8859-5" "\0"
294	   "CP28596" "\0" "ISO-8859-6" "\0"
295	   "CP28597" "\0" "ISO-8859-7" "\0"
296	   "CP28598" "\0" "ISO-8859-8" "\0"
297	   "CP28599" "\0" "ISO-8859-9" "\0"
298	   "CP28605" "\0" "ISO-8859-15" "\0"
299	   "CP38598" "\0" "ISO-8859-8" "\0"
300	   "CP51932" "\0" "EUC-JP" "\0"
301	   "CP51936" "\0" "GB2312" "\0"
302	   "CP51949" "\0" "EUC-KR" "\0"
303	   "CP51950" "\0" "EUC-TW" "\0"
304	   "CP54936" "\0" "GB18030" "\0"
305	   "CP65001" "\0" "UTF-8" "\0";
306# endif
307#endif
308
309      charset_aliases = cp;
310    }
311
312  return cp;
313}
314
315/* Determine the current locale's character encoding, and canonicalize it
316   into one of the canonical names listed in config.charset.
317   The result must not be freed; it is statically allocated.
318   If the canonical name cannot be determined, the result is a non-canonical
319   name.  */
320
321#ifdef STATIC
322STATIC
323#endif
324const char *
325locale_charset (void)
326{
327  const char *codeset;
328  const char *aliases;
329
330#if !(defined WIN32_NATIVE || defined OS2)
331
332# if HAVE_LANGINFO_CODESET
333
334  /* Most systems support nl_langinfo (CODESET) nowadays.  */
335  codeset = nl_langinfo (CODESET);
336
337#  ifdef __CYGWIN__
338  /* Cygwin 2006 does not have locales.  nl_langinfo (CODESET) always
339     returns "US-ASCII".  As long as this is not fixed, return the suffix
340     of the locale name from the environment variables (if present) or
341     the codepage as a number.  */
342  if (codeset != NULL && strcmp (codeset, "US-ASCII") == 0)
343    {
344      const char *locale;
345      static char buf[2 + 10 + 1];
346
347      locale = getenv ("LC_ALL");
348      if (locale == NULL || locale[0] == '\0')
349	{
350	  locale = getenv ("LC_CTYPE");
351	  if (locale == NULL || locale[0] == '\0')
352	    locale = getenv ("LANG");
353	}
354      if (locale != NULL && locale[0] != '\0')
355	{
356	  /* If the locale name contains an encoding after the dot, return
357	     it.  */
358	  const char *dot = strchr (locale, '.');
359
360	  if (dot != NULL)
361	    {
362	      const char *modifier;
363
364	      dot++;
365	      /* Look for the possible @... trailer and remove it, if any.  */
366	      modifier = strchr (dot, '@');
367	      if (modifier == NULL)
368		return dot;
369	      if (modifier - dot < sizeof (buf))
370		{
371		  memcpy (buf, dot, modifier - dot);
372		  buf [modifier - dot] = '\0';
373		  return buf;
374		}
375	    }
376	}
377
378      /* Woe32 has a function returning the locale's codepage as a number.  */
379      sprintf (buf, "CP%u", GetACP ());
380      codeset = buf;
381    }
382#  endif
383
384# else
385
386  /* On old systems which lack it, use setlocale or getenv.  */
387  const char *locale = NULL;
388
389  /* But most old systems don't have a complete set of locales.  Some
390     (like SunOS 4 or DJGPP) have only the C locale.  Therefore we don't
391     use setlocale here; it would return "C" when it doesn't support the
392     locale name the user has set.  */
393#  if 0
394  locale = setlocale (LC_CTYPE, NULL);
395#  endif
396  if (locale == NULL || locale[0] == '\0')
397    {
398      locale = getenv ("LC_ALL");
399      if (locale == NULL || locale[0] == '\0')
400	{
401	  locale = getenv ("LC_CTYPE");
402	  if (locale == NULL || locale[0] == '\0')
403	    locale = getenv ("LANG");
404	}
405    }
406
407  /* On some old systems, one used to set locale = "iso8859_1". On others,
408     you set it to "language_COUNTRY.charset". In any case, we resolve it
409     through the charset.alias file.  */
410  codeset = locale;
411
412# endif
413
414#elif defined WIN32_NATIVE
415
416  static char buf[2 + 10 + 1];
417
418  /* Woe32 has a function returning the locale's codepage as a number.  */
419  sprintf (buf, "CP%u", GetACP ());
420  codeset = buf;
421
422#elif defined OS2
423
424  const char *locale;
425  static char buf[2 + 10 + 1];
426  ULONG cp[3];
427  ULONG cplen;
428
429  /* Allow user to override the codeset, as set in the operating system,
430     with standard language environment variables.  */
431  locale = getenv ("LC_ALL");
432  if (locale == NULL || locale[0] == '\0')
433    {
434      locale = getenv ("LC_CTYPE");
435      if (locale == NULL || locale[0] == '\0')
436	locale = getenv ("LANG");
437    }
438  if (locale != NULL && locale[0] != '\0')
439    {
440      /* If the locale name contains an encoding after the dot, return it.  */
441      const char *dot = strchr (locale, '.');
442
443      if (dot != NULL)
444	{
445	  const char *modifier;
446
447	  dot++;
448	  /* Look for the possible @... trailer and remove it, if any.  */
449	  modifier = strchr (dot, '@');
450	  if (modifier == NULL)
451	    return dot;
452	  if (modifier - dot < sizeof (buf))
453	    {
454	      memcpy (buf, dot, modifier - dot);
455	      buf [modifier - dot] = '\0';
456	      return buf;
457	    }
458	}
459
460      /* Resolve through the charset.alias file.  */
461      codeset = locale;
462    }
463  else
464    {
465      /* OS/2 has a function returning the locale's codepage as a number.  */
466      if (DosQueryCp (sizeof (cp), cp, &cplen))
467	codeset = "";
468      else
469	{
470	  sprintf (buf, "CP%u", cp[0]);
471	  codeset = buf;
472	}
473    }
474
475#endif
476
477  if (codeset == NULL)
478    /* The canonical name cannot be determined.  */
479    codeset = "";
480
481  /* Resolve alias. */
482  for (aliases = get_charset_aliases ();
483       *aliases != '\0';
484       aliases += strlen (aliases) + 1, aliases += strlen (aliases) + 1)
485    if (strcmp (codeset, aliases) == 0
486	|| (aliases[0] == '*' && aliases[1] == '\0'))
487      {
488	codeset = aliases + strlen (aliases) + 1;
489	break;
490      }
491
492  /* Don't return an empty string.  GNU libc and GNU libiconv interpret
493     the empty string as denoting "the locale's character encoding",
494     thus GNU libiconv would call this function a second time.  */
495  if (codeset[0] == '\0')
496    codeset = "ASCII";
497
498  return codeset;
499}
500