1/* Determine a canonical name for the current locale's character encoding. 2 3 Copyright (C) 2000-2006, 2008-2009 Free Software Foundation, Inc. 4 5 This program is free software; you can redistribute it and/or modify 6 it under the terms of the GNU General Public License as published by 7 the Free Software Foundation; either version 3, or (at your option) 8 any later version. 9 10 This program is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 GNU General Public License for more details. 14 15 You should have received a copy of the GNU General Public License along 16 with this program; if not, write to the Free Software Foundation, 17 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ 18 19/* Written by Bruno Haible <bruno@clisp.org>. */ 20 21#include <config.h> 22 23/* Specification. */ 24#include "localcharset.h" 25 26#include <stddef.h> 27#include <stdio.h> 28#include <string.h> 29#include <stdlib.h> 30 31#if defined __APPLE__ && defined __MACH__ && HAVE_LANGINFO_CODESET 32# define DARWIN7 /* Darwin 7 or newer, i.e. MacOS X 10.3 or newer */ 33#endif 34 35#if defined _WIN32 || defined __WIN32__ 36# define WIN32_NATIVE 37#endif 38 39#if defined __EMX__ 40/* Assume EMX program runs on OS/2, even if compiled under DOS. */ 41# ifndef OS2 42# define OS2 43# endif 44#endif 45 46#if !defined WIN32_NATIVE 47# if HAVE_LANGINFO_CODESET 48# include <langinfo.h> 49# else 50# if 0 /* see comment below */ 51# include <locale.h> 52# endif 53# endif 54# ifdef __CYGWIN__ 55# define WIN32_LEAN_AND_MEAN 56# include <windows.h> 57# endif 58#elif defined WIN32_NATIVE 59# define WIN32_LEAN_AND_MEAN 60# include <windows.h> 61#endif 62#if defined OS2 63# define INCL_DOS 64# include <os2.h> 65#endif 66 67#if ENABLE_RELOCATABLE 68# include "relocatable.h" 69#else 70# define relocate(pathname) (pathname) 71#endif 72 73/* Get LIBDIR. */ 74#ifndef LIBDIR 75# include "configmake.h" 76#endif 77 78#if defined _WIN32 || defined __WIN32__ || defined __CYGWIN__ || defined __EMX__ || defined __DJGPP__ 79 /* Win32, Cygwin, OS/2, DOS */ 80# define ISSLASH(C) ((C) == '/' || (C) == '\\') 81#endif 82 83#ifndef DIRECTORY_SEPARATOR 84# define DIRECTORY_SEPARATOR '/' 85#endif 86 87#ifndef ISSLASH 88# define ISSLASH(C) ((C) == DIRECTORY_SEPARATOR) 89#endif 90 91#if HAVE_DECL_GETC_UNLOCKED 92# undef getc 93# define getc getc_unlocked 94#endif 95 96/* The following static variable is declared 'volatile' to avoid a 97 possible multithread problem in the function get_charset_aliases. If we 98 are running in a threaded environment, and if two threads initialize 99 'charset_aliases' simultaneously, both will produce the same value, 100 and everything will be ok if the two assignments to 'charset_aliases' 101 are atomic. But I don't know what will happen if the two assignments mix. */ 102#if __STDC__ != 1 103# define volatile /* empty */ 104#endif 105/* Pointer to the contents of the charset.alias file, if it has already been 106 read, else NULL. Its format is: 107 ALIAS_1 '\0' CANONICAL_1 '\0' ... ALIAS_n '\0' CANONICAL_n '\0' '\0' */ 108static const char * volatile charset_aliases; 109 110/* Return a pointer to the contents of the charset.alias file. */ 111static const char * 112get_charset_aliases (void) 113{ 114 const char *cp; 115 116 cp = charset_aliases; 117 if (cp == NULL) 118 { 119#if !(defined DARWIN7 || defined VMS || defined WIN32_NATIVE || defined __CYGWIN__) 120 FILE *fp; 121 const char *dir; 122 const char *base = "charset.alias"; 123 char *file_name; 124 125 /* Make it possible to override the charset.alias location. This is 126 necessary for running the testsuite before "make install". */ 127 dir = getenv ("CHARSETALIASDIR"); 128 if (dir == NULL || dir[0] == '\0') 129 dir = relocate (LIBDIR); 130 131 /* Concatenate dir and base into freshly allocated file_name. */ 132 { 133 size_t dir_len = strlen (dir); 134 size_t base_len = strlen (base); 135 int add_slash = (dir_len > 0 && !ISSLASH (dir[dir_len - 1])); 136 file_name = (char *) malloc (dir_len + add_slash + base_len + 1); 137 if (file_name != NULL) 138 { 139 memcpy (file_name, dir, dir_len); 140 if (add_slash) 141 file_name[dir_len] = DIRECTORY_SEPARATOR; 142 memcpy (file_name + dir_len + add_slash, base, base_len + 1); 143 } 144 } 145 146 if (file_name == NULL || (fp = fopen (file_name, "r")) == NULL) 147 /* Out of memory or file not found, treat it as empty. */ 148 cp = ""; 149 else 150 { 151 /* Parse the file's contents. */ 152 char *res_ptr = NULL; 153 size_t res_size = 0; 154 155 for (;;) 156 { 157 int c; 158 char buf1[50+1]; 159 char buf2[50+1]; 160 size_t l1, l2; 161 char *old_res_ptr; 162 163 c = getc (fp); 164 if (c == EOF) 165 break; 166 if (c == '\n' || c == ' ' || c == '\t') 167 continue; 168 if (c == '#') 169 { 170 /* Skip comment, to end of line. */ 171 do 172 c = getc (fp); 173 while (!(c == EOF || c == '\n')); 174 if (c == EOF) 175 break; 176 continue; 177 } 178 ungetc (c, fp); 179 if (fscanf (fp, "%50s %50s", buf1, buf2) < 2) 180 break; 181 l1 = strlen (buf1); 182 l2 = strlen (buf2); 183 old_res_ptr = res_ptr; 184 if (res_size == 0) 185 { 186 res_size = l1 + 1 + l2 + 1; 187 res_ptr = (char *) malloc (res_size + 1); 188 } 189 else 190 { 191 res_size += l1 + 1 + l2 + 1; 192 res_ptr = (char *) realloc (res_ptr, res_size + 1); 193 } 194 if (res_ptr == NULL) 195 { 196 /* Out of memory. */ 197 res_size = 0; 198 if (old_res_ptr != NULL) 199 free (old_res_ptr); 200 break; 201 } 202 strcpy (res_ptr + res_size - (l2 + 1) - (l1 + 1), buf1); 203 strcpy (res_ptr + res_size - (l2 + 1), buf2); 204 } 205 fclose (fp); 206 if (res_size == 0) 207 cp = ""; 208 else 209 { 210 *(res_ptr + res_size) = '\0'; 211 cp = res_ptr; 212 } 213 } 214 215 if (file_name != NULL) 216 free (file_name); 217 218#else 219 220# if defined DARWIN7 221 /* To avoid the trouble of installing a file that is shared by many 222 GNU packages -- many packaging systems have problems with this --, 223 simply inline the aliases here. */ 224 cp = "ISO8859-1" "\0" "ISO-8859-1" "\0" 225 "ISO8859-2" "\0" "ISO-8859-2" "\0" 226 "ISO8859-4" "\0" "ISO-8859-4" "\0" 227 "ISO8859-5" "\0" "ISO-8859-5" "\0" 228 "ISO8859-7" "\0" "ISO-8859-7" "\0" 229 "ISO8859-9" "\0" "ISO-8859-9" "\0" 230 "ISO8859-13" "\0" "ISO-8859-13" "\0" 231 "ISO8859-15" "\0" "ISO-8859-15" "\0" 232 "KOI8-R" "\0" "KOI8-R" "\0" 233 "KOI8-U" "\0" "KOI8-U" "\0" 234 "CP866" "\0" "CP866" "\0" 235 "CP949" "\0" "CP949" "\0" 236 "CP1131" "\0" "CP1131" "\0" 237 "CP1251" "\0" "CP1251" "\0" 238 "eucCN" "\0" "GB2312" "\0" 239 "GB2312" "\0" "GB2312" "\0" 240 "eucJP" "\0" "EUC-JP" "\0" 241 "eucKR" "\0" "EUC-KR" "\0" 242 "Big5" "\0" "BIG5" "\0" 243 "Big5HKSCS" "\0" "BIG5-HKSCS" "\0" 244 "GBK" "\0" "GBK" "\0" 245 "GB18030" "\0" "GB18030" "\0" 246 "SJIS" "\0" "SHIFT_JIS" "\0" 247 "ARMSCII-8" "\0" "ARMSCII-8" "\0" 248 "PT154" "\0" "PT154" "\0" 249 /*"ISCII-DEV" "\0" "?" "\0"*/ 250 "*" "\0" "UTF-8" "\0"; 251# endif 252 253# if defined VMS 254 /* To avoid the troubles of an extra file charset.alias_vms in the 255 sources of many GNU packages, simply inline the aliases here. */ 256 /* The list of encodings is taken from the OpenVMS 7.3-1 documentation 257 "Compaq C Run-Time Library Reference Manual for OpenVMS systems" 258 section 10.7 "Handling Different Character Sets". */ 259 cp = "ISO8859-1" "\0" "ISO-8859-1" "\0" 260 "ISO8859-2" "\0" "ISO-8859-2" "\0" 261 "ISO8859-5" "\0" "ISO-8859-5" "\0" 262 "ISO8859-7" "\0" "ISO-8859-7" "\0" 263 "ISO8859-8" "\0" "ISO-8859-8" "\0" 264 "ISO8859-9" "\0" "ISO-8859-9" "\0" 265 /* Japanese */ 266 "eucJP" "\0" "EUC-JP" "\0" 267 "SJIS" "\0" "SHIFT_JIS" "\0" 268 "DECKANJI" "\0" "DEC-KANJI" "\0" 269 "SDECKANJI" "\0" "EUC-JP" "\0" 270 /* Chinese */ 271 "eucTW" "\0" "EUC-TW" "\0" 272 "DECHANYU" "\0" "DEC-HANYU" "\0" 273 "DECHANZI" "\0" "GB2312" "\0" 274 /* Korean */ 275 "DECKOREAN" "\0" "EUC-KR" "\0"; 276# endif 277 278# if defined WIN32_NATIVE || defined __CYGWIN__ 279 /* To avoid the troubles of installing a separate file in the same 280 directory as the DLL and of retrieving the DLL's directory at 281 runtime, simply inline the aliases here. */ 282 283 cp = "CP936" "\0" "GBK" "\0" 284 "CP1361" "\0" "JOHAB" "\0" 285 "CP20127" "\0" "ASCII" "\0" 286 "CP20866" "\0" "KOI8-R" "\0" 287 "CP20936" "\0" "GB2312" "\0" 288 "CP21866" "\0" "KOI8-RU" "\0" 289 "CP28591" "\0" "ISO-8859-1" "\0" 290 "CP28592" "\0" "ISO-8859-2" "\0" 291 "CP28593" "\0" "ISO-8859-3" "\0" 292 "CP28594" "\0" "ISO-8859-4" "\0" 293 "CP28595" "\0" "ISO-8859-5" "\0" 294 "CP28596" "\0" "ISO-8859-6" "\0" 295 "CP28597" "\0" "ISO-8859-7" "\0" 296 "CP28598" "\0" "ISO-8859-8" "\0" 297 "CP28599" "\0" "ISO-8859-9" "\0" 298 "CP28605" "\0" "ISO-8859-15" "\0" 299 "CP38598" "\0" "ISO-8859-8" "\0" 300 "CP51932" "\0" "EUC-JP" "\0" 301 "CP51936" "\0" "GB2312" "\0" 302 "CP51949" "\0" "EUC-KR" "\0" 303 "CP51950" "\0" "EUC-TW" "\0" 304 "CP54936" "\0" "GB18030" "\0" 305 "CP65001" "\0" "UTF-8" "\0"; 306# endif 307#endif 308 309 charset_aliases = cp; 310 } 311 312 return cp; 313} 314 315/* Determine the current locale's character encoding, and canonicalize it 316 into one of the canonical names listed in config.charset. 317 The result must not be freed; it is statically allocated. 318 If the canonical name cannot be determined, the result is a non-canonical 319 name. */ 320 321#ifdef STATIC 322STATIC 323#endif 324const char * 325locale_charset (void) 326{ 327 const char *codeset; 328 const char *aliases; 329 330#if !(defined WIN32_NATIVE || defined OS2) 331 332# if HAVE_LANGINFO_CODESET 333 334 /* Most systems support nl_langinfo (CODESET) nowadays. */ 335 codeset = nl_langinfo (CODESET); 336 337# ifdef __CYGWIN__ 338 /* Cygwin 2006 does not have locales. nl_langinfo (CODESET) always 339 returns "US-ASCII". As long as this is not fixed, return the suffix 340 of the locale name from the environment variables (if present) or 341 the codepage as a number. */ 342 if (codeset != NULL && strcmp (codeset, "US-ASCII") == 0) 343 { 344 const char *locale; 345 static char buf[2 + 10 + 1]; 346 347 locale = getenv ("LC_ALL"); 348 if (locale == NULL || locale[0] == '\0') 349 { 350 locale = getenv ("LC_CTYPE"); 351 if (locale == NULL || locale[0] == '\0') 352 locale = getenv ("LANG"); 353 } 354 if (locale != NULL && locale[0] != '\0') 355 { 356 /* If the locale name contains an encoding after the dot, return 357 it. */ 358 const char *dot = strchr (locale, '.'); 359 360 if (dot != NULL) 361 { 362 const char *modifier; 363 364 dot++; 365 /* Look for the possible @... trailer and remove it, if any. */ 366 modifier = strchr (dot, '@'); 367 if (modifier == NULL) 368 return dot; 369 if (modifier - dot < sizeof (buf)) 370 { 371 memcpy (buf, dot, modifier - dot); 372 buf [modifier - dot] = '\0'; 373 return buf; 374 } 375 } 376 } 377 378 /* Woe32 has a function returning the locale's codepage as a number. */ 379 sprintf (buf, "CP%u", GetACP ()); 380 codeset = buf; 381 } 382# endif 383 384# else 385 386 /* On old systems which lack it, use setlocale or getenv. */ 387 const char *locale = NULL; 388 389 /* But most old systems don't have a complete set of locales. Some 390 (like SunOS 4 or DJGPP) have only the C locale. Therefore we don't 391 use setlocale here; it would return "C" when it doesn't support the 392 locale name the user has set. */ 393# if 0 394 locale = setlocale (LC_CTYPE, NULL); 395# endif 396 if (locale == NULL || locale[0] == '\0') 397 { 398 locale = getenv ("LC_ALL"); 399 if (locale == NULL || locale[0] == '\0') 400 { 401 locale = getenv ("LC_CTYPE"); 402 if (locale == NULL || locale[0] == '\0') 403 locale = getenv ("LANG"); 404 } 405 } 406 407 /* On some old systems, one used to set locale = "iso8859_1". On others, 408 you set it to "language_COUNTRY.charset". In any case, we resolve it 409 through the charset.alias file. */ 410 codeset = locale; 411 412# endif 413 414#elif defined WIN32_NATIVE 415 416 static char buf[2 + 10 + 1]; 417 418 /* Woe32 has a function returning the locale's codepage as a number. */ 419 sprintf (buf, "CP%u", GetACP ()); 420 codeset = buf; 421 422#elif defined OS2 423 424 const char *locale; 425 static char buf[2 + 10 + 1]; 426 ULONG cp[3]; 427 ULONG cplen; 428 429 /* Allow user to override the codeset, as set in the operating system, 430 with standard language environment variables. */ 431 locale = getenv ("LC_ALL"); 432 if (locale == NULL || locale[0] == '\0') 433 { 434 locale = getenv ("LC_CTYPE"); 435 if (locale == NULL || locale[0] == '\0') 436 locale = getenv ("LANG"); 437 } 438 if (locale != NULL && locale[0] != '\0') 439 { 440 /* If the locale name contains an encoding after the dot, return it. */ 441 const char *dot = strchr (locale, '.'); 442 443 if (dot != NULL) 444 { 445 const char *modifier; 446 447 dot++; 448 /* Look for the possible @... trailer and remove it, if any. */ 449 modifier = strchr (dot, '@'); 450 if (modifier == NULL) 451 return dot; 452 if (modifier - dot < sizeof (buf)) 453 { 454 memcpy (buf, dot, modifier - dot); 455 buf [modifier - dot] = '\0'; 456 return buf; 457 } 458 } 459 460 /* Resolve through the charset.alias file. */ 461 codeset = locale; 462 } 463 else 464 { 465 /* OS/2 has a function returning the locale's codepage as a number. */ 466 if (DosQueryCp (sizeof (cp), cp, &cplen)) 467 codeset = ""; 468 else 469 { 470 sprintf (buf, "CP%u", cp[0]); 471 codeset = buf; 472 } 473 } 474 475#endif 476 477 if (codeset == NULL) 478 /* The canonical name cannot be determined. */ 479 codeset = ""; 480 481 /* Resolve alias. */ 482 for (aliases = get_charset_aliases (); 483 *aliases != '\0'; 484 aliases += strlen (aliases) + 1, aliases += strlen (aliases) + 1) 485 if (strcmp (codeset, aliases) == 0 486 || (aliases[0] == '*' && aliases[1] == '\0')) 487 { 488 codeset = aliases + strlen (aliases) + 1; 489 break; 490 } 491 492 /* Don't return an empty string. GNU libc and GNU libiconv interpret 493 the empty string as denoting "the locale's character encoding", 494 thus GNU libiconv would call this function a second time. */ 495 if (codeset[0] == '\0') 496 codeset = "ASCII"; 497 498 return codeset; 499} 500