1/* Determine a canonical name for the current locale's character encoding. 2 3 Copyright (C) 2000-2006 Free Software Foundation, Inc. 4 5 This program is free software; you can redistribute it and/or modify it 6 under the terms of the GNU Library General Public License as published 7 by the Free Software Foundation; either version 2, or (at your option) 8 any later version. 9 10 This program is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Library General Public License for more details. 14 15 You should have received a copy of the GNU Library General Public 16 License along with this program; if not, write to the Free Software 17 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, 18 USA. */ 19 20/* Written by Bruno Haible <bruno@clisp.org>. */ 21 22#ifdef HAVE_CONFIG_H 23# include <config.h> 24#endif 25 26/* Specification. */ 27#include "localcharset.h" 28 29#if HAVE_STDDEF_H 30# include <stddef.h> 31#endif 32 33#include <stdio.h> 34#if HAVE_STRING_H 35# include <string.h> 36#else 37# include <strings.h> 38#endif 39#if HAVE_STDLIB_H 40# include <stdlib.h> 41#endif 42 43#if defined _WIN32 || defined __WIN32__ 44# define WIN32_NATIVE 45#endif 46 47#if defined __EMX__ 48/* Assume EMX program runs on OS/2, even if compiled under DOS. */ 49# define OS2 50#endif 51 52#if !defined WIN32_NATIVE 53# if HAVE_LANGINFO_CODESET 54# include <langinfo.h> 55# else 56# if HAVE_SETLOCALE 57# include <locale.h> 58# endif 59# endif 60# ifdef __CYGWIN__ 61# define WIN32_LEAN_AND_MEAN 62# include <windows.h> 63# endif 64#elif defined WIN32_NATIVE 65# define WIN32_LEAN_AND_MEAN 66# include <windows.h> 67#endif 68#if defined OS2 69# define INCL_DOS 70# include <os2.h> 71#endif 72 73#if ENABLE_RELOCATABLE 74# include "relocatable.h" 75#else 76# define relocate(pathname) (pathname) 77#endif 78 79#if defined _WIN32 || defined __WIN32__ || defined __CYGWIN__ || defined __EMX__ || defined __DJGPP__ 80 /* Win32, Cygwin, OS/2, DOS */ 81# define ISSLASH(C) ((C) == '/' || (C) == '\\') 82#endif 83 84#ifndef DIRECTORY_SEPARATOR 85# define DIRECTORY_SEPARATOR '/' 86#endif 87 88#ifndef ISSLASH 89# define ISSLASH(C) ((C) == DIRECTORY_SEPARATOR) 90#endif 91 92#if HAVE_DECL_GETC_UNLOCKED 93# undef getc 94# define getc getc_unlocked 95#endif 96 97/* The following static variable is declared 'volatile' to avoid a 98 possible multithread problem in the function get_charset_aliases. If we 99 are running in a threaded environment, and if two threads initialize 100 'charset_aliases' simultaneously, both will produce the same value, 101 and everything will be ok if the two assignments to 'charset_aliases' 102 are atomic. But I don't know what will happen if the two assignments mix. */ 103#if __STDC__ != 1 104# define volatile /* empty */ 105#endif 106/* Pointer to the contents of the charset.alias file, if it has already been 107 read, else NULL. Its format is: 108 ALIAS_1 '\0' CANONICAL_1 '\0' ... ALIAS_n '\0' CANONICAL_n '\0' '\0' */ 109static const char * volatile charset_aliases; 110 111/* Return a pointer to the contents of the charset.alias file. */ 112static const char * 113get_charset_aliases (void) 114{ 115 const char *cp; 116 117 cp = charset_aliases; 118 if (cp == NULL) 119 { 120#if !(defined VMS || defined WIN32_NATIVE || defined __CYGWIN__) 121 FILE *fp; 122 const char *dir; 123 const char *base = "charset.alias"; 124 char *file_name; 125 126 /* Make it possible to override the charset.alias location. This is 127 necessary for running the testsuite before "make install". */ 128 dir = getenv ("CHARSETALIASDIR"); 129 if (dir == NULL || dir[0] == '\0') 130 dir = relocate (LIBDIR); 131 132 /* Concatenate dir and base into freshly allocated file_name. */ 133 { 134 size_t dir_len = strlen (dir); 135 size_t base_len = strlen (base); 136 int add_slash = (dir_len > 0 && !ISSLASH (dir[dir_len - 1])); 137 file_name = (char *) malloc (dir_len + add_slash + base_len + 1); 138 if (file_name != NULL) 139 { 140 memcpy (file_name, dir, dir_len); 141 if (add_slash) 142 file_name[dir_len] = DIRECTORY_SEPARATOR; 143 memcpy (file_name + dir_len + add_slash, base, base_len + 1); 144 } 145 } 146 147 if (file_name == NULL || (fp = fopen (file_name, "r")) == NULL) 148 /* Out of memory or file not found, treat it as empty. */ 149 cp = ""; 150 else 151 { 152 /* Parse the file's contents. */ 153 char *res_ptr = NULL; 154 size_t res_size = 0; 155 156 for (;;) 157 { 158 int c; 159 char buf1[50+1]; 160 char buf2[50+1]; 161 size_t l1, l2; 162 char *old_res_ptr; 163 164 c = getc (fp); 165 if (c == EOF) 166 break; 167 if (c == '\n' || c == ' ' || c == '\t') 168 continue; 169 if (c == '#') 170 { 171 /* Skip comment, to end of line. */ 172 do 173 c = getc (fp); 174 while (!(c == EOF || c == '\n')); 175 if (c == EOF) 176 break; 177 continue; 178 } 179 ungetc (c, fp); 180 if (fscanf (fp, "%50s %50s", buf1, buf2) < 2) 181 break; 182 l1 = strlen (buf1); 183 l2 = strlen (buf2); 184 old_res_ptr = res_ptr; 185 if (res_size == 0) 186 { 187 res_size = l1 + 1 + l2 + 1; 188 res_ptr = (char *) malloc (res_size + 1); 189 } 190 else 191 { 192 res_size += l1 + 1 + l2 + 1; 193 res_ptr = (char *) realloc (res_ptr, res_size + 1); 194 } 195 if (res_ptr == NULL) 196 { 197 /* Out of memory. */ 198 res_size = 0; 199 if (old_res_ptr != NULL) 200 free (old_res_ptr); 201 break; 202 } 203 strcpy (res_ptr + res_size - (l2 + 1) - (l1 + 1), buf1); 204 strcpy (res_ptr + res_size - (l2 + 1), buf2); 205 } 206 fclose (fp); 207 if (res_size == 0) 208 cp = ""; 209 else 210 { 211 *(res_ptr + res_size) = '\0'; 212 cp = res_ptr; 213 } 214 } 215 216 if (file_name != NULL) 217 free (file_name); 218 219#else 220 221# if defined VMS 222 /* To avoid the troubles of an extra file charset.alias_vms in the 223 sources of many GNU packages, simply inline the aliases here. */ 224 /* The list of encodings is taken from the OpenVMS 7.3-1 documentation 225 "Compaq C Run-Time Library Reference Manual for OpenVMS systems" 226 section 10.7 "Handling Different Character Sets". */ 227 cp = "ISO8859-1" "\0" "ISO-8859-1" "\0" 228 "ISO8859-2" "\0" "ISO-8859-2" "\0" 229 "ISO8859-5" "\0" "ISO-8859-5" "\0" 230 "ISO8859-7" "\0" "ISO-8859-7" "\0" 231 "ISO8859-8" "\0" "ISO-8859-8" "\0" 232 "ISO8859-9" "\0" "ISO-8859-9" "\0" 233 /* Japanese */ 234 "eucJP" "\0" "EUC-JP" "\0" 235 "SJIS" "\0" "SHIFT_JIS" "\0" 236 "DECKANJI" "\0" "DEC-KANJI" "\0" 237 "SDECKANJI" "\0" "EUC-JP" "\0" 238 /* Chinese */ 239 "eucTW" "\0" "EUC-TW" "\0" 240 "DECHANYU" "\0" "DEC-HANYU" "\0" 241 "DECHANZI" "\0" "GB2312" "\0" 242 /* Korean */ 243 "DECKOREAN" "\0" "EUC-KR" "\0"; 244# endif 245 246# if defined WIN32_NATIVE || defined __CYGWIN__ 247 /* To avoid the troubles of installing a separate file in the same 248 directory as the DLL and of retrieving the DLL's directory at 249 runtime, simply inline the aliases here. */ 250 251 cp = "CP936" "\0" "GBK" "\0" 252 "CP1361" "\0" "JOHAB" "\0" 253 "CP20127" "\0" "ASCII" "\0" 254 "CP20866" "\0" "KOI8-R" "\0" 255 "CP20936" "\0" "GB2312" "\0" 256 "CP21866" "\0" "KOI8-RU" "\0" 257 "CP28591" "\0" "ISO-8859-1" "\0" 258 "CP28592" "\0" "ISO-8859-2" "\0" 259 "CP28593" "\0" "ISO-8859-3" "\0" 260 "CP28594" "\0" "ISO-8859-4" "\0" 261 "CP28595" "\0" "ISO-8859-5" "\0" 262 "CP28596" "\0" "ISO-8859-6" "\0" 263 "CP28597" "\0" "ISO-8859-7" "\0" 264 "CP28598" "\0" "ISO-8859-8" "\0" 265 "CP28599" "\0" "ISO-8859-9" "\0" 266 "CP28605" "\0" "ISO-8859-15" "\0" 267 "CP38598" "\0" "ISO-8859-8" "\0" 268 "CP51932" "\0" "EUC-JP" "\0" 269 "CP51936" "\0" "GB2312" "\0" 270 "CP51949" "\0" "EUC-KR" "\0" 271 "CP51950" "\0" "EUC-TW" "\0" 272 "CP54936" "\0" "GB18030" "\0" 273 "CP65001" "\0" "UTF-8" "\0"; 274# endif 275#endif 276 277 charset_aliases = cp; 278 } 279 280 return cp; 281} 282 283/* Determine the current locale's character encoding, and canonicalize it 284 into one of the canonical names listed in config.charset. 285 The result must not be freed; it is statically allocated. 286 If the canonical name cannot be determined, the result is a non-canonical 287 name. */ 288 289#ifdef STATIC 290STATIC 291#endif 292const char * 293locale_charset (void) 294{ 295 const char *codeset; 296 const char *aliases; 297 298#if !(defined WIN32_NATIVE || defined OS2) 299 300# if HAVE_LANGINFO_CODESET 301 302 /* Most systems support nl_langinfo (CODESET) nowadays. */ 303 codeset = nl_langinfo (CODESET); 304 305# ifdef __CYGWIN__ 306 /* Cygwin 2006 does not have locales. nl_langinfo (CODESET) always 307 returns "US-ASCII". As long as this is not fixed, return the suffix 308 of the locale name from the environment variables (if present) or 309 the codepage as a number. */ 310 if (codeset != NULL && strcmp (codeset, "US-ASCII") == 0) 311 { 312 const char *locale; 313 static char buf[2 + 10 + 1]; 314 315 locale = getenv ("LC_ALL"); 316 if (locale == NULL || locale[0] == '\0') 317 { 318 locale = getenv ("LC_CTYPE"); 319 if (locale == NULL || locale[0] == '\0') 320 locale = getenv ("LANG"); 321 } 322 if (locale != NULL && locale[0] != '\0') 323 { 324 /* If the locale name contains an encoding after the dot, return 325 it. */ 326 const char *dot = strchr (locale, '.'); 327 328 if (dot != NULL) 329 { 330 const char *modifier; 331 332 dot++; 333 /* Look for the possible @... trailer and remove it, if any. */ 334 modifier = strchr (dot, '@'); 335 if (modifier == NULL) 336 return dot; 337 if (modifier - dot < sizeof (buf)) 338 { 339 memcpy (buf, dot, modifier - dot); 340 buf [modifier - dot] = '\0'; 341 return buf; 342 } 343 } 344 } 345 346 /* Woe32 has a function returning the locale's codepage as a number. */ 347 sprintf (buf, "CP%u", GetACP ()); 348 codeset = buf; 349 } 350# endif 351 352# else 353 354 /* On old systems which lack it, use setlocale or getenv. */ 355 const char *locale = NULL; 356 357 /* But most old systems don't have a complete set of locales. Some 358 (like SunOS 4 or DJGPP) have only the C locale. Therefore we don't 359 use setlocale here; it would return "C" when it doesn't support the 360 locale name the user has set. */ 361# if HAVE_SETLOCALE && 0 362 locale = setlocale (LC_CTYPE, NULL); 363# endif 364 if (locale == NULL || locale[0] == '\0') 365 { 366 locale = getenv ("LC_ALL"); 367 if (locale == NULL || locale[0] == '\0') 368 { 369 locale = getenv ("LC_CTYPE"); 370 if (locale == NULL || locale[0] == '\0') 371 locale = getenv ("LANG"); 372 } 373 } 374 375 /* On some old systems, one used to set locale = "iso8859_1". On others, 376 you set it to "language_COUNTRY.charset". In any case, we resolve it 377 through the charset.alias file. */ 378 codeset = locale; 379 380# endif 381 382#elif defined WIN32_NATIVE 383 384 static char buf[2 + 10 + 1]; 385 386 /* Woe32 has a function returning the locale's codepage as a number. */ 387 sprintf (buf, "CP%u", GetACP ()); 388 codeset = buf; 389 390#elif defined OS2 391 392 const char *locale; 393 static char buf[2 + 10 + 1]; 394 ULONG cp[3]; 395 ULONG cplen; 396 397 /* Allow user to override the codeset, as set in the operating system, 398 with standard language environment variables. */ 399 locale = getenv ("LC_ALL"); 400 if (locale == NULL || locale[0] == '\0') 401 { 402 locale = getenv ("LC_CTYPE"); 403 if (locale == NULL || locale[0] == '\0') 404 locale = getenv ("LANG"); 405 } 406 if (locale != NULL && locale[0] != '\0') 407 { 408 /* If the locale name contains an encoding after the dot, return it. */ 409 const char *dot = strchr (locale, '.'); 410 411 if (dot != NULL) 412 { 413 const char *modifier; 414 415 dot++; 416 /* Look for the possible @... trailer and remove it, if any. */ 417 modifier = strchr (dot, '@'); 418 if (modifier == NULL) 419 return dot; 420 if (modifier - dot < sizeof (buf)) 421 { 422 memcpy (buf, dot, modifier - dot); 423 buf [modifier - dot] = '\0'; 424 return buf; 425 } 426 } 427 428 /* Resolve through the charset.alias file. */ 429 codeset = locale; 430 } 431 else 432 { 433 /* OS/2 has a function returning the locale's codepage as a number. */ 434 if (DosQueryCp (sizeof (cp), cp, &cplen)) 435 codeset = ""; 436 else 437 { 438 sprintf (buf, "CP%u", cp[0]); 439 codeset = buf; 440 } 441 } 442 443#endif 444 445 if (codeset == NULL) 446 /* The canonical name cannot be determined. */ 447 codeset = ""; 448 449 /* Resolve alias. */ 450 for (aliases = get_charset_aliases (); 451 *aliases != '\0'; 452 aliases += strlen (aliases) + 1, aliases += strlen (aliases) + 1) 453 if (strcmp (codeset, aliases) == 0 454 || (aliases[0] == '*' && aliases[1] == '\0')) 455 { 456 codeset = aliases + strlen (aliases) + 1; 457 break; 458 } 459 460 /* Don't return an empty string. GNU libc and GNU libiconv interpret 461 the empty string as denoting "the locale's character encoding", 462 thus GNU libiconv would call this function a second time. */ 463 if (codeset[0] == '\0') 464 codeset = "ASCII"; 465 466 return codeset; 467} 468