1/* Determine a canonical name for the current locale's character encoding. 2 3 Copyright (C) 2000-2004 Free Software Foundation, Inc. 4 5 This program is free software; you can redistribute it and/or modify it 6 under the terms of the GNU Library General Public License as published 7 by the Free Software Foundation; either version 2, or (at your option) 8 any later version. 9 10 This program is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Library General Public License for more details. 14 15 You should have received a copy of the GNU Library General Public 16 License along with this program; if not, write to the Free Software 17 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, 18 USA. */ 19 20/* Written by Bruno Haible <bruno@clisp.org>. */ 21 22#ifdef HAVE_CONFIG_H 23# include <config.h> 24#endif 25 26/* Specification. */ 27#include "localcharset.h" 28 29#if HAVE_STDDEF_H 30# include <stddef.h> 31#endif 32 33#include <stdio.h> 34#if HAVE_STRING_H 35# include <string.h> 36#else 37# include <strings.h> 38#endif 39#if HAVE_STDLIB_H 40# include <stdlib.h> 41#endif 42 43#if defined _WIN32 || defined __WIN32__ 44# undef WIN32 /* avoid warning on mingw32 */ 45# define WIN32 46#endif 47 48#if defined __EMX__ 49/* Assume EMX program runs on OS/2, even if compiled under DOS. */ 50# define OS2 51#endif 52 53#if !defined WIN32 54# if HAVE_LANGINFO_CODESET 55# include <langinfo.h> 56# else 57# if HAVE_SETLOCALE 58# include <locale.h> 59# endif 60# endif 61#elif defined WIN32 62# define WIN32_LEAN_AND_MEAN 63# include <windows.h> 64#endif 65#if defined OS2 66# define INCL_DOS 67# include <os2.h> 68#endif 69 70#if ENABLE_RELOCATABLE 71# include "relocatable.h" 72#else 73# define relocate(pathname) (pathname) 74#endif 75 76#if defined _WIN32 || defined __WIN32__ || defined __CYGWIN__ || defined __EMX__ || defined __DJGPP__ 77 /* Win32, Cygwin, OS/2, DOS */ 78# define ISSLASH(C) ((C) == '/' || (C) == '\\') 79#endif 80 81#ifndef DIRECTORY_SEPARATOR 82# define DIRECTORY_SEPARATOR '/' 83#endif 84 85#ifndef ISSLASH 86# define ISSLASH(C) ((C) == DIRECTORY_SEPARATOR) 87#endif 88 89#if HAVE_DECL_GETC_UNLOCKED 90# undef getc 91# define getc getc_unlocked 92#endif 93 94/* The following static variable is declared 'volatile' to avoid a 95 possible multithread problem in the function get_charset_aliases. If we 96 are running in a threaded environment, and if two threads initialize 97 'charset_aliases' simultaneously, both will produce the same value, 98 and everything will be ok if the two assignments to 'charset_aliases' 99 are atomic. But I don't know what will happen if the two assignments mix. */ 100#if __STDC__ != 1 101# define volatile /* empty */ 102#endif 103/* Pointer to the contents of the charset.alias file, if it has already been 104 read, else NULL. Its format is: 105 ALIAS_1 '\0' CANONICAL_1 '\0' ... ALIAS_n '\0' CANONICAL_n '\0' '\0' */ 106static const char * volatile charset_aliases; 107 108/* Return a pointer to the contents of the charset.alias file. */ 109static const char * 110get_charset_aliases () 111{ 112 const char *cp; 113 114 cp = charset_aliases; 115 if (cp == NULL) 116 { 117#if !(defined VMS || defined WIN32) 118 FILE *fp; 119 const char *dir; 120 const char *base = "charset.alias"; 121 char *file_name; 122 123 /* Make it possible to override the charset.alias location. This is 124 necessary for running the testsuite before "make install". */ 125 dir = getenv ("CHARSETALIASDIR"); 126 if (dir == NULL || dir[0] == '\0') 127 dir = relocate (LIBDIR); 128 129 /* Concatenate dir and base into freshly allocated file_name. */ 130 { 131 size_t dir_len = strlen (dir); 132 size_t base_len = strlen (base); 133 int add_slash = (dir_len > 0 && !ISSLASH (dir[dir_len - 1])); 134 file_name = (char *) malloc (dir_len + add_slash + base_len + 1); 135 if (file_name != NULL) 136 { 137 memcpy (file_name, dir, dir_len); 138 if (add_slash) 139 file_name[dir_len] = DIRECTORY_SEPARATOR; 140 memcpy (file_name + dir_len + add_slash, base, base_len + 1); 141 } 142 } 143 144 if (file_name == NULL || (fp = fopen (file_name, "r")) == NULL) 145 /* Out of memory or file not found, treat it as empty. */ 146 cp = ""; 147 else 148 { 149 /* Parse the file's contents. */ 150 char *res_ptr = NULL; 151 size_t res_size = 0; 152 153 for (;;) 154 { 155 int c; 156 char buf1[50+1]; 157 char buf2[50+1]; 158 size_t l1, l2; 159 char *old_res_ptr; 160 161 c = getc (fp); 162 if (c == EOF) 163 break; 164 if (c == '\n' || c == ' ' || c == '\t') 165 continue; 166 if (c == '#') 167 { 168 /* Skip comment, to end of line. */ 169 do 170 c = getc (fp); 171 while (!(c == EOF || c == '\n')); 172 if (c == EOF) 173 break; 174 continue; 175 } 176 ungetc (c, fp); 177 if (fscanf (fp, "%50s %50s", buf1, buf2) < 2) 178 break; 179 l1 = strlen (buf1); 180 l2 = strlen (buf2); 181 old_res_ptr = res_ptr; 182 if (res_size == 0) 183 { 184 res_size = l1 + 1 + l2 + 1; 185 res_ptr = (char *) malloc (res_size + 1); 186 } 187 else 188 { 189 res_size += l1 + 1 + l2 + 1; 190 res_ptr = (char *) realloc (res_ptr, res_size + 1); 191 } 192 if (res_ptr == NULL) 193 { 194 /* Out of memory. */ 195 res_size = 0; 196 if (old_res_ptr != NULL) 197 free (old_res_ptr); 198 break; 199 } 200 strcpy (res_ptr + res_size - (l2 + 1) - (l1 + 1), buf1); 201 strcpy (res_ptr + res_size - (l2 + 1), buf2); 202 } 203 fclose (fp); 204 if (res_size == 0) 205 cp = ""; 206 else 207 { 208 *(res_ptr + res_size) = '\0'; 209 cp = res_ptr; 210 } 211 } 212 213 if (file_name != NULL) 214 free (file_name); 215 216#else 217 218# if defined VMS 219 /* To avoid the troubles of an extra file charset.alias_vms in the 220 sources of many GNU packages, simply inline the aliases here. */ 221 /* The list of encodings is taken from the OpenVMS 7.3-1 documentation 222 "Compaq C Run-Time Library Reference Manual for OpenVMS systems" 223 section 10.7 "Handling Different Character Sets". */ 224 cp = "ISO8859-1" "\0" "ISO-8859-1" "\0" 225 "ISO8859-2" "\0" "ISO-8859-2" "\0" 226 "ISO8859-5" "\0" "ISO-8859-5" "\0" 227 "ISO8859-7" "\0" "ISO-8859-7" "\0" 228 "ISO8859-8" "\0" "ISO-8859-8" "\0" 229 "ISO8859-9" "\0" "ISO-8859-9" "\0" 230 /* Japanese */ 231 "eucJP" "\0" "EUC-JP" "\0" 232 "SJIS" "\0" "SHIFT_JIS" "\0" 233 "DECKANJI" "\0" "DEC-KANJI" "\0" 234 "SDECKANJI" "\0" "EUC-JP" "\0" 235 /* Chinese */ 236 "eucTW" "\0" "EUC-TW" "\0" 237 "DECHANYU" "\0" "DEC-HANYU" "\0" 238 "DECHANZI" "\0" "GB2312" "\0" 239 /* Korean */ 240 "DECKOREAN" "\0" "EUC-KR" "\0"; 241# endif 242 243# if defined WIN32 244 /* To avoid the troubles of installing a separate file in the same 245 directory as the DLL and of retrieving the DLL's directory at 246 runtime, simply inline the aliases here. */ 247 248 cp = "CP936" "\0" "GBK" "\0" 249 "CP1361" "\0" "JOHAB" "\0" 250 "CP20127" "\0" "ASCII" "\0" 251 "CP20866" "\0" "KOI8-R" "\0" 252 "CP21866" "\0" "KOI8-RU" "\0" 253 "CP28591" "\0" "ISO-8859-1" "\0" 254 "CP28592" "\0" "ISO-8859-2" "\0" 255 "CP28593" "\0" "ISO-8859-3" "\0" 256 "CP28594" "\0" "ISO-8859-4" "\0" 257 "CP28595" "\0" "ISO-8859-5" "\0" 258 "CP28596" "\0" "ISO-8859-6" "\0" 259 "CP28597" "\0" "ISO-8859-7" "\0" 260 "CP28598" "\0" "ISO-8859-8" "\0" 261 "CP28599" "\0" "ISO-8859-9" "\0" 262 "CP28605" "\0" "ISO-8859-15" "\0"; 263# endif 264#endif 265 266 charset_aliases = cp; 267 } 268 269 return cp; 270} 271 272/* Determine the current locale's character encoding, and canonicalize it 273 into one of the canonical names listed in config.charset. 274 The result must not be freed; it is statically allocated. 275 If the canonical name cannot be determined, the result is a non-canonical 276 name. */ 277 278#ifdef STATIC 279STATIC 280#endif 281const char * 282locale_charset () 283{ 284 const char *codeset; 285 const char *aliases; 286 287#if !(defined WIN32 || defined OS2) 288 289# if HAVE_LANGINFO_CODESET 290 291 /* Most systems support nl_langinfo (CODESET) nowadays. */ 292 codeset = nl_langinfo (CODESET); 293 294# else 295 296 /* On old systems which lack it, use setlocale or getenv. */ 297 const char *locale = NULL; 298 299 /* But most old systems don't have a complete set of locales. Some 300 (like SunOS 4 or DJGPP) have only the C locale. Therefore we don't 301 use setlocale here; it would return "C" when it doesn't support the 302 locale name the user has set. */ 303# if HAVE_SETLOCALE && 0 304 locale = setlocale (LC_CTYPE, NULL); 305# endif 306 if (locale == NULL || locale[0] == '\0') 307 { 308 locale = getenv ("LC_ALL"); 309 if (locale == NULL || locale[0] == '\0') 310 { 311 locale = getenv ("LC_CTYPE"); 312 if (locale == NULL || locale[0] == '\0') 313 locale = getenv ("LANG"); 314 } 315 } 316 317 /* On some old systems, one used to set locale = "iso8859_1". On others, 318 you set it to "language_COUNTRY.charset". In any case, we resolve it 319 through the charset.alias file. */ 320 codeset = locale; 321 322# endif 323 324#elif defined WIN32 325 326 static char buf[2 + 10 + 1]; 327 328 /* Woe32 has a function returning the locale's codepage as a number. */ 329 sprintf (buf, "CP%u", GetACP ()); 330 codeset = buf; 331 332#elif defined OS2 333 334 const char *locale; 335 static char buf[2 + 10 + 1]; 336 ULONG cp[3]; 337 ULONG cplen; 338 339 /* Allow user to override the codeset, as set in the operating system, 340 with standard language environment variables. */ 341 locale = getenv ("LC_ALL"); 342 if (locale == NULL || locale[0] == '\0') 343 { 344 locale = getenv ("LC_CTYPE"); 345 if (locale == NULL || locale[0] == '\0') 346 locale = getenv ("LANG"); 347 } 348 if (locale != NULL && locale[0] != '\0') 349 { 350 /* If the locale name contains an encoding after the dot, return it. */ 351 const char *dot = strchr (locale, '.'); 352 353 if (dot != NULL) 354 { 355 const char *modifier; 356 357 dot++; 358 /* Look for the possible @... trailer and remove it, if any. */ 359 modifier = strchr (dot, '@'); 360 if (modifier == NULL) 361 return dot; 362 if (modifier - dot < sizeof (buf)) 363 { 364 memcpy (buf, dot, modifier - dot); 365 buf [modifier - dot] = '\0'; 366 return buf; 367 } 368 } 369 370 /* Resolve through the charset.alias file. */ 371 codeset = locale; 372 } 373 else 374 { 375 /* OS/2 has a function returning the locale's codepage as a number. */ 376 if (DosQueryCp (sizeof (cp), cp, &cplen)) 377 codeset = ""; 378 else 379 { 380 sprintf (buf, "CP%u", cp[0]); 381 codeset = buf; 382 } 383 } 384 385#endif 386 387 if (codeset == NULL) 388 /* The canonical name cannot be determined. */ 389 codeset = ""; 390 391 /* Resolve alias. */ 392 for (aliases = get_charset_aliases (); 393 *aliases != '\0'; 394 aliases += strlen (aliases) + 1, aliases += strlen (aliases) + 1) 395 if (strcmp (codeset, aliases) == 0 396 || (aliases[0] == '*' && aliases[1] == '\0')) 397 { 398 codeset = aliases + strlen (aliases) + 1; 399 break; 400 } 401 402 /* Don't return an empty string. GNU libc and GNU libiconv interpret 403 the empty string as denoting "the locale's character encoding", 404 thus GNU libiconv would call this function a second time. */ 405 if (codeset[0] == '\0') 406 codeset = "ASCII"; 407 408 return codeset; 409} 410