1/* localcharset.c - Determine a canonical name for the current locale's character encoding. */ 2 3/* Copyright (C) 2000-2003, 2005-2009 Free Software Foundation, Inc. 4 5 This file is part of GNU Bash. 6 7 Bash is free software: you can redistribute it and/or modify 8 it under the terms of the GNU General Public License as published by 9 the Free Software Foundation, either version 3 of the License, or 10 (at your option) any later version. 11 12 Bash is distributed in the hope that it will be useful, 13 but WITHOUT ANY WARRANTY; without even the implied warranty of 14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 GNU General Public License for more details. 16 17 You should have received a copy of the GNU General Public License 18 along with Bash. If not, see <http://www.gnu.org/licenses/>. 19*/ 20 21/* Written by Bruno Haible <bruno@clisp.org>. */ 22 23#ifdef HAVE_CONFIG_H 24# include <config.h> 25#endif 26 27/* Specification. */ 28#include "localcharset.h" 29 30#if HAVE_STDDEF_H 31# include <stddef.h> 32#endif 33 34#include <stdio.h> 35#if HAVE_STRING_H 36# include <string.h> 37#else 38# include <strings.h> 39#endif 40#if HAVE_STDLIB_H 41# include <stdlib.h> 42#endif 43 44#if defined _WIN32 || defined __WIN32__ 45# undef WIN32 /* avoid warning on mingw32 */ 46# define WIN32 47#endif 48 49#if defined __EMX__ 50/* Assume EMX program runs on OS/2, even if compiled under DOS. */ 51# define OS2 52#endif 53 54#if !defined WIN32 55# if HAVE_LANGINFO_CODESET 56# include <langinfo.h> 57# else 58# if HAVE_SETLOCALE 59# include <locale.h> 60# endif 61# endif 62#elif defined WIN32 63# define WIN32_LEAN_AND_MEAN 64# include <windows.h> 65#endif 66#if defined OS2 67# define INCL_DOS 68# include <os2.h> 69#endif 70 71#if ENABLE_RELOCATABLE 72# include "relocatable.h" 73#else 74# define relocate(pathname) (pathname) 75#endif 76 77#if defined _WIN32 || defined __WIN32__ || defined __EMX__ || defined __DJGPP__ 78 /* Win32, OS/2, DOS */ 79# define ISSLASH(C) ((C) == '/' || (C) == '\\') 80#endif 81 82#ifndef DIRECTORY_SEPARATOR 83# define DIRECTORY_SEPARATOR '/' 84#endif 85 86#ifndef ISSLASH 87# define ISSLASH(C) ((C) == DIRECTORY_SEPARATOR) 88#endif 89 90#ifdef HAVE_GETC_UNLOCKED 91# undef getc 92# define getc getc_unlocked 93#endif 94 95/* The following static variable is declared 'volatile' to avoid a 96 possible multithread problem in the function get_charset_aliases. If we 97 are running in a threaded environment, and if two threads initialize 98 'charset_aliases' simultaneously, both will produce the same value, 99 and everything will be ok if the two assignments to 'charset_aliases' 100 are atomic. But I don't know what will happen if the two assignments mix. */ 101#if __STDC__ != 1 102# define volatile /* empty */ 103#endif 104/* Pointer to the contents of the charset.alias file, if it has already been 105 read, else NULL. Its format is: 106 ALIAS_1 '\0' CANONICAL_1 '\0' ... ALIAS_n '\0' CANONICAL_n '\0' '\0' */ 107static const char * volatile charset_aliases; 108 109/* Return a pointer to the contents of the charset.alias file. */ 110static const char * 111get_charset_aliases () 112{ 113 const char *cp; 114 115 cp = charset_aliases; 116 if (cp == NULL) 117 { 118#if !(defined VMS || defined WIN32) 119 FILE *fp; 120 const char *dir = relocate (LIBDIR); 121 const char *base = "charset.alias"; 122 char *file_name; 123 124 /* Concatenate dir and base into freshly allocated file_name. */ 125 { 126 size_t dir_len = strlen (dir); 127 size_t base_len = strlen (base); 128 int add_slash = (dir_len > 0 && !ISSLASH (dir[dir_len - 1])); 129 file_name = (char *) malloc (dir_len + add_slash + base_len + 1); 130 if (file_name != NULL) 131 { 132 memcpy (file_name, dir, dir_len); 133 if (add_slash) 134 file_name[dir_len] = DIRECTORY_SEPARATOR; 135 memcpy (file_name + dir_len + add_slash, base, base_len + 1); 136 } 137 } 138 139 if (file_name == NULL || (fp = fopen (file_name, "r")) == NULL) 140 /* Out of memory or file not found, treat it as empty. */ 141 cp = ""; 142 else 143 { 144 /* Parse the file's contents. */ 145 int c; 146 char buf1[50+1]; 147 char buf2[50+1]; 148 char *res_ptr = NULL; 149 size_t res_size = 0; 150 size_t l1, l2; 151 152 for (;;) 153 { 154 c = getc (fp); 155 if (c == EOF) 156 break; 157 if (c == '\n' || c == ' ' || c == '\t') 158 continue; 159 if (c == '#') 160 { 161 /* Skip comment, to end of line. */ 162 do 163 c = getc (fp); 164 while (!(c == EOF || c == '\n')); 165 if (c == EOF) 166 break; 167 continue; 168 } 169 ungetc (c, fp); 170 if (fscanf (fp, "%50s %50s", buf1, buf2) < 2) 171 break; 172 l1 = strlen (buf1); 173 l2 = strlen (buf2); 174 if (res_size == 0) 175 { 176 res_size = l1 + 1 + l2 + 1; 177 res_ptr = (char *) malloc (res_size + 1); 178 } 179 else 180 { 181 res_size += l1 + 1 + l2 + 1; 182 res_ptr = (char *) realloc (res_ptr, res_size + 1); 183 } 184 if (res_ptr == NULL) 185 { 186 /* Out of memory. */ 187 res_size = 0; 188 break; 189 } 190 strcpy (res_ptr + res_size - (l2 + 1) - (l1 + 1), buf1); 191 strcpy (res_ptr + res_size - (l2 + 1), buf2); 192 } 193 fclose (fp); 194 if (res_size == 0) 195 cp = ""; 196 else 197 { 198 *(res_ptr + res_size) = '\0'; 199 cp = res_ptr; 200 } 201 } 202 203 if (file_name != NULL) 204 free (file_name); 205 206#else 207 208# if defined VMS 209 /* To avoid the troubles of an extra file charset.alias_vms in the 210 sources of many GNU packages, simply inline the aliases here. */ 211 /* The list of encodings is taken from the OpenVMS 7.3-1 documentation 212 "Compaq C Run-Time Library Reference Manual for OpenVMS systems" 213 section 10.7 "Handling Different Character Sets". */ 214 cp = "ISO8859-1" "\0" "ISO-8859-1" "\0" 215 "ISO8859-2" "\0" "ISO-8859-2" "\0" 216 "ISO8859-5" "\0" "ISO-8859-5" "\0" 217 "ISO8859-7" "\0" "ISO-8859-7" "\0" 218 "ISO8859-8" "\0" "ISO-8859-8" "\0" 219 "ISO8859-9" "\0" "ISO-8859-9" "\0" 220 /* Japanese */ 221 "eucJP" "\0" "EUC-JP" "\0" 222 "SJIS" "\0" "SHIFT_JIS" "\0" 223 "DECKANJI" "\0" "DEC-KANJI" "\0" 224 "SDECKANJI" "\0" "EUC-JP" "\0" 225 /* Chinese */ 226 "eucTW" "\0" "EUC-TW" "\0" 227 "DECHANYU" "\0" "DEC-HANYU" "\0" 228 "DECHANZI" "\0" "GB2312" "\0" 229 /* Korean */ 230 "DECKOREAN" "\0" "EUC-KR" "\0"; 231# endif 232 233# if defined WIN32 234 /* To avoid the troubles of installing a separate file in the same 235 directory as the DLL and of retrieving the DLL's directory at 236 runtime, simply inline the aliases here. */ 237 238 cp = "CP936" "\0" "GBK" "\0" 239 "CP1361" "\0" "JOHAB" "\0" 240 "CP20127" "\0" "ASCII" "\0" 241 "CP20866" "\0" "KOI8-R" "\0" 242 "CP21866" "\0" "KOI8-RU" "\0" 243 "CP28591" "\0" "ISO-8859-1" "\0" 244 "CP28592" "\0" "ISO-8859-2" "\0" 245 "CP28593" "\0" "ISO-8859-3" "\0" 246 "CP28594" "\0" "ISO-8859-4" "\0" 247 "CP28595" "\0" "ISO-8859-5" "\0" 248 "CP28596" "\0" "ISO-8859-6" "\0" 249 "CP28597" "\0" "ISO-8859-7" "\0" 250 "CP28598" "\0" "ISO-8859-8" "\0" 251 "CP28599" "\0" "ISO-8859-9" "\0" 252 "CP28605" "\0" "ISO-8859-15" "\0"; 253# endif 254#endif 255 256 charset_aliases = cp; 257 } 258 259 return cp; 260} 261 262/* Determine the current locale's character encoding, and canonicalize it 263 into one of the canonical names listed in config.charset. 264 The result must not be freed; it is statically allocated. 265 If the canonical name cannot be determined, the result is a non-canonical 266 name. */ 267 268#ifdef STATIC 269STATIC 270#endif 271const char * 272locale_charset () 273{ 274 const char *codeset; 275 const char *aliases; 276 277#if !(defined WIN32 || defined OS2) 278 279# if HAVE_LANGINFO_CODESET 280 281 /* Most systems support nl_langinfo (CODESET) nowadays. */ 282 codeset = nl_langinfo (CODESET); 283 284# else 285 286 /* On old systems which lack it, use setlocale or getenv. */ 287 const char *locale = NULL; 288 289 /* But most old systems don't have a complete set of locales. Some 290 (like SunOS 4 or DJGPP) have only the C locale. Therefore we don't 291 use setlocale here; it would return "C" when it doesn't support the 292 locale name the user has set. */ 293# if HAVE_SETLOCALE && 0 294 locale = setlocale (LC_CTYPE, NULL); 295# endif 296 if (locale == NULL || locale[0] == '\0') 297 { 298 locale = getenv ("LC_ALL"); 299 if (locale == NULL || locale[0] == '\0') 300 { 301 locale = getenv ("LC_CTYPE"); 302 if (locale == NULL || locale[0] == '\0') 303 locale = getenv ("LANG"); 304 } 305 } 306 307 /* On some old systems, one used to set locale = "iso8859_1". On others, 308 you set it to "language_COUNTRY.charset". In any case, we resolve it 309 through the charset.alias file. */ 310 codeset = locale; 311 312# endif 313 314#elif defined WIN32 315 316 static char buf[2 + 10 + 1]; 317 318 /* Woe32 has a function returning the locale's codepage as a number. */ 319 sprintf (buf, "CP%u", GetACP ()); 320 codeset = buf; 321 322#elif defined OS2 323 324 const char *locale; 325 static char buf[2 + 10 + 1]; 326 ULONG cp[3]; 327 ULONG cplen; 328 329 /* Allow user to override the codeset, as set in the operating system, 330 with standard language environment variables. */ 331 locale = getenv ("LC_ALL"); 332 if (locale == NULL || locale[0] == '\0') 333 { 334 locale = getenv ("LC_CTYPE"); 335 if (locale == NULL || locale[0] == '\0') 336 locale = getenv ("LANG"); 337 } 338 if (locale != NULL && locale[0] != '\0') 339 { 340 /* If the locale name contains an encoding after the dot, return it. */ 341 const char *dot = strchr (locale, '.'); 342 343 if (dot != NULL) 344 { 345 const char *modifier; 346 347 dot++; 348 /* Look for the possible @... trailer and remove it, if any. */ 349 modifier = strchr (dot, '@'); 350 if (modifier == NULL) 351 return dot; 352 if (modifier - dot < sizeof (buf)) 353 { 354 memcpy (buf, dot, modifier - dot); 355 buf [modifier - dot] = '\0'; 356 return buf; 357 } 358 } 359 360 /* Resolve through the charset.alias file. */ 361 codeset = locale; 362 } 363 else 364 { 365 /* OS/2 has a function returning the locale's codepage as a number. */ 366 if (DosQueryCp (sizeof (cp), cp, &cplen)) 367 codeset = ""; 368 else 369 { 370 sprintf (buf, "CP%u", cp[0]); 371 codeset = buf; 372 } 373 } 374 375#endif 376 377 if (codeset == NULL) 378 /* The canonical name cannot be determined. */ 379 codeset = ""; 380 381 /* Resolve alias. */ 382 for (aliases = get_charset_aliases (); 383 *aliases != '\0'; 384 aliases += strlen (aliases) + 1, aliases += strlen (aliases) + 1) 385 if (strcmp (codeset, aliases) == 0 386 || (aliases[0] == '*' && aliases[1] == '\0')) 387 { 388 codeset = aliases + strlen (aliases) + 1; 389 break; 390 } 391 392 /* Don't return an empty string. GNU libc and GNU libiconv interpret 393 the empty string as denoting "the locale's character encoding", 394 thus GNU libiconv would call this function a second time. */ 395 if (codeset[0] == '\0') 396 codeset = "ASCII"; 397 398 return codeset; 399} 400