1/* Determine a canonical name for the current locale's character encoding. 2 3 Copyright (C) 2000-2002 Free Software Foundation, Inc. 4 5 This program is free software; you can redistribute it and/or modify it 6 under the terms of the GNU Library General Public License as published 7 by the Free Software Foundation; either version 2, or (at your option) 8 any later version. 9 10 This program is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Library General Public License for more details. 14 15 You should have received a copy of the GNU Library General Public 16 License along with this program; if not, write to the Free Software 17 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, 18 USA. */ 19 20/* Written by Bruno Haible <bruno@clisp.org>. */ 21 22#ifdef HAVE_CONFIG_H 23# include <config.h> 24#endif 25 26#if HAVE_STDDEF_H 27# include <stddef.h> 28#endif 29 30#include <stdio.h> 31#if HAVE_STRING_H 32# include <string.h> 33#else 34# include <strings.h> 35#endif 36#if HAVE_STDLIB_H 37# include <stdlib.h> 38#endif 39 40#if defined _WIN32 || defined __WIN32__ 41# undef WIN32 /* avoid warning on mingw32 */ 42# define WIN32 43#endif 44 45#if defined __EMX__ 46/* Assume EMX program runs on OS/2, even if compiled under DOS. */ 47# define OS2 48#endif 49 50#if !defined WIN32 51# if HAVE_LANGINFO_CODESET 52# include <langinfo.h> 53# else 54# if HAVE_SETLOCALE 55# include <locale.h> 56# endif 57# endif 58#elif defined WIN32 59# define WIN32_LEAN_AND_MEAN 60# include <windows.h> 61#endif 62#if defined OS2 63# define INCL_DOS 64# include <os2.h> 65#endif 66 67#if defined _WIN32 || defined __WIN32__ || defined __EMX__ || defined __DJGPP__ 68 /* Win32, OS/2, DOS */ 69# define ISSLASH(C) ((C) == '/' || (C) == '\\') 70#endif 71 72#ifndef DIRECTORY_SEPARATOR 73# define DIRECTORY_SEPARATOR '/' 74#endif 75 76#ifndef ISSLASH 77# define ISSLASH(C) ((C) == DIRECTORY_SEPARATOR) 78#endif 79 80#ifdef HAVE_GETC_UNLOCKED 81# undef getc 82# define getc getc_unlocked 83#endif 84 85#ifdef __cplusplus 86/* When compiling with "gcc -x c++", produce a function with C linkage. */ 87extern "C" const char * locale_charset (void); 88#endif 89 90/* The following static variable is declared 'volatile' to avoid a 91 possible multithread problem in the function get_charset_aliases. If we 92 are running in a threaded environment, and if two threads initialize 93 'charset_aliases' simultaneously, both will produce the same value, 94 and everything will be ok if the two assignments to 'charset_aliases' 95 are atomic. But I don't know what will happen if the two assignments mix. */ 96#if __STDC__ != 1 97# define volatile /* empty */ 98#endif 99/* Pointer to the contents of the charset.alias file, if it has already been 100 read, else NULL. Its format is: 101 ALIAS_1 '\0' CANONICAL_1 '\0' ... ALIAS_n '\0' CANONICAL_n '\0' '\0' */ 102static const char * volatile charset_aliases; 103 104/* Return a pointer to the contents of the charset.alias file. */ 105static const char * 106get_charset_aliases () 107{ 108 const char *cp; 109 110 cp = charset_aliases; 111 if (cp == NULL) 112 { 113#if !defined WIN32 114 FILE *fp; 115 const char *dir = LIBDIR; 116 const char *base = "charset.alias"; 117 char *file_name; 118 119 /* Concatenate dir and base into freshly allocated file_name. */ 120 { 121 size_t dir_len = strlen (dir); 122 size_t base_len = strlen (base); 123 int add_slash = (dir_len > 0 && !ISSLASH (dir[dir_len - 1])); 124 file_name = (char *) malloc (dir_len + add_slash + base_len + 1); 125 if (file_name != NULL) 126 { 127 memcpy (file_name, dir, dir_len); 128 if (add_slash) 129 file_name[dir_len] = DIRECTORY_SEPARATOR; 130 memcpy (file_name + dir_len + add_slash, base, base_len + 1); 131 } 132 } 133 134 if (file_name == NULL || (fp = fopen (file_name, "r")) == NULL) 135 /* Out of memory or file not found, treat it as empty. */ 136 cp = ""; 137 else 138 { 139 /* Parse the file's contents. */ 140 int c; 141 char buf1[50+1]; 142 char buf2[50+1]; 143 char *res_ptr = NULL; 144 size_t res_size = 0; 145 size_t l1, l2; 146 147 for (;;) 148 { 149 c = getc (fp); 150 if (c == EOF) 151 break; 152 if (c == '\n' || c == ' ' || c == '\t') 153 continue; 154 if (c == '#') 155 { 156 /* Skip comment, to end of line. */ 157 do 158 c = getc (fp); 159 while (!(c == EOF || c == '\n')); 160 if (c == EOF) 161 break; 162 continue; 163 } 164 ungetc (c, fp); 165 if (fscanf (fp, "%50s %50s", buf1, buf2) < 2) 166 break; 167 l1 = strlen (buf1); 168 l2 = strlen (buf2); 169 if (res_size == 0) 170 { 171 res_size = l1 + 1 + l2 + 1; 172 res_ptr = (char *) malloc (res_size + 1); 173 } 174 else 175 { 176 res_size += l1 + 1 + l2 + 1; 177 res_ptr = (char *) realloc (res_ptr, res_size + 1); 178 } 179 if (res_ptr == NULL) 180 { 181 /* Out of memory. */ 182 res_size = 0; 183 break; 184 } 185 strcpy (res_ptr + res_size - (l2 + 1) - (l1 + 1), buf1); 186 strcpy (res_ptr + res_size - (l2 + 1), buf2); 187 } 188 fclose (fp); 189 if (res_size == 0) 190 cp = ""; 191 else 192 { 193 *(res_ptr + res_size) = '\0'; 194 cp = res_ptr; 195 } 196 } 197 198 if (file_name != NULL) 199 free (file_name); 200 201#else 202 203 /* To avoid the troubles of installing a separate file in the same 204 directory as the DLL and of retrieving the DLL's directory at 205 runtime, simply inline the aliases here. */ 206 207# if defined WIN32 208 cp = "CP936" "\0" "GBK" "\0" 209 "CP1361" "\0" "JOHAB" "\0" 210 "CP20127" "\0" "ASCII" "\0" 211 "CP20866" "\0" "KOI8-R" "\0" 212 "CP21866" "\0" "KOI8-RU" "\0" 213 "CP28591" "\0" "ISO-8859-1" "\0" 214 "CP28592" "\0" "ISO-8859-2" "\0" 215 "CP28593" "\0" "ISO-8859-3" "\0" 216 "CP28594" "\0" "ISO-8859-4" "\0" 217 "CP28595" "\0" "ISO-8859-5" "\0" 218 "CP28596" "\0" "ISO-8859-6" "\0" 219 "CP28597" "\0" "ISO-8859-7" "\0" 220 "CP28598" "\0" "ISO-8859-8" "\0" 221 "CP28599" "\0" "ISO-8859-9" "\0" 222 "CP28605" "\0" "ISO-8859-15" "\0"; 223# endif 224#endif 225 226 charset_aliases = cp; 227 } 228 229 return cp; 230} 231 232/* Determine the current locale's character encoding, and canonicalize it 233 into one of the canonical names listed in config.charset. 234 The result must not be freed; it is statically allocated. 235 If the canonical name cannot be determined, the result is a non-canonical 236 name. */ 237 238#ifdef STATIC 239STATIC 240#endif 241const char * 242locale_charset () 243{ 244 const char *codeset; 245 const char *aliases; 246 247#if !(defined WIN32 || defined OS2) 248 249# if HAVE_LANGINFO_CODESET 250 251 /* Most systems support nl_langinfo (CODESET) nowadays. */ 252 codeset = nl_langinfo (CODESET); 253 254# else 255 256 /* On old systems which lack it, use setlocale or getenv. */ 257 const char *locale = NULL; 258 259 /* But most old systems don't have a complete set of locales. Some 260 (like SunOS 4 or DJGPP) have only the C locale. Therefore we don't 261 use setlocale here; it would return "C" when it doesn't support the 262 locale name the user has set. */ 263# if HAVE_SETLOCALE && 0 264 locale = setlocale (LC_CTYPE, NULL); 265# endif 266 if (locale == NULL || locale[0] == '\0') 267 { 268 locale = getenv ("LC_ALL"); 269 if (locale == NULL || locale[0] == '\0') 270 { 271 locale = getenv ("LC_CTYPE"); 272 if (locale == NULL || locale[0] == '\0') 273 locale = getenv ("LANG"); 274 } 275 } 276 277 /* On some old systems, one used to set locale = "iso8859_1". On others, 278 you set it to "language_COUNTRY.charset". In any case, we resolve it 279 through the charset.alias file. */ 280 codeset = locale; 281 282# endif 283 284#elif defined WIN32 285 286 static char buf[2 + 10 + 1]; 287 288 /* Woe32 has a function returning the locale's codepage as a number. */ 289 sprintf (buf, "CP%u", GetACP ()); 290 codeset = buf; 291 292#elif defined OS2 293 294 const char *locale; 295 static char buf[2 + 10 + 1]; 296 ULONG cp[3]; 297 ULONG cplen; 298 299 /* Allow user to override the codeset, as set in the operating system, 300 with standard language environment variables. */ 301 locale = getenv ("LC_ALL"); 302 if (locale == NULL || locale[0] == '\0') 303 { 304 locale = getenv ("LC_CTYPE"); 305 if (locale == NULL || locale[0] == '\0') 306 locale = getenv ("LANG"); 307 } 308 if (locale != NULL && locale[0] != '\0') 309 { 310 /* If the locale name contains an encoding after the dot, return it. */ 311 const char *dot = strchr (locale, '.'); 312 313 if (dot != NULL) 314 { 315 const char *modifier; 316 317 dot++; 318 /* Look for the possible @... trailer and remove it, if any. */ 319 modifier = strchr (dot, '@'); 320 if (modifier == NULL) 321 return dot; 322 if (modifier - dot < sizeof (buf)) 323 { 324 memcpy (buf, dot, modifier - dot); 325 buf [modifier - dot] = '\0'; 326 return buf; 327 } 328 } 329 330 /* Resolve through the charset.alias file. */ 331 codeset = locale; 332 } 333 else 334 { 335 /* OS/2 has a function returning the locale's codepage as a number. */ 336 if (DosQueryCp (sizeof (cp), cp, &cplen)) 337 codeset = ""; 338 else 339 { 340 sprintf (buf, "CP%u", cp[0]); 341 codeset = buf; 342 } 343 } 344 345#endif 346 347 if (codeset == NULL) 348 /* The canonical name cannot be determined. */ 349 codeset = ""; 350 351 /* Resolve alias. */ 352 for (aliases = get_charset_aliases (); 353 *aliases != '\0'; 354 aliases += strlen (aliases) + 1, aliases += strlen (aliases) + 1) 355 if (strcmp (codeset, aliases) == 0 356 || (aliases[0] == '*' && aliases[1] == '\0')) 357 { 358 codeset = aliases + strlen (aliases) + 1; 359 break; 360 } 361 362 /* Don't return an empty string. GNU libc and GNU libiconv interpret 363 the empty string as denoting "the locale's character encoding", 364 thus GNU libiconv would call this function a second time. */ 365 if (codeset[0] == '\0') 366 codeset = "ASCII"; 367 368 return codeset; 369} 370