1/* Charset handling while reading PO files. 2 Copyright (C) 2001-2006 Free Software Foundation, Inc. 3 Written by Bruno Haible <haible@clisp.cons.org>, 2001. 4 5 This program is free software; you can redistribute it and/or modify 6 it under the terms of the GNU General Public License as published by 7 the Free Software Foundation; either version 2, or (at your option) 8 any later version. 9 10 This program is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 GNU General Public License for more details. 14 15 You should have received a copy of the GNU General Public License 16 along with this program; if not, write to the Free Software Foundation, 17 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ 18 19 20#ifdef HAVE_CONFIG_H 21# include "config.h" 22#endif 23#include <alloca.h> 24 25/* Specification. */ 26#include "po-charset.h" 27 28#include <stdlib.h> 29#include <string.h> 30 31#include "xallocsa.h" 32#include "xvasprintf.h" 33#include "po-xerror.h" 34#include "basename.h" 35#include "progname.h" 36#include "c-strstr.h" 37#include "c-strcase.h" 38#include "gettext.h" 39 40#define _(str) gettext (str) 41 42#define SIZEOF(a) (sizeof(a) / sizeof(a[0])) 43 44static const char ascii[] = "ASCII"; 45 46/* The canonicalized encoding name for ASCII. */ 47const char *po_charset_ascii = ascii; 48 49static const char utf8[] = "UTF-8"; 50 51/* The canonicalized encoding name for UTF-8. */ 52const char *po_charset_utf8 = utf8; 53 54/* Canonicalize an encoding name. */ 55const char * 56po_charset_canonicalize (const char *charset) 57{ 58 /* The list of charsets supported by glibc's iconv() and by the portable 59 iconv() across platforms. Taken from intl/config.charset. */ 60 static const char *standard_charsets[] = 61 { 62 ascii, "ANSI_X3.4-1968", "US-ASCII", /* i = 0..2 */ 63 "ISO-8859-1", "ISO_8859-1", /* i = 3, 4 */ 64 "ISO-8859-2", "ISO_8859-2", 65 "ISO-8859-3", "ISO_8859-3", 66 "ISO-8859-4", "ISO_8859-4", 67 "ISO-8859-5", "ISO_8859-5", 68 "ISO-8859-6", "ISO_8859-6", 69 "ISO-8859-7", "ISO_8859-7", 70 "ISO-8859-8", "ISO_8859-8", 71 "ISO-8859-9", "ISO_8859-9", 72 "ISO-8859-13", "ISO_8859-13", 73 "ISO-8859-14", "ISO_8859-14", 74 "ISO-8859-15", "ISO_8859-15", /* i = 25, 26 */ 75 "KOI8-R", 76 "KOI8-U", 77 "KOI8-T", 78 "CP850", 79 "CP866", 80 "CP874", 81 "CP932", 82 "CP949", 83 "CP950", 84 "CP1250", 85 "CP1251", 86 "CP1252", 87 "CP1253", 88 "CP1254", 89 "CP1255", 90 "CP1256", 91 "CP1257", 92 "GB2312", 93 "EUC-JP", 94 "EUC-KR", 95 "EUC-TW", 96 "BIG5", 97 "BIG5-HKSCS", 98 "GBK", 99 "GB18030", 100 "SHIFT_JIS", 101 "JOHAB", 102 "TIS-620", 103 "VISCII", 104 "GEORGIAN-PS", 105 utf8 106 }; 107 size_t i; 108 109 for (i = 0; i < SIZEOF (standard_charsets); i++) 110 if (c_strcasecmp (charset, standard_charsets[i]) == 0) 111 return standard_charsets[i < 3 ? 0 : i < 27 ? ((i - 3) & ~1) + 3 : i]; 112 return NULL; 113} 114 115/* Test for ASCII compatibility. */ 116bool 117po_charset_ascii_compatible (const char *canon_charset) 118{ 119 /* There are only a few exceptions to ASCII compatibility. */ 120 if (strcmp (canon_charset, "SHIFT_JIS") == 0 121 || strcmp (canon_charset, "JOHAB") == 0 122 || strcmp (canon_charset, "VISCII") == 0) 123 return false; 124 else 125 return true; 126} 127 128/* Test for a weird encoding, i.e. an encoding which has double-byte 129 characters ending in 0x5C. */ 130bool po_is_charset_weird (const char *canon_charset) 131{ 132 static const char *weird_charsets[] = 133 { 134 "BIG5", 135 "BIG5-HKSCS", 136 "GBK", 137 "GB18030", 138 "SHIFT_JIS", 139 "JOHAB" 140 }; 141 size_t i; 142 143 for (i = 0; i < SIZEOF (weird_charsets); i++) 144 if (strcmp (canon_charset, weird_charsets[i]) == 0) 145 return true; 146 return false; 147} 148 149/* Test for a weird CJK encoding, i.e. a weird encoding with CJK structure. 150 An encoding has CJK structure if every valid character stream is composed 151 of single bytes in the range 0x{00..7F} and of byte pairs in the range 152 0x{80..FF}{30..FF}. */ 153bool po_is_charset_weird_cjk (const char *canon_charset) 154{ 155 static const char *weird_cjk_charsets[] = 156 { /* single bytes double bytes */ 157 "BIG5", /* 0x{00..7F}, 0x{A1..F9}{40..FE} */ 158 "BIG5-HKSCS", /* 0x{00..7F}, 0x{88..FE}{40..FE} */ 159 "GBK", /* 0x{00..7F}, 0x{81..FE}{40..FE} */ 160 "GB18030", /* 0x{00..7F}, 0x{81..FE}{30..FE} */ 161 "SHIFT_JIS", /* 0x{00..7F}, 0x{81..F9}{40..FC} */ 162 "JOHAB" /* 0x{00..7F}, 0x{84..F9}{31..FE} */ 163 }; 164 size_t i; 165 166 for (i = 0; i < SIZEOF (weird_cjk_charsets); i++) 167 if (strcmp (canon_charset, weird_cjk_charsets[i]) == 0) 168 return true; 169 return false; 170} 171 172/* Hardcoded iterator functions for all kinds of encodings. 173 We could also implement a general iterator function with iconv(), 174 but we need a fast one. */ 175 176/* Character iterator for 8-bit encodings. */ 177static size_t 178char_iterator (const char *s) 179{ 180 return 1; 181} 182 183/* Character iterator for GB2312. See libiconv/lib/euc_cn.h. */ 184/* Character iterator for EUC-KR. See libiconv/lib/euc_kr.h. */ 185static size_t 186euc_character_iterator (const char *s) 187{ 188 unsigned char c = *s; 189 if (c >= 0xa1 && c < 0xff) 190 { 191 unsigned char c2 = s[1]; 192 if (c2 >= 0xa1 && c2 < 0xff) 193 return 2; 194 } 195 return 1; 196} 197 198/* Character iterator for EUC-JP. See libiconv/lib/euc_jp.h. */ 199static size_t 200euc_jp_character_iterator (const char *s) 201{ 202 unsigned char c = *s; 203 if (c >= 0xa1 && c < 0xff) 204 { 205 unsigned char c2 = s[1]; 206 if (c2 >= 0xa1 && c2 < 0xff) 207 return 2; 208 } 209 else if (c == 0x8e) 210 { 211 unsigned char c2 = s[1]; 212 if (c2 >= 0xa1 && c2 < 0xe0) 213 return 2; 214 } 215 else if (c == 0x8f) 216 { 217 unsigned char c2 = s[1]; 218 if (c2 >= 0xa1 && c2 < 0xff) 219 { 220 unsigned char c3 = s[2]; 221 if (c3 >= 0xa1 && c3 < 0xff) 222 return 3; 223 } 224 } 225 return 1; 226} 227 228/* Character iterator for EUC-TW. See libiconv/lib/euc_tw.h. */ 229static size_t 230euc_tw_character_iterator (const char *s) 231{ 232 unsigned char c = *s; 233 if (c >= 0xa1 && c < 0xff) 234 { 235 unsigned char c2 = s[1]; 236 if (c2 >= 0xa1 && c2 < 0xff) 237 return 2; 238 } 239 else if (c == 0x8e) 240 { 241 unsigned char c2 = s[1]; 242 if (c2 >= 0xa1 && c2 <= 0xb0) 243 { 244 unsigned char c3 = s[2]; 245 if (c3 >= 0xa1 && c3 < 0xff) 246 { 247 unsigned char c4 = s[3]; 248 if (c4 >= 0xa1 && c4 < 0xff) 249 return 4; 250 } 251 } 252 } 253 return 1; 254} 255 256/* Character iterator for BIG5. See libiconv/lib/ces_big5.h. */ 257static size_t 258big5_character_iterator (const char *s) 259{ 260 unsigned char c = *s; 261 if (c >= 0xa1 && c < 0xff) 262 { 263 unsigned char c2 = s[1]; 264 if ((c2 >= 0x40 && c2 < 0x7f) || (c2 >= 0xa1 && c2 < 0xff)) 265 return 2; 266 } 267 return 1; 268} 269 270/* Character iterator for BIG5-HKSCS. See libiconv/lib/big5hkscs.h. */ 271static size_t 272big5hkscs_character_iterator (const char *s) 273{ 274 unsigned char c = *s; 275 if (c >= 0x88 && c < 0xff) 276 { 277 unsigned char c2 = s[1]; 278 if ((c2 >= 0x40 && c2 < 0x7f) || (c2 >= 0xa1 && c2 < 0xff)) 279 return 2; 280 } 281 return 1; 282} 283 284/* Character iterator for GBK. See libiconv/lib/ces_gbk.h and 285 libiconv/lib/gbk.h. */ 286static size_t 287gbk_character_iterator (const char *s) 288{ 289 unsigned char c = *s; 290 if (c >= 0x81 && c < 0xff) 291 { 292 unsigned char c2 = s[1]; 293 if ((c2 >= 0x40 && c2 < 0x7f) || (c2 >= 0x80 && c2 < 0xff)) 294 return 2; 295 } 296 return 1; 297} 298 299/* Character iterator for GB18030. See libiconv/lib/gb18030.h. */ 300static size_t 301gb18030_character_iterator (const char *s) 302{ 303 unsigned char c = *s; 304 if (c >= 0x81 && c < 0xff) 305 { 306 unsigned char c2 = s[1]; 307 if ((c2 >= 0x40 && c2 < 0x7f) || (c2 >= 0x80 && c2 < 0xff)) 308 return 2; 309 } 310 if (c >= 0x81 && c <= 0x84) 311 { 312 unsigned char c2 = s[1]; 313 if (c2 >= 0x30 && c2 <= 0x39) 314 { 315 unsigned char c3 = s[2]; 316 if (c3 >= 0x81 && c3 < 0xff) 317 { 318 unsigned char c4 = s[3]; 319 if (c4 >= 0x30 && c4 <= 0x39) 320 return 4; 321 } 322 } 323 } 324 return 1; 325} 326 327/* Character iterator for SHIFT_JIS. See libiconv/lib/sjis.h. */ 328static size_t 329shift_jis_character_iterator (const char *s) 330{ 331 unsigned char c = *s; 332 if ((c >= 0x81 && c <= 0x9f) || (c >= 0xe0 && c <= 0xf9)) 333 { 334 unsigned char c2 = s[1]; 335 if ((c2 >= 0x40 && c2 <= 0x7e) || (c2 >= 0x80 && c2 <= 0xfc)) 336 return 2; 337 } 338 return 1; 339} 340 341/* Character iterator for JOHAB. See libiconv/lib/johab.h and 342 libiconv/lib/johab_hangul.h. */ 343static size_t 344johab_character_iterator (const char *s) 345{ 346 unsigned char c = *s; 347 if (c >= 0x84 && c <= 0xd3) 348 { 349 unsigned char c2 = s[1]; 350 if ((c2 >= 0x41 && c2 < 0x7f) || (c2 >= 0x81 && c2 < 0xff)) 351 return 2; 352 } 353 else if (c >= 0xd9 && c <= 0xf9) 354 { 355 unsigned char c2 = s[1]; 356 if ((c2 >= 0x31 && c2 <= 0x7e) || (c2 >= 0x91 && c2 <= 0xfe)) 357 return 2; 358 } 359 return 1; 360} 361 362/* Character iterator for UTF-8. See libiconv/lib/utf8.h. */ 363static size_t 364utf8_character_iterator (const char *s) 365{ 366 unsigned char c = *s; 367 if (c >= 0xc2) 368 { 369 if (c < 0xe0) 370 { 371 unsigned char c2 = s[1]; 372 if (c2 >= 0x80 && c2 < 0xc0) 373 return 2; 374 } 375 else if (c < 0xf0) 376 { 377 unsigned char c2 = s[1]; 378 if (c2 >= 0x80 && c2 < 0xc0) 379 { 380 unsigned char c3 = s[2]; 381 if (c3 >= 0x80 && c3 < 0xc0) 382 return 3; 383 } 384 } 385 else if (c < 0xf8) 386 { 387 unsigned char c2 = s[1]; 388 if (c2 >= 0x80 && c2 < 0xc0) 389 { 390 unsigned char c3 = s[2]; 391 if (c3 >= 0x80 && c3 < 0xc0) 392 { 393 unsigned char c4 = s[3]; 394 if (c4 >= 0x80 && c4 < 0xc0) 395 return 4; 396 } 397 } 398 } 399 } 400 return 1; 401} 402 403/* Returns a character iterator for a given encoding. 404 Given a pointer into a string, it returns the number occupied by the next 405 single character. If the piece of string is not valid or if the *s == '\0', 406 it returns 1. */ 407character_iterator_t 408po_charset_character_iterator (const char *canon_charset) 409{ 410 if (canon_charset == utf8) 411 return utf8_character_iterator; 412 if (strcmp (canon_charset, "GB2312") == 0 413 || strcmp (canon_charset, "EUC-KR") == 0) 414 return euc_character_iterator; 415 if (strcmp (canon_charset, "EUC-JP") == 0) 416 return euc_jp_character_iterator; 417 if (strcmp (canon_charset, "EUC-TW") == 0) 418 return euc_tw_character_iterator; 419 if (strcmp (canon_charset, "BIG5") == 0) 420 return big5_character_iterator; 421 if (strcmp (canon_charset, "BIG5-HKSCS") == 0) 422 return big5hkscs_character_iterator; 423 if (strcmp (canon_charset, "GBK") == 0) 424 return gbk_character_iterator; 425 if (strcmp (canon_charset, "GB18030") == 0) 426 return gb18030_character_iterator; 427 if (strcmp (canon_charset, "SHIFT_JIS") == 0) 428 return shift_jis_character_iterator; 429 if (strcmp (canon_charset, "JOHAB") == 0) 430 return johab_character_iterator; 431 return char_iterator; 432} 433 434 435/* The PO file's encoding, as specified in the header entry. */ 436const char *po_lex_charset; 437 438#if HAVE_ICONV 439/* Converter from the PO file's encoding to UTF-8. */ 440iconv_t po_lex_iconv; 441#endif 442/* If no converter is available, some information about the structure of the 443 PO file's encoding. */ 444bool po_lex_weird_cjk; 445 446void 447po_lex_charset_init () 448{ 449 po_lex_charset = NULL; 450#if HAVE_ICONV 451 po_lex_iconv = (iconv_t)(-1); 452#endif 453 po_lex_weird_cjk = false; 454} 455 456void 457po_lex_charset_set (const char *header_entry, const char *filename) 458{ 459 /* Verify the validity of CHARSET. It is necessary 460 1. for the correct treatment of multibyte characters containing 461 0x5C bytes in the PO lexer, 462 2. so that at run time, gettext() can call iconv() to convert 463 msgstr. */ 464 const char *charsetstr = c_strstr (header_entry, "charset="); 465 466 if (charsetstr != NULL) 467 { 468 size_t len; 469 char *charset; 470 const char *canon_charset; 471 472 charsetstr += strlen ("charset="); 473 len = strcspn (charsetstr, " \t\n"); 474 charset = (char *) xallocsa (len + 1); 475 memcpy (charset, charsetstr, len); 476 charset[len] = '\0'; 477 478 canon_charset = po_charset_canonicalize (charset); 479 if (canon_charset == NULL) 480 { 481 /* Don't warn for POT files, because POT files usually contain 482 only ASCII msgids. */ 483 size_t filenamelen = strlen (filename); 484 485 if (!(filenamelen >= 4 486 && memcmp (filename + filenamelen - 4, ".pot", 4) == 0 487 && strcmp (charset, "CHARSET") == 0)) 488 { 489 char *warning_message = 490 xasprintf (_("\ 491Charset \"%s\" is not a portable encoding name.\n\ 492Message conversion to user's charset might not work.\n"), 493 charset); 494 po_xerror (PO_SEVERITY_WARNING, NULL, 495 filename, (size_t)(-1), (size_t)(-1), true, 496 warning_message); 497 free (warning_message); 498 } 499 } 500 else 501 { 502 const char *envval; 503 504 po_lex_charset = canon_charset; 505#if HAVE_ICONV 506 if (po_lex_iconv != (iconv_t)(-1)) 507 iconv_close (po_lex_iconv); 508#endif 509 510 /* The old Solaris/openwin msgfmt and GNU msgfmt <= 0.10.35 511 don't know about multibyte encodings, and require a spurious 512 backslash after every multibyte character whose last byte is 513 0x5C. Some programs, like vim, distribute PO files in this 514 broken format. GNU msgfmt must continue to support this old 515 PO file format when the Makefile requests it. */ 516 envval = getenv ("OLD_PO_FILE_INPUT"); 517 if (envval != NULL && *envval != '\0') 518 { 519 /* Assume the PO file is in old format, with extraneous 520 backslashes. */ 521#if HAVE_ICONV 522 po_lex_iconv = (iconv_t)(-1); 523#endif 524 po_lex_weird_cjk = false; 525 } 526 else 527 { 528 /* Use iconv() to parse multibyte characters. */ 529#if HAVE_ICONV 530 /* Avoid glibc-2.1 bug with EUC-KR. */ 531# if (__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) && !defined _LIBICONV_VERSION 532 if (strcmp (po_lex_charset, "EUC-KR") == 0) 533 po_lex_iconv = (iconv_t)(-1); 534 else 535# endif 536 /* Avoid Solaris 2.9 bug with GB2312, EUC-TW, BIG5, BIG5-HKSCS, 537 GBK, GB18030. */ 538# if defined __sun && !defined _LIBICONV_VERSION 539 if ( strcmp (po_lex_charset, "GB2312") == 0 540 || strcmp (po_lex_charset, "EUC-TW") == 0 541 || strcmp (po_lex_charset, "BIG5") == 0 542 || strcmp (po_lex_charset, "BIG5-HKSCS") == 0 543 || strcmp (po_lex_charset, "GBK") == 0 544 || strcmp (po_lex_charset, "GB18030") == 0) 545 po_lex_iconv = (iconv_t)(-1); 546 else 547# endif 548 po_lex_iconv = iconv_open ("UTF-8", po_lex_charset); 549 if (po_lex_iconv == (iconv_t)(-1)) 550 { 551 char *warning_message; 552 const char *recommendation; 553 const char *note; 554 char *whole_message; 555 556 warning_message = 557 xasprintf (_("\ 558Charset \"%s\" is not supported. %s relies on iconv(),\n\ 559and iconv() does not support \"%s\".\n"), 560 po_lex_charset, basename (program_name), 561 po_lex_charset); 562 563# if !defined _LIBICONV_VERSION 564 recommendation = _("\ 565Installing GNU libiconv and then reinstalling GNU gettext\n\ 566would fix this problem.\n"); 567# else 568 recommendation = ""; 569# endif 570 571 /* Test for a charset which has double-byte characters 572 ending in 0x5C. For these encodings, the string parser 573 is likely to be confused if it can't see the character 574 boundaries. */ 575 po_lex_weird_cjk = po_is_charset_weird_cjk (po_lex_charset); 576 if (po_is_charset_weird (po_lex_charset) 577 && !po_lex_weird_cjk) 578 note = _("Continuing anyway, expect parse errors."); 579 else 580 note = _("Continuing anyway."); 581 582 whole_message = 583 xasprintf ("%s%s%s\n", 584 warning_message, recommendation, note); 585 586 po_xerror (PO_SEVERITY_WARNING, NULL, 587 filename, (size_t)(-1), (size_t)(-1), true, 588 whole_message); 589 590 free (whole_message); 591 free (warning_message); 592 } 593#else 594 /* Test for a charset which has double-byte characters 595 ending in 0x5C. For these encodings, the string parser 596 is likely to be confused if it can't see the character 597 boundaries. */ 598 po_lex_weird_cjk = po_is_charset_weird_cjk (po_lex_charset); 599 if (po_is_charset_weird (po_lex_charset) && !po_lex_weird_cjk) 600 { 601 char *warning_message; 602 const char *recommendation; 603 const char *note; 604 char *whole_message; 605 606 warning_message = 607 xasprintf (_("\ 608Charset \"%s\" is not supported. %s relies on iconv().\n\ 609This version was built without iconv().\n"), 610 po_lex_charset, basename (program_name)); 611 612 recommendation = _("\ 613Installing GNU libiconv and then reinstalling GNU gettext\n\ 614would fix this problem.\n"); 615 616 note = _("Continuing anyway, expect parse errors."); 617 618 whole_message = 619 xasprintf ("%s%s%s\n", 620 warning_message, recommendation, note); 621 622 po_xerror (PO_SEVERITY_WARNING, NULL, 623 filename, (size_t)(-1), (size_t)(-1), true, 624 whole_message); 625 626 free (whole_message); 627 free (warning_message); 628 } 629#endif 630 } 631 } 632 freesa (charset); 633 } 634 else 635 { 636 /* Don't warn for POT files, because POT files usually contain 637 only ASCII msgids. */ 638 size_t filenamelen = strlen (filename); 639 640 if (!(filenamelen >= 4 641 && memcmp (filename + filenamelen - 4, ".pot", 4) == 0)) 642 po_xerror (PO_SEVERITY_WARNING, 643 NULL, filename, (size_t)(-1), (size_t)(-1), true, 644 _("\ 645Charset missing in header.\n\ 646Message conversion to user's charset will not work.\n")); 647 } 648} 649 650void 651po_lex_charset_close () 652{ 653 po_lex_charset = NULL; 654#if HAVE_ICONV 655 if (po_lex_iconv != (iconv_t)(-1)) 656 { 657 iconv_close (po_lex_iconv); 658 po_lex_iconv = (iconv_t)(-1); 659 } 660#endif 661 po_lex_weird_cjk = false; 662} 663