1/* Charset handling while reading PO files.
2   Copyright (C) 2001-2006 Free Software Foundation, Inc.
3   Written by Bruno Haible <haible@clisp.cons.org>, 2001.
4
5   This program is free software; you can redistribute it and/or modify
6   it under the terms of the GNU General Public License as published by
7   the Free Software Foundation; either version 2, or (at your option)
8   any later version.
9
10   This program is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13   GNU General Public License for more details.
14
15   You should have received a copy of the GNU General Public License
16   along with this program; if not, write to the Free Software Foundation,
17   Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.  */
18
19
20#ifdef HAVE_CONFIG_H
21# include "config.h"
22#endif
23#include <alloca.h>
24
25/* Specification.  */
26#include "po-charset.h"
27
28#include <stdlib.h>
29#include <string.h>
30
31#include "xallocsa.h"
32#include "xvasprintf.h"
33#include "po-xerror.h"
34#include "basename.h"
35#include "progname.h"
36#include "c-strstr.h"
37#include "c-strcase.h"
38#include "gettext.h"
39
40#define _(str) gettext (str)
41
42#define SIZEOF(a) (sizeof(a) / sizeof(a[0]))
43
44static const char ascii[] = "ASCII";
45
46/* The canonicalized encoding name for ASCII.  */
47const char *po_charset_ascii = ascii;
48
49static const char utf8[] = "UTF-8";
50
51/* The canonicalized encoding name for UTF-8.  */
52const char *po_charset_utf8 = utf8;
53
54/* Canonicalize an encoding name.  */
55const char *
56po_charset_canonicalize (const char *charset)
57{
58  /* The list of charsets supported by glibc's iconv() and by the portable
59     iconv() across platforms.  Taken from intl/config.charset.  */
60  static const char *standard_charsets[] =
61  {
62    ascii, "ANSI_X3.4-1968", "US-ASCII",	/* i = 0..2 */
63    "ISO-8859-1", "ISO_8859-1",			/* i = 3, 4 */
64    "ISO-8859-2", "ISO_8859-2",
65    "ISO-8859-3", "ISO_8859-3",
66    "ISO-8859-4", "ISO_8859-4",
67    "ISO-8859-5", "ISO_8859-5",
68    "ISO-8859-6", "ISO_8859-6",
69    "ISO-8859-7", "ISO_8859-7",
70    "ISO-8859-8", "ISO_8859-8",
71    "ISO-8859-9", "ISO_8859-9",
72    "ISO-8859-13", "ISO_8859-13",
73    "ISO-8859-14", "ISO_8859-14",
74    "ISO-8859-15", "ISO_8859-15",		/* i = 25, 26 */
75    "KOI8-R",
76    "KOI8-U",
77    "KOI8-T",
78    "CP850",
79    "CP866",
80    "CP874",
81    "CP932",
82    "CP949",
83    "CP950",
84    "CP1250",
85    "CP1251",
86    "CP1252",
87    "CP1253",
88    "CP1254",
89    "CP1255",
90    "CP1256",
91    "CP1257",
92    "GB2312",
93    "EUC-JP",
94    "EUC-KR",
95    "EUC-TW",
96    "BIG5",
97    "BIG5-HKSCS",
98    "GBK",
99    "GB18030",
100    "SHIFT_JIS",
101    "JOHAB",
102    "TIS-620",
103    "VISCII",
104    "GEORGIAN-PS",
105    utf8
106  };
107  size_t i;
108
109  for (i = 0; i < SIZEOF (standard_charsets); i++)
110    if (c_strcasecmp (charset, standard_charsets[i]) == 0)
111      return standard_charsets[i < 3 ? 0 : i < 27 ? ((i - 3) & ~1) + 3 : i];
112  return NULL;
113}
114
115/* Test for ASCII compatibility.  */
116bool
117po_charset_ascii_compatible (const char *canon_charset)
118{
119  /* There are only a few exceptions to ASCII compatibility.  */
120  if (strcmp (canon_charset, "SHIFT_JIS") == 0
121      || strcmp (canon_charset, "JOHAB") == 0
122      || strcmp (canon_charset, "VISCII") == 0)
123    return false;
124  else
125    return true;
126}
127
128/* Test for a weird encoding, i.e. an encoding which has double-byte
129   characters ending in 0x5C.  */
130bool po_is_charset_weird (const char *canon_charset)
131{
132  static const char *weird_charsets[] =
133  {
134    "BIG5",
135    "BIG5-HKSCS",
136    "GBK",
137    "GB18030",
138    "SHIFT_JIS",
139    "JOHAB"
140  };
141  size_t i;
142
143  for (i = 0; i < SIZEOF (weird_charsets); i++)
144    if (strcmp (canon_charset, weird_charsets[i]) == 0)
145      return true;
146  return false;
147}
148
149/* Test for a weird CJK encoding, i.e. a weird encoding with CJK structure.
150   An encoding has CJK structure if every valid character stream is composed
151   of single bytes in the range 0x{00..7F} and of byte pairs in the range
152   0x{80..FF}{30..FF}.  */
153bool po_is_charset_weird_cjk (const char *canon_charset)
154{
155  static const char *weird_cjk_charsets[] =
156  {			/* single bytes   double bytes       */
157    "BIG5",		/* 0x{00..7F},    0x{A1..F9}{40..FE} */
158    "BIG5-HKSCS",	/* 0x{00..7F},    0x{88..FE}{40..FE} */
159    "GBK",		/* 0x{00..7F},    0x{81..FE}{40..FE} */
160    "GB18030",		/* 0x{00..7F},    0x{81..FE}{30..FE} */
161    "SHIFT_JIS",	/* 0x{00..7F},    0x{81..F9}{40..FC} */
162    "JOHAB"		/* 0x{00..7F},    0x{84..F9}{31..FE} */
163  };
164  size_t i;
165
166  for (i = 0; i < SIZEOF (weird_cjk_charsets); i++)
167    if (strcmp (canon_charset, weird_cjk_charsets[i]) == 0)
168      return true;
169  return false;
170}
171
172/* Hardcoded iterator functions for all kinds of encodings.
173   We could also implement a general iterator function with iconv(),
174   but we need a fast one.  */
175
176/* Character iterator for 8-bit encodings.  */
177static size_t
178char_iterator (const char *s)
179{
180  return 1;
181}
182
183/* Character iterator for GB2312.  See libiconv/lib/euc_cn.h.  */
184/* Character iterator for EUC-KR.  See libiconv/lib/euc_kr.h.  */
185static size_t
186euc_character_iterator (const char *s)
187{
188  unsigned char c = *s;
189  if (c >= 0xa1 && c < 0xff)
190    {
191      unsigned char c2 = s[1];
192      if (c2 >= 0xa1 && c2 < 0xff)
193	return 2;
194    }
195  return 1;
196}
197
198/* Character iterator for EUC-JP.  See libiconv/lib/euc_jp.h.  */
199static size_t
200euc_jp_character_iterator (const char *s)
201{
202  unsigned char c = *s;
203  if (c >= 0xa1 && c < 0xff)
204    {
205      unsigned char c2 = s[1];
206      if (c2 >= 0xa1 && c2 < 0xff)
207	return 2;
208    }
209  else if (c == 0x8e)
210    {
211      unsigned char c2 = s[1];
212      if (c2 >= 0xa1 && c2 < 0xe0)
213	return 2;
214    }
215  else if (c == 0x8f)
216    {
217      unsigned char c2 = s[1];
218      if (c2 >= 0xa1 && c2 < 0xff)
219	{
220	  unsigned char c3 = s[2];
221	  if (c3 >= 0xa1 && c3 < 0xff)
222	    return 3;
223	}
224    }
225  return 1;
226}
227
228/* Character iterator for EUC-TW.  See libiconv/lib/euc_tw.h.  */
229static size_t
230euc_tw_character_iterator (const char *s)
231{
232  unsigned char c = *s;
233  if (c >= 0xa1 && c < 0xff)
234    {
235      unsigned char c2 = s[1];
236      if (c2 >= 0xa1 && c2 < 0xff)
237	return 2;
238    }
239  else if (c == 0x8e)
240    {
241      unsigned char c2 = s[1];
242      if (c2 >= 0xa1 && c2 <= 0xb0)
243	{
244	  unsigned char c3 = s[2];
245	  if (c3 >= 0xa1 && c3 < 0xff)
246	    {
247	      unsigned char c4 = s[3];
248	      if (c4 >= 0xa1 && c4 < 0xff)
249		return 4;
250	    }
251	}
252    }
253  return 1;
254}
255
256/* Character iterator for BIG5.  See libiconv/lib/ces_big5.h.  */
257static size_t
258big5_character_iterator (const char *s)
259{
260  unsigned char c = *s;
261  if (c >= 0xa1 && c < 0xff)
262    {
263      unsigned char c2 = s[1];
264      if ((c2 >= 0x40 && c2 < 0x7f) || (c2 >= 0xa1 && c2 < 0xff))
265	return 2;
266    }
267  return 1;
268}
269
270/* Character iterator for BIG5-HKSCS.  See libiconv/lib/big5hkscs.h.  */
271static size_t
272big5hkscs_character_iterator (const char *s)
273{
274  unsigned char c = *s;
275  if (c >= 0x88 && c < 0xff)
276    {
277      unsigned char c2 = s[1];
278      if ((c2 >= 0x40 && c2 < 0x7f) || (c2 >= 0xa1 && c2 < 0xff))
279	return 2;
280    }
281  return 1;
282}
283
284/* Character iterator for GBK.  See libiconv/lib/ces_gbk.h and
285   libiconv/lib/gbk.h.  */
286static size_t
287gbk_character_iterator (const char *s)
288{
289  unsigned char c = *s;
290  if (c >= 0x81 && c < 0xff)
291    {
292      unsigned char c2 = s[1];
293      if ((c2 >= 0x40 && c2 < 0x7f) || (c2 >= 0x80 && c2 < 0xff))
294	return 2;
295    }
296  return 1;
297}
298
299/* Character iterator for GB18030.  See libiconv/lib/gb18030.h.  */
300static size_t
301gb18030_character_iterator (const char *s)
302{
303  unsigned char c = *s;
304  if (c >= 0x81 && c < 0xff)
305    {
306      unsigned char c2 = s[1];
307      if ((c2 >= 0x40 && c2 < 0x7f) || (c2 >= 0x80 && c2 < 0xff))
308	return 2;
309    }
310  if (c >= 0x81 && c <= 0x84)
311    {
312      unsigned char c2 = s[1];
313      if (c2 >= 0x30 && c2 <= 0x39)
314	{
315	  unsigned char c3 = s[2];
316	  if (c3 >= 0x81 && c3 < 0xff)
317	    {
318	      unsigned char c4 = s[3];
319	      if (c4 >= 0x30 && c4 <= 0x39)
320		return 4;
321	    }
322	}
323    }
324  return 1;
325}
326
327/* Character iterator for SHIFT_JIS.  See libiconv/lib/sjis.h.  */
328static size_t
329shift_jis_character_iterator (const char *s)
330{
331  unsigned char c = *s;
332  if ((c >= 0x81 && c <= 0x9f) || (c >= 0xe0 && c <= 0xf9))
333    {
334      unsigned char c2 = s[1];
335      if ((c2 >= 0x40 && c2 <= 0x7e) || (c2 >= 0x80 && c2 <= 0xfc))
336	return 2;
337    }
338  return 1;
339}
340
341/* Character iterator for JOHAB.  See libiconv/lib/johab.h and
342   libiconv/lib/johab_hangul.h.  */
343static size_t
344johab_character_iterator (const char *s)
345{
346  unsigned char c = *s;
347  if (c >= 0x84 && c <= 0xd3)
348    {
349      unsigned char c2 = s[1];
350      if ((c2 >= 0x41 && c2 < 0x7f) || (c2 >= 0x81 && c2 < 0xff))
351	return 2;
352    }
353  else if (c >= 0xd9 && c <= 0xf9)
354    {
355      unsigned char c2 = s[1];
356      if ((c2 >= 0x31 && c2 <= 0x7e) || (c2 >= 0x91 && c2 <= 0xfe))
357	return 2;
358    }
359  return 1;
360}
361
362/* Character iterator for UTF-8.  See libiconv/lib/utf8.h.  */
363static size_t
364utf8_character_iterator (const char *s)
365{
366  unsigned char c = *s;
367  if (c >= 0xc2)
368    {
369      if (c < 0xe0)
370	{
371	  unsigned char c2 = s[1];
372	  if (c2 >= 0x80 && c2 < 0xc0)
373	    return 2;
374	}
375      else if (c < 0xf0)
376	{
377	  unsigned char c2 = s[1];
378	  if (c2 >= 0x80 && c2 < 0xc0)
379	    {
380	      unsigned char c3 = s[2];
381	      if (c3 >= 0x80 && c3 < 0xc0)
382		return 3;
383	    }
384	}
385      else if (c < 0xf8)
386	{
387	  unsigned char c2 = s[1];
388	  if (c2 >= 0x80 && c2 < 0xc0)
389	    {
390	      unsigned char c3 = s[2];
391	      if (c3 >= 0x80 && c3 < 0xc0)
392		{
393		  unsigned char c4 = s[3];
394		  if (c4 >= 0x80 && c4 < 0xc0)
395		    return 4;
396		}
397	    }
398	}
399    }
400  return 1;
401}
402
403/* Returns a character iterator for a given encoding.
404   Given a pointer into a string, it returns the number occupied by the next
405   single character.  If the piece of string is not valid or if the *s == '\0',
406   it returns 1.  */
407character_iterator_t
408po_charset_character_iterator (const char *canon_charset)
409{
410  if (canon_charset == utf8)
411    return utf8_character_iterator;
412  if (strcmp (canon_charset, "GB2312") == 0
413      || strcmp (canon_charset, "EUC-KR") == 0)
414    return euc_character_iterator;
415  if (strcmp (canon_charset, "EUC-JP") == 0)
416    return euc_jp_character_iterator;
417  if (strcmp (canon_charset, "EUC-TW") == 0)
418    return euc_tw_character_iterator;
419  if (strcmp (canon_charset, "BIG5") == 0)
420    return big5_character_iterator;
421  if (strcmp (canon_charset, "BIG5-HKSCS") == 0)
422    return big5hkscs_character_iterator;
423  if (strcmp (canon_charset, "GBK") == 0)
424    return gbk_character_iterator;
425  if (strcmp (canon_charset, "GB18030") == 0)
426    return gb18030_character_iterator;
427  if (strcmp (canon_charset, "SHIFT_JIS") == 0)
428    return shift_jis_character_iterator;
429  if (strcmp (canon_charset, "JOHAB") == 0)
430    return johab_character_iterator;
431  return char_iterator;
432}
433
434
435/* The PO file's encoding, as specified in the header entry.  */
436const char *po_lex_charset;
437
438#if HAVE_ICONV
439/* Converter from the PO file's encoding to UTF-8.  */
440iconv_t po_lex_iconv;
441#endif
442/* If no converter is available, some information about the structure of the
443   PO file's encoding.  */
444bool po_lex_weird_cjk;
445
446void
447po_lex_charset_init ()
448{
449  po_lex_charset = NULL;
450#if HAVE_ICONV
451  po_lex_iconv = (iconv_t)(-1);
452#endif
453  po_lex_weird_cjk = false;
454}
455
456void
457po_lex_charset_set (const char *header_entry, const char *filename)
458{
459  /* Verify the validity of CHARSET.  It is necessary
460     1. for the correct treatment of multibyte characters containing
461	0x5C bytes in the PO lexer,
462     2. so that at run time, gettext() can call iconv() to convert
463	msgstr.  */
464  const char *charsetstr = c_strstr (header_entry, "charset=");
465
466  if (charsetstr != NULL)
467    {
468      size_t len;
469      char *charset;
470      const char *canon_charset;
471
472      charsetstr += strlen ("charset=");
473      len = strcspn (charsetstr, " \t\n");
474      charset = (char *) xallocsa (len + 1);
475      memcpy (charset, charsetstr, len);
476      charset[len] = '\0';
477
478      canon_charset = po_charset_canonicalize (charset);
479      if (canon_charset == NULL)
480	{
481	  /* Don't warn for POT files, because POT files usually contain
482	     only ASCII msgids.  */
483	  size_t filenamelen = strlen (filename);
484
485	  if (!(filenamelen >= 4
486		&& memcmp (filename + filenamelen - 4, ".pot", 4) == 0
487		&& strcmp (charset, "CHARSET") == 0))
488	    {
489	      char *warning_message =
490		xasprintf (_("\
491Charset \"%s\" is not a portable encoding name.\n\
492Message conversion to user's charset might not work.\n"),
493			   charset);
494	      po_xerror (PO_SEVERITY_WARNING, NULL,
495			 filename, (size_t)(-1), (size_t)(-1), true,
496			 warning_message);
497	      free (warning_message);
498	    }
499	}
500      else
501	{
502	  const char *envval;
503
504	  po_lex_charset = canon_charset;
505#if HAVE_ICONV
506	  if (po_lex_iconv != (iconv_t)(-1))
507	    iconv_close (po_lex_iconv);
508#endif
509
510	  /* The old Solaris/openwin msgfmt and GNU msgfmt <= 0.10.35
511	     don't know about multibyte encodings, and require a spurious
512	     backslash after every multibyte character whose last byte is
513	     0x5C.  Some programs, like vim, distribute PO files in this
514	     broken format.  GNU msgfmt must continue to support this old
515	     PO file format when the Makefile requests it.  */
516	  envval = getenv ("OLD_PO_FILE_INPUT");
517	  if (envval != NULL && *envval != '\0')
518	    {
519	      /* Assume the PO file is in old format, with extraneous
520		 backslashes.  */
521#if HAVE_ICONV
522	      po_lex_iconv = (iconv_t)(-1);
523#endif
524	      po_lex_weird_cjk = false;
525	    }
526	  else
527	    {
528	      /* Use iconv() to parse multibyte characters.  */
529#if HAVE_ICONV
530	      /* Avoid glibc-2.1 bug with EUC-KR.  */
531# if (__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) && !defined _LIBICONV_VERSION
532	      if (strcmp (po_lex_charset, "EUC-KR") == 0)
533		po_lex_iconv = (iconv_t)(-1);
534	      else
535# endif
536	      /* Avoid Solaris 2.9 bug with GB2312, EUC-TW, BIG5, BIG5-HKSCS,
537		 GBK, GB18030.  */
538# if defined __sun && !defined _LIBICONV_VERSION
539	      if (   strcmp (po_lex_charset, "GB2312") == 0
540		  || strcmp (po_lex_charset, "EUC-TW") == 0
541		  || strcmp (po_lex_charset, "BIG5") == 0
542		  || strcmp (po_lex_charset, "BIG5-HKSCS") == 0
543		  || strcmp (po_lex_charset, "GBK") == 0
544		  || strcmp (po_lex_charset, "GB18030") == 0)
545		po_lex_iconv = (iconv_t)(-1);
546	      else
547# endif
548	      po_lex_iconv = iconv_open ("UTF-8", po_lex_charset);
549	      if (po_lex_iconv == (iconv_t)(-1))
550		{
551		  char *warning_message;
552		  const char *recommendation;
553		  const char *note;
554		  char *whole_message;
555
556		  warning_message =
557		    xasprintf (_("\
558Charset \"%s\" is not supported. %s relies on iconv(),\n\
559and iconv() does not support \"%s\".\n"),
560			       po_lex_charset, basename (program_name),
561			       po_lex_charset);
562
563# if !defined _LIBICONV_VERSION
564		  recommendation = _("\
565Installing GNU libiconv and then reinstalling GNU gettext\n\
566would fix this problem.\n");
567# else
568		  recommendation = "";
569# endif
570
571		  /* Test for a charset which has double-byte characters
572		     ending in 0x5C.  For these encodings, the string parser
573		     is likely to be confused if it can't see the character
574		     boundaries.  */
575		  po_lex_weird_cjk = po_is_charset_weird_cjk (po_lex_charset);
576		  if (po_is_charset_weird (po_lex_charset)
577		      && !po_lex_weird_cjk)
578		    note = _("Continuing anyway, expect parse errors.");
579		  else
580		    note = _("Continuing anyway.");
581
582		  whole_message =
583		    xasprintf ("%s%s%s\n",
584			       warning_message, recommendation, note);
585
586		  po_xerror (PO_SEVERITY_WARNING, NULL,
587			     filename, (size_t)(-1), (size_t)(-1), true,
588			     whole_message);
589
590		  free (whole_message);
591		  free (warning_message);
592		}
593#else
594	      /* Test for a charset which has double-byte characters
595		 ending in 0x5C.  For these encodings, the string parser
596		 is likely to be confused if it can't see the character
597		 boundaries.  */
598	      po_lex_weird_cjk = po_is_charset_weird_cjk (po_lex_charset);
599	      if (po_is_charset_weird (po_lex_charset) && !po_lex_weird_cjk)
600		{
601		  char *warning_message;
602		  const char *recommendation;
603		  const char *note;
604		  char *whole_message;
605
606		  warning_message =
607		    xasprintf (_("\
608Charset \"%s\" is not supported. %s relies on iconv().\n\
609This version was built without iconv().\n"),
610			       po_lex_charset, basename (program_name));
611
612		  recommendation = _("\
613Installing GNU libiconv and then reinstalling GNU gettext\n\
614would fix this problem.\n");
615
616		  note = _("Continuing anyway, expect parse errors.");
617
618		  whole_message =
619		    xasprintf ("%s%s%s\n",
620			       warning_message, recommendation, note);
621
622		  po_xerror (PO_SEVERITY_WARNING, NULL,
623			     filename, (size_t)(-1), (size_t)(-1), true,
624			     whole_message);
625
626		  free (whole_message);
627		  free (warning_message);
628		}
629#endif
630	    }
631	}
632      freesa (charset);
633    }
634  else
635    {
636      /* Don't warn for POT files, because POT files usually contain
637	 only ASCII msgids.  */
638      size_t filenamelen = strlen (filename);
639
640      if (!(filenamelen >= 4
641	    && memcmp (filename + filenamelen - 4, ".pot", 4) == 0))
642	po_xerror (PO_SEVERITY_WARNING,
643		   NULL, filename, (size_t)(-1), (size_t)(-1), true,
644		   _("\
645Charset missing in header.\n\
646Message conversion to user's charset will not work.\n"));
647    }
648}
649
650void
651po_lex_charset_close ()
652{
653  po_lex_charset = NULL;
654#if HAVE_ICONV
655  if (po_lex_iconv != (iconv_t)(-1))
656    {
657      iconv_close (po_lex_iconv);
658      po_lex_iconv = (iconv_t)(-1);
659    }
660#endif
661  po_lex_weird_cjk = false;
662}
663