1/* Writing Qt .qm files.
2   Copyright (C) 2003, 2005 Free Software Foundation, Inc.
3   Written by Bruno Haible <bruno@clisp.org>, 2003.
4
5   This program is free software; you can redistribute it and/or modify
6   it under the terms of the GNU General Public License as published by
7   the Free Software Foundation; either version 2, or (at your option)
8   any later version.
9
10   This program is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13   GNU General Public License for more details.
14
15   You should have received a copy of the GNU General Public License
16   along with this program; if not, write to the Free Software Foundation,
17   Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.  */
18
19#ifdef HAVE_CONFIG_H
20# include <config.h>
21#endif
22
23/* Specification.  */
24#include "write-qt.h"
25
26#include <assert.h>
27#include <errno.h>
28#include <stdbool.h>
29#include <stdio.h>
30#include <stdlib.h>
31#include <string.h>
32
33#include "error.h"
34#include "xerror.h"
35#include "message.h"
36#include "po-charset.h"
37#include "msgl-iconv.h"
38#include "hash-string.h"
39#include "utf8-ucs4.h"
40#include "xalloc.h"
41#include "obstack.h"
42#include "binary-io.h"
43#include "fwriteerror.h"
44#include "exit.h"
45#include "gettext.h"
46
47#define _(str) gettext (str)
48
49/* Qt .qm files are read by the QTranslator::load() function and written
50   by the Qt QTranslator::save() function.
51
52   The Qt tool 'msg2qm' uses the latter function and can convert PO files
53   to .qm files. But since 'msg2qm' is marked as an "old" tool in Qt 3.0.5's
54   i18n.html documentation and therefore likely to disappear, we provide the
55   same functionality here.
56
57   The format of .qm files, as reverse engineered from the functions
58     QTranslator::save(const QString& filename, SaveMode mode)
59     QTranslator::squeeze(SaveMode mode)
60     QTranslatorMessage::write(QDataStream& stream, bool strip, Prefix prefix)
61     elfHash(const char* name)
62   in qt-3.0.5, is as follows:
63
64     It's a binary data format. Elements are u8 (byte), u16, u32. They are
65     written in big-endian order.
66
67     The file starts with a magic string of 16 bytes:
68       3C B8 64 18 CA EF 9C 95 CD 21 1C BF 60 A1 BD DD
69
70     Then come three sections. Each of the three sections is optional. Each
71     has this structure:
72       struct {
73         u8 section_type; // 0x42 = hashes, 0x69 = messages, 0x2f = contexts
74         u32 length; // number of bytes of the data
75         u8 data[length];
76       };
77
78     In the first section, the hashes section, the data has the following
79     structure:
80       It's a sorted array of
81         struct {
82           u32 hashcode; // elfHash of the concatenation of msgid and
83                         // disambiguating-comment
84           u32 offset; // offset within the data[] of the messages section
85         };
86       It's sorted in ascending order by hashcode as primary sorting criteria
87       and - when the hashcodes are the same - by offset as secondary criteria.
88
89     In the second section, the messages section, the data has the following
90     structure:
91       It's a sequence of records, each representing a message, in no
92       particular order. Each record is a sequence of subsections, each
93       introduced by a particular subsection tag. The possible subsection tags
94       are (and they usually occur in this order):
95         - 03: Translation. Followed by the msgstr in UCS-2 or UTF-16 format:
96               struct {
97                 u32 length;
98                 u16 chars[length/2];
99               };
100         - 08: Disambiguating-comment. Followed by the NUL-terminated,
101               ISO-8859-1 encoded, disambiguating-comment string:
102               struct {
103                 u32 length;    // number of bytes including the NUL at the end
104                 u8 chars[length];
105               };
106         - 06: SourceText, i.e. msgid. Followed by the NUL-terminated,
107               ISO-8859-1 encoded, msgid:
108               struct {
109                 u32 length;    // number of bytes including the NUL at the end
110                 u8 chars[length];
111               };
112         - 02: SourceText16, i.e. msgid. Encoded as UCS-2, but must actually
113               be ISO-8859-1.
114               struct {
115                 u32 length;
116                 u16 chars[length/2];
117               };
118               This subsection tag is obsoleted by SourceText.
119         - 07: Context. Followed by the NUL-terminated, ISO-8859-1 encoded,
120               context string (usually a C++ class name or empty):
121               struct {
122                 u32 length;    // number of bytes including the NUL at the end
123                 u8 chars[length];
124               };
125         - 04: Context16. Encoded as UCS-2, but must actually be ISO-8859-1.
126               struct {
127                 u32 length;
128                 u16 chars[length/2];
129               };
130               This subsection tag is obsoleted by Context.
131         - 05: Hash. Followed by
132               struct {
133                 u32 hashcode; // elfHash of the concatenation of msgid and
134                               // disambiguating-comment
135               };
136         - 01: End. Designates the end of the record. No further data.
137       Usually the following subsections are written, but some of them are
138       optional:
139         - 03: Translation.
140         - 08: Disambiguating-comment (optional).
141         - 06: SourceText (optional).
142         - 07: Context (optional).
143         - 05: Hash.
144         - 01: End.
145       A subsection can be omitted if the value to be output is the same as
146       for the previous record.
147
148     In the third section, the contexts section, the data contains a hash
149     table. Quite complicated.
150
151     The elfHash function is the same as our hash_string function, except that
152     at the end it maps a hash code of 0x00000000 to 0x00000001.
153
154   When we convert from PO file format, all disambiguating-comments and
155   contexts are empty, and therefore the contexts section can be omitted.  */
156
157
158/* Write a u8 (a single byte) to the output stream.  */
159static inline void
160write_u8 (FILE *output_file, unsigned char value)
161{
162  putc (value, output_file);
163}
164
165/* Write a u16 (two bytes) to the output stream.  */
166static inline void
167write_u16 (FILE *output_file, unsigned short value)
168{
169  unsigned char data[2];
170
171  data[0] = (value >> 8) & 0xff;
172  data[1] = value & 0xff;
173
174  fwrite (data, 2, 1, output_file);
175}
176
177/* Write a u32 (four bytes) to the output stream.  */
178static inline void
179write_u32 (FILE *output_file, unsigned int value)
180{
181  unsigned char data[4];
182
183  data[0] = (value >> 24) & 0xff;
184  data[1] = (value >> 16) & 0xff;
185  data[2] = (value >> 8) & 0xff;
186  data[3] = value & 0xff;
187
188  fwrite (data, 4, 1, output_file);
189}
190
191
192#define obstack_chunk_alloc xmalloc
193#define obstack_chunk_free free
194
195/* Add a u8 (a single byte) to an obstack.  */
196static void
197append_u8 (struct obstack *mempool, unsigned char value)
198{
199  unsigned char data[1];
200
201  data[0] = value;
202
203  obstack_grow (mempool, data, 1);
204}
205
206/* Add a u16 (two bytes) to an obstack.  */
207static void
208append_u16 (struct obstack *mempool, unsigned short value)
209{
210  unsigned char data[2];
211
212  data[0] = (value >> 8) & 0xff;
213  data[1] = value & 0xff;
214
215  obstack_grow (mempool, data, 2);
216}
217
218/* Add a u32 (four bytes) to an obstack.  */
219static void
220append_u32 (struct obstack *mempool, unsigned int value)
221{
222  unsigned char data[4];
223
224  data[0] = (value >> 24) & 0xff;
225  data[1] = (value >> 16) & 0xff;
226  data[2] = (value >> 8) & 0xff;
227  data[3] = value & 0xff;
228
229  obstack_grow (mempool, data, 4);
230}
231
232/* Add an ISO-8859-1 encoded string to an obstack.  */
233static void
234append_base_string (struct obstack *mempool, const char *string)
235{
236  size_t length = strlen (string) + 1;
237  append_u32 (mempool, length);
238  obstack_grow (mempool, string, length);
239}
240
241/* Add an UTF-16 encoded string to an obstack.  */
242static void
243append_unicode_string (struct obstack *mempool, const unsigned short *string,
244		       size_t length)
245{
246  append_u32 (mempool, length * 2);
247  for (; length > 0; string++, length--)
248    append_u16 (mempool, *string);
249}
250
251/* Retrieve a 4-byte integer from memory.  */
252static inline unsigned int
253peek_u32 (const unsigned char *p)
254{
255  return (p[0] << 24) | (p[1] << 16) | (p[2] << 8) | p[3];
256}
257
258/* Convert an UTF-8 string to ISO-8859-1, without error checking.  */
259static char *
260conv_to_iso_8859_1 (const char *string)
261{
262  size_t length = strlen (string);
263  const char *str = string;
264  const char *str_limit = string + length;
265  /* Conversion to ISO-8859-1 can only reduce the number of bytes.  */
266  char *result = (char *) xmalloc (length + 1);
267  char *q = result;
268
269  while (str < str_limit)
270    {
271      unsigned int uc;
272      str += u8_mbtouc (&uc, (const unsigned char *) str, str_limit - str);
273      /* It has already been verified that the string its in ISO-8859-1.  */
274      if (!(uc < 0x100))
275	abort ();
276      /* Store as ISO-8859-1.  */
277      *q++ = (unsigned char) uc;
278    }
279  *q = '\0';
280  assert (q - result <= length);
281
282  return result;
283}
284
285/* Convert an UTF-8 string to UTF-16, returning its size (number of UTF-16
286   codepoints) in *SIZEP.  */
287static unsigned short *
288conv_to_utf16 (const char *string, size_t *sizep)
289{
290  size_t length = strlen (string);
291  const char *str = string;
292  const char *str_limit = string + length;
293  /* Conversion to UTF-16 can at most double the number of bytes.  */
294  unsigned short *result = (unsigned short *) xmalloc (2 * length);
295  unsigned short *q = result;
296
297  while (str < str_limit)
298    {
299      unsigned int uc;
300      str += u8_mbtouc (&uc, (const unsigned char *) str, str_limit - str);
301      if (uc < 0x10000)
302	/* UCS-2 character.  */
303	*q++ = (unsigned short) uc;
304      else
305	{
306	  /* UTF-16 surrogate.  */
307	  *q++ = 0xd800 + ((uc - 0x10000) >> 10);
308	  *q++ = 0xdc00 + ((uc - 0x10000) & 0x3ff);
309	}
310    }
311  assert (q - result <= 2 * length);
312
313  *sizep = q - result;
314  return result;
315}
316
317/* Return the Qt hash code of a string.  */
318static unsigned int
319string_hashcode (const char *str)
320{
321  unsigned int h;
322
323  h = hash_string (str);
324  if (h == 0)
325    h = 1;
326  return h;
327}
328
329/* Compare two entries of the hashes section.  */
330static int
331cmp_hashes (const void *va, const void *vb)
332{
333  const unsigned char *a = (const unsigned char *) va;
334  const unsigned char *b = (const unsigned char *) vb;
335  unsigned int a_hashcode = peek_u32 (a);
336  unsigned int b_hashcode = peek_u32 (b);
337
338  if (a_hashcode != b_hashcode)
339    return (a_hashcode >= b_hashcode ? 1 : -1);
340  else
341    {
342      unsigned int a_offset = peek_u32 (a + 4);
343      unsigned int b_offset = peek_u32 (b + 4);
344
345      if (a_offset != b_offset)
346	return (a_offset >= b_offset ? 1 : -1);
347      else
348	return 0;
349    }
350}
351
352
353/* Write a section to the output stream.  */
354static void
355write_section (FILE *output_file, unsigned char tag, void *data, size_t size)
356{
357  /* A section can be omitted if it is empty.  */
358  if (size > 0)
359    {
360      write_u8 (output_file, tag);
361      write_u32 (output_file, size);
362      fwrite (data, size, 1, output_file);
363    }
364}
365
366
367/* Write an entire .qm file.  */
368static void
369write_qm (FILE *output_file, message_list_ty *mlp)
370{
371  static unsigned char magic[16] =
372    {
373      0x3C, 0xB8, 0x64, 0x18, 0xCA, 0xEF, 0x9C, 0x95,
374      0xCD, 0x21, 0x1C, 0xBF, 0x60, 0xA1, 0xBD, 0xDD
375    };
376  struct obstack hashes_pool;
377  struct obstack messages_pool;
378  size_t j;
379
380  obstack_init (&hashes_pool);
381  obstack_init (&messages_pool);
382
383  /* Prepare the hashes section and the messages section.  */
384  for (j = 0; j < mlp->nitems; j++)
385    {
386      message_ty *mp = mlp->item[j];
387
388      /* No need to emit the header entry, it's not needed at runtime.  */
389      if (mp->msgid[0] != '\0')
390	{
391	  char *msgid_as_iso_8859_1 = conv_to_iso_8859_1 (mp->msgid);
392	  size_t msgstr_len;
393	  unsigned short *msgstr_as_utf16 =
394	    conv_to_utf16 (mp->msgstr, &msgstr_len);
395	  unsigned int hashcode = string_hashcode (msgid_as_iso_8859_1);
396	  unsigned int offset = obstack_object_size (&messages_pool);
397
398	  /* Add a record to the hashes section.  */
399	  append_u32 (&hashes_pool, hashcode);
400	  append_u32 (&hashes_pool, offset);
401
402	  /* Add a record to the messages section.  */
403
404	  append_u8 (&messages_pool, 0x03);
405	  append_unicode_string (&messages_pool, msgstr_as_utf16, msgstr_len);
406
407	  append_u8 (&messages_pool, 0x08);
408	  append_base_string (&messages_pool, "");
409
410	  append_u8 (&messages_pool, 0x06);
411	  append_base_string (&messages_pool, msgid_as_iso_8859_1);
412
413	  append_u8 (&messages_pool, 0x07);
414	  append_base_string (&messages_pool, "");
415
416	  append_u8 (&messages_pool, 0x05);
417	  append_u32 (&messages_pool, hashcode);
418
419	  append_u8 (&messages_pool, 0x01);
420
421	  free (msgstr_as_utf16);
422	  free (msgid_as_iso_8859_1);
423	}
424    }
425
426  /* Sort the hashes section.  */
427  {
428    size_t nstrings = obstack_object_size (&hashes_pool) / 8;
429    if (nstrings > 0)
430      qsort (obstack_base (&hashes_pool), nstrings, 8, cmp_hashes);
431  }
432
433  /* Write the magic number.  */
434  fwrite (magic, sizeof (magic), 1, output_file);
435
436  /* Write the hashes section.  */
437  write_section (output_file, 0x42, obstack_base (&hashes_pool),
438		 obstack_object_size (&hashes_pool));
439
440  /* Write the messages section.  */
441  write_section (output_file, 0x69, obstack_base (&messages_pool),
442		 obstack_object_size (&messages_pool));
443
444  /* Omit the contexts section.  */
445#if 0
446  write_section (output_file, 0x2f, ...);
447#endif
448
449  obstack_free (&messages_pool, NULL);
450  obstack_free (&hashes_pool, NULL);
451}
452
453
454int
455msgdomain_write_qt (message_list_ty *mlp, const char *canon_encoding,
456		    const char *domain_name, const char *file_name)
457{
458  FILE *output_file;
459
460  /* If no entry for this domain don't even create the file.  */
461  if (mlp->nitems != 0)
462    {
463      /* Determine whether mlp has plural entries.  */
464      {
465	bool has_plural;
466	size_t j;
467
468	has_plural = false;
469	for (j = 0; j < mlp->nitems; j++)
470	  if (mlp->item[j]->msgid_plural != NULL)
471	    has_plural = true;
472	if (has_plural)
473	  {
474	    multiline_error (xstrdup (""),
475			     xstrdup (_("\
476message catalog has plural form translations\n\
477but the Qt message catalog format doesn't support plural handling\n")));
478	    return 1;
479	  }
480      }
481
482      /* Convert the messages to Unicode.  */
483      iconv_message_list (mlp, canon_encoding, po_charset_utf8, NULL);
484
485      /* Determine whether mlp has non-ISO-8859-1 msgid entries.  */
486      {
487	size_t j;
488
489	for (j = 0; j < mlp->nitems; j++)
490	  {
491	    const char *string = mlp->item[j]->msgid;
492
493	    /* An UTF-8 encoded string fits in ISO-8859-1 if and only if all
494	       its bytes are < 0xc4.  */
495	    for (; *string; string++)
496	      if ((unsigned char) *string >= 0xc4)
497		{
498		  multiline_error (xstrdup (""),
499				   xstrdup (_("\
500message catalog has msgid strings containing characters outside ISO-8859-1\n\
501but the Qt message catalog format supports Unicode only in the translated\n\
502strings, not in the untranslated strings\n")));
503		  return 1;
504		}
505	  }
506      }
507
508      if (strcmp (domain_name, "-") == 0)
509	{
510	  output_file = stdout;
511	  SET_BINARY (fileno (output_file));
512	}
513      else
514	{
515	  output_file = fopen (file_name, "wb");
516	  if (output_file == NULL)
517	    {
518	      error (0, errno, _("error while opening \"%s\" for writing"),
519		     file_name);
520	      return 1;
521	    }
522	}
523
524      if (output_file != NULL)
525	{
526	  write_qm (output_file, mlp);
527
528	  /* Make sure nothing went wrong.  */
529	  if (fwriteerror (output_file))
530	    error (EXIT_FAILURE, errno, _("error while writing \"%s\" file"),
531		   file_name);
532	}
533    }
534
535  return 0;
536}
537