1/* Writing Qt .qm files. 2 Copyright (C) 2003, 2005 Free Software Foundation, Inc. 3 Written by Bruno Haible <bruno@clisp.org>, 2003. 4 5 This program is free software; you can redistribute it and/or modify 6 it under the terms of the GNU General Public License as published by 7 the Free Software Foundation; either version 2, or (at your option) 8 any later version. 9 10 This program is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 GNU General Public License for more details. 14 15 You should have received a copy of the GNU General Public License 16 along with this program; if not, write to the Free Software Foundation, 17 Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ 18 19#ifdef HAVE_CONFIG_H 20# include <config.h> 21#endif 22 23/* Specification. */ 24#include "write-qt.h" 25 26#include <assert.h> 27#include <errno.h> 28#include <stdbool.h> 29#include <stdio.h> 30#include <stdlib.h> 31#include <string.h> 32 33#include "error.h" 34#include "xerror.h" 35#include "message.h" 36#include "po-charset.h" 37#include "msgl-iconv.h" 38#include "hash-string.h" 39#include "utf8-ucs4.h" 40#include "xalloc.h" 41#include "obstack.h" 42#include "binary-io.h" 43#include "fwriteerror.h" 44#include "exit.h" 45#include "gettext.h" 46 47#define _(str) gettext (str) 48 49/* Qt .qm files are read by the QTranslator::load() function and written 50 by the Qt QTranslator::save() function. 51 52 The Qt tool 'msg2qm' uses the latter function and can convert PO files 53 to .qm files. But since 'msg2qm' is marked as an "old" tool in Qt 3.0.5's 54 i18n.html documentation and therefore likely to disappear, we provide the 55 same functionality here. 56 57 The format of .qm files, as reverse engineered from the functions 58 QTranslator::save(const QString& filename, SaveMode mode) 59 QTranslator::squeeze(SaveMode mode) 60 QTranslatorMessage::write(QDataStream& stream, bool strip, Prefix prefix) 61 elfHash(const char* name) 62 in qt-3.0.5, is as follows: 63 64 It's a binary data format. Elements are u8 (byte), u16, u32. They are 65 written in big-endian order. 66 67 The file starts with a magic string of 16 bytes: 68 3C B8 64 18 CA EF 9C 95 CD 21 1C BF 60 A1 BD DD 69 70 Then come three sections. Each of the three sections is optional. Each 71 has this structure: 72 struct { 73 u8 section_type; // 0x42 = hashes, 0x69 = messages, 0x2f = contexts 74 u32 length; // number of bytes of the data 75 u8 data[length]; 76 }; 77 78 In the first section, the hashes section, the data has the following 79 structure: 80 It's a sorted array of 81 struct { 82 u32 hashcode; // elfHash of the concatenation of msgid and 83 // disambiguating-comment 84 u32 offset; // offset within the data[] of the messages section 85 }; 86 It's sorted in ascending order by hashcode as primary sorting criteria 87 and - when the hashcodes are the same - by offset as secondary criteria. 88 89 In the second section, the messages section, the data has the following 90 structure: 91 It's a sequence of records, each representing a message, in no 92 particular order. Each record is a sequence of subsections, each 93 introduced by a particular subsection tag. The possible subsection tags 94 are (and they usually occur in this order): 95 - 03: Translation. Followed by the msgstr in UCS-2 or UTF-16 format: 96 struct { 97 u32 length; 98 u16 chars[length/2]; 99 }; 100 - 08: Disambiguating-comment. Followed by the NUL-terminated, 101 ISO-8859-1 encoded, disambiguating-comment string: 102 struct { 103 u32 length; // number of bytes including the NUL at the end 104 u8 chars[length]; 105 }; 106 - 06: SourceText, i.e. msgid. Followed by the NUL-terminated, 107 ISO-8859-1 encoded, msgid: 108 struct { 109 u32 length; // number of bytes including the NUL at the end 110 u8 chars[length]; 111 }; 112 - 02: SourceText16, i.e. msgid. Encoded as UCS-2, but must actually 113 be ISO-8859-1. 114 struct { 115 u32 length; 116 u16 chars[length/2]; 117 }; 118 This subsection tag is obsoleted by SourceText. 119 - 07: Context. Followed by the NUL-terminated, ISO-8859-1 encoded, 120 context string (usually a C++ class name or empty): 121 struct { 122 u32 length; // number of bytes including the NUL at the end 123 u8 chars[length]; 124 }; 125 - 04: Context16. Encoded as UCS-2, but must actually be ISO-8859-1. 126 struct { 127 u32 length; 128 u16 chars[length/2]; 129 }; 130 This subsection tag is obsoleted by Context. 131 - 05: Hash. Followed by 132 struct { 133 u32 hashcode; // elfHash of the concatenation of msgid and 134 // disambiguating-comment 135 }; 136 - 01: End. Designates the end of the record. No further data. 137 Usually the following subsections are written, but some of them are 138 optional: 139 - 03: Translation. 140 - 08: Disambiguating-comment (optional). 141 - 06: SourceText (optional). 142 - 07: Context (optional). 143 - 05: Hash. 144 - 01: End. 145 A subsection can be omitted if the value to be output is the same as 146 for the previous record. 147 148 In the third section, the contexts section, the data contains a hash 149 table. Quite complicated. 150 151 The elfHash function is the same as our hash_string function, except that 152 at the end it maps a hash code of 0x00000000 to 0x00000001. 153 154 When we convert from PO file format, all disambiguating-comments and 155 contexts are empty, and therefore the contexts section can be omitted. */ 156 157 158/* Write a u8 (a single byte) to the output stream. */ 159static inline void 160write_u8 (FILE *output_file, unsigned char value) 161{ 162 putc (value, output_file); 163} 164 165/* Write a u16 (two bytes) to the output stream. */ 166static inline void 167write_u16 (FILE *output_file, unsigned short value) 168{ 169 unsigned char data[2]; 170 171 data[0] = (value >> 8) & 0xff; 172 data[1] = value & 0xff; 173 174 fwrite (data, 2, 1, output_file); 175} 176 177/* Write a u32 (four bytes) to the output stream. */ 178static inline void 179write_u32 (FILE *output_file, unsigned int value) 180{ 181 unsigned char data[4]; 182 183 data[0] = (value >> 24) & 0xff; 184 data[1] = (value >> 16) & 0xff; 185 data[2] = (value >> 8) & 0xff; 186 data[3] = value & 0xff; 187 188 fwrite (data, 4, 1, output_file); 189} 190 191 192#define obstack_chunk_alloc xmalloc 193#define obstack_chunk_free free 194 195/* Add a u8 (a single byte) to an obstack. */ 196static void 197append_u8 (struct obstack *mempool, unsigned char value) 198{ 199 unsigned char data[1]; 200 201 data[0] = value; 202 203 obstack_grow (mempool, data, 1); 204} 205 206/* Add a u16 (two bytes) to an obstack. */ 207static void 208append_u16 (struct obstack *mempool, unsigned short value) 209{ 210 unsigned char data[2]; 211 212 data[0] = (value >> 8) & 0xff; 213 data[1] = value & 0xff; 214 215 obstack_grow (mempool, data, 2); 216} 217 218/* Add a u32 (four bytes) to an obstack. */ 219static void 220append_u32 (struct obstack *mempool, unsigned int value) 221{ 222 unsigned char data[4]; 223 224 data[0] = (value >> 24) & 0xff; 225 data[1] = (value >> 16) & 0xff; 226 data[2] = (value >> 8) & 0xff; 227 data[3] = value & 0xff; 228 229 obstack_grow (mempool, data, 4); 230} 231 232/* Add an ISO-8859-1 encoded string to an obstack. */ 233static void 234append_base_string (struct obstack *mempool, const char *string) 235{ 236 size_t length = strlen (string) + 1; 237 append_u32 (mempool, length); 238 obstack_grow (mempool, string, length); 239} 240 241/* Add an UTF-16 encoded string to an obstack. */ 242static void 243append_unicode_string (struct obstack *mempool, const unsigned short *string, 244 size_t length) 245{ 246 append_u32 (mempool, length * 2); 247 for (; length > 0; string++, length--) 248 append_u16 (mempool, *string); 249} 250 251/* Retrieve a 4-byte integer from memory. */ 252static inline unsigned int 253peek_u32 (const unsigned char *p) 254{ 255 return (p[0] << 24) | (p[1] << 16) | (p[2] << 8) | p[3]; 256} 257 258/* Convert an UTF-8 string to ISO-8859-1, without error checking. */ 259static char * 260conv_to_iso_8859_1 (const char *string) 261{ 262 size_t length = strlen (string); 263 const char *str = string; 264 const char *str_limit = string + length; 265 /* Conversion to ISO-8859-1 can only reduce the number of bytes. */ 266 char *result = (char *) xmalloc (length + 1); 267 char *q = result; 268 269 while (str < str_limit) 270 { 271 unsigned int uc; 272 str += u8_mbtouc (&uc, (const unsigned char *) str, str_limit - str); 273 /* It has already been verified that the string its in ISO-8859-1. */ 274 if (!(uc < 0x100)) 275 abort (); 276 /* Store as ISO-8859-1. */ 277 *q++ = (unsigned char) uc; 278 } 279 *q = '\0'; 280 assert (q - result <= length); 281 282 return result; 283} 284 285/* Convert an UTF-8 string to UTF-16, returning its size (number of UTF-16 286 codepoints) in *SIZEP. */ 287static unsigned short * 288conv_to_utf16 (const char *string, size_t *sizep) 289{ 290 size_t length = strlen (string); 291 const char *str = string; 292 const char *str_limit = string + length; 293 /* Conversion to UTF-16 can at most double the number of bytes. */ 294 unsigned short *result = (unsigned short *) xmalloc (2 * length); 295 unsigned short *q = result; 296 297 while (str < str_limit) 298 { 299 unsigned int uc; 300 str += u8_mbtouc (&uc, (const unsigned char *) str, str_limit - str); 301 if (uc < 0x10000) 302 /* UCS-2 character. */ 303 *q++ = (unsigned short) uc; 304 else 305 { 306 /* UTF-16 surrogate. */ 307 *q++ = 0xd800 + ((uc - 0x10000) >> 10); 308 *q++ = 0xdc00 + ((uc - 0x10000) & 0x3ff); 309 } 310 } 311 assert (q - result <= 2 * length); 312 313 *sizep = q - result; 314 return result; 315} 316 317/* Return the Qt hash code of a string. */ 318static unsigned int 319string_hashcode (const char *str) 320{ 321 unsigned int h; 322 323 h = hash_string (str); 324 if (h == 0) 325 h = 1; 326 return h; 327} 328 329/* Compare two entries of the hashes section. */ 330static int 331cmp_hashes (const void *va, const void *vb) 332{ 333 const unsigned char *a = (const unsigned char *) va; 334 const unsigned char *b = (const unsigned char *) vb; 335 unsigned int a_hashcode = peek_u32 (a); 336 unsigned int b_hashcode = peek_u32 (b); 337 338 if (a_hashcode != b_hashcode) 339 return (a_hashcode >= b_hashcode ? 1 : -1); 340 else 341 { 342 unsigned int a_offset = peek_u32 (a + 4); 343 unsigned int b_offset = peek_u32 (b + 4); 344 345 if (a_offset != b_offset) 346 return (a_offset >= b_offset ? 1 : -1); 347 else 348 return 0; 349 } 350} 351 352 353/* Write a section to the output stream. */ 354static void 355write_section (FILE *output_file, unsigned char tag, void *data, size_t size) 356{ 357 /* A section can be omitted if it is empty. */ 358 if (size > 0) 359 { 360 write_u8 (output_file, tag); 361 write_u32 (output_file, size); 362 fwrite (data, size, 1, output_file); 363 } 364} 365 366 367/* Write an entire .qm file. */ 368static void 369write_qm (FILE *output_file, message_list_ty *mlp) 370{ 371 static unsigned char magic[16] = 372 { 373 0x3C, 0xB8, 0x64, 0x18, 0xCA, 0xEF, 0x9C, 0x95, 374 0xCD, 0x21, 0x1C, 0xBF, 0x60, 0xA1, 0xBD, 0xDD 375 }; 376 struct obstack hashes_pool; 377 struct obstack messages_pool; 378 size_t j; 379 380 obstack_init (&hashes_pool); 381 obstack_init (&messages_pool); 382 383 /* Prepare the hashes section and the messages section. */ 384 for (j = 0; j < mlp->nitems; j++) 385 { 386 message_ty *mp = mlp->item[j]; 387 388 /* No need to emit the header entry, it's not needed at runtime. */ 389 if (mp->msgid[0] != '\0') 390 { 391 char *msgid_as_iso_8859_1 = conv_to_iso_8859_1 (mp->msgid); 392 size_t msgstr_len; 393 unsigned short *msgstr_as_utf16 = 394 conv_to_utf16 (mp->msgstr, &msgstr_len); 395 unsigned int hashcode = string_hashcode (msgid_as_iso_8859_1); 396 unsigned int offset = obstack_object_size (&messages_pool); 397 398 /* Add a record to the hashes section. */ 399 append_u32 (&hashes_pool, hashcode); 400 append_u32 (&hashes_pool, offset); 401 402 /* Add a record to the messages section. */ 403 404 append_u8 (&messages_pool, 0x03); 405 append_unicode_string (&messages_pool, msgstr_as_utf16, msgstr_len); 406 407 append_u8 (&messages_pool, 0x08); 408 append_base_string (&messages_pool, ""); 409 410 append_u8 (&messages_pool, 0x06); 411 append_base_string (&messages_pool, msgid_as_iso_8859_1); 412 413 append_u8 (&messages_pool, 0x07); 414 append_base_string (&messages_pool, ""); 415 416 append_u8 (&messages_pool, 0x05); 417 append_u32 (&messages_pool, hashcode); 418 419 append_u8 (&messages_pool, 0x01); 420 421 free (msgstr_as_utf16); 422 free (msgid_as_iso_8859_1); 423 } 424 } 425 426 /* Sort the hashes section. */ 427 { 428 size_t nstrings = obstack_object_size (&hashes_pool) / 8; 429 if (nstrings > 0) 430 qsort (obstack_base (&hashes_pool), nstrings, 8, cmp_hashes); 431 } 432 433 /* Write the magic number. */ 434 fwrite (magic, sizeof (magic), 1, output_file); 435 436 /* Write the hashes section. */ 437 write_section (output_file, 0x42, obstack_base (&hashes_pool), 438 obstack_object_size (&hashes_pool)); 439 440 /* Write the messages section. */ 441 write_section (output_file, 0x69, obstack_base (&messages_pool), 442 obstack_object_size (&messages_pool)); 443 444 /* Omit the contexts section. */ 445#if 0 446 write_section (output_file, 0x2f, ...); 447#endif 448 449 obstack_free (&messages_pool, NULL); 450 obstack_free (&hashes_pool, NULL); 451} 452 453 454int 455msgdomain_write_qt (message_list_ty *mlp, const char *canon_encoding, 456 const char *domain_name, const char *file_name) 457{ 458 FILE *output_file; 459 460 /* If no entry for this domain don't even create the file. */ 461 if (mlp->nitems != 0) 462 { 463 /* Determine whether mlp has plural entries. */ 464 { 465 bool has_plural; 466 size_t j; 467 468 has_plural = false; 469 for (j = 0; j < mlp->nitems; j++) 470 if (mlp->item[j]->msgid_plural != NULL) 471 has_plural = true; 472 if (has_plural) 473 { 474 multiline_error (xstrdup (""), 475 xstrdup (_("\ 476message catalog has plural form translations\n\ 477but the Qt message catalog format doesn't support plural handling\n"))); 478 return 1; 479 } 480 } 481 482 /* Convert the messages to Unicode. */ 483 iconv_message_list (mlp, canon_encoding, po_charset_utf8, NULL); 484 485 /* Determine whether mlp has non-ISO-8859-1 msgid entries. */ 486 { 487 size_t j; 488 489 for (j = 0; j < mlp->nitems; j++) 490 { 491 const char *string = mlp->item[j]->msgid; 492 493 /* An UTF-8 encoded string fits in ISO-8859-1 if and only if all 494 its bytes are < 0xc4. */ 495 for (; *string; string++) 496 if ((unsigned char) *string >= 0xc4) 497 { 498 multiline_error (xstrdup (""), 499 xstrdup (_("\ 500message catalog has msgid strings containing characters outside ISO-8859-1\n\ 501but the Qt message catalog format supports Unicode only in the translated\n\ 502strings, not in the untranslated strings\n"))); 503 return 1; 504 } 505 } 506 } 507 508 if (strcmp (domain_name, "-") == 0) 509 { 510 output_file = stdout; 511 SET_BINARY (fileno (output_file)); 512 } 513 else 514 { 515 output_file = fopen (file_name, "wb"); 516 if (output_file == NULL) 517 { 518 error (0, errno, _("error while opening \"%s\" for writing"), 519 file_name); 520 return 1; 521 } 522 } 523 524 if (output_file != NULL) 525 { 526 write_qm (output_file, mlp); 527 528 /* Make sure nothing went wrong. */ 529 if (fwriteerror (output_file)) 530 error (EXIT_FAILURE, errno, _("error while writing \"%s\" file"), 531 file_name); 532 } 533 } 534 535 return 0; 536} 537