1/* Writing Qt .qm files. 2 Copyright (C) 2003, 2005-2007 Free Software Foundation, Inc. 3 Written by Bruno Haible <bruno@clisp.org>, 2003. 4 5 This program is free software: you can redistribute it and/or modify 6 it under the terms of the GNU General Public License as published by 7 the Free Software Foundation; either version 3 of the License, or 8 (at your option) any later version. 9 10 This program is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 GNU General Public License for more details. 14 15 You should have received a copy of the GNU General Public License 16 along with this program. If not, see <http://www.gnu.org/licenses/>. */ 17 18#ifdef HAVE_CONFIG_H 19# include <config.h> 20#endif 21 22/* Specification. */ 23#include "write-qt.h" 24 25#include <assert.h> 26#include <errno.h> 27#include <stdbool.h> 28#include <stdio.h> 29#include <stdlib.h> 30#include <string.h> 31 32#include "error.h" 33#include "xerror.h" 34#include "message.h" 35#include "po-charset.h" 36#include "msgl-iconv.h" 37#include "hash-string.h" 38#include "unistr.h" 39#include "xalloc.h" 40#include "obstack.h" 41#include "hash.h" 42#include "binary-io.h" 43#include "fwriteerror.h" 44#include "gettext.h" 45 46#define _(str) gettext (str) 47 48/* Qt .qm files are read by the QTranslator::load() function and written 49 by the Qt QTranslator::save() function. 50 51 The Qt tool 'msg2qm' uses the latter function and can convert PO files 52 to .qm files. But since 'msg2qm' is marked as an "old" tool in Qt 3.0.5's 53 i18n.html documentation and therefore likely to disappear, we provide the 54 same functionality here. 55 56 The format of .qm files, as reverse engineered from the functions 57 QTranslator::save(const QString& filename, SaveMode mode) 58 QTranslator::squeeze(SaveMode mode) 59 QTranslatorMessage::write(QDataStream& stream, bool strip, Prefix prefix) 60 elfHash(const char* name) 61 in qt-3.0.5, is as follows: 62 63 It's a binary data format. Elements are u8 (byte), u16, u32. They are 64 written in big-endian order. 65 66 The file starts with a magic string of 16 bytes: 67 3C B8 64 18 CA EF 9C 95 CD 21 1C BF 60 A1 BD DD 68 69 Then come three sections. Each of the three sections is optional. Each 70 has this structure: 71 struct { 72 u8 section_type; // 0x42 = hashes, 0x69 = messages, 0x2f = contexts 73 u32 length; // number of bytes of the data 74 u8 data[length]; 75 }; 76 77 In the first section, the hashes section, the data has the following 78 structure: 79 It's a sorted array of 80 struct { 81 u32 hashcode; // elfHash of the concatenation of msgid and 82 // disambiguating-comment 83 u32 offset; // offset within the data[] of the messages section 84 }; 85 It's sorted in ascending order by hashcode as primary sorting criteria 86 and - when the hashcodes are the same - by offset as secondary criteria. 87 88 In the second section, the messages section, the data has the following 89 structure: 90 It's a sequence of records, each representing a message, in no 91 particular order. Each record is a sequence of subsections, each 92 introduced by a particular subsection tag. The possible subsection tags 93 are (and they usually occur in this order): 94 - 03: Translation. Followed by the msgstr in UCS-2 or UTF-16 format: 95 struct { 96 u32 length; 97 u16 chars[length/2]; 98 }; 99 - 08: Disambiguating-comment. Followed by the NUL-terminated, 100 ISO-8859-1 encoded, disambiguating-comment string: 101 struct { 102 u32 length; // number of bytes including the NUL at the end 103 u8 chars[length]; 104 }; 105 - 06: SourceText, i.e. msgid. Followed by the NUL-terminated, 106 ISO-8859-1 encoded, msgid: 107 struct { 108 u32 length; // number of bytes including the NUL at the end 109 u8 chars[length]; 110 }; 111 - 02: SourceText16, i.e. msgid. Encoded as UCS-2, but must actually 112 be ISO-8859-1. 113 struct { 114 u32 length; 115 u16 chars[length/2]; 116 }; 117 This subsection tag is obsoleted by SourceText. 118 - 07: Context. Followed by the NUL-terminated, ISO-8859-1 encoded, 119 context string (usually a C++ class name or empty): 120 struct { 121 u32 length; // number of bytes including the NUL at the end 122 u8 chars[length]; 123 }; 124 - 04: Context16. Encoded as UCS-2, but must actually be ISO-8859-1. 125 struct { 126 u32 length; 127 u16 chars[length/2]; 128 }; 129 This subsection tag is obsoleted by Context. 130 - 05: Hash. Followed by 131 struct { 132 u32 hashcode; // elfHash of the concatenation of msgid and 133 // disambiguating-comment 134 }; 135 - 01: End. Designates the end of the record. No further data. 136 Usually the following subsections are written, but some of them are 137 optional: 138 - 03: Translation. 139 - 08: Disambiguating-comment (optional). 140 - 06: SourceText (optional). 141 - 07: Context (optional). 142 - 05: Hash. 143 - 01: End. 144 A subsection can be omitted if the value to be output is the same as 145 for the previous record. 146 147 The third section, the contexts section, contains the set of all occurring 148 context strings. This section is optional; it is used to speed up the 149 search. The data is a hash table with the following structure: 150 struct { 151 u16 table_size; 152 u16 buckets[table_size]; 153 u8 pool[...]; 154 }; 155 pool[...] contains: 156 u16 zero; 157 for i = 0, ..., table_size: 158 if there are context strings with elfHash(context)%table_size == i: 159 for all context strings with elfHash(context)%table_size == i: 160 len := min(length(context),255); // truncated to length 255 161 struct { 162 u8 len; 163 u8 chars[len]; 164 }; 165 struct { 166 u8 zero[1]; // signals the end of this bucket 167 u8 padding[0 or 1]; // padding for even number of bytes 168 }; 169 buckets[i] is 0 for an empty bucket, or the offset in pool[] where 170 the context strings for this bucket start, divided by 2. 171 This context section must not be used 172 - if the empty context is used, or 173 - if a context of length > 255 is used, or 174 - if the context pool's size would be > 2^17. 175 176 The elfHash function is the same as our hash_string function, except that 177 at the end it maps a hash code of 0x00000000 to 0x00000001. 178 179 When we convert from PO file format, all disambiguating-comments and 180 contexts are empty, and therefore the contexts section can be omitted. */ 181 182 183/* Write a u8 (a single byte) to the output stream. */ 184static inline void 185write_u8 (FILE *output_file, unsigned char value) 186{ 187 putc (value, output_file); 188} 189 190/* Write a u16 (two bytes) to the output stream. */ 191static inline void 192write_u16 (FILE *output_file, unsigned short value) 193{ 194 unsigned char data[2]; 195 196 data[0] = (value >> 8) & 0xff; 197 data[1] = value & 0xff; 198 199 fwrite (data, 2, 1, output_file); 200} 201 202/* Write a u32 (four bytes) to the output stream. */ 203static inline void 204write_u32 (FILE *output_file, unsigned int value) 205{ 206 unsigned char data[4]; 207 208 data[0] = (value >> 24) & 0xff; 209 data[1] = (value >> 16) & 0xff; 210 data[2] = (value >> 8) & 0xff; 211 data[3] = value & 0xff; 212 213 fwrite (data, 4, 1, output_file); 214} 215 216 217#define obstack_chunk_alloc xmalloc 218#define obstack_chunk_free free 219 220/* Add a u8 (a single byte) to an obstack. */ 221static void 222append_u8 (struct obstack *mempool, unsigned char value) 223{ 224 unsigned char data[1]; 225 226 data[0] = value; 227 228 obstack_grow (mempool, data, 1); 229} 230 231/* Add a u16 (two bytes) to an obstack. */ 232static void 233append_u16 (struct obstack *mempool, unsigned short value) 234{ 235 unsigned char data[2]; 236 237 data[0] = (value >> 8) & 0xff; 238 data[1] = value & 0xff; 239 240 obstack_grow (mempool, data, 2); 241} 242 243/* Add a u32 (four bytes) to an obstack. */ 244static void 245append_u32 (struct obstack *mempool, unsigned int value) 246{ 247 unsigned char data[4]; 248 249 data[0] = (value >> 24) & 0xff; 250 data[1] = (value >> 16) & 0xff; 251 data[2] = (value >> 8) & 0xff; 252 data[3] = value & 0xff; 253 254 obstack_grow (mempool, data, 4); 255} 256 257/* Add an ISO-8859-1 encoded string to an obstack. */ 258static void 259append_base_string (struct obstack *mempool, const char *string) 260{ 261 size_t length = strlen (string) + 1; 262 append_u32 (mempool, length); 263 obstack_grow (mempool, string, length); 264} 265 266/* Add an UTF-16 encoded string to an obstack. */ 267static void 268append_unicode_string (struct obstack *mempool, const unsigned short *string, 269 size_t length) 270{ 271 append_u32 (mempool, length * 2); 272 for (; length > 0; string++, length--) 273 append_u16 (mempool, *string); 274} 275 276/* Retrieve a 4-byte integer from memory. */ 277static inline unsigned int 278peek_u32 (const unsigned char *p) 279{ 280 return (p[0] << 24) | (p[1] << 16) | (p[2] << 8) | p[3]; 281} 282 283/* Convert an UTF-8 string to ISO-8859-1, without error checking. */ 284static char * 285conv_to_iso_8859_1 (const char *string) 286{ 287 size_t length = strlen (string); 288 const char *str = string; 289 const char *str_limit = string + length; 290 /* Conversion to ISO-8859-1 can only reduce the number of bytes. */ 291 char *result = XNMALLOC (length + 1, char); 292 char *q = result; 293 294 while (str < str_limit) 295 { 296 unsigned int uc; 297 str += u8_mbtouc (&uc, (const unsigned char *) str, str_limit - str); 298 /* It has already been verified that the string fits in ISO-8859-1. */ 299 if (!(uc < 0x100)) 300 abort (); 301 /* Store as ISO-8859-1. */ 302 *q++ = (unsigned char) uc; 303 } 304 *q = '\0'; 305 assert (q - result <= length); 306 307 return result; 308} 309 310/* Convert an UTF-8 string to UTF-16, returning its size (number of UTF-16 311 codepoints) in *SIZEP. */ 312static unsigned short * 313conv_to_utf16 (const char *string, size_t *sizep) 314{ 315 size_t length = strlen (string); 316 const char *str = string; 317 const char *str_limit = string + length; 318 /* Conversion to UTF-16 can at most double the number of bytes. */ 319 unsigned short *result = XNMALLOC (length, unsigned short); 320 unsigned short *q = result; 321 322 while (str < str_limit) 323 { 324 unsigned int uc; 325 str += u8_mbtouc (&uc, (const unsigned char *) str, str_limit - str); 326 if (uc < 0x10000) 327 /* UCS-2 character. */ 328 *q++ = (unsigned short) uc; 329 else 330 { 331 /* UTF-16 surrogate. */ 332 *q++ = 0xd800 + ((uc - 0x10000) >> 10); 333 *q++ = 0xdc00 + ((uc - 0x10000) & 0x3ff); 334 } 335 } 336 assert (q - result <= 2 * length); 337 338 *sizep = q - result; 339 return result; 340} 341 342/* Return the Qt hash code of a string. */ 343static unsigned int 344string_hashcode (const char *str) 345{ 346 unsigned int h; 347 348 h = hash_string (str); 349 if (h == 0) 350 h = 1; 351 return h; 352} 353 354/* Compare two entries of the hashes section. */ 355static int 356cmp_hashes (const void *va, const void *vb) 357{ 358 const unsigned char *a = (const unsigned char *) va; 359 const unsigned char *b = (const unsigned char *) vb; 360 unsigned int a_hashcode = peek_u32 (a); 361 unsigned int b_hashcode = peek_u32 (b); 362 363 if (a_hashcode != b_hashcode) 364 return (a_hashcode >= b_hashcode ? 1 : -1); 365 else 366 { 367 unsigned int a_offset = peek_u32 (a + 4); 368 unsigned int b_offset = peek_u32 (b + 4); 369 370 if (a_offset != b_offset) 371 return (a_offset >= b_offset ? 1 : -1); 372 else 373 return 0; 374 } 375} 376 377 378/* Write a section to the output stream. */ 379static void 380write_section (FILE *output_file, unsigned char tag, void *data, size_t size) 381{ 382 /* A section can be omitted if it is empty. */ 383 if (size > 0) 384 { 385 write_u8 (output_file, tag); 386 write_u32 (output_file, size); 387 fwrite (data, size, 1, output_file); 388 } 389} 390 391 392/* Write an entire .qm file. */ 393static void 394write_qm (FILE *output_file, message_list_ty *mlp) 395{ 396 static unsigned char magic[16] = 397 { 398 0x3C, 0xB8, 0x64, 0x18, 0xCA, 0xEF, 0x9C, 0x95, 399 0xCD, 0x21, 0x1C, 0xBF, 0x60, 0xA1, 0xBD, 0xDD 400 }; 401 struct obstack hashes_pool; 402 struct obstack messages_pool; 403 size_t j; 404 405 obstack_init (&hashes_pool); 406 obstack_init (&messages_pool); 407 408 /* Prepare the hashes section and the messages section. */ 409 for (j = 0; j < mlp->nitems; j++) 410 { 411 message_ty *mp = mlp->item[j]; 412 413 /* No need to emit the header entry, it's not needed at runtime. */ 414 if (!is_header (mp)) 415 { 416 char *msgctxt_as_iso_8859_1 = 417 conv_to_iso_8859_1 (mp->msgctxt != NULL ? mp->msgctxt : ""); 418 char *msgid_as_iso_8859_1 = conv_to_iso_8859_1 (mp->msgid); 419 size_t msgstr_len; 420 unsigned short *msgstr_as_utf16 = 421 conv_to_utf16 (mp->msgstr, &msgstr_len); 422 unsigned int hashcode = string_hashcode (msgid_as_iso_8859_1); 423 unsigned int offset = obstack_object_size (&messages_pool); 424 425 /* Add a record to the hashes section. */ 426 append_u32 (&hashes_pool, hashcode); 427 append_u32 (&hashes_pool, offset); 428 429 /* Add a record to the messages section. */ 430 431 append_u8 (&messages_pool, 0x03); 432 append_unicode_string (&messages_pool, msgstr_as_utf16, msgstr_len); 433 434 append_u8 (&messages_pool, 0x08); 435 append_base_string (&messages_pool, ""); 436 437 append_u8 (&messages_pool, 0x06); 438 append_base_string (&messages_pool, msgid_as_iso_8859_1); 439 440 append_u8 (&messages_pool, 0x07); 441 append_base_string (&messages_pool, msgctxt_as_iso_8859_1); 442 443 append_u8 (&messages_pool, 0x05); 444 append_u32 (&messages_pool, hashcode); 445 446 append_u8 (&messages_pool, 0x01); 447 448 free (msgstr_as_utf16); 449 free (msgid_as_iso_8859_1); 450 free (msgctxt_as_iso_8859_1); 451 } 452 } 453 454 /* Sort the hashes section. */ 455 { 456 size_t nstrings = obstack_object_size (&hashes_pool) / 8; 457 if (nstrings > 0) 458 qsort (obstack_base (&hashes_pool), nstrings, 8, cmp_hashes); 459 } 460 461 /* Write the magic number. */ 462 fwrite (magic, sizeof (magic), 1, output_file); 463 464 /* Write the hashes section. */ 465 write_section (output_file, 0x42, obstack_base (&hashes_pool), 466 obstack_object_size (&hashes_pool)); 467 468 /* Write the messages section. */ 469 write_section (output_file, 0x69, obstack_base (&messages_pool), 470 obstack_object_size (&messages_pool)); 471 472 /* Decide whether to write a contexts section. */ 473 { 474 bool can_write_contexts = true; 475 476 for (j = 0; j < mlp->nitems; j++) 477 { 478 message_ty *mp = mlp->item[j]; 479 480 if (!is_header (mp)) 481 if (mp->msgctxt == NULL || mp->msgctxt[0] == '\0' 482 || strlen (mp->msgctxt) > 255) 483 { 484 can_write_contexts = false; 485 break; 486 } 487 } 488 489 if (can_write_contexts) 490 { 491 hash_table all_contexts; 492 size_t num_contexts; 493 unsigned long table_size; 494 495 /* Collect the contexts, removing duplicates. */ 496 hash_init (&all_contexts, 10); 497 for (j = 0; j < mlp->nitems; j++) 498 { 499 message_ty *mp = mlp->item[j]; 500 501 if (!is_header (mp)) 502 hash_insert_entry (&all_contexts, 503 mp->msgctxt, strlen (mp->msgctxt) + 1, 504 NULL); 505 } 506 507 /* Compute the number of different contexts. */ 508 num_contexts = all_contexts.size; 509 510 /* Compute a suitable hash table size. */ 511 table_size = next_prime (num_contexts * 1.7); 512 if (table_size >= 0x10000) 513 table_size = 65521; 514 515 /* Put the contexts into a hash table of size table_size. */ 516 { 517 struct list_cell { const char *context; struct list_cell *next; }; 518 struct list_cell *list_memory = 519 XNMALLOC (table_size, struct list_cell); 520 struct list_cell *freelist; 521 struct bucket { struct list_cell *head; struct list_cell **tail; }; 522 struct bucket *buckets = XNMALLOC (table_size, struct bucket); 523 size_t i; 524 525 freelist = list_memory; 526 527 for (i = 0; i < table_size; i++) 528 { 529 buckets[i].head = NULL; 530 buckets[i].tail = &buckets[i].head; 531 } 532 533 { 534 void *iter; 535 const void *key; 536 size_t keylen; 537 void *null; 538 539 iter = NULL; 540 while (hash_iterate (&all_contexts, &iter, &key, &keylen, &null) 541 == 0) 542 { 543 const char *context = (const char *)key; 544 i = string_hashcode (context) % table_size; 545 freelist->context = context; 546 freelist->next = NULL; 547 *buckets[i].tail = freelist; 548 buckets[i].tail = &freelist->next; 549 freelist++; 550 } 551 } 552 553 /* Determine the total context pool size. */ 554 { 555 size_t pool_size; 556 557 pool_size = 2; 558 for (i = 0; i < table_size; i++) 559 if (buckets[i].head != NULL) 560 { 561 const struct list_cell *p; 562 563 for (p = buckets[i].head; p != NULL; p = p->next) 564 pool_size += 1 + strlen (p->context); 565 pool_size++; 566 if ((pool_size % 2) != 0) 567 pool_size++; 568 } 569 if (pool_size <= 0x20000) 570 { 571 /* Prepare the contexts section. */ 572 struct obstack contexts_pool; 573 size_t pool_offset; 574 575 obstack_init (&contexts_pool); 576 577 append_u16 (&contexts_pool, table_size); 578 pool_offset = 2; 579 for (i = 0; i < table_size; i++) 580 if (buckets[i].head != NULL) 581 { 582 const struct list_cell *p; 583 584 append_u16 (&contexts_pool, pool_offset / 2); 585 for (p = buckets[i].head; p != NULL; p = p->next) 586 pool_offset += 1 + strlen (p->context); 587 pool_offset++; 588 if ((pool_offset % 2) != 0) 589 pool_offset++; 590 } 591 else 592 append_u16 (&contexts_pool, 0); 593 if (!(pool_offset == pool_size)) 594 abort (); 595 596 append_u16 (&contexts_pool, 0); 597 pool_offset = 2; 598 for (i = 0; i < table_size; i++) 599 if (buckets[i].head != NULL) 600 { 601 const struct list_cell *p; 602 603 for (p = buckets[i].head; p != NULL; p = p->next) 604 { 605 append_u8 (&contexts_pool, strlen (p->context)); 606 obstack_grow (&contexts_pool, 607 p->context, strlen (p->context)); 608 pool_offset += 1 + strlen (p->context); 609 } 610 append_u8 (&contexts_pool, 0); 611 pool_offset++; 612 if ((pool_offset % 2) != 0) 613 { 614 append_u8 (&contexts_pool, 0); 615 pool_offset++; 616 } 617 } 618 if (!(pool_offset == pool_size)) 619 abort (); 620 621 if (!(obstack_object_size (&contexts_pool) 622 == 2 + 2 * table_size + pool_size)) 623 abort (); 624 625 /* Write the contexts section. */ 626 write_section (output_file, 0x2f, obstack_base (&contexts_pool), 627 obstack_object_size (&contexts_pool)); 628 629 obstack_free (&contexts_pool, NULL); 630 } 631 } 632 633 free (buckets); 634 free (list_memory); 635 } 636 637 hash_destroy (&all_contexts); 638 } 639 } 640 641 obstack_free (&messages_pool, NULL); 642 obstack_free (&hashes_pool, NULL); 643} 644 645 646int 647msgdomain_write_qt (message_list_ty *mlp, const char *canon_encoding, 648 const char *domain_name, const char *file_name) 649{ 650 FILE *output_file; 651 652 /* If no entry for this domain don't even create the file. */ 653 if (mlp->nitems != 0) 654 { 655 /* Determine whether mlp has plural entries. */ 656 { 657 bool has_plural; 658 size_t j; 659 660 has_plural = false; 661 for (j = 0; j < mlp->nitems; j++) 662 if (mlp->item[j]->msgid_plural != NULL) 663 has_plural = true; 664 if (has_plural) 665 { 666 multiline_error (xstrdup (""), 667 xstrdup (_("\ 668message catalog has plural form translations\n\ 669but the Qt message catalog format doesn't support plural handling\n"))); 670 return 1; 671 } 672 } 673 674 /* Convert the messages to Unicode. */ 675 iconv_message_list (mlp, canon_encoding, po_charset_utf8, NULL); 676 677 /* Determine whether mlp has non-ISO-8859-1 msgctxt entries. */ 678 { 679 size_t j; 680 681 for (j = 0; j < mlp->nitems; j++) 682 { 683 const char *string = mlp->item[j]->msgctxt; 684 685 if (string != NULL) 686 { 687 /* An UTF-8 encoded string fits in ISO-8859-1 if and only if 688 all its bytes are < 0xc4. */ 689 for (; *string; string++) 690 if ((unsigned char) *string >= 0xc4) 691 { 692 multiline_error (xstrdup (""), 693 xstrdup (_("\ 694message catalog has msgctxt strings containing characters outside ISO-8859-1\n\ 695but the Qt message catalog format supports Unicode only in the translated\n\ 696strings, not in the context strings\n"))); 697 return 1; 698 } 699 } 700 } 701 } 702 703 /* Determine whether mlp has non-ISO-8859-1 msgid entries. */ 704 { 705 size_t j; 706 707 for (j = 0; j < mlp->nitems; j++) 708 { 709 const char *string = mlp->item[j]->msgid; 710 711 /* An UTF-8 encoded string fits in ISO-8859-1 if and only if all 712 its bytes are < 0xc4. */ 713 for (; *string; string++) 714 if ((unsigned char) *string >= 0xc4) 715 { 716 multiline_error (xstrdup (""), 717 xstrdup (_("\ 718message catalog has msgid strings containing characters outside ISO-8859-1\n\ 719but the Qt message catalog format supports Unicode only in the translated\n\ 720strings, not in the untranslated strings\n"))); 721 return 1; 722 } 723 } 724 } 725 726 if (strcmp (domain_name, "-") == 0) 727 { 728 output_file = stdout; 729 SET_BINARY (fileno (output_file)); 730 } 731 else 732 { 733 output_file = fopen (file_name, "wb"); 734 if (output_file == NULL) 735 { 736 error (0, errno, _("error while opening \"%s\" for writing"), 737 file_name); 738 return 1; 739 } 740 } 741 742 if (output_file != NULL) 743 { 744 write_qm (output_file, mlp); 745 746 /* Make sure nothing went wrong. */ 747 if (fwriteerror (output_file)) 748 error (EXIT_FAILURE, errno, _("error while writing \"%s\" file"), 749 file_name); 750 } 751 } 752 753 return 0; 754} 755