1/* xgettext Java backend. 2 Copyright (C) 2003, 2005-2007 Free Software Foundation, Inc. 3 Written by Bruno Haible <bruno@clisp.org>, 2003. 4 5 This program is free software: you can redistribute it and/or modify 6 it under the terms of the GNU General Public License as published by 7 the Free Software Foundation; either version 3 of the License, or 8 (at your option) any later version. 9 10 This program is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 GNU General Public License for more details. 14 15 You should have received a copy of the GNU General Public License 16 along with this program. If not, see <http://www.gnu.org/licenses/>. */ 17 18#ifdef HAVE_CONFIG_H 19# include "config.h" 20#endif 21 22/* Specification. */ 23#include "x-java.h" 24 25#include <errno.h> 26#include <stdbool.h> 27#include <stdio.h> 28#include <stdlib.h> 29#include <string.h> 30 31#include "message.h" 32#include "xgettext.h" 33#include "x-java.h" 34#include "error.h" 35#include "xalloc.h" 36#include "hash.h" 37#include "po-charset.h" 38#include "unistr.h" 39#include "gettext.h" 40 41#define _(s) gettext(s) 42 43#define SIZEOF(a) (sizeof(a) / sizeof(a[0])) 44 45 46/* The Java syntax is defined in the 47 Java Language Specification, Second Edition, 48 (available from http://java.sun.com/), 49 chapter 3 "Lexical Structure". */ 50 51 52/* ====================== Keyword set customization. ====================== */ 53 54/* If true extract all strings. */ 55static bool extract_all = false; 56 57static hash_table keywords; 58static bool default_keywords = true; 59 60 61void 62x_java_extract_all () 63{ 64 extract_all = true; 65} 66 67 68void 69x_java_keyword (const char *name) 70{ 71 if (name == NULL) 72 default_keywords = false; 73 else 74 { 75 const char *end; 76 struct callshape shape; 77 const char *colon; 78 79 if (keywords.table == NULL) 80 hash_init (&keywords, 100); 81 82 split_keywordspec (name, &end, &shape); 83 84 /* The characters between name and end should form a valid Java 85 identifier sequence with dots. 86 A colon means an invalid parse in split_keywordspec(). */ 87 colon = strchr (name, ':'); 88 if (colon == NULL || colon >= end) 89 insert_keyword_callshape (&keywords, name, end - name, &shape); 90 } 91} 92 93/* Finish initializing the keywords hash table. 94 Called after argument processing, before each file is processed. */ 95static void 96init_keywords () 97{ 98 if (default_keywords) 99 { 100 /* When adding new keywords here, also update the documentation in 101 xgettext.texi! */ 102 x_java_keyword ("GettextResource.gettext:2"); /* static method */ 103 x_java_keyword ("GettextResource.ngettext:2,3"); /* static method */ 104 x_java_keyword ("GettextResource.pgettext:2c,3"); /* static method */ 105 x_java_keyword ("GettextResource.npgettext:2c,3,4"); /* static method */ 106 x_java_keyword ("gettext"); 107 x_java_keyword ("ngettext:1,2"); 108 x_java_keyword ("pgettext:1c,2"); 109 x_java_keyword ("npgettext:1c,2,3"); 110 x_java_keyword ("getString"); /* ResourceBundle.getString */ 111 default_keywords = false; 112 } 113} 114 115void 116init_flag_table_java () 117{ 118 xgettext_record_flag ("GettextResource.gettext:2:pass-java-format"); 119 xgettext_record_flag ("GettextResource.ngettext:2:pass-java-format"); 120 xgettext_record_flag ("GettextResource.ngettext:3:pass-java-format"); 121 xgettext_record_flag ("GettextResource.pgettext:3:pass-java-format"); 122 xgettext_record_flag ("GettextResource.npgettext:3:pass-java-format"); 123 xgettext_record_flag ("GettextResource.npgettext:4:pass-java-format"); 124 xgettext_record_flag ("gettext:1:pass-java-format"); 125 xgettext_record_flag ("ngettext:1:pass-java-format"); 126 xgettext_record_flag ("ngettext:2:pass-java-format"); 127 xgettext_record_flag ("pgettext:2:pass-java-format"); 128 xgettext_record_flag ("npgettext:2:pass-java-format"); 129 xgettext_record_flag ("npgettext:3:pass-java-format"); 130 xgettext_record_flag ("getString:1:pass-java-format"); 131 xgettext_record_flag ("MessageFormat:1:java-format"); 132 xgettext_record_flag ("MessageFormat.format:1:java-format"); 133} 134 135 136/* ======================== Reading of characters. ======================== */ 137 138/* Real filename, used in error messages about the input file. */ 139static const char *real_file_name; 140 141/* Logical filename and line number, used to label the extracted messages. */ 142static char *logical_file_name; 143static int line_number; 144 145/* The input file stream. */ 146static FILE *fp; 147 148 149/* Fetch the next single-byte character from the input file. 150 Pushback can consist of an unlimited number of 'u' followed by up to 4 151 other characters. */ 152 153/* Special coding of multiple 'u's in the pushback buffer. */ 154#define MULTIPLE_U(count) (0x1000 + (count)) 155 156static int phase1_pushback[5]; 157static unsigned int phase1_pushback_length; 158 159static int 160phase1_getc () 161{ 162 int c; 163 164 if (phase1_pushback_length) 165 { 166 c = phase1_pushback[--phase1_pushback_length]; 167 if (c >= MULTIPLE_U (0)) 168 { 169 if (c > MULTIPLE_U (1)) 170 phase1_pushback[phase1_pushback_length++] = c - 1; 171 return 'u'; 172 } 173 else 174 return c; 175 } 176 177 c = getc (fp); 178 179 if (c == EOF) 180 { 181 if (ferror (fp)) 182 error (EXIT_FAILURE, errno, _("\ 183error while reading \"%s\""), real_file_name); 184 } 185 186 return c; 187} 188 189/* Supports any number of 'u' and up to 4 arbitrary characters of pushback. */ 190static void 191phase1_ungetc (int c) 192{ 193 if (c != EOF) 194 { 195 if (c == 'u') 196 { 197 if (phase1_pushback_length > 0 198 && phase1_pushback[phase1_pushback_length - 1] >= MULTIPLE_U (0)) 199 phase1_pushback[phase1_pushback_length - 1]++; 200 else 201 { 202 if (phase1_pushback_length == SIZEOF (phase1_pushback)) 203 abort (); 204 phase1_pushback[phase1_pushback_length++] = MULTIPLE_U (1); 205 } 206 } 207 else 208 { 209 if (phase1_pushback_length == SIZEOF (phase1_pushback)) 210 abort (); 211 phase1_pushback[phase1_pushback_length++] = c; 212 } 213 } 214} 215 216 217/* Fetch the next single-byte character or Unicode character from the file. 218 (Here, as in the Java Language Specification, when we say "Unicode 219 character", we actually mean "UTF-16 encoding unit".) */ 220 221/* Return value of phase 2, 3, 4 when EOF is reached. */ 222#define P2_EOF 0xffff 223 224/* Convert an UTF-16 code point to a return value that can be distinguished 225 from a single-byte return value. */ 226#define UNICODE(code) (0x10000 + (code)) 227 228/* Test a return value of phase 2, 3, 4 whether it designates an UTF-16 code 229 point. */ 230#define IS_UNICODE(p2_result) ((p2_result) >= 0x10000) 231 232/* Extract the UTF-16 code of a return value that satisfies IS_UNICODE. */ 233#define UTF16_VALUE(p2_result) ((p2_result) - 0x10000) 234 235/* Reduces a return value of phase 2, 3, 4 by unmasking the UNICODE bit, 236 so that it can be more easily compared against an ASCII character. 237 (RED (c) == 'x') is equivalent to (c == 'x' || c == UNICODE ('x')). */ 238#define RED(p2_result) ((p2_result) & 0xffff) 239 240static int phase2_pushback[1]; 241static int phase2_pushback_length; 242 243static int 244phase2_getc () 245{ 246 int c; 247 248 if (phase2_pushback_length) 249 return phase2_pushback[--phase2_pushback_length]; 250 251 c = phase1_getc (); 252 if (c == EOF) 253 return P2_EOF; 254 if (c == '\\') 255 { 256 c = phase1_getc (); 257 if (c == 'u') 258 { 259 unsigned int u_count = 1; 260 unsigned char buf[4]; 261 unsigned int n; 262 int i; 263 264 for (;;) 265 { 266 c = phase1_getc (); 267 if (c != 'u') 268 break; 269 u_count++; 270 } 271 phase1_ungetc (c); 272 273 n = 0; 274 for (i = 0; i < 4; i++) 275 { 276 c = phase1_getc (); 277 278 if (c >= '0' && c <= '9') 279 n = (n << 4) + (c - '0'); 280 else if (c >= 'A' && c <= 'F') 281 n = (n << 4) + (c - 'A' + 10); 282 else if (c >= 'a' && c <= 'f') 283 n = (n << 4) + (c - 'a' + 10); 284 else 285 { 286 phase1_ungetc (c); 287 while (--i >= 0) 288 phase1_ungetc (buf[i]); 289 for (; u_count > 0; u_count--) 290 phase1_ungetc ('u'); 291 return '\\'; 292 } 293 294 buf[i] = c; 295 } 296 return UNICODE (n); 297 } 298 phase1_ungetc (c); 299 return '\\'; 300 } 301 return c; 302} 303 304/* Supports only one pushback character. */ 305static void 306phase2_ungetc (int c) 307{ 308 if (c != P2_EOF) 309 { 310 if (phase2_pushback_length == SIZEOF (phase2_pushback)) 311 abort (); 312 phase2_pushback[phase2_pushback_length++] = c; 313 } 314} 315 316 317/* Fetch the next single-byte character or Unicode character from the file. 318 With line number handling. 319 Convert line terminators to '\n' or UNICODE ('\n'). */ 320 321static int phase3_pushback[2]; 322static int phase3_pushback_length; 323 324static int 325phase3_getc () 326{ 327 int c; 328 329 if (phase3_pushback_length) 330 { 331 c = phase3_pushback[--phase3_pushback_length]; 332 if (c == '\n') 333 ++line_number; 334 return c; 335 } 336 337 c = phase2_getc (); 338 339 /* Handle line terminators. */ 340 if (RED (c) == '\r') 341 { 342 int c1 = phase2_getc (); 343 344 if (RED (c1) != '\n') 345 phase2_ungetc (c1); 346 347 /* Seen line terminator CR or CR/LF. */ 348 if (c == '\r' || c1 == '\n') 349 { 350 ++line_number; 351 return '\n'; 352 } 353 else 354 return UNICODE ('\n'); 355 } 356 else if (RED (c) == '\n') 357 { 358 /* Seen line terminator LF. */ 359 if (c == '\n') 360 { 361 ++line_number; 362 return '\n'; 363 } 364 else 365 return UNICODE ('\n'); 366 } 367 368 return c; 369} 370 371/* Supports 2 characters of pushback. */ 372static void 373phase3_ungetc (int c) 374{ 375 if (c != P2_EOF) 376 { 377 if (c == '\n') 378 --line_number; 379 if (phase3_pushback_length == SIZEOF (phase3_pushback)) 380 abort (); 381 phase3_pushback[phase3_pushback_length++] = c; 382 } 383} 384 385 386/* ========================= Accumulating strings. ======================== */ 387 388/* A string buffer type that allows appending bytes (in the 389 xgettext_current_source_encoding) or Unicode characters. 390 Returns the entire string in UTF-8 encoding. */ 391 392struct string_buffer 393{ 394 /* The part of the string that has already been converted to UTF-8. */ 395 char *utf8_buffer; 396 size_t utf8_buflen; 397 size_t utf8_allocated; 398 /* The first half of an UTF-16 surrogate character. */ 399 unsigned short utf16_surr; 400 /* The part of the string that is still in the source encoding. */ 401 char *curr_buffer; 402 size_t curr_buflen; 403 size_t curr_allocated; 404}; 405 406/* Initialize a 'struct string_buffer' to empty. */ 407static inline void 408init_string_buffer (struct string_buffer *bp) 409{ 410 bp->utf8_buffer = NULL; 411 bp->utf8_buflen = 0; 412 bp->utf8_allocated = 0; 413 bp->utf16_surr = 0; 414 bp->curr_buffer = NULL; 415 bp->curr_buflen = 0; 416 bp->curr_allocated = 0; 417} 418 419/* Auxiliary function: Append a byte to bp->curr. */ 420static inline void 421string_buffer_append_byte (struct string_buffer *bp, unsigned char c) 422{ 423 if (bp->curr_buflen == bp->curr_allocated) 424 { 425 bp->curr_allocated = 2 * bp->curr_allocated + 10; 426 bp->curr_buffer = xrealloc (bp->curr_buffer, bp->curr_allocated); 427 } 428 bp->curr_buffer[bp->curr_buflen++] = c; 429} 430 431/* Auxiliary function: Ensure count more bytes are available in bp->utf8. */ 432static inline void 433string_buffer_append_unicode_grow (struct string_buffer *bp, size_t count) 434{ 435 if (bp->utf8_buflen + count > bp->utf8_allocated) 436 { 437 size_t new_allocated = 2 * bp->utf8_allocated + 10; 438 if (new_allocated < bp->utf8_buflen + count) 439 new_allocated = bp->utf8_buflen + count; 440 bp->utf8_allocated = new_allocated; 441 bp->utf8_buffer = xrealloc (bp->utf8_buffer, new_allocated); 442 } 443} 444 445/* Auxiliary function: Append a Unicode character to bp->utf8. 446 uc must be < 0x110000. */ 447static inline void 448string_buffer_append_unicode (struct string_buffer *bp, unsigned int uc) 449{ 450 unsigned char utf8buf[6]; 451 int count = u8_uctomb (utf8buf, uc, 6); 452 453 if (count < 0) 454 /* The caller should have ensured that uc is not out-of-range. */ 455 abort (); 456 457 string_buffer_append_unicode_grow (bp, count); 458 memcpy (bp->utf8_buffer + bp->utf8_buflen, utf8buf, count); 459 bp->utf8_buflen += count; 460} 461 462/* Auxiliary function: Handle the attempt to append a lone surrogate to 463 bp->utf8. */ 464static void 465string_buffer_append_lone_surrogate (struct string_buffer *bp, unsigned int uc) 466{ 467 /* A half surrogate is invalid, therefore use U+FFFD instead. 468 It appears to be valid Java: The Java Language Specification, 469 3rd ed., says "The Java programming language represents text 470 in sequences of 16-bit code units, using the UTF-16 encoding." 471 but does not impose constraints on the use of \uxxxx escape 472 sequences for surrogates. And the JDK's javac happily groks 473 half surrogates. 474 But a half surrogate is invalid in UTF-8: 475 - RFC 3629 says 476 "The definition of UTF-8 prohibits encoding character 477 numbers between U+D800 and U+DFFF". 478 - Unicode 4.0 chapter 3 479 <http://www.unicode.org/versions/Unicode4.0.0/ch03.pdf> 480 section 3.9, p.77, says 481 "Because surrogate code points are not Unicode scalar 482 values, any UTF-8 byte sequence that would otherwise 483 map to code points D800..DFFF is ill-formed." 484 and in table 3-6, p. 78, does not mention D800..DFFF. 485 - The unicode.org FAQ question "How do I convert an unpaired 486 UTF-16 surrogate to UTF-8?" has the answer 487 "By representing such an unpaired surrogate on its own 488 as a 3-byte sequence, the resulting UTF-8 data stream 489 would become ill-formed." 490 So use U+FFFD instead. */ 491 error_with_progname = false; 492 error (0, 0, _("%s:%d: warning: lone surrogate U+%04X"), 493 logical_file_name, line_number, uc); 494 error_with_progname = true; 495 string_buffer_append_unicode (bp, 0xfffd); 496} 497 498/* Auxiliary function: Flush bp->utf16_surr into bp->utf8_buffer. */ 499static inline void 500string_buffer_flush_utf16_surr (struct string_buffer *bp) 501{ 502 if (bp->utf16_surr != 0) 503 { 504 string_buffer_append_lone_surrogate (bp, bp->utf16_surr); 505 bp->utf16_surr = 0; 506 } 507} 508 509/* Auxiliary function: Flush bp->curr_buffer into bp->utf8_buffer. */ 510static inline void 511string_buffer_flush_curr_buffer (struct string_buffer *bp, int lineno) 512{ 513 if (bp->curr_buflen > 0) 514 { 515 char *curr; 516 size_t count; 517 518 string_buffer_append_byte (bp, '\0'); 519 520 /* Convert from the source encoding to UTF-8. */ 521 curr = from_current_source_encoding (bp->curr_buffer, 522 logical_file_name, lineno); 523 524 /* Append it to bp->utf8_buffer. */ 525 count = strlen (curr); 526 string_buffer_append_unicode_grow (bp, count); 527 memcpy (bp->utf8_buffer + bp->utf8_buflen, curr, count); 528 bp->utf8_buflen += count; 529 530 if (curr != bp->curr_buffer) 531 free (curr); 532 bp->curr_buflen = 0; 533 } 534} 535 536/* Append a character or Unicode character to a 'struct string_buffer'. */ 537static void 538string_buffer_append (struct string_buffer *bp, int c) 539{ 540 if (IS_UNICODE (c)) 541 { 542 /* Append a Unicode character. */ 543 544 /* Switch from multibyte character mode to Unicode character mode. */ 545 string_buffer_flush_curr_buffer (bp, line_number); 546 547 /* Test whether this character and the previous one form a Unicode 548 surrogate character pair. */ 549 if (bp->utf16_surr != 0 550 && (c >= UNICODE (0xdc00) && c < UNICODE (0xe000))) 551 { 552 unsigned short utf16buf[2]; 553 unsigned int uc; 554 555 utf16buf[0] = bp->utf16_surr; 556 utf16buf[1] = UTF16_VALUE (c); 557 if (u16_mbtouc (&uc, utf16buf, 2) != 2) 558 abort (); 559 560 string_buffer_append_unicode (bp, uc); 561 bp->utf16_surr = 0; 562 } 563 else 564 { 565 string_buffer_flush_utf16_surr (bp); 566 567 if (c >= UNICODE (0xd800) && c < UNICODE (0xdc00)) 568 bp->utf16_surr = UTF16_VALUE (c); 569 else if (c >= UNICODE (0xdc00) && c < UNICODE (0xe000)) 570 string_buffer_append_lone_surrogate (bp, UTF16_VALUE (c)); 571 else 572 string_buffer_append_unicode (bp, UTF16_VALUE (c)); 573 } 574 } 575 else 576 { 577 /* Append a single byte. */ 578 579 /* Switch from Unicode character mode to multibyte character mode. */ 580 string_buffer_flush_utf16_surr (bp); 581 582 /* When a newline is seen, convert the accumulated multibyte sequence. 583 This ensures a correct line number in the error message in case of 584 a conversion error. The "- 1" is to account for the newline. */ 585 if (c == '\n') 586 string_buffer_flush_curr_buffer (bp, line_number - 1); 587 588 string_buffer_append_byte (bp, (unsigned char) c); 589 } 590} 591 592/* Return the string buffer's contents. */ 593static char * 594string_buffer_result (struct string_buffer *bp) 595{ 596 /* Flush all into bp->utf8_buffer. */ 597 string_buffer_flush_utf16_surr (bp); 598 string_buffer_flush_curr_buffer (bp, line_number); 599 /* NUL-terminate it. */ 600 string_buffer_append_unicode_grow (bp, 1); 601 bp->utf8_buffer[bp->utf8_buflen] = '\0'; 602 /* Return it. */ 603 return bp->utf8_buffer; 604} 605 606/* Free the memory pointed to by a 'struct string_buffer'. */ 607static inline void 608free_string_buffer (struct string_buffer *bp) 609{ 610 free (bp->utf8_buffer); 611 free (bp->curr_buffer); 612} 613 614 615/* ======================== Accumulating comments. ======================== */ 616 617 618/* Accumulating a single comment line. */ 619 620static struct string_buffer comment_buffer; 621 622static inline void 623comment_start () 624{ 625 comment_buffer.utf8_buflen = 0; 626 comment_buffer.utf16_surr = 0; 627 comment_buffer.curr_buflen = 0; 628} 629 630static inline bool 631comment_at_start () 632{ 633 return (comment_buffer.utf8_buflen == 0 && comment_buffer.utf16_surr == 0 634 && comment_buffer.curr_buflen == 0); 635} 636 637static inline void 638comment_add (int c) 639{ 640 string_buffer_append (&comment_buffer, c); 641} 642 643static inline void 644comment_line_end (size_t chars_to_remove) 645{ 646 char *buffer = string_buffer_result (&comment_buffer); 647 size_t buflen = strlen (buffer); 648 649 buflen -= chars_to_remove; 650 while (buflen >= 1 651 && (buffer[buflen - 1] == ' ' || buffer[buflen - 1] == '\t')) 652 --buflen; 653 buffer[buflen] = '\0'; 654 savable_comment_add (buffer); 655} 656 657 658/* These are for tracking whether comments count as immediately before 659 keyword. */ 660static int last_comment_line; 661static int last_non_comment_line; 662 663 664/* Replace each comment that is not inside a character constant or string 665 literal with a space or newline character. */ 666 667static int 668phase4_getc () 669{ 670 int c0; 671 int c; 672 bool last_was_star; 673 674 c0 = phase3_getc (); 675 if (RED (c0) != '/') 676 return c0; 677 c = phase3_getc (); 678 switch (RED (c)) 679 { 680 default: 681 phase3_ungetc (c); 682 return c0; 683 684 case '*': 685 /* C style comment. */ 686 comment_start (); 687 last_was_star = false; 688 for (;;) 689 { 690 c = phase3_getc (); 691 if (c == P2_EOF) 692 break; 693 /* We skip all leading white space, but not EOLs. */ 694 if (!(comment_at_start () && (RED (c) == ' ' || RED (c) == '\t'))) 695 comment_add (c); 696 switch (RED (c)) 697 { 698 case '\n': 699 comment_line_end (1); 700 comment_start (); 701 last_was_star = false; 702 continue; 703 704 case '*': 705 last_was_star = true; 706 continue; 707 708 case '/': 709 if (last_was_star) 710 { 711 comment_line_end (2); 712 break; 713 } 714 /* FALLTHROUGH */ 715 716 default: 717 last_was_star = false; 718 continue; 719 } 720 break; 721 } 722 last_comment_line = line_number; 723 return ' '; 724 725 case '/': 726 /* C++ style comment. */ 727 last_comment_line = line_number; 728 comment_start (); 729 for (;;) 730 { 731 c = phase3_getc (); 732 if (RED (c) == '\n' || c == P2_EOF) 733 break; 734 /* We skip all leading white space, but not EOLs. */ 735 if (!(comment_at_start () && (RED (c) == ' ' || RED (c) == '\t'))) 736 comment_add (c); 737 } 738 phase3_ungetc (c); /* push back the newline, to decrement line_number */ 739 comment_line_end (0); 740 phase3_getc (); /* read the newline again */ 741 return '\n'; 742 } 743} 744 745/* Supports only one pushback character. */ 746static void 747phase4_ungetc (int c) 748{ 749 phase3_ungetc (c); 750} 751 752 753/* ========================== Reading of tokens. ========================== */ 754 755enum token_type_ty 756{ 757 token_type_eof, 758 token_type_lparen, /* ( */ 759 token_type_rparen, /* ) */ 760 token_type_lbrace, /* { */ 761 token_type_rbrace, /* } */ 762 token_type_comma, /* , */ 763 token_type_dot, /* . */ 764 token_type_string_literal, /* "abc" */ 765 token_type_number, /* 1.23 */ 766 token_type_symbol, /* identifier, keyword, null */ 767 token_type_plus, /* + */ 768 token_type_other /* character literal, misc. operator */ 769}; 770typedef enum token_type_ty token_type_ty; 771 772typedef struct token_ty token_ty; 773struct token_ty 774{ 775 token_type_ty type; 776 char *string; /* for token_type_string_literal, token_type_symbol */ 777 refcounted_string_list_ty *comment; /* for token_type_string_literal */ 778 int line_number; 779}; 780 781 782/* Free the memory pointed to by a 'struct token_ty'. */ 783static inline void 784free_token (token_ty *tp) 785{ 786 if (tp->type == token_type_string_literal || tp->type == token_type_symbol) 787 free (tp->string); 788 if (tp->type == token_type_string_literal) 789 drop_reference (tp->comment); 790} 791 792 793/* Read an escape sequence inside a string literal or character literal. */ 794static inline int 795do_getc_escaped () 796{ 797 int c; 798 799 /* Use phase 3, because phase 4 elides comments. */ 800 c = phase3_getc (); 801 if (c == P2_EOF) 802 return UNICODE ('\\'); 803 switch (RED (c)) 804 { 805 case 'b': 806 return UNICODE (0x08); 807 case 't': 808 return UNICODE (0x09); 809 case 'n': 810 return UNICODE (0x0a); 811 case 'f': 812 return UNICODE (0x0c); 813 case 'r': 814 return UNICODE (0x0d); 815 case '"': 816 return UNICODE ('"'); 817 case '\'': 818 return UNICODE ('\''); 819 case '\\': 820 return UNICODE ('\\'); 821 case '0': case '1': case '2': case '3': 822 case '4': case '5': case '6': case '7': 823 { 824 int n = RED (c) - '0'; 825 bool maybe3digits = (n < 4); 826 827 c = phase3_getc (); 828 if (RED (c) >= '0' && RED (c) <= '7') 829 { 830 n = (n << 3) + (RED (c) - '0'); 831 if (maybe3digits) 832 { 833 c = phase3_getc (); 834 if (RED (c) >= '0' && RED (c) <= '7') 835 n = (n << 3) + (RED (c) - '0'); 836 else 837 phase3_ungetc (c); 838 } 839 } 840 else 841 phase3_ungetc (c); 842 843 return UNICODE (n); 844 } 845 default: 846 /* Invalid escape sequence. */ 847 phase3_ungetc (c); 848 return UNICODE ('\\'); 849 } 850} 851 852/* Read a string literal or character literal. */ 853static void 854accumulate_escaped (struct string_buffer *literal, int delimiter) 855{ 856 int c; 857 858 for (;;) 859 { 860 /* Use phase 3, because phase 4 elides comments. */ 861 c = phase3_getc (); 862 if (c == P2_EOF || RED (c) == delimiter) 863 break; 864 if (RED (c) == '\n') 865 { 866 phase3_ungetc (c); 867 error_with_progname = false; 868 if (delimiter == '\'') 869 error (0, 0, _("%s:%d: warning: unterminated character constant"), 870 logical_file_name, line_number); 871 else 872 error (0, 0, _("%s:%d: warning: unterminated string constant"), 873 logical_file_name, line_number); 874 error_with_progname = true; 875 break; 876 } 877 if (RED (c) == '\\') 878 c = do_getc_escaped (); 879 string_buffer_append (literal, c); 880 } 881} 882 883 884/* Combine characters into tokens. Discard whitespace. */ 885 886static token_ty phase5_pushback[3]; 887static int phase5_pushback_length; 888 889static void 890phase5_get (token_ty *tp) 891{ 892 int c; 893 894 if (phase5_pushback_length) 895 { 896 *tp = phase5_pushback[--phase5_pushback_length]; 897 return; 898 } 899 tp->string = NULL; 900 901 for (;;) 902 { 903 tp->line_number = line_number; 904 c = phase4_getc (); 905 906 if (c == P2_EOF) 907 { 908 tp->type = token_type_eof; 909 return; 910 } 911 912 switch (RED (c)) 913 { 914 case '\n': 915 if (last_non_comment_line > last_comment_line) 916 savable_comment_reset (); 917 /* FALLTHROUGH */ 918 case ' ': 919 case '\t': 920 case '\f': 921 /* Ignore whitespace and comments. */ 922 continue; 923 } 924 925 last_non_comment_line = tp->line_number; 926 927 switch (RED (c)) 928 { 929 case '(': 930 tp->type = token_type_lparen; 931 return; 932 933 case ')': 934 tp->type = token_type_rparen; 935 return; 936 937 case '{': 938 tp->type = token_type_lbrace; 939 return; 940 941 case '}': 942 tp->type = token_type_rbrace; 943 return; 944 945 case ',': 946 tp->type = token_type_comma; 947 return; 948 949 case '.': 950 c = phase4_getc (); 951 if (!(RED (c) >= '0' && RED (c) <= '9')) 952 { 953 phase4_ungetc (c); 954 tp->type = token_type_dot; 955 return; 956 } 957 /* FALLTHROUGH */ 958 959 case '0': case '1': case '2': case '3': case '4': 960 case '5': case '6': case '7': case '8': case '9': 961 { 962 /* Don't need to verify the complicated syntax of integers and 963 floating-point numbers. We assume a valid Java input. 964 The simplified syntax that we recognize as number is: any 965 sequence of alphanumeric characters, additionally '+' and '-' 966 immediately after 'e' or 'E' except in hexadecimal numbers. */ 967 bool hexadecimal = false; 968 969 for (;;) 970 { 971 c = phase4_getc (); 972 if (RED (c) >= '0' && RED (c) <= '9') 973 continue; 974 if ((RED (c) >= 'A' && RED (c) <= 'Z') 975 || (RED (c) >= 'a' && RED (c) <= 'z')) 976 { 977 if (RED (c) == 'X' || RED (c) == 'x') 978 hexadecimal = true; 979 if ((RED (c) == 'E' || RED (c) == 'e') && !hexadecimal) 980 { 981 c = phase4_getc (); 982 if (!(RED (c) == '+' || RED (c) == '-')) 983 phase4_ungetc (c); 984 } 985 continue; 986 } 987 if (RED (c) == '.') 988 continue; 989 break; 990 } 991 phase4_ungetc (c); 992 tp->type = token_type_number; 993 return; 994 } 995 996 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G': 997 case 'H': case 'I': case 'J': case 'K': case 'L': case 'M': case 'N': 998 case 'O': case 'P': case 'Q': case 'R': case 'S': case 'T': case 'U': 999 case 'V': case 'W': case 'X': case 'Y': case 'Z': 1000 case '_': 1001 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g': 1002 case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n': 1003 case 'o': case 'p': case 'q': case 'r': case 's': case 't': case 'u': 1004 case 'v': case 'w': case 'x': case 'y': case 'z': 1005 /* Although Java allows identifiers containing many Unicode 1006 characters, we recognize only identifiers consisting of ASCII 1007 characters. This avoids conversion hassles w.r.t. the --keyword 1008 arguments, and shouldn't be a big problem in practice. */ 1009 { 1010 static char *buffer; 1011 static int bufmax; 1012 int bufpos = 0; 1013 for (;;) 1014 { 1015 if (bufpos >= bufmax) 1016 { 1017 bufmax = 2 * bufmax + 10; 1018 buffer = xrealloc (buffer, bufmax); 1019 } 1020 buffer[bufpos++] = RED (c); 1021 c = phase4_getc (); 1022 if (!((RED (c) >= 'A' && RED (c) <= 'Z') 1023 || (RED (c) >= 'a' && RED (c) <= 'z') 1024 || (RED (c) >= '0' && RED (c) <= '9') 1025 || RED (c) == '_')) 1026 break; 1027 } 1028 phase4_ungetc (c); 1029 if (bufpos >= bufmax) 1030 { 1031 bufmax = 2 * bufmax + 10; 1032 buffer = xrealloc (buffer, bufmax); 1033 } 1034 buffer[bufpos] = '\0'; 1035 tp->string = xstrdup (buffer); 1036 tp->type = token_type_symbol; 1037 return; 1038 } 1039 1040 case '"': 1041 /* String literal. */ 1042 { 1043 struct string_buffer literal; 1044 1045 init_string_buffer (&literal); 1046 accumulate_escaped (&literal, '"'); 1047 tp->string = xstrdup (string_buffer_result (&literal)); 1048 free_string_buffer (&literal); 1049 tp->comment = add_reference (savable_comment); 1050 tp->type = token_type_string_literal; 1051 return; 1052 } 1053 1054 case '\'': 1055 /* Character literal. */ 1056 { 1057 struct string_buffer literal; 1058 1059 init_string_buffer (&literal); 1060 accumulate_escaped (&literal, '\''); 1061 free_string_buffer (&literal); 1062 tp->type = token_type_other; 1063 return; 1064 } 1065 1066 case '+': 1067 c = phase4_getc (); 1068 if (RED (c) == '+') 1069 /* Operator ++ */ 1070 tp->type = token_type_other; 1071 else if (RED (c) == '=') 1072 /* Operator += */ 1073 tp->type = token_type_other; 1074 else 1075 { 1076 /* Operator + */ 1077 phase4_ungetc (c); 1078 tp->type = token_type_plus; 1079 } 1080 return; 1081 1082 default: 1083 /* Misc. operator. */ 1084 tp->type = token_type_other; 1085 return; 1086 } 1087 } 1088} 1089 1090/* Supports 3 tokens of pushback. */ 1091static void 1092phase5_unget (token_ty *tp) 1093{ 1094 if (tp->type != token_type_eof) 1095 { 1096 if (phase5_pushback_length == SIZEOF (phase5_pushback)) 1097 abort (); 1098 phase5_pushback[phase5_pushback_length++] = *tp; 1099 } 1100} 1101 1102 1103/* Compile-time optimization of string literal concatenation. 1104 Combine "string1" + ... + "stringN" to the concatenated string if 1105 - the token before this expression is not ')' (because then the first 1106 string could be part of a cast expression), 1107 - the token after this expression is not '.' (because then the last 1108 string could be part of a method call expression). */ 1109 1110static token_ty phase6_pushback[2]; 1111static int phase6_pushback_length; 1112 1113static token_type_ty phase6_last; 1114 1115static void 1116phase6_get (token_ty *tp) 1117{ 1118 if (phase6_pushback_length) 1119 { 1120 *tp = phase6_pushback[--phase6_pushback_length]; 1121 return; 1122 } 1123 1124 phase5_get (tp); 1125 if (tp->type == token_type_string_literal && phase6_last != token_type_rparen) 1126 { 1127 char *sum = tp->string; 1128 size_t sum_len = strlen (sum); 1129 1130 for (;;) 1131 { 1132 token_ty token2; 1133 1134 phase5_get (&token2); 1135 if (token2.type == token_type_plus) 1136 { 1137 token_ty token3; 1138 1139 phase5_get (&token3); 1140 if (token3.type == token_type_string_literal) 1141 { 1142 token_ty token_after; 1143 1144 phase5_get (&token_after); 1145 if (token_after.type != token_type_dot) 1146 { 1147 char *addend = token3.string; 1148 size_t addend_len = strlen (addend); 1149 1150 sum = (char *) xrealloc (sum, sum_len + addend_len + 1); 1151 memcpy (sum + sum_len, addend, addend_len + 1); 1152 sum_len += addend_len; 1153 1154 phase5_unget (&token_after); 1155 free_token (&token3); 1156 free_token (&token2); 1157 continue; 1158 } 1159 phase5_unget (&token_after); 1160 } 1161 phase5_unget (&token3); 1162 } 1163 phase5_unget (&token2); 1164 break; 1165 } 1166 tp->string = sum; 1167 } 1168 phase6_last = tp->type; 1169} 1170 1171/* Supports 2 tokens of pushback. */ 1172static void 1173phase6_unget (token_ty *tp) 1174{ 1175 if (tp->type != token_type_eof) 1176 { 1177 if (phase6_pushback_length == SIZEOF (phase6_pushback)) 1178 abort (); 1179 phase6_pushback[phase6_pushback_length++] = *tp; 1180 } 1181} 1182 1183 1184static void 1185x_java_lex (token_ty *tp) 1186{ 1187 phase6_get (tp); 1188} 1189 1190/* Supports 2 tokens of pushback. */ 1191static void 1192x_java_unlex (token_ty *tp) 1193{ 1194 phase6_unget (tp); 1195} 1196 1197 1198/* ========================= Extracting strings. ========================== */ 1199 1200 1201/* Context lookup table. */ 1202static flag_context_list_table_ty *flag_context_list_table; 1203 1204 1205/* The file is broken into tokens. Scan the token stream, looking for 1206 a keyword, followed by a left paren, followed by a string. When we 1207 see this sequence, we have something to remember. We assume we are 1208 looking at a valid C or C++ program, and leave the complaints about 1209 the grammar to the compiler. 1210 1211 Normal handling: Look for 1212 keyword ( ... msgid ... ) 1213 Plural handling: Look for 1214 keyword ( ... msgid ... msgid_plural ... ) 1215 1216 We use recursion because the arguments before msgid or between msgid 1217 and msgid_plural can contain subexpressions of the same form. */ 1218 1219 1220/* Extract messages until the next balanced closing parenthesis or brace, 1221 depending on TERMINATOR. 1222 Extracted messages are added to MLP. 1223 Return true upon eof, false upon closing parenthesis or brace. */ 1224static bool 1225extract_parenthesized (message_list_ty *mlp, token_type_ty terminator, 1226 flag_context_ty outer_context, 1227 flag_context_list_iterator_ty context_iter, 1228 struct arglist_parser *argparser) 1229{ 1230 /* Current argument number. */ 1231 int arg = 1; 1232 /* 0 when no keyword has been seen. 1 right after a keyword is seen. */ 1233 int state; 1234 /* Parameters of the keyword just seen. Defined only in state 1. */ 1235 const struct callshapes *next_shapes = NULL; 1236 /* Context iterator that will be used if the next token is a '('. */ 1237 flag_context_list_iterator_ty next_context_iter = 1238 passthrough_context_list_iterator; 1239 /* Current context. */ 1240 flag_context_ty inner_context = 1241 inherited_context (outer_context, 1242 flag_context_list_iterator_advance (&context_iter)); 1243 1244 /* Start state is 0. */ 1245 state = 0; 1246 1247 for (;;) 1248 { 1249 token_ty token; 1250 1251 x_java_lex (&token); 1252 switch (token.type) 1253 { 1254 case token_type_symbol: 1255 { 1256 /* Combine symbol1 . ... . symbolN to a single strings, so that 1257 we can recognize static function calls like 1258 GettextResource.gettext. The information present for 1259 symbolI.....symbolN has precedence over the information for 1260 symbolJ.....symbolN with J > I. */ 1261 char *sum = token.string; 1262 size_t sum_len = strlen (sum); 1263 const char *dottedname; 1264 flag_context_list_ty *context_list; 1265 1266 for (;;) 1267 { 1268 token_ty token2; 1269 1270 x_java_lex (&token2); 1271 if (token2.type == token_type_dot) 1272 { 1273 token_ty token3; 1274 1275 x_java_lex (&token3); 1276 if (token3.type == token_type_symbol) 1277 { 1278 char *addend = token3.string; 1279 size_t addend_len = strlen (addend); 1280 1281 sum = 1282 (char *) xrealloc (sum, sum_len + 1 + addend_len + 1); 1283 sum[sum_len] = '.'; 1284 memcpy (sum + sum_len + 1, addend, addend_len + 1); 1285 sum_len += 1 + addend_len; 1286 1287 free_token (&token3); 1288 free_token (&token2); 1289 continue; 1290 } 1291 x_java_unlex (&token3); 1292 } 1293 x_java_unlex (&token2); 1294 break; 1295 } 1296 1297 for (dottedname = sum;;) 1298 { 1299 void *keyword_value; 1300 1301 if (hash_find_entry (&keywords, dottedname, strlen (dottedname), 1302 &keyword_value) 1303 == 0) 1304 { 1305 next_shapes = (const struct callshapes *) keyword_value; 1306 state = 1; 1307 break; 1308 } 1309 1310 dottedname = strchr (dottedname, '.'); 1311 if (dottedname == NULL) 1312 { 1313 state = 0; 1314 break; 1315 } 1316 dottedname++; 1317 } 1318 1319 for (dottedname = sum;;) 1320 { 1321 context_list = 1322 flag_context_list_table_lookup ( 1323 flag_context_list_table, 1324 dottedname, strlen (dottedname)); 1325 if (context_list != NULL) 1326 break; 1327 1328 dottedname = strchr (dottedname, '.'); 1329 if (dottedname == NULL) 1330 break; 1331 dottedname++; 1332 } 1333 next_context_iter = flag_context_list_iterator (context_list); 1334 1335 free (sum); 1336 continue; 1337 } 1338 1339 case token_type_lparen: 1340 if (extract_parenthesized (mlp, token_type_rparen, 1341 inner_context, next_context_iter, 1342 arglist_parser_alloc (mlp, 1343 state ? next_shapes : NULL))) 1344 { 1345 xgettext_current_source_encoding = po_charset_utf8; 1346 arglist_parser_done (argparser, arg); 1347 xgettext_current_source_encoding = xgettext_global_source_encoding; 1348 return true; 1349 } 1350 next_context_iter = null_context_list_iterator; 1351 state = 0; 1352 continue; 1353 1354 case token_type_rparen: 1355 if (terminator == token_type_rparen) 1356 { 1357 xgettext_current_source_encoding = po_charset_utf8; 1358 arglist_parser_done (argparser, arg); 1359 xgettext_current_source_encoding = xgettext_global_source_encoding; 1360 return false; 1361 } 1362 if (terminator == token_type_rbrace) 1363 { 1364 error_with_progname = false; 1365 error (0, 0, 1366 _("%s:%d: warning: ')' found where '}' was expected"), 1367 logical_file_name, token.line_number); 1368 error_with_progname = true; 1369 } 1370 next_context_iter = null_context_list_iterator; 1371 state = 0; 1372 continue; 1373 1374 case token_type_lbrace: 1375 if (extract_parenthesized (mlp, token_type_rbrace, 1376 null_context, null_context_list_iterator, 1377 arglist_parser_alloc (mlp, NULL))) 1378 { 1379 xgettext_current_source_encoding = po_charset_utf8; 1380 arglist_parser_done (argparser, arg); 1381 xgettext_current_source_encoding = xgettext_global_source_encoding; 1382 return true; 1383 } 1384 next_context_iter = null_context_list_iterator; 1385 state = 0; 1386 continue; 1387 1388 case token_type_rbrace: 1389 if (terminator == token_type_rbrace) 1390 { 1391 xgettext_current_source_encoding = po_charset_utf8; 1392 arglist_parser_done (argparser, arg); 1393 xgettext_current_source_encoding = xgettext_global_source_encoding; 1394 return false; 1395 } 1396 if (terminator == token_type_rparen) 1397 { 1398 error_with_progname = false; 1399 error (0, 0, 1400 _("%s:%d: warning: '}' found where ')' was expected"), 1401 logical_file_name, token.line_number); 1402 error_with_progname = true; 1403 } 1404 next_context_iter = null_context_list_iterator; 1405 state = 0; 1406 continue; 1407 1408 case token_type_comma: 1409 arg++; 1410 inner_context = 1411 inherited_context (outer_context, 1412 flag_context_list_iterator_advance ( 1413 &context_iter)); 1414 next_context_iter = passthrough_context_list_iterator; 1415 state = 0; 1416 continue; 1417 1418 case token_type_string_literal: 1419 { 1420 lex_pos_ty pos; 1421 pos.file_name = logical_file_name; 1422 pos.line_number = token.line_number; 1423 1424 xgettext_current_source_encoding = po_charset_utf8; 1425 if (extract_all) 1426 remember_a_message (mlp, NULL, token.string, inner_context, 1427 &pos, token.comment); 1428 else 1429 arglist_parser_remember (argparser, arg, token.string, 1430 inner_context, 1431 pos.file_name, pos.line_number, 1432 token.comment); 1433 xgettext_current_source_encoding = xgettext_global_source_encoding; 1434 } 1435 drop_reference (token.comment); 1436 next_context_iter = null_context_list_iterator; 1437 state = 0; 1438 continue; 1439 1440 case token_type_eof: 1441 xgettext_current_source_encoding = po_charset_utf8; 1442 arglist_parser_done (argparser, arg); 1443 xgettext_current_source_encoding = xgettext_global_source_encoding; 1444 return true; 1445 1446 case token_type_dot: 1447 case token_type_number: 1448 case token_type_plus: 1449 case token_type_other: 1450 next_context_iter = null_context_list_iterator; 1451 state = 0; 1452 continue; 1453 1454 default: 1455 abort (); 1456 } 1457 } 1458} 1459 1460 1461void 1462extract_java (FILE *f, 1463 const char *real_filename, const char *logical_filename, 1464 flag_context_list_table_ty *flag_table, 1465 msgdomain_list_ty *mdlp) 1466{ 1467 message_list_ty *mlp = mdlp->item[0]->messages; 1468 1469 fp = f; 1470 real_file_name = real_filename; 1471 logical_file_name = xstrdup (logical_filename); 1472 line_number = 1; 1473 1474 last_comment_line = -1; 1475 last_non_comment_line = -1; 1476 1477 phase6_last = token_type_eof; 1478 1479 flag_context_list_table = flag_table; 1480 1481 init_keywords (); 1482 1483 /* Eat tokens until eof is seen. When extract_parenthesized returns 1484 due to an unbalanced closing parenthesis, just restart it. */ 1485 while (!extract_parenthesized (mlp, token_type_eof, 1486 null_context, null_context_list_iterator, 1487 arglist_parser_alloc (mlp, NULL))) 1488 ; 1489 1490 fp = NULL; 1491 real_file_name = NULL; 1492 logical_file_name = NULL; 1493 line_number = 0; 1494} 1495