1/* xgettext Java backend. 2 Copyright (C) 2003, 2005-2006 Free Software Foundation, Inc. 3 Written by Bruno Haible <bruno@clisp.org>, 2003. 4 5 This program is free software; you can redistribute it and/or modify 6 it under the terms of the GNU General Public License as published by 7 the Free Software Foundation; either version 2, or (at your option) 8 any later version. 9 10 This program is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 GNU General Public License for more details. 14 15 You should have received a copy of the GNU General Public License 16 along with this program; if not, write to the Free Software Foundation, 17 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ 18 19#ifdef HAVE_CONFIG_H 20# include "config.h" 21#endif 22 23#include <errno.h> 24#include <stdbool.h> 25#include <stdio.h> 26#include <stdlib.h> 27#include <string.h> 28 29#include "message.h" 30#include "xgettext.h" 31#include "x-java.h" 32#include "error.h" 33#include "xalloc.h" 34#include "exit.h" 35#include "hash.h" 36#include "po-charset.h" 37#include "utf16-ucs4.h" 38#include "ucs4-utf8.h" 39#include "gettext.h" 40 41#define _(s) gettext(s) 42 43#define SIZEOF(a) (sizeof(a) / sizeof(a[0])) 44 45 46/* The Java syntax is defined in the 47 Java Language Specification, Second Edition, 48 (available from http://java.sun.com/), 49 chapter 3 "Lexical Structure". */ 50 51 52/* ====================== Keyword set customization. ====================== */ 53 54/* If true extract all strings. */ 55static bool extract_all = false; 56 57static hash_table keywords; 58static bool default_keywords = true; 59 60 61void 62x_java_extract_all () 63{ 64 extract_all = true; 65} 66 67 68void 69x_java_keyword (const char *name) 70{ 71 if (name == NULL) 72 default_keywords = false; 73 else 74 { 75 const char *end; 76 struct callshape shape; 77 const char *colon; 78 79 if (keywords.table == NULL) 80 hash_init (&keywords, 100); 81 82 split_keywordspec (name, &end, &shape); 83 84 /* The characters between name and end should form a valid Java 85 identifier sequence with dots. 86 A colon means an invalid parse in split_keywordspec(). */ 87 colon = strchr (name, ':'); 88 if (colon == NULL || colon >= end) 89 insert_keyword_callshape (&keywords, name, end - name, &shape); 90 } 91} 92 93/* Finish initializing the keywords hash table. 94 Called after argument processing, before each file is processed. */ 95static void 96init_keywords () 97{ 98 if (default_keywords) 99 { 100 /* When adding new keywords here, also update the documentation in 101 xgettext.texi! */ 102 x_java_keyword ("GettextResource.gettext:2"); /* static method */ 103 x_java_keyword ("GettextResource.ngettext:2,3"); /* static method */ 104 x_java_keyword ("gettext"); 105 x_java_keyword ("ngettext:1,2"); 106 x_java_keyword ("getString"); /* ResourceBundle.getString */ 107 default_keywords = false; 108 } 109} 110 111void 112init_flag_table_java () 113{ 114 xgettext_record_flag ("GettextResource.gettext:2:pass-java-format"); 115 xgettext_record_flag ("GettextResource.ngettext:2:pass-java-format"); 116 xgettext_record_flag ("GettextResource.ngettext:3:pass-java-format"); 117 xgettext_record_flag ("gettext:1:pass-java-format"); 118 xgettext_record_flag ("ngettext:1:pass-java-format"); 119 xgettext_record_flag ("ngettext:2:pass-java-format"); 120 xgettext_record_flag ("getString:1:pass-java-format"); 121 xgettext_record_flag ("MessageFormat:1:java-format"); 122 xgettext_record_flag ("MessageFormat.format:1:java-format"); 123} 124 125 126/* ======================== Reading of characters. ======================== */ 127 128/* Real filename, used in error messages about the input file. */ 129static const char *real_file_name; 130 131/* Logical filename and line number, used to label the extracted messages. */ 132static char *logical_file_name; 133static int line_number; 134 135/* The input file stream. */ 136static FILE *fp; 137 138 139/* Fetch the next single-byte character from the input file. 140 Pushback can consist of an unlimited number of 'u' followed by up to 4 141 other characters. */ 142 143/* Special coding of multiple 'u's in the pushback buffer. */ 144#define MULTIPLE_U(count) (0x1000 + (count)) 145 146static int phase1_pushback[5]; 147static unsigned int phase1_pushback_length; 148 149static int 150phase1_getc () 151{ 152 int c; 153 154 if (phase1_pushback_length) 155 { 156 c = phase1_pushback[--phase1_pushback_length]; 157 if (c >= MULTIPLE_U (0)) 158 { 159 if (c > MULTIPLE_U (1)) 160 phase1_pushback[phase1_pushback_length++] = c - 1; 161 return 'u'; 162 } 163 else 164 return c; 165 } 166 167 c = getc (fp); 168 169 if (c == EOF) 170 { 171 if (ferror (fp)) 172 error (EXIT_FAILURE, errno, _("\ 173error while reading \"%s\""), real_file_name); 174 } 175 176 return c; 177} 178 179/* Supports any number of 'u' and up to 4 arbitrary characters of pushback. */ 180static void 181phase1_ungetc (int c) 182{ 183 if (c != EOF) 184 { 185 if (c == 'u') 186 { 187 if (phase1_pushback_length > 0 188 && phase1_pushback[phase1_pushback_length - 1] >= MULTIPLE_U (0)) 189 phase1_pushback[phase1_pushback_length - 1]++; 190 else 191 { 192 if (phase1_pushback_length == SIZEOF (phase1_pushback)) 193 abort (); 194 phase1_pushback[phase1_pushback_length++] = MULTIPLE_U (1); 195 } 196 } 197 else 198 { 199 if (phase1_pushback_length == SIZEOF (phase1_pushback)) 200 abort (); 201 phase1_pushback[phase1_pushback_length++] = c; 202 } 203 } 204} 205 206 207/* Fetch the next single-byte character or Unicode character from the file. 208 (Here, as in the Java Language Specification, when we say "Unicode 209 character", we actually mean "UTF-16 encoding unit".) */ 210 211/* Return value of phase 2, 3, 4 when EOF is reached. */ 212#define P2_EOF 0xffff 213 214/* Convert an UTF-16 code point to a return value that can be distinguished 215 from a single-byte return value. */ 216#define UNICODE(code) (0x10000 + (code)) 217 218/* Test a return value of phase 2, 3, 4 whether it designates an UTF-16 code 219 point. */ 220#define IS_UNICODE(p2_result) ((p2_result) >= 0x10000) 221 222/* Extract the UTF-16 code of a return value that satisfies IS_UNICODE. */ 223#define UTF16_VALUE(p2_result) ((p2_result) - 0x10000) 224 225/* Reduces a return value of phase 2, 3, 4 by unmasking the UNICODE bit, 226 so that it can be more easily compared against an ASCII character. 227 (RED (c) == 'x') is equivalent to (c == 'x' || c == UNICODE ('x')). */ 228#define RED(p2_result) ((p2_result) & 0xffff) 229 230static int phase2_pushback[1]; 231static int phase2_pushback_length; 232 233static int 234phase2_getc () 235{ 236 int c; 237 238 if (phase2_pushback_length) 239 return phase2_pushback[--phase2_pushback_length]; 240 241 c = phase1_getc (); 242 if (c == EOF) 243 return P2_EOF; 244 if (c == '\\') 245 { 246 c = phase1_getc (); 247 if (c == 'u') 248 { 249 unsigned int u_count = 1; 250 unsigned char buf[4]; 251 unsigned int n; 252 int i; 253 254 for (;;) 255 { 256 c = phase1_getc (); 257 if (c != 'u') 258 break; 259 u_count++; 260 } 261 phase1_ungetc (c); 262 263 n = 0; 264 for (i = 0; i < 4; i++) 265 { 266 c = phase1_getc (); 267 268 if (c >= '0' && c <= '9') 269 n = (n << 4) + (c - '0'); 270 else if (c >= 'A' && c <= 'F') 271 n = (n << 4) + (c - 'A' + 10); 272 else if (c >= 'a' && c <= 'f') 273 n = (n << 4) + (c - 'a' + 10); 274 else 275 { 276 phase1_ungetc (c); 277 while (--i >= 0) 278 phase1_ungetc (buf[i]); 279 for (; u_count > 0; u_count--) 280 phase1_ungetc ('u'); 281 return '\\'; 282 } 283 284 buf[i] = c; 285 } 286 return UNICODE (n); 287 } 288 phase1_ungetc (c); 289 return '\\'; 290 } 291 return c; 292} 293 294/* Supports only one pushback character. */ 295static void 296phase2_ungetc (int c) 297{ 298 if (c != P2_EOF) 299 { 300 if (phase2_pushback_length == SIZEOF (phase2_pushback)) 301 abort (); 302 phase2_pushback[phase2_pushback_length++] = c; 303 } 304} 305 306 307/* Fetch the next single-byte character or Unicode character from the file. 308 With line number handling. 309 Convert line terminators to '\n' or UNICODE ('\n'). */ 310 311static int phase3_pushback[2]; 312static int phase3_pushback_length; 313 314static int 315phase3_getc () 316{ 317 int c; 318 319 if (phase3_pushback_length) 320 { 321 c = phase3_pushback[--phase3_pushback_length]; 322 if (c == '\n') 323 ++line_number; 324 return c; 325 } 326 327 c = phase2_getc (); 328 329 /* Handle line terminators. */ 330 if (RED (c) == '\r') 331 { 332 int c1 = phase2_getc (); 333 334 if (RED (c1) != '\n') 335 phase2_ungetc (c1); 336 337 /* Seen line terminator CR or CR/LF. */ 338 if (c == '\r' || c1 == '\n') 339 { 340 ++line_number; 341 return '\n'; 342 } 343 else 344 return UNICODE ('\n'); 345 } 346 else if (RED (c) == '\n') 347 { 348 /* Seen line terminator LF. */ 349 if (c == '\n') 350 { 351 ++line_number; 352 return '\n'; 353 } 354 else 355 return UNICODE ('\n'); 356 } 357 358 return c; 359} 360 361/* Supports 2 characters of pushback. */ 362static void 363phase3_ungetc (int c) 364{ 365 if (c != P2_EOF) 366 { 367 if (c == '\n') 368 --line_number; 369 if (phase3_pushback_length == SIZEOF (phase3_pushback)) 370 abort (); 371 phase3_pushback[phase3_pushback_length++] = c; 372 } 373} 374 375 376/* ========================= Accumulating strings. ======================== */ 377 378/* A string buffer type that allows appending bytes (in the 379 xgettext_current_source_encoding) or Unicode characters. 380 Returns the entire string in UTF-8 encoding. */ 381 382struct string_buffer 383{ 384 /* The part of the string that has already been converted to UTF-8. */ 385 char *utf8_buffer; 386 size_t utf8_buflen; 387 size_t utf8_allocated; 388 /* The first half of an UTF-16 surrogate character. */ 389 unsigned short utf16_surr; 390 /* The part of the string that is still in the source encoding. */ 391 char *curr_buffer; 392 size_t curr_buflen; 393 size_t curr_allocated; 394}; 395 396/* Initialize a 'struct string_buffer' to empty. */ 397static inline void 398init_string_buffer (struct string_buffer *bp) 399{ 400 bp->utf8_buffer = NULL; 401 bp->utf8_buflen = 0; 402 bp->utf8_allocated = 0; 403 bp->utf16_surr = 0; 404 bp->curr_buffer = NULL; 405 bp->curr_buflen = 0; 406 bp->curr_allocated = 0; 407} 408 409/* Auxiliary function: Append a byte to bp->curr. */ 410static inline void 411string_buffer_append_byte (struct string_buffer *bp, unsigned char c) 412{ 413 if (bp->curr_buflen == bp->curr_allocated) 414 { 415 bp->curr_allocated = 2 * bp->curr_allocated + 10; 416 bp->curr_buffer = xrealloc (bp->curr_buffer, bp->curr_allocated); 417 } 418 bp->curr_buffer[bp->curr_buflen++] = c; 419} 420 421/* Auxiliary function: Ensure count more bytes are available in bp->utf8. */ 422static inline void 423string_buffer_append_unicode_grow (struct string_buffer *bp, size_t count) 424{ 425 if (bp->utf8_buflen + count > bp->utf8_allocated) 426 { 427 size_t new_allocated = 2 * bp->utf8_allocated + 10; 428 if (new_allocated < bp->utf8_buflen + count) 429 new_allocated = bp->utf8_buflen + count; 430 bp->utf8_allocated = new_allocated; 431 bp->utf8_buffer = xrealloc (bp->utf8_buffer, new_allocated); 432 } 433} 434 435/* Auxiliary function: Append a Unicode character to bp->utf8. 436 uc must be < 0x110000. */ 437static inline void 438string_buffer_append_unicode (struct string_buffer *bp, unsigned int uc) 439{ 440 unsigned char utf8buf[6]; 441 int count = u8_uctomb (utf8buf, uc, 6); 442 443 if (count < 0) 444 /* The caller should have ensured that uc is not out-of-range. */ 445 abort (); 446 447 string_buffer_append_unicode_grow (bp, count); 448 memcpy (bp->utf8_buffer + bp->utf8_buflen, utf8buf, count); 449 bp->utf8_buflen += count; 450} 451 452/* Auxiliary function: Flush bp->utf16_surr into bp->utf8_buffer. */ 453static inline void 454string_buffer_flush_utf16_surr (struct string_buffer *bp) 455{ 456 if (bp->utf16_surr != 0) 457 { 458 /* A half surrogate is invalid, therefore use U+FFFD instead. */ 459 string_buffer_append_unicode (bp, 0xfffd); 460 bp->utf16_surr = 0; 461 } 462} 463 464/* Auxiliary function: Flush bp->curr_buffer into bp->utf8_buffer. */ 465static inline void 466string_buffer_flush_curr_buffer (struct string_buffer *bp, int lineno) 467{ 468 if (bp->curr_buflen > 0) 469 { 470 char *curr; 471 size_t count; 472 473 string_buffer_append_byte (bp, '\0'); 474 475 /* Convert from the source encoding to UTF-8. */ 476 curr = from_current_source_encoding (bp->curr_buffer, 477 logical_file_name, lineno); 478 479 /* Append it to bp->utf8_buffer. */ 480 count = strlen (curr); 481 string_buffer_append_unicode_grow (bp, count); 482 memcpy (bp->utf8_buffer + bp->utf8_buflen, curr, count); 483 bp->utf8_buflen += count; 484 485 if (curr != bp->curr_buffer) 486 free (curr); 487 bp->curr_buflen = 0; 488 } 489} 490 491/* Append a character or Unicode character to a 'struct string_buffer'. */ 492static void 493string_buffer_append (struct string_buffer *bp, int c) 494{ 495 if (IS_UNICODE (c)) 496 { 497 /* Append a Unicode character. */ 498 499 /* Switch from multibyte character mode to Unicode character mode. */ 500 string_buffer_flush_curr_buffer (bp, line_number); 501 502 /* Test whether this character and the previous one form a Unicode 503 surrogate character pair. */ 504 if (bp->utf16_surr != 0 505 && (c >= UNICODE (0xdc00) && c < UNICODE (0xe000))) 506 { 507 unsigned short utf16buf[2]; 508 unsigned int uc; 509 510 utf16buf[0] = bp->utf16_surr; 511 utf16buf[1] = UTF16_VALUE (c); 512 if (u16_mbtouc_aux (&uc, utf16buf, 2) != 2) 513 abort (); 514 515 string_buffer_append_unicode (bp, uc); 516 bp->utf16_surr = 0; 517 } 518 else 519 { 520 string_buffer_flush_utf16_surr (bp); 521 522 if (c >= UNICODE (0xd800) && c < UNICODE (0xdc00)) 523 bp->utf16_surr = UTF16_VALUE (c); 524 else 525 string_buffer_append_unicode (bp, UTF16_VALUE (c)); 526 } 527 } 528 else 529 { 530 /* Append a single byte. */ 531 532 /* Switch from Unicode character mode to multibyte character mode. */ 533 string_buffer_flush_utf16_surr (bp); 534 535 /* When a newline is seen, convert the accumulated multibyte sequence. 536 This ensures a correct line number in the error message in case of 537 a conversion error. The "- 1" is to account for the newline. */ 538 if (c == '\n') 539 string_buffer_flush_curr_buffer (bp, line_number - 1); 540 541 string_buffer_append_byte (bp, (unsigned char) c); 542 } 543} 544 545/* Return the string buffer's contents. */ 546static char * 547string_buffer_result (struct string_buffer *bp) 548{ 549 /* Flush all into bp->utf8_buffer. */ 550 string_buffer_flush_utf16_surr (bp); 551 string_buffer_flush_curr_buffer (bp, line_number); 552 /* NUL-terminate it. */ 553 string_buffer_append_unicode_grow (bp, 1); 554 bp->utf8_buffer[bp->utf8_buflen] = '\0'; 555 /* Return it. */ 556 return bp->utf8_buffer; 557} 558 559/* Free the memory pointed to by a 'struct string_buffer'. */ 560static inline void 561free_string_buffer (struct string_buffer *bp) 562{ 563 free (bp->utf8_buffer); 564 free (bp->curr_buffer); 565} 566 567 568/* ======================== Accumulating comments. ======================== */ 569 570 571/* Accumulating a single comment line. */ 572 573static struct string_buffer comment_buffer; 574 575static inline void 576comment_start () 577{ 578 comment_buffer.utf8_buflen = 0; 579 comment_buffer.utf16_surr = 0; 580 comment_buffer.curr_buflen = 0; 581} 582 583static inline bool 584comment_at_start () 585{ 586 return (comment_buffer.utf8_buflen == 0 && comment_buffer.utf16_surr == 0 587 && comment_buffer.curr_buflen == 0); 588} 589 590static inline void 591comment_add (int c) 592{ 593 string_buffer_append (&comment_buffer, c); 594} 595 596static inline void 597comment_line_end (size_t chars_to_remove) 598{ 599 char *buffer = string_buffer_result (&comment_buffer); 600 size_t buflen = strlen (buffer); 601 602 buflen -= chars_to_remove; 603 while (buflen >= 1 604 && (buffer[buflen - 1] == ' ' || buffer[buflen - 1] == '\t')) 605 --buflen; 606 buffer[buflen] = '\0'; 607 savable_comment_add (buffer); 608} 609 610 611/* These are for tracking whether comments count as immediately before 612 keyword. */ 613static int last_comment_line; 614static int last_non_comment_line; 615 616 617/* Replace each comment that is not inside a character constant or string 618 literal with a space or newline character. */ 619 620static int 621phase4_getc () 622{ 623 int c0; 624 int c; 625 bool last_was_star; 626 627 c0 = phase3_getc (); 628 if (RED (c0) != '/') 629 return c0; 630 c = phase3_getc (); 631 switch (RED (c)) 632 { 633 default: 634 phase3_ungetc (c); 635 return c0; 636 637 case '*': 638 /* C style comment. */ 639 comment_start (); 640 last_was_star = false; 641 for (;;) 642 { 643 c = phase3_getc (); 644 if (c == P2_EOF) 645 break; 646 /* We skip all leading white space, but not EOLs. */ 647 if (!(comment_at_start () && (RED (c) == ' ' || RED (c) == '\t'))) 648 comment_add (c); 649 switch (RED (c)) 650 { 651 case '\n': 652 comment_line_end (1); 653 comment_start (); 654 last_was_star = false; 655 continue; 656 657 case '*': 658 last_was_star = true; 659 continue; 660 661 case '/': 662 if (last_was_star) 663 { 664 comment_line_end (2); 665 break; 666 } 667 /* FALLTHROUGH */ 668 669 default: 670 last_was_star = false; 671 continue; 672 } 673 break; 674 } 675 last_comment_line = line_number; 676 return ' '; 677 678 case '/': 679 /* C++ style comment. */ 680 last_comment_line = line_number; 681 comment_start (); 682 for (;;) 683 { 684 c = phase3_getc (); 685 if (RED (c) == '\n' || c == P2_EOF) 686 break; 687 /* We skip all leading white space, but not EOLs. */ 688 if (!(comment_at_start () && (RED (c) == ' ' || RED (c) == '\t'))) 689 comment_add (c); 690 } 691 phase3_ungetc (c); /* push back the newline, to decrement line_number */ 692 comment_line_end (0); 693 phase3_getc (); /* read the newline again */ 694 return '\n'; 695 } 696} 697 698/* Supports only one pushback character. */ 699static void 700phase4_ungetc (int c) 701{ 702 phase3_ungetc (c); 703} 704 705 706/* ========================== Reading of tokens. ========================== */ 707 708enum token_type_ty 709{ 710 token_type_eof, 711 token_type_lparen, /* ( */ 712 token_type_rparen, /* ) */ 713 token_type_lbrace, /* { */ 714 token_type_rbrace, /* } */ 715 token_type_comma, /* , */ 716 token_type_dot, /* . */ 717 token_type_string_literal, /* "abc" */ 718 token_type_number, /* 1.23 */ 719 token_type_symbol, /* identifier, keyword, null */ 720 token_type_plus, /* + */ 721 token_type_other /* character literal, misc. operator */ 722}; 723typedef enum token_type_ty token_type_ty; 724 725typedef struct token_ty token_ty; 726struct token_ty 727{ 728 token_type_ty type; 729 char *string; /* for token_type_string_literal, token_type_symbol */ 730 refcounted_string_list_ty *comment; /* for token_type_string_literal */ 731 int line_number; 732}; 733 734 735/* Free the memory pointed to by a 'struct token_ty'. */ 736static inline void 737free_token (token_ty *tp) 738{ 739 if (tp->type == token_type_string_literal || tp->type == token_type_symbol) 740 free (tp->string); 741 if (tp->type == token_type_string_literal) 742 drop_reference (tp->comment); 743} 744 745 746/* Read an escape sequence inside a string literal or character literal. */ 747static inline int 748do_getc_escaped () 749{ 750 int c; 751 752 /* Use phase 3, because phase 4 elides comments. */ 753 c = phase3_getc (); 754 if (c == P2_EOF) 755 return UNICODE ('\\'); 756 switch (RED (c)) 757 { 758 case 'b': 759 return UNICODE (0x08); 760 case 't': 761 return UNICODE (0x09); 762 case 'n': 763 return UNICODE (0x0a); 764 case 'f': 765 return UNICODE (0x0c); 766 case 'r': 767 return UNICODE (0x0d); 768 case '"': 769 return UNICODE ('"'); 770 case '\'': 771 return UNICODE ('\''); 772 case '\\': 773 return UNICODE ('\\'); 774 case '0': case '1': case '2': case '3': 775 case '4': case '5': case '6': case '7': 776 { 777 int n = RED (c) - '0'; 778 bool maybe3digits = (n < 4); 779 780 c = phase3_getc (); 781 if (RED (c) >= '0' && RED (c) <= '7') 782 { 783 n = (n << 3) + (RED (c) - '0'); 784 if (maybe3digits) 785 { 786 c = phase3_getc (); 787 if (RED (c) >= '0' && RED (c) <= '7') 788 n = (n << 3) + (RED (c) - '0'); 789 else 790 phase3_ungetc (c); 791 } 792 } 793 else 794 phase3_ungetc (c); 795 796 return UNICODE (n); 797 } 798 default: 799 /* Invalid escape sequence. */ 800 phase3_ungetc (c); 801 return UNICODE ('\\'); 802 } 803} 804 805/* Read a string literal or character literal. */ 806static void 807accumulate_escaped (struct string_buffer *literal, int delimiter) 808{ 809 int c; 810 811 for (;;) 812 { 813 /* Use phase 3, because phase 4 elides comments. */ 814 c = phase3_getc (); 815 if (c == P2_EOF || RED (c) == delimiter) 816 break; 817 if (RED (c) == '\n') 818 { 819 phase3_ungetc (c); 820 error_with_progname = false; 821 if (delimiter == '\'') 822 error (0, 0, _("%s:%d: warning: unterminated character constant"), 823 logical_file_name, line_number); 824 else 825 error (0, 0, _("%s:%d: warning: unterminated string constant"), 826 logical_file_name, line_number); 827 error_with_progname = true; 828 break; 829 } 830 if (RED (c) == '\\') 831 c = do_getc_escaped (); 832 string_buffer_append (literal, c); 833 } 834} 835 836 837/* Combine characters into tokens. Discard whitespace. */ 838 839static token_ty phase5_pushback[3]; 840static int phase5_pushback_length; 841 842static void 843phase5_get (token_ty *tp) 844{ 845 int c; 846 847 if (phase5_pushback_length) 848 { 849 *tp = phase5_pushback[--phase5_pushback_length]; 850 return; 851 } 852 tp->string = NULL; 853 854 for (;;) 855 { 856 tp->line_number = line_number; 857 c = phase4_getc (); 858 859 if (c == P2_EOF) 860 { 861 tp->type = token_type_eof; 862 return; 863 } 864 865 switch (RED (c)) 866 { 867 case '\n': 868 if (last_non_comment_line > last_comment_line) 869 savable_comment_reset (); 870 /* FALLTHROUGH */ 871 case ' ': 872 case '\t': 873 case '\f': 874 /* Ignore whitespace and comments. */ 875 continue; 876 } 877 878 last_non_comment_line = tp->line_number; 879 880 switch (RED (c)) 881 { 882 case '(': 883 tp->type = token_type_lparen; 884 return; 885 886 case ')': 887 tp->type = token_type_rparen; 888 return; 889 890 case '{': 891 tp->type = token_type_lbrace; 892 return; 893 894 case '}': 895 tp->type = token_type_rbrace; 896 return; 897 898 case ',': 899 tp->type = token_type_comma; 900 return; 901 902 case '.': 903 c = phase4_getc (); 904 if (!(RED (c) >= '0' && RED (c) <= '9')) 905 { 906 phase4_ungetc (c); 907 tp->type = token_type_dot; 908 return; 909 } 910 /* FALLTHROUGH */ 911 912 case '0': case '1': case '2': case '3': case '4': 913 case '5': case '6': case '7': case '8': case '9': 914 { 915 /* Don't need to verify the complicated syntax of integers and 916 floating-point numbers. We assume a valid Java input. 917 The simplified syntax that we recognize as number is: any 918 sequence of alphanumeric characters, additionally '+' and '-' 919 immediately after 'e' or 'E' except in hexadecimal numbers. */ 920 bool hexadecimal = false; 921 922 for (;;) 923 { 924 c = phase4_getc (); 925 if (RED (c) >= '0' && RED (c) <= '9') 926 continue; 927 if ((RED (c) >= 'A' && RED (c) <= 'Z') 928 || (RED (c) >= 'a' && RED (c) <= 'z')) 929 { 930 if (RED (c) == 'X' || RED (c) == 'x') 931 hexadecimal = true; 932 if ((RED (c) == 'E' || RED (c) == 'e') && !hexadecimal) 933 { 934 c = phase4_getc (); 935 if (!(RED (c) == '+' || RED (c) == '-')) 936 phase4_ungetc (c); 937 } 938 continue; 939 } 940 if (RED (c) == '.') 941 continue; 942 break; 943 } 944 phase4_ungetc (c); 945 tp->type = token_type_number; 946 return; 947 } 948 949 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G': 950 case 'H': case 'I': case 'J': case 'K': case 'L': case 'M': case 'N': 951 case 'O': case 'P': case 'Q': case 'R': case 'S': case 'T': case 'U': 952 case 'V': case 'W': case 'X': case 'Y': case 'Z': 953 case '_': 954 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g': 955 case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n': 956 case 'o': case 'p': case 'q': case 'r': case 's': case 't': case 'u': 957 case 'v': case 'w': case 'x': case 'y': case 'z': 958 /* Although Java allows identifiers containing many Unicode 959 characters, we recognize only identifiers consisting of ASCII 960 characters. This avoids conversion hassles w.r.t. the --keyword 961 arguments, and shouldn't be a big problem in practice. */ 962 { 963 static char *buffer; 964 static int bufmax; 965 int bufpos = 0; 966 for (;;) 967 { 968 if (bufpos >= bufmax) 969 { 970 bufmax = 2 * bufmax + 10; 971 buffer = xrealloc (buffer, bufmax); 972 } 973 buffer[bufpos++] = RED (c); 974 c = phase4_getc (); 975 if (!((RED (c) >= 'A' && RED (c) <= 'Z') 976 || (RED (c) >= 'a' && RED (c) <= 'z') 977 || (RED (c) >= '0' && RED (c) <= '9') 978 || RED (c) == '_')) 979 break; 980 } 981 phase4_ungetc (c); 982 if (bufpos >= bufmax) 983 { 984 bufmax = 2 * bufmax + 10; 985 buffer = xrealloc (buffer, bufmax); 986 } 987 buffer[bufpos] = '\0'; 988 tp->string = xstrdup (buffer); 989 tp->type = token_type_symbol; 990 return; 991 } 992 993 case '"': 994 /* String literal. */ 995 { 996 struct string_buffer literal; 997 998 init_string_buffer (&literal); 999 accumulate_escaped (&literal, '"'); 1000 tp->string = xstrdup (string_buffer_result (&literal)); 1001 free_string_buffer (&literal); 1002 tp->comment = add_reference (savable_comment); 1003 tp->type = token_type_string_literal; 1004 return; 1005 } 1006 1007 case '\'': 1008 /* Character literal. */ 1009 { 1010 struct string_buffer literal; 1011 1012 init_string_buffer (&literal); 1013 accumulate_escaped (&literal, '\''); 1014 free_string_buffer (&literal); 1015 tp->type = token_type_other; 1016 return; 1017 } 1018 1019 case '+': 1020 c = phase4_getc (); 1021 if (RED (c) == '+') 1022 /* Operator ++ */ 1023 tp->type = token_type_other; 1024 else if (RED (c) == '=') 1025 /* Operator += */ 1026 tp->type = token_type_other; 1027 else 1028 { 1029 /* Operator + */ 1030 phase4_ungetc (c); 1031 tp->type = token_type_plus; 1032 } 1033 return; 1034 1035 default: 1036 /* Misc. operator. */ 1037 tp->type = token_type_other; 1038 return; 1039 } 1040 } 1041} 1042 1043/* Supports 3 tokens of pushback. */ 1044static void 1045phase5_unget (token_ty *tp) 1046{ 1047 if (tp->type != token_type_eof) 1048 { 1049 if (phase5_pushback_length == SIZEOF (phase5_pushback)) 1050 abort (); 1051 phase5_pushback[phase5_pushback_length++] = *tp; 1052 } 1053} 1054 1055 1056/* Compile-time optimization of string literal concatenation. 1057 Combine "string1" + ... + "stringN" to the concatenated string if 1058 - the token before this expression is not ')' (because then the first 1059 string could be part of a cast expression), 1060 - the token after this expression is not '.' (because then the last 1061 string could be part of a method call expression). */ 1062 1063static token_ty phase6_pushback[2]; 1064static int phase6_pushback_length; 1065 1066static token_type_ty phase6_last; 1067 1068static void 1069phase6_get (token_ty *tp) 1070{ 1071 if (phase6_pushback_length) 1072 { 1073 *tp = phase6_pushback[--phase6_pushback_length]; 1074 return; 1075 } 1076 1077 phase5_get (tp); 1078 if (tp->type == token_type_string_literal && phase6_last != token_type_rparen) 1079 { 1080 char *sum = tp->string; 1081 size_t sum_len = strlen (sum); 1082 1083 for (;;) 1084 { 1085 token_ty token2; 1086 1087 phase5_get (&token2); 1088 if (token2.type == token_type_plus) 1089 { 1090 token_ty token3; 1091 1092 phase5_get (&token3); 1093 if (token3.type == token_type_string_literal) 1094 { 1095 token_ty token_after; 1096 1097 phase5_get (&token_after); 1098 if (token_after.type != token_type_dot) 1099 { 1100 char *addend = token3.string; 1101 size_t addend_len = strlen (addend); 1102 1103 sum = (char *) xrealloc (sum, sum_len + addend_len + 1); 1104 memcpy (sum + sum_len, addend, addend_len + 1); 1105 sum_len += addend_len; 1106 1107 phase5_unget (&token_after); 1108 free_token (&token3); 1109 free_token (&token2); 1110 continue; 1111 } 1112 phase5_unget (&token_after); 1113 } 1114 phase5_unget (&token3); 1115 } 1116 phase5_unget (&token2); 1117 break; 1118 } 1119 tp->string = sum; 1120 } 1121 phase6_last = tp->type; 1122} 1123 1124/* Supports 2 tokens of pushback. */ 1125static void 1126phase6_unget (token_ty *tp) 1127{ 1128 if (tp->type != token_type_eof) 1129 { 1130 if (phase6_pushback_length == SIZEOF (phase6_pushback)) 1131 abort (); 1132 phase6_pushback[phase6_pushback_length++] = *tp; 1133 } 1134} 1135 1136 1137static void 1138x_java_lex (token_ty *tp) 1139{ 1140 phase6_get (tp); 1141} 1142 1143/* Supports 2 tokens of pushback. */ 1144static void 1145x_java_unlex (token_ty *tp) 1146{ 1147 phase6_unget (tp); 1148} 1149 1150 1151/* ========================= Extracting strings. ========================== */ 1152 1153 1154/* Context lookup table. */ 1155static flag_context_list_table_ty *flag_context_list_table; 1156 1157 1158/* The file is broken into tokens. Scan the token stream, looking for 1159 a keyword, followed by a left paren, followed by a string. When we 1160 see this sequence, we have something to remember. We assume we are 1161 looking at a valid C or C++ program, and leave the complaints about 1162 the grammar to the compiler. 1163 1164 Normal handling: Look for 1165 keyword ( ... msgid ... ) 1166 Plural handling: Look for 1167 keyword ( ... msgid ... msgid_plural ... ) 1168 1169 We use recursion because the arguments before msgid or between msgid 1170 and msgid_plural can contain subexpressions of the same form. */ 1171 1172 1173/* Extract messages until the next balanced closing parenthesis or brace, 1174 depending on TERMINATOR. 1175 Extracted messages are added to MLP. 1176 Return true upon eof, false upon closing parenthesis or brace. */ 1177static bool 1178extract_parenthesized (message_list_ty *mlp, token_type_ty terminator, 1179 flag_context_ty outer_context, 1180 flag_context_list_iterator_ty context_iter, 1181 struct arglist_parser *argparser) 1182{ 1183 /* Current argument number. */ 1184 int arg = 1; 1185 /* 0 when no keyword has been seen. 1 right after a keyword is seen. */ 1186 int state; 1187 /* Parameters of the keyword just seen. Defined only in state 1. */ 1188 const struct callshapes *next_shapes = NULL; 1189 /* Context iterator that will be used if the next token is a '('. */ 1190 flag_context_list_iterator_ty next_context_iter = 1191 passthrough_context_list_iterator; 1192 /* Current context. */ 1193 flag_context_ty inner_context = 1194 inherited_context (outer_context, 1195 flag_context_list_iterator_advance (&context_iter)); 1196 1197 /* Start state is 0. */ 1198 state = 0; 1199 1200 for (;;) 1201 { 1202 token_ty token; 1203 1204 x_java_lex (&token); 1205 switch (token.type) 1206 { 1207 case token_type_symbol: 1208 { 1209 /* Combine symbol1 . ... . symbolN to a single strings, so that 1210 we can recognize static function calls like 1211 GettextResource.gettext. The information present for 1212 symbolI.....symbolN has precedence over the information for 1213 symbolJ.....symbolN with J > I. */ 1214 char *sum = token.string; 1215 size_t sum_len = strlen (sum); 1216 const char *dottedname; 1217 flag_context_list_ty *context_list; 1218 1219 for (;;) 1220 { 1221 token_ty token2; 1222 1223 x_java_lex (&token2); 1224 if (token2.type == token_type_dot) 1225 { 1226 token_ty token3; 1227 1228 x_java_lex (&token3); 1229 if (token3.type == token_type_symbol) 1230 { 1231 char *addend = token3.string; 1232 size_t addend_len = strlen (addend); 1233 1234 sum = 1235 (char *) xrealloc (sum, sum_len + 1 + addend_len + 1); 1236 sum[sum_len] = '.'; 1237 memcpy (sum + sum_len + 1, addend, addend_len + 1); 1238 sum_len += 1 + addend_len; 1239 1240 free_token (&token3); 1241 free_token (&token2); 1242 continue; 1243 } 1244 x_java_unlex (&token3); 1245 } 1246 x_java_unlex (&token2); 1247 break; 1248 } 1249 1250 for (dottedname = sum;;) 1251 { 1252 void *keyword_value; 1253 1254 if (hash_find_entry (&keywords, dottedname, strlen (dottedname), 1255 &keyword_value) 1256 == 0) 1257 { 1258 next_shapes = (const struct callshapes *) keyword_value; 1259 state = 1; 1260 break; 1261 } 1262 1263 dottedname = strchr (dottedname, '.'); 1264 if (dottedname == NULL) 1265 { 1266 state = 0; 1267 break; 1268 } 1269 dottedname++; 1270 } 1271 1272 for (dottedname = sum;;) 1273 { 1274 context_list = 1275 flag_context_list_table_lookup ( 1276 flag_context_list_table, 1277 dottedname, strlen (dottedname)); 1278 if (context_list != NULL) 1279 break; 1280 1281 dottedname = strchr (dottedname, '.'); 1282 if (dottedname == NULL) 1283 break; 1284 dottedname++; 1285 } 1286 next_context_iter = flag_context_list_iterator (context_list); 1287 1288 free (sum); 1289 continue; 1290 } 1291 1292 case token_type_lparen: 1293 if (extract_parenthesized (mlp, token_type_rparen, 1294 inner_context, next_context_iter, 1295 arglist_parser_alloc (mlp, 1296 state ? next_shapes : NULL))) 1297 { 1298 xgettext_current_source_encoding = po_charset_utf8; 1299 arglist_parser_done (argparser, arg); 1300 xgettext_current_source_encoding = xgettext_global_source_encoding; 1301 return true; 1302 } 1303 next_context_iter = null_context_list_iterator; 1304 state = 0; 1305 continue; 1306 1307 case token_type_rparen: 1308 if (terminator == token_type_rparen) 1309 { 1310 xgettext_current_source_encoding = po_charset_utf8; 1311 arglist_parser_done (argparser, arg); 1312 xgettext_current_source_encoding = xgettext_global_source_encoding; 1313 return false; 1314 } 1315 if (terminator == token_type_rbrace) 1316 { 1317 error_with_progname = false; 1318 error (0, 0, 1319 _("%s:%d: warning: ')' found where '}' was expected"), 1320 logical_file_name, token.line_number); 1321 error_with_progname = true; 1322 } 1323 next_context_iter = null_context_list_iterator; 1324 state = 0; 1325 continue; 1326 1327 case token_type_lbrace: 1328 if (extract_parenthesized (mlp, token_type_rbrace, 1329 null_context, null_context_list_iterator, 1330 arglist_parser_alloc (mlp, NULL))) 1331 { 1332 xgettext_current_source_encoding = po_charset_utf8; 1333 arglist_parser_done (argparser, arg); 1334 xgettext_current_source_encoding = xgettext_global_source_encoding; 1335 return true; 1336 } 1337 next_context_iter = null_context_list_iterator; 1338 state = 0; 1339 continue; 1340 1341 case token_type_rbrace: 1342 if (terminator == token_type_rbrace) 1343 { 1344 xgettext_current_source_encoding = po_charset_utf8; 1345 arglist_parser_done (argparser, arg); 1346 xgettext_current_source_encoding = xgettext_global_source_encoding; 1347 return false; 1348 } 1349 if (terminator == token_type_rparen) 1350 { 1351 error_with_progname = false; 1352 error (0, 0, 1353 _("%s:%d: warning: '}' found where ')' was expected"), 1354 logical_file_name, token.line_number); 1355 error_with_progname = true; 1356 } 1357 next_context_iter = null_context_list_iterator; 1358 state = 0; 1359 continue; 1360 1361 case token_type_comma: 1362 arg++; 1363 inner_context = 1364 inherited_context (outer_context, 1365 flag_context_list_iterator_advance ( 1366 &context_iter)); 1367 next_context_iter = passthrough_context_list_iterator; 1368 state = 0; 1369 continue; 1370 1371 case token_type_string_literal: 1372 { 1373 lex_pos_ty pos; 1374 pos.file_name = logical_file_name; 1375 pos.line_number = token.line_number; 1376 1377 xgettext_current_source_encoding = po_charset_utf8; 1378 if (extract_all) 1379 remember_a_message (mlp, NULL, token.string, inner_context, 1380 &pos, token.comment); 1381 else 1382 arglist_parser_remember (argparser, arg, token.string, 1383 inner_context, 1384 pos.file_name, pos.line_number, 1385 token.comment); 1386 xgettext_current_source_encoding = xgettext_global_source_encoding; 1387 } 1388 drop_reference (token.comment); 1389 next_context_iter = null_context_list_iterator; 1390 state = 0; 1391 continue; 1392 1393 case token_type_eof: 1394 xgettext_current_source_encoding = po_charset_utf8; 1395 arglist_parser_done (argparser, arg); 1396 xgettext_current_source_encoding = xgettext_global_source_encoding; 1397 return true; 1398 1399 case token_type_dot: 1400 case token_type_number: 1401 case token_type_plus: 1402 case token_type_other: 1403 next_context_iter = null_context_list_iterator; 1404 state = 0; 1405 continue; 1406 1407 default: 1408 abort (); 1409 } 1410 } 1411} 1412 1413 1414void 1415extract_java (FILE *f, 1416 const char *real_filename, const char *logical_filename, 1417 flag_context_list_table_ty *flag_table, 1418 msgdomain_list_ty *mdlp) 1419{ 1420 message_list_ty *mlp = mdlp->item[0]->messages; 1421 1422 fp = f; 1423 real_file_name = real_filename; 1424 logical_file_name = xstrdup (logical_filename); 1425 line_number = 1; 1426 1427 last_comment_line = -1; 1428 last_non_comment_line = -1; 1429 1430 phase6_last = token_type_eof; 1431 1432 flag_context_list_table = flag_table; 1433 1434 init_keywords (); 1435 1436 /* Eat tokens until eof is seen. When extract_parenthesized returns 1437 due to an unbalanced closing parenthesis, just restart it. */ 1438 while (!extract_parenthesized (mlp, token_type_eof, 1439 null_context, null_context_list_iterator, 1440 arglist_parser_alloc (mlp, NULL))) 1441 ; 1442 1443 fp = NULL; 1444 real_file_name = NULL; 1445 logical_file_name = NULL; 1446 line_number = 0; 1447} 1448