1/* xgettext Java backend.
2   Copyright (C) 2003, 2005-2007 Free Software Foundation, Inc.
3   Written by Bruno Haible <bruno@clisp.org>, 2003.
4
5   This program is free software: you can redistribute it and/or modify
6   it under the terms of the GNU General Public License as published by
7   the Free Software Foundation; either version 3 of the License, or
8   (at your option) any later version.
9
10   This program is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13   GNU General Public License for more details.
14
15   You should have received a copy of the GNU General Public License
16   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
17
18#ifdef HAVE_CONFIG_H
19# include "config.h"
20#endif
21
22/* Specification.  */
23#include "x-java.h"
24
25#include <errno.h>
26#include <stdbool.h>
27#include <stdio.h>
28#include <stdlib.h>
29#include <string.h>
30
31#include "message.h"
32#include "xgettext.h"
33#include "x-java.h"
34#include "error.h"
35#include "xalloc.h"
36#include "hash.h"
37#include "po-charset.h"
38#include "unistr.h"
39#include "gettext.h"
40
41#define _(s) gettext(s)
42
43#define SIZEOF(a) (sizeof(a) / sizeof(a[0]))
44
45
46/* The Java syntax is defined in the
47     Java Language Specification, Second Edition,
48     (available from http://java.sun.com/),
49     chapter 3 "Lexical Structure".  */
50
51
52/* ====================== Keyword set customization.  ====================== */
53
54/* If true extract all strings.  */
55static bool extract_all = false;
56
57static hash_table keywords;
58static bool default_keywords = true;
59
60
61void
62x_java_extract_all ()
63{
64  extract_all = true;
65}
66
67
68void
69x_java_keyword (const char *name)
70{
71  if (name == NULL)
72    default_keywords = false;
73  else
74    {
75      const char *end;
76      struct callshape shape;
77      const char *colon;
78
79      if (keywords.table == NULL)
80	hash_init (&keywords, 100);
81
82      split_keywordspec (name, &end, &shape);
83
84      /* The characters between name and end should form a valid Java
85	 identifier sequence with dots.
86	 A colon means an invalid parse in split_keywordspec().  */
87      colon = strchr (name, ':');
88      if (colon == NULL || colon >= end)
89	insert_keyword_callshape (&keywords, name, end - name, &shape);
90    }
91}
92
93/* Finish initializing the keywords hash table.
94   Called after argument processing, before each file is processed.  */
95static void
96init_keywords ()
97{
98  if (default_keywords)
99    {
100      /* When adding new keywords here, also update the documentation in
101	 xgettext.texi!  */
102      x_java_keyword ("GettextResource.gettext:2");        /* static method */
103      x_java_keyword ("GettextResource.ngettext:2,3");     /* static method */
104      x_java_keyword ("GettextResource.pgettext:2c,3");    /* static method */
105      x_java_keyword ("GettextResource.npgettext:2c,3,4"); /* static method */
106      x_java_keyword ("gettext");
107      x_java_keyword ("ngettext:1,2");
108      x_java_keyword ("pgettext:1c,2");
109      x_java_keyword ("npgettext:1c,2,3");
110      x_java_keyword ("getString");	/* ResourceBundle.getString */
111      default_keywords = false;
112    }
113}
114
115void
116init_flag_table_java ()
117{
118  xgettext_record_flag ("GettextResource.gettext:2:pass-java-format");
119  xgettext_record_flag ("GettextResource.ngettext:2:pass-java-format");
120  xgettext_record_flag ("GettextResource.ngettext:3:pass-java-format");
121  xgettext_record_flag ("GettextResource.pgettext:3:pass-java-format");
122  xgettext_record_flag ("GettextResource.npgettext:3:pass-java-format");
123  xgettext_record_flag ("GettextResource.npgettext:4:pass-java-format");
124  xgettext_record_flag ("gettext:1:pass-java-format");
125  xgettext_record_flag ("ngettext:1:pass-java-format");
126  xgettext_record_flag ("ngettext:2:pass-java-format");
127  xgettext_record_flag ("pgettext:2:pass-java-format");
128  xgettext_record_flag ("npgettext:2:pass-java-format");
129  xgettext_record_flag ("npgettext:3:pass-java-format");
130  xgettext_record_flag ("getString:1:pass-java-format");
131  xgettext_record_flag ("MessageFormat:1:java-format");
132  xgettext_record_flag ("MessageFormat.format:1:java-format");
133}
134
135
136/* ======================== Reading of characters.  ======================== */
137
138/* Real filename, used in error messages about the input file.  */
139static const char *real_file_name;
140
141/* Logical filename and line number, used to label the extracted messages.  */
142static char *logical_file_name;
143static int line_number;
144
145/* The input file stream.  */
146static FILE *fp;
147
148
149/* Fetch the next single-byte character from the input file.
150   Pushback can consist of an unlimited number of 'u' followed by up to 4
151   other characters.  */
152
153/* Special coding of multiple 'u's in the pushback buffer.  */
154#define MULTIPLE_U(count) (0x1000 + (count))
155
156static int phase1_pushback[5];
157static unsigned int phase1_pushback_length;
158
159static int
160phase1_getc ()
161{
162  int c;
163
164  if (phase1_pushback_length)
165    {
166      c = phase1_pushback[--phase1_pushback_length];
167      if (c >= MULTIPLE_U (0))
168	{
169	  if (c > MULTIPLE_U (1))
170	    phase1_pushback[phase1_pushback_length++] = c - 1;
171	  return 'u';
172	}
173      else
174	return c;
175    }
176
177  c = getc (fp);
178
179  if (c == EOF)
180    {
181      if (ferror (fp))
182	error (EXIT_FAILURE, errno, _("\
183error while reading \"%s\""), real_file_name);
184    }
185
186  return c;
187}
188
189/* Supports any number of 'u' and up to 4 arbitrary characters of pushback.  */
190static void
191phase1_ungetc (int c)
192{
193  if (c != EOF)
194    {
195      if (c == 'u')
196	{
197	  if (phase1_pushback_length > 0
198	      && phase1_pushback[phase1_pushback_length - 1] >= MULTIPLE_U (0))
199	    phase1_pushback[phase1_pushback_length - 1]++;
200	  else
201	    {
202	      if (phase1_pushback_length == SIZEOF (phase1_pushback))
203		abort ();
204	      phase1_pushback[phase1_pushback_length++] = MULTIPLE_U (1);
205	    }
206	}
207      else
208	{
209	  if (phase1_pushback_length == SIZEOF (phase1_pushback))
210	    abort ();
211	  phase1_pushback[phase1_pushback_length++] = c;
212	}
213    }
214}
215
216
217/* Fetch the next single-byte character or Unicode character from the file.
218   (Here, as in the Java Language Specification, when we say "Unicode
219   character", we actually mean "UTF-16 encoding unit".)  */
220
221/* Return value of phase 2, 3, 4 when EOF is reached.  */
222#define P2_EOF 0xffff
223
224/* Convert an UTF-16 code point to a return value that can be distinguished
225   from a single-byte return value.  */
226#define UNICODE(code) (0x10000 + (code))
227
228/* Test a return value of phase 2, 3, 4 whether it designates an UTF-16 code
229   point.  */
230#define IS_UNICODE(p2_result) ((p2_result) >= 0x10000)
231
232/* Extract the UTF-16 code of a return value that satisfies IS_UNICODE.  */
233#define UTF16_VALUE(p2_result) ((p2_result) - 0x10000)
234
235/* Reduces a return value of phase 2, 3, 4 by unmasking the UNICODE bit,
236   so that it can be more easily compared against an ASCII character.
237   (RED (c) == 'x')  is equivalent to  (c == 'x' || c == UNICODE ('x')).  */
238#define RED(p2_result) ((p2_result) & 0xffff)
239
240static int phase2_pushback[1];
241static int phase2_pushback_length;
242
243static int
244phase2_getc ()
245{
246  int c;
247
248  if (phase2_pushback_length)
249    return phase2_pushback[--phase2_pushback_length];
250
251  c = phase1_getc ();
252  if (c == EOF)
253    return P2_EOF;
254  if (c == '\\')
255    {
256      c = phase1_getc ();
257      if (c == 'u')
258	{
259	  unsigned int u_count = 1;
260	  unsigned char buf[4];
261	  unsigned int n;
262	  int i;
263
264	  for (;;)
265	    {
266	      c = phase1_getc ();
267	      if (c != 'u')
268		break;
269	      u_count++;
270	    }
271	  phase1_ungetc (c);
272
273	  n = 0;
274	  for (i = 0; i < 4; i++)
275	    {
276	      c = phase1_getc ();
277
278	      if (c >= '0' && c <= '9')
279		n = (n << 4) + (c - '0');
280	      else if (c >= 'A' && c <= 'F')
281		n = (n << 4) + (c - 'A' + 10);
282	      else if (c >= 'a' && c <= 'f')
283		n = (n << 4) + (c - 'a' + 10);
284	      else
285		{
286		  phase1_ungetc (c);
287		  while (--i >= 0)
288		    phase1_ungetc (buf[i]);
289		  for (; u_count > 0; u_count--)
290		    phase1_ungetc ('u');
291		  return '\\';
292		}
293
294	      buf[i] = c;
295	    }
296	  return UNICODE (n);
297	}
298      phase1_ungetc (c);
299      return '\\';
300    }
301  return c;
302}
303
304/* Supports only one pushback character.  */
305static void
306phase2_ungetc (int c)
307{
308  if (c != P2_EOF)
309    {
310      if (phase2_pushback_length == SIZEOF (phase2_pushback))
311	abort ();
312      phase2_pushback[phase2_pushback_length++] = c;
313    }
314}
315
316
317/* Fetch the next single-byte character or Unicode character from the file.
318   With line number handling.
319   Convert line terminators to '\n' or UNICODE ('\n').  */
320
321static int phase3_pushback[2];
322static int phase3_pushback_length;
323
324static int
325phase3_getc ()
326{
327  int c;
328
329  if (phase3_pushback_length)
330    {
331      c = phase3_pushback[--phase3_pushback_length];
332      if (c == '\n')
333	++line_number;
334      return c;
335    }
336
337  c = phase2_getc ();
338
339  /* Handle line terminators.  */
340  if (RED (c) == '\r')
341    {
342      int c1 = phase2_getc ();
343
344      if (RED (c1) != '\n')
345	phase2_ungetc (c1);
346
347      /* Seen line terminator CR or CR/LF.  */
348      if (c == '\r' || c1 == '\n')
349	{
350	  ++line_number;
351	  return '\n';
352	}
353      else
354	return UNICODE ('\n');
355    }
356  else if (RED (c) == '\n')
357    {
358      /* Seen line terminator LF.  */
359      if (c == '\n')
360	{
361	  ++line_number;
362	  return '\n';
363	}
364      else
365	return UNICODE ('\n');
366    }
367
368  return c;
369}
370
371/* Supports 2 characters of pushback.  */
372static void
373phase3_ungetc (int c)
374{
375  if (c != P2_EOF)
376    {
377      if (c == '\n')
378	--line_number;
379      if (phase3_pushback_length == SIZEOF (phase3_pushback))
380	abort ();
381      phase3_pushback[phase3_pushback_length++] = c;
382    }
383}
384
385
386/* ========================= Accumulating strings.  ======================== */
387
388/* A string buffer type that allows appending bytes (in the
389   xgettext_current_source_encoding) or Unicode characters.
390   Returns the entire string in UTF-8 encoding.  */
391
392struct string_buffer
393{
394  /* The part of the string that has already been converted to UTF-8.  */
395  char *utf8_buffer;
396  size_t utf8_buflen;
397  size_t utf8_allocated;
398  /* The first half of an UTF-16 surrogate character.  */
399  unsigned short utf16_surr;
400  /* The part of the string that is still in the source encoding.  */
401  char *curr_buffer;
402  size_t curr_buflen;
403  size_t curr_allocated;
404};
405
406/* Initialize a 'struct string_buffer' to empty.  */
407static inline void
408init_string_buffer (struct string_buffer *bp)
409{
410  bp->utf8_buffer = NULL;
411  bp->utf8_buflen = 0;
412  bp->utf8_allocated = 0;
413  bp->utf16_surr = 0;
414  bp->curr_buffer = NULL;
415  bp->curr_buflen = 0;
416  bp->curr_allocated = 0;
417}
418
419/* Auxiliary function: Append a byte to bp->curr.  */
420static inline void
421string_buffer_append_byte (struct string_buffer *bp, unsigned char c)
422{
423  if (bp->curr_buflen == bp->curr_allocated)
424    {
425      bp->curr_allocated = 2 * bp->curr_allocated + 10;
426      bp->curr_buffer = xrealloc (bp->curr_buffer, bp->curr_allocated);
427    }
428  bp->curr_buffer[bp->curr_buflen++] = c;
429}
430
431/* Auxiliary function: Ensure count more bytes are available in bp->utf8.  */
432static inline void
433string_buffer_append_unicode_grow (struct string_buffer *bp, size_t count)
434{
435  if (bp->utf8_buflen + count > bp->utf8_allocated)
436    {
437      size_t new_allocated = 2 * bp->utf8_allocated + 10;
438      if (new_allocated < bp->utf8_buflen + count)
439	new_allocated = bp->utf8_buflen + count;
440      bp->utf8_allocated = new_allocated;
441      bp->utf8_buffer = xrealloc (bp->utf8_buffer, new_allocated);
442    }
443}
444
445/* Auxiliary function: Append a Unicode character to bp->utf8.
446   uc must be < 0x110000.  */
447static inline void
448string_buffer_append_unicode (struct string_buffer *bp, unsigned int uc)
449{
450  unsigned char utf8buf[6];
451  int count = u8_uctomb (utf8buf, uc, 6);
452
453  if (count < 0)
454    /* The caller should have ensured that uc is not out-of-range.  */
455    abort ();
456
457  string_buffer_append_unicode_grow (bp, count);
458  memcpy (bp->utf8_buffer + bp->utf8_buflen, utf8buf, count);
459  bp->utf8_buflen += count;
460}
461
462/* Auxiliary function: Handle the attempt to append a lone surrogate to
463   bp->utf8.  */
464static void
465string_buffer_append_lone_surrogate (struct string_buffer *bp, unsigned int uc)
466{
467  /* A half surrogate is invalid, therefore use U+FFFD instead.
468     It appears to be valid Java: The Java Language Specification,
469     3rd ed., says "The Java programming language represents text
470     in sequences of 16-bit code units, using the UTF-16 encoding."
471     but does not impose constraints on the use of \uxxxx escape
472     sequences for surrogates.  And the JDK's javac happily groks
473     half surrogates.
474     But a half surrogate is invalid in UTF-8:
475       - RFC 3629 says
476	   "The definition of UTF-8 prohibits encoding character
477	    numbers between U+D800 and U+DFFF".
478       - Unicode 4.0 chapter 3
479	 <http://www.unicode.org/versions/Unicode4.0.0/ch03.pdf>
480	 section 3.9, p.77, says
481	   "Because surrogate code points are not Unicode scalar
482	    values, any UTF-8 byte sequence that would otherwise
483	    map to code points D800..DFFF is ill-formed."
484	 and in table 3-6, p. 78, does not mention D800..DFFF.
485       - The unicode.org FAQ question "How do I convert an unpaired
486	 UTF-16 surrogate to UTF-8?" has the answer
487	   "By representing such an unpaired surrogate on its own
488	    as a 3-byte sequence, the resulting UTF-8 data stream
489	    would become ill-formed."
490     So use U+FFFD instead.  */
491  error_with_progname = false;
492  error (0, 0, _("%s:%d: warning: lone surrogate U+%04X"),
493	 logical_file_name, line_number, uc);
494  error_with_progname = true;
495  string_buffer_append_unicode (bp, 0xfffd);
496}
497
498/* Auxiliary function: Flush bp->utf16_surr into bp->utf8_buffer.  */
499static inline void
500string_buffer_flush_utf16_surr (struct string_buffer *bp)
501{
502  if (bp->utf16_surr != 0)
503    {
504      string_buffer_append_lone_surrogate (bp, bp->utf16_surr);
505      bp->utf16_surr = 0;
506    }
507}
508
509/* Auxiliary function: Flush bp->curr_buffer into bp->utf8_buffer.  */
510static inline void
511string_buffer_flush_curr_buffer (struct string_buffer *bp, int lineno)
512{
513  if (bp->curr_buflen > 0)
514    {
515      char *curr;
516      size_t count;
517
518      string_buffer_append_byte (bp, '\0');
519
520      /* Convert from the source encoding to UTF-8.  */
521      curr = from_current_source_encoding (bp->curr_buffer,
522					   logical_file_name, lineno);
523
524      /* Append it to bp->utf8_buffer.  */
525      count = strlen (curr);
526      string_buffer_append_unicode_grow (bp, count);
527      memcpy (bp->utf8_buffer + bp->utf8_buflen, curr, count);
528      bp->utf8_buflen += count;
529
530      if (curr != bp->curr_buffer)
531	free (curr);
532      bp->curr_buflen = 0;
533    }
534}
535
536/* Append a character or Unicode character to a 'struct string_buffer'.  */
537static void
538string_buffer_append (struct string_buffer *bp, int c)
539{
540  if (IS_UNICODE (c))
541    {
542      /* Append a Unicode character.  */
543
544      /* Switch from multibyte character mode to Unicode character mode.  */
545      string_buffer_flush_curr_buffer (bp, line_number);
546
547      /* Test whether this character and the previous one form a Unicode
548	 surrogate character pair.  */
549      if (bp->utf16_surr != 0
550	  && (c >= UNICODE (0xdc00) && c < UNICODE (0xe000)))
551	{
552	  unsigned short utf16buf[2];
553	  unsigned int uc;
554
555	  utf16buf[0] = bp->utf16_surr;
556	  utf16buf[1] = UTF16_VALUE (c);
557	  if (u16_mbtouc (&uc, utf16buf, 2) != 2)
558	    abort ();
559
560	  string_buffer_append_unicode (bp, uc);
561	  bp->utf16_surr = 0;
562	}
563      else
564	{
565	  string_buffer_flush_utf16_surr (bp);
566
567	  if (c >= UNICODE (0xd800) && c < UNICODE (0xdc00))
568	    bp->utf16_surr = UTF16_VALUE (c);
569	  else if (c >= UNICODE (0xdc00) && c < UNICODE (0xe000))
570	    string_buffer_append_lone_surrogate (bp, UTF16_VALUE (c));
571	  else
572	    string_buffer_append_unicode (bp, UTF16_VALUE (c));
573	}
574    }
575  else
576    {
577      /* Append a single byte.  */
578
579      /* Switch from Unicode character mode to multibyte character mode.  */
580      string_buffer_flush_utf16_surr (bp);
581
582      /* When a newline is seen, convert the accumulated multibyte sequence.
583	 This ensures a correct line number in the error message in case of
584	 a conversion error.  The "- 1" is to account for the newline.  */
585      if (c == '\n')
586	string_buffer_flush_curr_buffer (bp, line_number - 1);
587
588      string_buffer_append_byte (bp, (unsigned char) c);
589    }
590}
591
592/* Return the string buffer's contents.  */
593static char *
594string_buffer_result (struct string_buffer *bp)
595{
596  /* Flush all into bp->utf8_buffer.  */
597  string_buffer_flush_utf16_surr (bp);
598  string_buffer_flush_curr_buffer (bp, line_number);
599  /* NUL-terminate it.  */
600  string_buffer_append_unicode_grow (bp, 1);
601  bp->utf8_buffer[bp->utf8_buflen] = '\0';
602  /* Return it.  */
603  return bp->utf8_buffer;
604}
605
606/* Free the memory pointed to by a 'struct string_buffer'.  */
607static inline void
608free_string_buffer (struct string_buffer *bp)
609{
610  free (bp->utf8_buffer);
611  free (bp->curr_buffer);
612}
613
614
615/* ======================== Accumulating comments.  ======================== */
616
617
618/* Accumulating a single comment line.  */
619
620static struct string_buffer comment_buffer;
621
622static inline void
623comment_start ()
624{
625  comment_buffer.utf8_buflen = 0;
626  comment_buffer.utf16_surr = 0;
627  comment_buffer.curr_buflen = 0;
628}
629
630static inline bool
631comment_at_start ()
632{
633  return (comment_buffer.utf8_buflen == 0 && comment_buffer.utf16_surr == 0
634	  && comment_buffer.curr_buflen == 0);
635}
636
637static inline void
638comment_add (int c)
639{
640  string_buffer_append (&comment_buffer, c);
641}
642
643static inline void
644comment_line_end (size_t chars_to_remove)
645{
646  char *buffer = string_buffer_result (&comment_buffer);
647  size_t buflen = strlen (buffer);
648
649  buflen -= chars_to_remove;
650  while (buflen >= 1
651	 && (buffer[buflen - 1] == ' ' || buffer[buflen - 1] == '\t'))
652    --buflen;
653  buffer[buflen] = '\0';
654  savable_comment_add (buffer);
655}
656
657
658/* These are for tracking whether comments count as immediately before
659   keyword.  */
660static int last_comment_line;
661static int last_non_comment_line;
662
663
664/* Replace each comment that is not inside a character constant or string
665   literal with a space or newline character.  */
666
667static int
668phase4_getc ()
669{
670  int c0;
671  int c;
672  bool last_was_star;
673
674  c0 = phase3_getc ();
675  if (RED (c0) != '/')
676    return c0;
677  c = phase3_getc ();
678  switch (RED (c))
679    {
680    default:
681      phase3_ungetc (c);
682      return c0;
683
684    case '*':
685      /* C style comment.  */
686      comment_start ();
687      last_was_star = false;
688      for (;;)
689	{
690	  c = phase3_getc ();
691	  if (c == P2_EOF)
692	    break;
693	  /* We skip all leading white space, but not EOLs.  */
694	  if (!(comment_at_start () && (RED (c) == ' ' || RED (c) == '\t')))
695	    comment_add (c);
696	  switch (RED (c))
697	    {
698	    case '\n':
699	      comment_line_end (1);
700	      comment_start ();
701	      last_was_star = false;
702	      continue;
703
704	    case '*':
705	      last_was_star = true;
706	      continue;
707
708	    case '/':
709	      if (last_was_star)
710		{
711		  comment_line_end (2);
712		  break;
713		}
714	      /* FALLTHROUGH */
715
716	    default:
717	      last_was_star = false;
718	      continue;
719	    }
720	  break;
721	}
722      last_comment_line = line_number;
723      return ' ';
724
725    case '/':
726      /* C++ style comment.  */
727      last_comment_line = line_number;
728      comment_start ();
729      for (;;)
730	{
731	  c = phase3_getc ();
732	  if (RED (c) == '\n' || c == P2_EOF)
733	    break;
734	  /* We skip all leading white space, but not EOLs.  */
735	  if (!(comment_at_start () && (RED (c) == ' ' || RED (c) == '\t')))
736	    comment_add (c);
737	}
738      phase3_ungetc (c); /* push back the newline, to decrement line_number */
739      comment_line_end (0);
740      phase3_getc (); /* read the newline again */
741      return '\n';
742    }
743}
744
745/* Supports only one pushback character.  */
746static void
747phase4_ungetc (int c)
748{
749  phase3_ungetc (c);
750}
751
752
753/* ========================== Reading of tokens.  ========================== */
754
755enum token_type_ty
756{
757  token_type_eof,
758  token_type_lparen,		/* ( */
759  token_type_rparen,		/* ) */
760  token_type_lbrace,		/* { */
761  token_type_rbrace,		/* } */
762  token_type_comma,		/* , */
763  token_type_dot,		/* . */
764  token_type_string_literal,	/* "abc" */
765  token_type_number,		/* 1.23 */
766  token_type_symbol,		/* identifier, keyword, null */
767  token_type_plus,		/* + */
768  token_type_other		/* character literal, misc. operator */
769};
770typedef enum token_type_ty token_type_ty;
771
772typedef struct token_ty token_ty;
773struct token_ty
774{
775  token_type_ty type;
776  char *string;		/* for token_type_string_literal, token_type_symbol */
777  refcounted_string_list_ty *comment;	/* for token_type_string_literal */
778  int line_number;
779};
780
781
782/* Free the memory pointed to by a 'struct token_ty'.  */
783static inline void
784free_token (token_ty *tp)
785{
786  if (tp->type == token_type_string_literal || tp->type == token_type_symbol)
787    free (tp->string);
788  if (tp->type == token_type_string_literal)
789    drop_reference (tp->comment);
790}
791
792
793/* Read an escape sequence inside a string literal or character literal.  */
794static inline int
795do_getc_escaped ()
796{
797  int c;
798
799  /* Use phase 3, because phase 4 elides comments.  */
800  c = phase3_getc ();
801  if (c == P2_EOF)
802    return UNICODE ('\\');
803  switch (RED (c))
804    {
805    case 'b':
806      return UNICODE (0x08);
807    case 't':
808      return UNICODE (0x09);
809    case 'n':
810      return UNICODE (0x0a);
811    case 'f':
812      return UNICODE (0x0c);
813    case 'r':
814      return UNICODE (0x0d);
815    case '"':
816      return UNICODE ('"');
817    case '\'':
818      return UNICODE ('\'');
819    case '\\':
820      return UNICODE ('\\');
821    case '0': case '1': case '2': case '3':
822    case '4': case '5': case '6': case '7':
823      {
824	int n = RED (c) - '0';
825	bool maybe3digits = (n < 4);
826
827	c = phase3_getc ();
828	if (RED (c) >= '0' && RED (c) <= '7')
829	  {
830	    n = (n << 3) + (RED (c) - '0');
831	    if (maybe3digits)
832	      {
833		c = phase3_getc ();
834		if (RED (c) >= '0' && RED (c) <= '7')
835		  n = (n << 3) + (RED (c) - '0');
836		else
837		  phase3_ungetc (c);
838	      }
839	  }
840	else
841	  phase3_ungetc (c);
842
843	return UNICODE (n);
844      }
845    default:
846      /* Invalid escape sequence.  */
847      phase3_ungetc (c);
848      return UNICODE ('\\');
849    }
850}
851
852/* Read a string literal or character literal.  */
853static void
854accumulate_escaped (struct string_buffer *literal, int delimiter)
855{
856  int c;
857
858  for (;;)
859    {
860      /* Use phase 3, because phase 4 elides comments.  */
861      c = phase3_getc ();
862      if (c == P2_EOF || RED (c) == delimiter)
863	break;
864      if (RED (c) == '\n')
865	{
866	  phase3_ungetc (c);
867	  error_with_progname = false;
868	  if (delimiter == '\'')
869	    error (0, 0, _("%s:%d: warning: unterminated character constant"),
870		   logical_file_name, line_number);
871	  else
872	    error (0, 0, _("%s:%d: warning: unterminated string constant"),
873		   logical_file_name, line_number);
874	  error_with_progname = true;
875	  break;
876	}
877      if (RED (c) == '\\')
878	c = do_getc_escaped ();
879      string_buffer_append (literal, c);
880    }
881}
882
883
884/* Combine characters into tokens.  Discard whitespace.  */
885
886static token_ty phase5_pushback[3];
887static int phase5_pushback_length;
888
889static void
890phase5_get (token_ty *tp)
891{
892  int c;
893
894  if (phase5_pushback_length)
895    {
896      *tp = phase5_pushback[--phase5_pushback_length];
897      return;
898    }
899  tp->string = NULL;
900
901  for (;;)
902    {
903      tp->line_number = line_number;
904      c = phase4_getc ();
905
906      if (c == P2_EOF)
907	{
908	  tp->type = token_type_eof;
909	  return;
910	}
911
912      switch (RED (c))
913	{
914	case '\n':
915	  if (last_non_comment_line > last_comment_line)
916	    savable_comment_reset ();
917	  /* FALLTHROUGH */
918	case ' ':
919	case '\t':
920	case '\f':
921	  /* Ignore whitespace and comments.  */
922	  continue;
923	}
924
925      last_non_comment_line = tp->line_number;
926
927      switch (RED (c))
928	{
929	case '(':
930	  tp->type = token_type_lparen;
931	  return;
932
933	case ')':
934	  tp->type = token_type_rparen;
935	  return;
936
937	case '{':
938	  tp->type = token_type_lbrace;
939	  return;
940
941	case '}':
942	  tp->type = token_type_rbrace;
943	  return;
944
945	case ',':
946	  tp->type = token_type_comma;
947	  return;
948
949	case '.':
950	  c = phase4_getc ();
951	  if (!(RED (c) >= '0' && RED (c) <= '9'))
952	    {
953	      phase4_ungetc (c);
954	      tp->type = token_type_dot;
955	      return;
956	    }
957	  /* FALLTHROUGH */
958
959	case '0': case '1': case '2': case '3': case '4':
960	case '5': case '6': case '7': case '8': case '9':
961	  {
962	    /* Don't need to verify the complicated syntax of integers and
963	       floating-point numbers.  We assume a valid Java input.
964	       The simplified syntax that we recognize as number is: any
965	       sequence of alphanumeric characters, additionally '+' and '-'
966	       immediately after 'e' or 'E' except in hexadecimal numbers.  */
967	    bool hexadecimal = false;
968
969	    for (;;)
970	      {
971		c = phase4_getc ();
972		if (RED (c) >= '0' && RED (c) <= '9')
973		  continue;
974		if ((RED (c) >= 'A' && RED (c) <= 'Z')
975		    || (RED (c) >= 'a' && RED (c) <= 'z'))
976		  {
977		    if (RED (c) == 'X' || RED (c) == 'x')
978		      hexadecimal = true;
979		    if ((RED (c) == 'E' || RED (c) == 'e') && !hexadecimal)
980		      {
981			c = phase4_getc ();
982			if (!(RED (c) == '+' || RED (c) == '-'))
983			  phase4_ungetc (c);
984		      }
985		    continue;
986		  }
987		if (RED (c) == '.')
988		  continue;
989		break;
990	      }
991	    phase4_ungetc (c);
992	    tp->type = token_type_number;
993	    return;
994	  }
995
996	case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G':
997	case 'H': case 'I': case 'J': case 'K': case 'L': case 'M': case 'N':
998	case 'O': case 'P': case 'Q': case 'R': case 'S': case 'T': case 'U':
999	case 'V': case 'W': case 'X': case 'Y': case 'Z':
1000	case '_':
1001	case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g':
1002	case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n':
1003	case 'o': case 'p': case 'q': case 'r': case 's': case 't': case 'u':
1004	case 'v': case 'w': case 'x': case 'y': case 'z':
1005	  /* Although Java allows identifiers containing many Unicode
1006	     characters, we recognize only identifiers consisting of ASCII
1007	     characters.  This avoids conversion hassles w.r.t. the --keyword
1008	     arguments, and shouldn't be a big problem in practice.  */
1009	  {
1010	    static char *buffer;
1011	    static int bufmax;
1012	    int bufpos = 0;
1013	    for (;;)
1014	      {
1015		if (bufpos >= bufmax)
1016		  {
1017		    bufmax = 2 * bufmax + 10;
1018		    buffer = xrealloc (buffer, bufmax);
1019		  }
1020		buffer[bufpos++] = RED (c);
1021		c = phase4_getc ();
1022		if (!((RED (c) >= 'A' && RED (c) <= 'Z')
1023		      || (RED (c) >= 'a' && RED (c) <= 'z')
1024		      || (RED (c) >= '0' && RED (c) <= '9')
1025		      || RED (c) == '_'))
1026		  break;
1027	      }
1028	    phase4_ungetc (c);
1029	    if (bufpos >= bufmax)
1030	      {
1031		bufmax = 2 * bufmax + 10;
1032		buffer = xrealloc (buffer, bufmax);
1033	      }
1034	    buffer[bufpos] = '\0';
1035	    tp->string = xstrdup (buffer);
1036	    tp->type = token_type_symbol;
1037	    return;
1038	  }
1039
1040	case '"':
1041	  /* String literal.  */
1042	  {
1043	    struct string_buffer literal;
1044
1045	    init_string_buffer (&literal);
1046	    accumulate_escaped (&literal, '"');
1047	    tp->string = xstrdup (string_buffer_result (&literal));
1048	    free_string_buffer (&literal);
1049	    tp->comment = add_reference (savable_comment);
1050	    tp->type = token_type_string_literal;
1051	    return;
1052	  }
1053
1054	case '\'':
1055	  /* Character literal.  */
1056	  {
1057	    struct string_buffer literal;
1058
1059	    init_string_buffer (&literal);
1060	    accumulate_escaped (&literal, '\'');
1061	    free_string_buffer (&literal);
1062	    tp->type = token_type_other;
1063	    return;
1064	  }
1065
1066	case '+':
1067	  c = phase4_getc ();
1068	  if (RED (c) == '+')
1069	    /* Operator ++ */
1070	    tp->type = token_type_other;
1071	  else if (RED (c) == '=')
1072	    /* Operator += */
1073	    tp->type = token_type_other;
1074	  else
1075	    {
1076	      /* Operator + */
1077	      phase4_ungetc (c);
1078	      tp->type = token_type_plus;
1079	    }
1080	  return;
1081
1082	default:
1083	  /* Misc. operator.  */
1084	  tp->type = token_type_other;
1085	  return;
1086	}
1087    }
1088}
1089
1090/* Supports 3 tokens of pushback.  */
1091static void
1092phase5_unget (token_ty *tp)
1093{
1094  if (tp->type != token_type_eof)
1095    {
1096      if (phase5_pushback_length == SIZEOF (phase5_pushback))
1097	abort ();
1098      phase5_pushback[phase5_pushback_length++] = *tp;
1099    }
1100}
1101
1102
1103/* Compile-time optimization of string literal concatenation.
1104   Combine "string1" + ... + "stringN" to the concatenated string if
1105     - the token before this expression is not ')' (because then the first
1106       string could be part of a cast expression),
1107     - the token after this expression is not '.' (because then the last
1108       string could be part of a method call expression).  */
1109
1110static token_ty phase6_pushback[2];
1111static int phase6_pushback_length;
1112
1113static token_type_ty phase6_last;
1114
1115static void
1116phase6_get (token_ty *tp)
1117{
1118  if (phase6_pushback_length)
1119    {
1120      *tp = phase6_pushback[--phase6_pushback_length];
1121      return;
1122    }
1123
1124  phase5_get (tp);
1125  if (tp->type == token_type_string_literal && phase6_last != token_type_rparen)
1126    {
1127      char *sum = tp->string;
1128      size_t sum_len = strlen (sum);
1129
1130      for (;;)
1131	{
1132	  token_ty token2;
1133
1134	  phase5_get (&token2);
1135	  if (token2.type == token_type_plus)
1136	    {
1137	      token_ty token3;
1138
1139	      phase5_get (&token3);
1140	      if (token3.type == token_type_string_literal)
1141		{
1142		  token_ty token_after;
1143
1144		  phase5_get (&token_after);
1145		  if (token_after.type != token_type_dot)
1146		    {
1147		      char *addend = token3.string;
1148		      size_t addend_len = strlen (addend);
1149
1150		      sum = (char *) xrealloc (sum, sum_len + addend_len + 1);
1151		      memcpy (sum + sum_len, addend, addend_len + 1);
1152		      sum_len += addend_len;
1153
1154		      phase5_unget (&token_after);
1155		      free_token (&token3);
1156		      free_token (&token2);
1157		      continue;
1158		    }
1159		  phase5_unget (&token_after);
1160		}
1161	      phase5_unget (&token3);
1162	    }
1163	  phase5_unget (&token2);
1164	  break;
1165	}
1166      tp->string = sum;
1167    }
1168  phase6_last = tp->type;
1169}
1170
1171/* Supports 2 tokens of pushback.  */
1172static void
1173phase6_unget (token_ty *tp)
1174{
1175  if (tp->type != token_type_eof)
1176    {
1177      if (phase6_pushback_length == SIZEOF (phase6_pushback))
1178	abort ();
1179      phase6_pushback[phase6_pushback_length++] = *tp;
1180    }
1181}
1182
1183
1184static void
1185x_java_lex (token_ty *tp)
1186{
1187  phase6_get (tp);
1188}
1189
1190/* Supports 2 tokens of pushback.  */
1191static void
1192x_java_unlex (token_ty *tp)
1193{
1194  phase6_unget (tp);
1195}
1196
1197
1198/* ========================= Extracting strings.  ========================== */
1199
1200
1201/* Context lookup table.  */
1202static flag_context_list_table_ty *flag_context_list_table;
1203
1204
1205/* The file is broken into tokens.  Scan the token stream, looking for
1206   a keyword, followed by a left paren, followed by a string.  When we
1207   see this sequence, we have something to remember.  We assume we are
1208   looking at a valid C or C++ program, and leave the complaints about
1209   the grammar to the compiler.
1210
1211     Normal handling: Look for
1212       keyword ( ... msgid ... )
1213     Plural handling: Look for
1214       keyword ( ... msgid ... msgid_plural ... )
1215
1216   We use recursion because the arguments before msgid or between msgid
1217   and msgid_plural can contain subexpressions of the same form.  */
1218
1219
1220/* Extract messages until the next balanced closing parenthesis or brace,
1221   depending on TERMINATOR.
1222   Extracted messages are added to MLP.
1223   Return true upon eof, false upon closing parenthesis or brace.  */
1224static bool
1225extract_parenthesized (message_list_ty *mlp, token_type_ty terminator,
1226		       flag_context_ty outer_context,
1227		       flag_context_list_iterator_ty context_iter,
1228		       struct arglist_parser *argparser)
1229{
1230  /* Current argument number.  */
1231  int arg = 1;
1232  /* 0 when no keyword has been seen.  1 right after a keyword is seen.  */
1233  int state;
1234  /* Parameters of the keyword just seen.  Defined only in state 1.  */
1235  const struct callshapes *next_shapes = NULL;
1236  /* Context iterator that will be used if the next token is a '('.  */
1237  flag_context_list_iterator_ty next_context_iter =
1238    passthrough_context_list_iterator;
1239  /* Current context.  */
1240  flag_context_ty inner_context =
1241    inherited_context (outer_context,
1242		       flag_context_list_iterator_advance (&context_iter));
1243
1244  /* Start state is 0.  */
1245  state = 0;
1246
1247  for (;;)
1248    {
1249      token_ty token;
1250
1251      x_java_lex (&token);
1252      switch (token.type)
1253	{
1254	case token_type_symbol:
1255	  {
1256	    /* Combine symbol1 . ... . symbolN to a single strings, so that
1257	       we can recognize static function calls like
1258	       GettextResource.gettext.  The information present for
1259	       symbolI.....symbolN has precedence over the information for
1260	       symbolJ.....symbolN with J > I.  */
1261	    char *sum = token.string;
1262	    size_t sum_len = strlen (sum);
1263	    const char *dottedname;
1264	    flag_context_list_ty *context_list;
1265
1266	    for (;;)
1267	      {
1268		token_ty token2;
1269
1270		x_java_lex (&token2);
1271		if (token2.type == token_type_dot)
1272		  {
1273		    token_ty token3;
1274
1275		    x_java_lex (&token3);
1276		    if (token3.type == token_type_symbol)
1277		      {
1278			char *addend = token3.string;
1279			size_t addend_len = strlen (addend);
1280
1281			sum =
1282			  (char *) xrealloc (sum, sum_len + 1 + addend_len + 1);
1283			sum[sum_len] = '.';
1284			memcpy (sum + sum_len + 1, addend, addend_len + 1);
1285			sum_len += 1 + addend_len;
1286
1287			free_token (&token3);
1288			free_token (&token2);
1289			continue;
1290		      }
1291		    x_java_unlex (&token3);
1292		  }
1293		x_java_unlex (&token2);
1294		break;
1295	      }
1296
1297	    for (dottedname = sum;;)
1298	      {
1299		void *keyword_value;
1300
1301		if (hash_find_entry (&keywords, dottedname, strlen (dottedname),
1302				     &keyword_value)
1303		    == 0)
1304		  {
1305		    next_shapes = (const struct callshapes *) keyword_value;
1306		    state = 1;
1307		    break;
1308		  }
1309
1310		dottedname = strchr (dottedname, '.');
1311		if (dottedname == NULL)
1312		  {
1313		    state = 0;
1314		    break;
1315		  }
1316		dottedname++;
1317	      }
1318
1319	    for (dottedname = sum;;)
1320	      {
1321		context_list =
1322		  flag_context_list_table_lookup (
1323		    flag_context_list_table,
1324		    dottedname, strlen (dottedname));
1325		if (context_list != NULL)
1326		  break;
1327
1328		dottedname = strchr (dottedname, '.');
1329		if (dottedname == NULL)
1330		  break;
1331		dottedname++;
1332	      }
1333	    next_context_iter = flag_context_list_iterator (context_list);
1334
1335	    free (sum);
1336	    continue;
1337	  }
1338
1339	case token_type_lparen:
1340	  if (extract_parenthesized (mlp, token_type_rparen,
1341				     inner_context, next_context_iter,
1342				     arglist_parser_alloc (mlp,
1343							   state ? next_shapes : NULL)))
1344	    {
1345	      xgettext_current_source_encoding = po_charset_utf8;
1346	      arglist_parser_done (argparser, arg);
1347	      xgettext_current_source_encoding = xgettext_global_source_encoding;
1348	      return true;
1349	    }
1350	  next_context_iter = null_context_list_iterator;
1351	  state = 0;
1352	  continue;
1353
1354	case token_type_rparen:
1355	  if (terminator == token_type_rparen)
1356	    {
1357	      xgettext_current_source_encoding = po_charset_utf8;
1358	      arglist_parser_done (argparser, arg);
1359	      xgettext_current_source_encoding = xgettext_global_source_encoding;
1360	      return false;
1361	    }
1362	  if (terminator == token_type_rbrace)
1363	    {
1364	      error_with_progname = false;
1365	      error (0, 0,
1366		     _("%s:%d: warning: ')' found where '}' was expected"),
1367		     logical_file_name, token.line_number);
1368	      error_with_progname = true;
1369	    }
1370	  next_context_iter = null_context_list_iterator;
1371	  state = 0;
1372	  continue;
1373
1374	case token_type_lbrace:
1375	  if (extract_parenthesized (mlp, token_type_rbrace,
1376				     null_context, null_context_list_iterator,
1377				     arglist_parser_alloc (mlp, NULL)))
1378	    {
1379	      xgettext_current_source_encoding = po_charset_utf8;
1380	      arglist_parser_done (argparser, arg);
1381	      xgettext_current_source_encoding = xgettext_global_source_encoding;
1382	      return true;
1383	    }
1384	  next_context_iter = null_context_list_iterator;
1385	  state = 0;
1386	  continue;
1387
1388	case token_type_rbrace:
1389	  if (terminator == token_type_rbrace)
1390	    {
1391	      xgettext_current_source_encoding = po_charset_utf8;
1392	      arglist_parser_done (argparser, arg);
1393	      xgettext_current_source_encoding = xgettext_global_source_encoding;
1394	      return false;
1395	    }
1396	  if (terminator == token_type_rparen)
1397	    {
1398	      error_with_progname = false;
1399	      error (0, 0,
1400		     _("%s:%d: warning: '}' found where ')' was expected"),
1401		     logical_file_name, token.line_number);
1402	      error_with_progname = true;
1403	    }
1404	  next_context_iter = null_context_list_iterator;
1405	  state = 0;
1406	  continue;
1407
1408	case token_type_comma:
1409	  arg++;
1410	  inner_context =
1411	    inherited_context (outer_context,
1412			       flag_context_list_iterator_advance (
1413				 &context_iter));
1414	  next_context_iter = passthrough_context_list_iterator;
1415	  state = 0;
1416	  continue;
1417
1418	case token_type_string_literal:
1419	  {
1420	    lex_pos_ty pos;
1421	    pos.file_name = logical_file_name;
1422	    pos.line_number = token.line_number;
1423
1424	    xgettext_current_source_encoding = po_charset_utf8;
1425	    if (extract_all)
1426	      remember_a_message (mlp, NULL, token.string, inner_context,
1427				  &pos, token.comment);
1428	    else
1429	      arglist_parser_remember (argparser, arg, token.string,
1430				       inner_context,
1431				       pos.file_name, pos.line_number,
1432				       token.comment);
1433	    xgettext_current_source_encoding = xgettext_global_source_encoding;
1434	  }
1435	  drop_reference (token.comment);
1436	  next_context_iter = null_context_list_iterator;
1437	  state = 0;
1438	  continue;
1439
1440	case token_type_eof:
1441	  xgettext_current_source_encoding = po_charset_utf8;
1442	  arglist_parser_done (argparser, arg);
1443	  xgettext_current_source_encoding = xgettext_global_source_encoding;
1444	  return true;
1445
1446	case token_type_dot:
1447	case token_type_number:
1448	case token_type_plus:
1449	case token_type_other:
1450	  next_context_iter = null_context_list_iterator;
1451	  state = 0;
1452	  continue;
1453
1454	default:
1455	  abort ();
1456	}
1457    }
1458}
1459
1460
1461void
1462extract_java (FILE *f,
1463	      const char *real_filename, const char *logical_filename,
1464	      flag_context_list_table_ty *flag_table,
1465	      msgdomain_list_ty *mdlp)
1466{
1467  message_list_ty *mlp = mdlp->item[0]->messages;
1468
1469  fp = f;
1470  real_file_name = real_filename;
1471  logical_file_name = xstrdup (logical_filename);
1472  line_number = 1;
1473
1474  last_comment_line = -1;
1475  last_non_comment_line = -1;
1476
1477  phase6_last = token_type_eof;
1478
1479  flag_context_list_table = flag_table;
1480
1481  init_keywords ();
1482
1483  /* Eat tokens until eof is seen.  When extract_parenthesized returns
1484     due to an unbalanced closing parenthesis, just restart it.  */
1485  while (!extract_parenthesized (mlp, token_type_eof,
1486				 null_context, null_context_list_iterator,
1487				 arglist_parser_alloc (mlp, NULL)))
1488    ;
1489
1490  fp = NULL;
1491  real_file_name = NULL;
1492  logical_file_name = NULL;
1493  line_number = 0;
1494}
1495