1/* xgettext Java backend.
2   Copyright (C) 2003, 2005-2006 Free Software Foundation, Inc.
3   Written by Bruno Haible <bruno@clisp.org>, 2003.
4
5   This program is free software; you can redistribute it and/or modify
6   it under the terms of the GNU General Public License as published by
7   the Free Software Foundation; either version 2, or (at your option)
8   any later version.
9
10   This program is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13   GNU General Public License for more details.
14
15   You should have received a copy of the GNU General Public License
16   along with this program; if not, write to the Free Software Foundation,
17   Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.  */
18
19#ifdef HAVE_CONFIG_H
20# include "config.h"
21#endif
22
23#include <errno.h>
24#include <stdbool.h>
25#include <stdio.h>
26#include <stdlib.h>
27#include <string.h>
28
29#include "message.h"
30#include "xgettext.h"
31#include "x-java.h"
32#include "error.h"
33#include "xalloc.h"
34#include "exit.h"
35#include "hash.h"
36#include "po-charset.h"
37#include "utf16-ucs4.h"
38#include "ucs4-utf8.h"
39#include "gettext.h"
40
41#define _(s) gettext(s)
42
43#define SIZEOF(a) (sizeof(a) / sizeof(a[0]))
44
45
46/* The Java syntax is defined in the
47     Java Language Specification, Second Edition,
48     (available from http://java.sun.com/),
49     chapter 3 "Lexical Structure".  */
50
51
52/* ====================== Keyword set customization.  ====================== */
53
54/* If true extract all strings.  */
55static bool extract_all = false;
56
57static hash_table keywords;
58static bool default_keywords = true;
59
60
61void
62x_java_extract_all ()
63{
64  extract_all = true;
65}
66
67
68void
69x_java_keyword (const char *name)
70{
71  if (name == NULL)
72    default_keywords = false;
73  else
74    {
75      const char *end;
76      struct callshape shape;
77      const char *colon;
78
79      if (keywords.table == NULL)
80	hash_init (&keywords, 100);
81
82      split_keywordspec (name, &end, &shape);
83
84      /* The characters between name and end should form a valid Java
85	 identifier sequence with dots.
86	 A colon means an invalid parse in split_keywordspec().  */
87      colon = strchr (name, ':');
88      if (colon == NULL || colon >= end)
89	insert_keyword_callshape (&keywords, name, end - name, &shape);
90    }
91}
92
93/* Finish initializing the keywords hash table.
94   Called after argument processing, before each file is processed.  */
95static void
96init_keywords ()
97{
98  if (default_keywords)
99    {
100      /* When adding new keywords here, also update the documentation in
101	 xgettext.texi!  */
102      x_java_keyword ("GettextResource.gettext:2");	/* static method */
103      x_java_keyword ("GettextResource.ngettext:2,3");	/* static method */
104      x_java_keyword ("gettext");
105      x_java_keyword ("ngettext:1,2");
106      x_java_keyword ("getString");	/* ResourceBundle.getString */
107      default_keywords = false;
108    }
109}
110
111void
112init_flag_table_java ()
113{
114  xgettext_record_flag ("GettextResource.gettext:2:pass-java-format");
115  xgettext_record_flag ("GettextResource.ngettext:2:pass-java-format");
116  xgettext_record_flag ("GettextResource.ngettext:3:pass-java-format");
117  xgettext_record_flag ("gettext:1:pass-java-format");
118  xgettext_record_flag ("ngettext:1:pass-java-format");
119  xgettext_record_flag ("ngettext:2:pass-java-format");
120  xgettext_record_flag ("getString:1:pass-java-format");
121  xgettext_record_flag ("MessageFormat:1:java-format");
122  xgettext_record_flag ("MessageFormat.format:1:java-format");
123}
124
125
126/* ======================== Reading of characters.  ======================== */
127
128/* Real filename, used in error messages about the input file.  */
129static const char *real_file_name;
130
131/* Logical filename and line number, used to label the extracted messages.  */
132static char *logical_file_name;
133static int line_number;
134
135/* The input file stream.  */
136static FILE *fp;
137
138
139/* Fetch the next single-byte character from the input file.
140   Pushback can consist of an unlimited number of 'u' followed by up to 4
141   other characters.  */
142
143/* Special coding of multiple 'u's in the pushback buffer.  */
144#define MULTIPLE_U(count) (0x1000 + (count))
145
146static int phase1_pushback[5];
147static unsigned int phase1_pushback_length;
148
149static int
150phase1_getc ()
151{
152  int c;
153
154  if (phase1_pushback_length)
155    {
156      c = phase1_pushback[--phase1_pushback_length];
157      if (c >= MULTIPLE_U (0))
158	{
159	  if (c > MULTIPLE_U (1))
160	    phase1_pushback[phase1_pushback_length++] = c - 1;
161	  return 'u';
162	}
163      else
164	return c;
165    }
166
167  c = getc (fp);
168
169  if (c == EOF)
170    {
171      if (ferror (fp))
172	error (EXIT_FAILURE, errno, _("\
173error while reading \"%s\""), real_file_name);
174    }
175
176  return c;
177}
178
179/* Supports any number of 'u' and up to 4 arbitrary characters of pushback.  */
180static void
181phase1_ungetc (int c)
182{
183  if (c != EOF)
184    {
185      if (c == 'u')
186	{
187	  if (phase1_pushback_length > 0
188	      && phase1_pushback[phase1_pushback_length - 1] >= MULTIPLE_U (0))
189	    phase1_pushback[phase1_pushback_length - 1]++;
190	  else
191	    {
192	      if (phase1_pushback_length == SIZEOF (phase1_pushback))
193		abort ();
194	      phase1_pushback[phase1_pushback_length++] = MULTIPLE_U (1);
195	    }
196	}
197      else
198	{
199	  if (phase1_pushback_length == SIZEOF (phase1_pushback))
200	    abort ();
201	  phase1_pushback[phase1_pushback_length++] = c;
202	}
203    }
204}
205
206
207/* Fetch the next single-byte character or Unicode character from the file.
208   (Here, as in the Java Language Specification, when we say "Unicode
209   character", we actually mean "UTF-16 encoding unit".)  */
210
211/* Return value of phase 2, 3, 4 when EOF is reached.  */
212#define P2_EOF 0xffff
213
214/* Convert an UTF-16 code point to a return value that can be distinguished
215   from a single-byte return value.  */
216#define UNICODE(code) (0x10000 + (code))
217
218/* Test a return value of phase 2, 3, 4 whether it designates an UTF-16 code
219   point.  */
220#define IS_UNICODE(p2_result) ((p2_result) >= 0x10000)
221
222/* Extract the UTF-16 code of a return value that satisfies IS_UNICODE.  */
223#define UTF16_VALUE(p2_result) ((p2_result) - 0x10000)
224
225/* Reduces a return value of phase 2, 3, 4 by unmasking the UNICODE bit,
226   so that it can be more easily compared against an ASCII character.
227   (RED (c) == 'x')  is equivalent to  (c == 'x' || c == UNICODE ('x')).  */
228#define RED(p2_result) ((p2_result) & 0xffff)
229
230static int phase2_pushback[1];
231static int phase2_pushback_length;
232
233static int
234phase2_getc ()
235{
236  int c;
237
238  if (phase2_pushback_length)
239    return phase2_pushback[--phase2_pushback_length];
240
241  c = phase1_getc ();
242  if (c == EOF)
243    return P2_EOF;
244  if (c == '\\')
245    {
246      c = phase1_getc ();
247      if (c == 'u')
248	{
249	  unsigned int u_count = 1;
250	  unsigned char buf[4];
251	  unsigned int n;
252	  int i;
253
254	  for (;;)
255	    {
256	      c = phase1_getc ();
257	      if (c != 'u')
258		break;
259	      u_count++;
260	    }
261	  phase1_ungetc (c);
262
263	  n = 0;
264	  for (i = 0; i < 4; i++)
265	    {
266	      c = phase1_getc ();
267
268	      if (c >= '0' && c <= '9')
269		n = (n << 4) + (c - '0');
270	      else if (c >= 'A' && c <= 'F')
271		n = (n << 4) + (c - 'A' + 10);
272	      else if (c >= 'a' && c <= 'f')
273		n = (n << 4) + (c - 'a' + 10);
274	      else
275		{
276		  phase1_ungetc (c);
277		  while (--i >= 0)
278		    phase1_ungetc (buf[i]);
279		  for (; u_count > 0; u_count--)
280		    phase1_ungetc ('u');
281		  return '\\';
282		}
283
284	      buf[i] = c;
285	    }
286	  return UNICODE (n);
287	}
288      phase1_ungetc (c);
289      return '\\';
290    }
291  return c;
292}
293
294/* Supports only one pushback character.  */
295static void
296phase2_ungetc (int c)
297{
298  if (c != P2_EOF)
299    {
300      if (phase2_pushback_length == SIZEOF (phase2_pushback))
301	abort ();
302      phase2_pushback[phase2_pushback_length++] = c;
303    }
304}
305
306
307/* Fetch the next single-byte character or Unicode character from the file.
308   With line number handling.
309   Convert line terminators to '\n' or UNICODE ('\n').  */
310
311static int phase3_pushback[2];
312static int phase3_pushback_length;
313
314static int
315phase3_getc ()
316{
317  int c;
318
319  if (phase3_pushback_length)
320    {
321      c = phase3_pushback[--phase3_pushback_length];
322      if (c == '\n')
323	++line_number;
324      return c;
325    }
326
327  c = phase2_getc ();
328
329  /* Handle line terminators.  */
330  if (RED (c) == '\r')
331    {
332      int c1 = phase2_getc ();
333
334      if (RED (c1) != '\n')
335	phase2_ungetc (c1);
336
337      /* Seen line terminator CR or CR/LF.  */
338      if (c == '\r' || c1 == '\n')
339	{
340	  ++line_number;
341	  return '\n';
342	}
343      else
344	return UNICODE ('\n');
345    }
346  else if (RED (c) == '\n')
347    {
348      /* Seen line terminator LF.  */
349      if (c == '\n')
350	{
351	  ++line_number;
352	  return '\n';
353	}
354      else
355	return UNICODE ('\n');
356    }
357
358  return c;
359}
360
361/* Supports 2 characters of pushback.  */
362static void
363phase3_ungetc (int c)
364{
365  if (c != P2_EOF)
366    {
367      if (c == '\n')
368	--line_number;
369      if (phase3_pushback_length == SIZEOF (phase3_pushback))
370	abort ();
371      phase3_pushback[phase3_pushback_length++] = c;
372    }
373}
374
375
376/* ========================= Accumulating strings.  ======================== */
377
378/* A string buffer type that allows appending bytes (in the
379   xgettext_current_source_encoding) or Unicode characters.
380   Returns the entire string in UTF-8 encoding.  */
381
382struct string_buffer
383{
384  /* The part of the string that has already been converted to UTF-8.  */
385  char *utf8_buffer;
386  size_t utf8_buflen;
387  size_t utf8_allocated;
388  /* The first half of an UTF-16 surrogate character.  */
389  unsigned short utf16_surr;
390  /* The part of the string that is still in the source encoding.  */
391  char *curr_buffer;
392  size_t curr_buflen;
393  size_t curr_allocated;
394};
395
396/* Initialize a 'struct string_buffer' to empty.  */
397static inline void
398init_string_buffer (struct string_buffer *bp)
399{
400  bp->utf8_buffer = NULL;
401  bp->utf8_buflen = 0;
402  bp->utf8_allocated = 0;
403  bp->utf16_surr = 0;
404  bp->curr_buffer = NULL;
405  bp->curr_buflen = 0;
406  bp->curr_allocated = 0;
407}
408
409/* Auxiliary function: Append a byte to bp->curr.  */
410static inline void
411string_buffer_append_byte (struct string_buffer *bp, unsigned char c)
412{
413  if (bp->curr_buflen == bp->curr_allocated)
414    {
415      bp->curr_allocated = 2 * bp->curr_allocated + 10;
416      bp->curr_buffer = xrealloc (bp->curr_buffer, bp->curr_allocated);
417    }
418  bp->curr_buffer[bp->curr_buflen++] = c;
419}
420
421/* Auxiliary function: Ensure count more bytes are available in bp->utf8.  */
422static inline void
423string_buffer_append_unicode_grow (struct string_buffer *bp, size_t count)
424{
425  if (bp->utf8_buflen + count > bp->utf8_allocated)
426    {
427      size_t new_allocated = 2 * bp->utf8_allocated + 10;
428      if (new_allocated < bp->utf8_buflen + count)
429	new_allocated = bp->utf8_buflen + count;
430      bp->utf8_allocated = new_allocated;
431      bp->utf8_buffer = xrealloc (bp->utf8_buffer, new_allocated);
432    }
433}
434
435/* Auxiliary function: Append a Unicode character to bp->utf8.
436   uc must be < 0x110000.  */
437static inline void
438string_buffer_append_unicode (struct string_buffer *bp, unsigned int uc)
439{
440  unsigned char utf8buf[6];
441  int count = u8_uctomb (utf8buf, uc, 6);
442
443  if (count < 0)
444    /* The caller should have ensured that uc is not out-of-range.  */
445    abort ();
446
447  string_buffer_append_unicode_grow (bp, count);
448  memcpy (bp->utf8_buffer + bp->utf8_buflen, utf8buf, count);
449  bp->utf8_buflen += count;
450}
451
452/* Auxiliary function: Flush bp->utf16_surr into bp->utf8_buffer.  */
453static inline void
454string_buffer_flush_utf16_surr (struct string_buffer *bp)
455{
456  if (bp->utf16_surr != 0)
457    {
458      /* A half surrogate is invalid, therefore use U+FFFD instead.  */
459      string_buffer_append_unicode (bp, 0xfffd);
460      bp->utf16_surr = 0;
461    }
462}
463
464/* Auxiliary function: Flush bp->curr_buffer into bp->utf8_buffer.  */
465static inline void
466string_buffer_flush_curr_buffer (struct string_buffer *bp, int lineno)
467{
468  if (bp->curr_buflen > 0)
469    {
470      char *curr;
471      size_t count;
472
473      string_buffer_append_byte (bp, '\0');
474
475      /* Convert from the source encoding to UTF-8.  */
476      curr = from_current_source_encoding (bp->curr_buffer,
477					   logical_file_name, lineno);
478
479      /* Append it to bp->utf8_buffer.  */
480      count = strlen (curr);
481      string_buffer_append_unicode_grow (bp, count);
482      memcpy (bp->utf8_buffer + bp->utf8_buflen, curr, count);
483      bp->utf8_buflen += count;
484
485      if (curr != bp->curr_buffer)
486	free (curr);
487      bp->curr_buflen = 0;
488    }
489}
490
491/* Append a character or Unicode character to a 'struct string_buffer'.  */
492static void
493string_buffer_append (struct string_buffer *bp, int c)
494{
495  if (IS_UNICODE (c))
496    {
497      /* Append a Unicode character.  */
498
499      /* Switch from multibyte character mode to Unicode character mode.  */
500      string_buffer_flush_curr_buffer (bp, line_number);
501
502      /* Test whether this character and the previous one form a Unicode
503	 surrogate character pair.  */
504      if (bp->utf16_surr != 0
505	  && (c >= UNICODE (0xdc00) && c < UNICODE (0xe000)))
506	{
507	  unsigned short utf16buf[2];
508	  unsigned int uc;
509
510	  utf16buf[0] = bp->utf16_surr;
511	  utf16buf[1] = UTF16_VALUE (c);
512	  if (u16_mbtouc_aux (&uc, utf16buf, 2) != 2)
513	    abort ();
514
515	  string_buffer_append_unicode (bp, uc);
516	  bp->utf16_surr = 0;
517	}
518      else
519	{
520	  string_buffer_flush_utf16_surr (bp);
521
522	  if (c >= UNICODE (0xd800) && c < UNICODE (0xdc00))
523	    bp->utf16_surr = UTF16_VALUE (c);
524	  else
525	    string_buffer_append_unicode (bp, UTF16_VALUE (c));
526	}
527    }
528  else
529    {
530      /* Append a single byte.  */
531
532      /* Switch from Unicode character mode to multibyte character mode.  */
533      string_buffer_flush_utf16_surr (bp);
534
535      /* When a newline is seen, convert the accumulated multibyte sequence.
536	 This ensures a correct line number in the error message in case of
537	 a conversion error.  The "- 1" is to account for the newline.  */
538      if (c == '\n')
539	string_buffer_flush_curr_buffer (bp, line_number - 1);
540
541      string_buffer_append_byte (bp, (unsigned char) c);
542    }
543}
544
545/* Return the string buffer's contents.  */
546static char *
547string_buffer_result (struct string_buffer *bp)
548{
549  /* Flush all into bp->utf8_buffer.  */
550  string_buffer_flush_utf16_surr (bp);
551  string_buffer_flush_curr_buffer (bp, line_number);
552  /* NUL-terminate it.  */
553  string_buffer_append_unicode_grow (bp, 1);
554  bp->utf8_buffer[bp->utf8_buflen] = '\0';
555  /* Return it.  */
556  return bp->utf8_buffer;
557}
558
559/* Free the memory pointed to by a 'struct string_buffer'.  */
560static inline void
561free_string_buffer (struct string_buffer *bp)
562{
563  free (bp->utf8_buffer);
564  free (bp->curr_buffer);
565}
566
567
568/* ======================== Accumulating comments.  ======================== */
569
570
571/* Accumulating a single comment line.  */
572
573static struct string_buffer comment_buffer;
574
575static inline void
576comment_start ()
577{
578  comment_buffer.utf8_buflen = 0;
579  comment_buffer.utf16_surr = 0;
580  comment_buffer.curr_buflen = 0;
581}
582
583static inline bool
584comment_at_start ()
585{
586  return (comment_buffer.utf8_buflen == 0 && comment_buffer.utf16_surr == 0
587	  && comment_buffer.curr_buflen == 0);
588}
589
590static inline void
591comment_add (int c)
592{
593  string_buffer_append (&comment_buffer, c);
594}
595
596static inline void
597comment_line_end (size_t chars_to_remove)
598{
599  char *buffer = string_buffer_result (&comment_buffer);
600  size_t buflen = strlen (buffer);
601
602  buflen -= chars_to_remove;
603  while (buflen >= 1
604	 && (buffer[buflen - 1] == ' ' || buffer[buflen - 1] == '\t'))
605    --buflen;
606  buffer[buflen] = '\0';
607  savable_comment_add (buffer);
608}
609
610
611/* These are for tracking whether comments count as immediately before
612   keyword.  */
613static int last_comment_line;
614static int last_non_comment_line;
615
616
617/* Replace each comment that is not inside a character constant or string
618   literal with a space or newline character.  */
619
620static int
621phase4_getc ()
622{
623  int c0;
624  int c;
625  bool last_was_star;
626
627  c0 = phase3_getc ();
628  if (RED (c0) != '/')
629    return c0;
630  c = phase3_getc ();
631  switch (RED (c))
632    {
633    default:
634      phase3_ungetc (c);
635      return c0;
636
637    case '*':
638      /* C style comment.  */
639      comment_start ();
640      last_was_star = false;
641      for (;;)
642	{
643	  c = phase3_getc ();
644	  if (c == P2_EOF)
645	    break;
646	  /* We skip all leading white space, but not EOLs.  */
647	  if (!(comment_at_start () && (RED (c) == ' ' || RED (c) == '\t')))
648	    comment_add (c);
649	  switch (RED (c))
650	    {
651	    case '\n':
652	      comment_line_end (1);
653	      comment_start ();
654	      last_was_star = false;
655	      continue;
656
657	    case '*':
658	      last_was_star = true;
659	      continue;
660
661	    case '/':
662	      if (last_was_star)
663		{
664		  comment_line_end (2);
665		  break;
666		}
667	      /* FALLTHROUGH */
668
669	    default:
670	      last_was_star = false;
671	      continue;
672	    }
673	  break;
674	}
675      last_comment_line = line_number;
676      return ' ';
677
678    case '/':
679      /* C++ style comment.  */
680      last_comment_line = line_number;
681      comment_start ();
682      for (;;)
683	{
684	  c = phase3_getc ();
685	  if (RED (c) == '\n' || c == P2_EOF)
686	    break;
687	  /* We skip all leading white space, but not EOLs.  */
688	  if (!(comment_at_start () && (RED (c) == ' ' || RED (c) == '\t')))
689	    comment_add (c);
690	}
691      phase3_ungetc (c); /* push back the newline, to decrement line_number */
692      comment_line_end (0);
693      phase3_getc (); /* read the newline again */
694      return '\n';
695    }
696}
697
698/* Supports only one pushback character.  */
699static void
700phase4_ungetc (int c)
701{
702  phase3_ungetc (c);
703}
704
705
706/* ========================== Reading of tokens.  ========================== */
707
708enum token_type_ty
709{
710  token_type_eof,
711  token_type_lparen,		/* ( */
712  token_type_rparen,		/* ) */
713  token_type_lbrace,		/* { */
714  token_type_rbrace,		/* } */
715  token_type_comma,		/* , */
716  token_type_dot,		/* . */
717  token_type_string_literal,	/* "abc" */
718  token_type_number,		/* 1.23 */
719  token_type_symbol,		/* identifier, keyword, null */
720  token_type_plus,		/* + */
721  token_type_other		/* character literal, misc. operator */
722};
723typedef enum token_type_ty token_type_ty;
724
725typedef struct token_ty token_ty;
726struct token_ty
727{
728  token_type_ty type;
729  char *string;		/* for token_type_string_literal, token_type_symbol */
730  refcounted_string_list_ty *comment;	/* for token_type_string_literal */
731  int line_number;
732};
733
734
735/* Free the memory pointed to by a 'struct token_ty'.  */
736static inline void
737free_token (token_ty *tp)
738{
739  if (tp->type == token_type_string_literal || tp->type == token_type_symbol)
740    free (tp->string);
741  if (tp->type == token_type_string_literal)
742    drop_reference (tp->comment);
743}
744
745
746/* Read an escape sequence inside a string literal or character literal.  */
747static inline int
748do_getc_escaped ()
749{
750  int c;
751
752  /* Use phase 3, because phase 4 elides comments.  */
753  c = phase3_getc ();
754  if (c == P2_EOF)
755    return UNICODE ('\\');
756  switch (RED (c))
757    {
758    case 'b':
759      return UNICODE (0x08);
760    case 't':
761      return UNICODE (0x09);
762    case 'n':
763      return UNICODE (0x0a);
764    case 'f':
765      return UNICODE (0x0c);
766    case 'r':
767      return UNICODE (0x0d);
768    case '"':
769      return UNICODE ('"');
770    case '\'':
771      return UNICODE ('\'');
772    case '\\':
773      return UNICODE ('\\');
774    case '0': case '1': case '2': case '3':
775    case '4': case '5': case '6': case '7':
776      {
777	int n = RED (c) - '0';
778	bool maybe3digits = (n < 4);
779
780	c = phase3_getc ();
781	if (RED (c) >= '0' && RED (c) <= '7')
782	  {
783	    n = (n << 3) + (RED (c) - '0');
784	    if (maybe3digits)
785	      {
786		c = phase3_getc ();
787		if (RED (c) >= '0' && RED (c) <= '7')
788		  n = (n << 3) + (RED (c) - '0');
789		else
790		  phase3_ungetc (c);
791	      }
792	  }
793	else
794	  phase3_ungetc (c);
795
796	return UNICODE (n);
797      }
798    default:
799      /* Invalid escape sequence.  */
800      phase3_ungetc (c);
801      return UNICODE ('\\');
802    }
803}
804
805/* Read a string literal or character literal.  */
806static void
807accumulate_escaped (struct string_buffer *literal, int delimiter)
808{
809  int c;
810
811  for (;;)
812    {
813      /* Use phase 3, because phase 4 elides comments.  */
814      c = phase3_getc ();
815      if (c == P2_EOF || RED (c) == delimiter)
816	break;
817      if (RED (c) == '\n')
818	{
819	  phase3_ungetc (c);
820	  error_with_progname = false;
821	  if (delimiter == '\'')
822	    error (0, 0, _("%s:%d: warning: unterminated character constant"),
823		   logical_file_name, line_number);
824	  else
825	    error (0, 0, _("%s:%d: warning: unterminated string constant"),
826		   logical_file_name, line_number);
827	  error_with_progname = true;
828	  break;
829	}
830      if (RED (c) == '\\')
831	c = do_getc_escaped ();
832      string_buffer_append (literal, c);
833    }
834}
835
836
837/* Combine characters into tokens.  Discard whitespace.  */
838
839static token_ty phase5_pushback[3];
840static int phase5_pushback_length;
841
842static void
843phase5_get (token_ty *tp)
844{
845  int c;
846
847  if (phase5_pushback_length)
848    {
849      *tp = phase5_pushback[--phase5_pushback_length];
850      return;
851    }
852  tp->string = NULL;
853
854  for (;;)
855    {
856      tp->line_number = line_number;
857      c = phase4_getc ();
858
859      if (c == P2_EOF)
860	{
861	  tp->type = token_type_eof;
862	  return;
863	}
864
865      switch (RED (c))
866	{
867	case '\n':
868	  if (last_non_comment_line > last_comment_line)
869	    savable_comment_reset ();
870	  /* FALLTHROUGH */
871	case ' ':
872	case '\t':
873	case '\f':
874	  /* Ignore whitespace and comments.  */
875	  continue;
876	}
877
878      last_non_comment_line = tp->line_number;
879
880      switch (RED (c))
881	{
882	case '(':
883	  tp->type = token_type_lparen;
884	  return;
885
886	case ')':
887	  tp->type = token_type_rparen;
888	  return;
889
890	case '{':
891	  tp->type = token_type_lbrace;
892	  return;
893
894	case '}':
895	  tp->type = token_type_rbrace;
896	  return;
897
898	case ',':
899	  tp->type = token_type_comma;
900	  return;
901
902	case '.':
903	  c = phase4_getc ();
904	  if (!(RED (c) >= '0' && RED (c) <= '9'))
905	    {
906	      phase4_ungetc (c);
907	      tp->type = token_type_dot;
908	      return;
909	    }
910	  /* FALLTHROUGH */
911
912	case '0': case '1': case '2': case '3': case '4':
913	case '5': case '6': case '7': case '8': case '9':
914	  {
915	    /* Don't need to verify the complicated syntax of integers and
916	       floating-point numbers.  We assume a valid Java input.
917	       The simplified syntax that we recognize as number is: any
918	       sequence of alphanumeric characters, additionally '+' and '-'
919	       immediately after 'e' or 'E' except in hexadecimal numbers.  */
920	    bool hexadecimal = false;
921
922	    for (;;)
923	      {
924		c = phase4_getc ();
925		if (RED (c) >= '0' && RED (c) <= '9')
926		  continue;
927		if ((RED (c) >= 'A' && RED (c) <= 'Z')
928		    || (RED (c) >= 'a' && RED (c) <= 'z'))
929		  {
930		    if (RED (c) == 'X' || RED (c) == 'x')
931		      hexadecimal = true;
932		    if ((RED (c) == 'E' || RED (c) == 'e') && !hexadecimal)
933		      {
934			c = phase4_getc ();
935			if (!(RED (c) == '+' || RED (c) == '-'))
936			  phase4_ungetc (c);
937		      }
938		    continue;
939		  }
940		if (RED (c) == '.')
941		  continue;
942		break;
943	      }
944	    phase4_ungetc (c);
945	    tp->type = token_type_number;
946	    return;
947	  }
948
949	case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G':
950	case 'H': case 'I': case 'J': case 'K': case 'L': case 'M': case 'N':
951	case 'O': case 'P': case 'Q': case 'R': case 'S': case 'T': case 'U':
952	case 'V': case 'W': case 'X': case 'Y': case 'Z':
953	case '_':
954	case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g':
955	case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n':
956	case 'o': case 'p': case 'q': case 'r': case 's': case 't': case 'u':
957	case 'v': case 'w': case 'x': case 'y': case 'z':
958	  /* Although Java allows identifiers containing many Unicode
959	     characters, we recognize only identifiers consisting of ASCII
960	     characters.  This avoids conversion hassles w.r.t. the --keyword
961	     arguments, and shouldn't be a big problem in practice.  */
962	  {
963	    static char *buffer;
964	    static int bufmax;
965	    int bufpos = 0;
966	    for (;;)
967	      {
968		if (bufpos >= bufmax)
969		  {
970		    bufmax = 2 * bufmax + 10;
971		    buffer = xrealloc (buffer, bufmax);
972		  }
973		buffer[bufpos++] = RED (c);
974		c = phase4_getc ();
975		if (!((RED (c) >= 'A' && RED (c) <= 'Z')
976		      || (RED (c) >= 'a' && RED (c) <= 'z')
977		      || (RED (c) >= '0' && RED (c) <= '9')
978		      || RED (c) == '_'))
979		  break;
980	      }
981	    phase4_ungetc (c);
982	    if (bufpos >= bufmax)
983	      {
984		bufmax = 2 * bufmax + 10;
985		buffer = xrealloc (buffer, bufmax);
986	      }
987	    buffer[bufpos] = '\0';
988	    tp->string = xstrdup (buffer);
989	    tp->type = token_type_symbol;
990	    return;
991	  }
992
993	case '"':
994	  /* String literal.  */
995	  {
996	    struct string_buffer literal;
997
998	    init_string_buffer (&literal);
999	    accumulate_escaped (&literal, '"');
1000	    tp->string = xstrdup (string_buffer_result (&literal));
1001	    free_string_buffer (&literal);
1002	    tp->comment = add_reference (savable_comment);
1003	    tp->type = token_type_string_literal;
1004	    return;
1005	  }
1006
1007	case '\'':
1008	  /* Character literal.  */
1009	  {
1010	    struct string_buffer literal;
1011
1012	    init_string_buffer (&literal);
1013	    accumulate_escaped (&literal, '\'');
1014	    free_string_buffer (&literal);
1015	    tp->type = token_type_other;
1016	    return;
1017	  }
1018
1019	case '+':
1020	  c = phase4_getc ();
1021	  if (RED (c) == '+')
1022	    /* Operator ++ */
1023	    tp->type = token_type_other;
1024	  else if (RED (c) == '=')
1025	    /* Operator += */
1026	    tp->type = token_type_other;
1027	  else
1028	    {
1029	      /* Operator + */
1030	      phase4_ungetc (c);
1031	      tp->type = token_type_plus;
1032	    }
1033	  return;
1034
1035	default:
1036	  /* Misc. operator.  */
1037	  tp->type = token_type_other;
1038	  return;
1039	}
1040    }
1041}
1042
1043/* Supports 3 tokens of pushback.  */
1044static void
1045phase5_unget (token_ty *tp)
1046{
1047  if (tp->type != token_type_eof)
1048    {
1049      if (phase5_pushback_length == SIZEOF (phase5_pushback))
1050	abort ();
1051      phase5_pushback[phase5_pushback_length++] = *tp;
1052    }
1053}
1054
1055
1056/* Compile-time optimization of string literal concatenation.
1057   Combine "string1" + ... + "stringN" to the concatenated string if
1058     - the token before this expression is not ')' (because then the first
1059       string could be part of a cast expression),
1060     - the token after this expression is not '.' (because then the last
1061       string could be part of a method call expression).  */
1062
1063static token_ty phase6_pushback[2];
1064static int phase6_pushback_length;
1065
1066static token_type_ty phase6_last;
1067
1068static void
1069phase6_get (token_ty *tp)
1070{
1071  if (phase6_pushback_length)
1072    {
1073      *tp = phase6_pushback[--phase6_pushback_length];
1074      return;
1075    }
1076
1077  phase5_get (tp);
1078  if (tp->type == token_type_string_literal && phase6_last != token_type_rparen)
1079    {
1080      char *sum = tp->string;
1081      size_t sum_len = strlen (sum);
1082
1083      for (;;)
1084	{
1085	  token_ty token2;
1086
1087	  phase5_get (&token2);
1088	  if (token2.type == token_type_plus)
1089	    {
1090	      token_ty token3;
1091
1092	      phase5_get (&token3);
1093	      if (token3.type == token_type_string_literal)
1094		{
1095		  token_ty token_after;
1096
1097		  phase5_get (&token_after);
1098		  if (token_after.type != token_type_dot)
1099		    {
1100		      char *addend = token3.string;
1101		      size_t addend_len = strlen (addend);
1102
1103		      sum = (char *) xrealloc (sum, sum_len + addend_len + 1);
1104		      memcpy (sum + sum_len, addend, addend_len + 1);
1105		      sum_len += addend_len;
1106
1107		      phase5_unget (&token_after);
1108		      free_token (&token3);
1109		      free_token (&token2);
1110		      continue;
1111		    }
1112		  phase5_unget (&token_after);
1113		}
1114	      phase5_unget (&token3);
1115	    }
1116	  phase5_unget (&token2);
1117	  break;
1118	}
1119      tp->string = sum;
1120    }
1121  phase6_last = tp->type;
1122}
1123
1124/* Supports 2 tokens of pushback.  */
1125static void
1126phase6_unget (token_ty *tp)
1127{
1128  if (tp->type != token_type_eof)
1129    {
1130      if (phase6_pushback_length == SIZEOF (phase6_pushback))
1131	abort ();
1132      phase6_pushback[phase6_pushback_length++] = *tp;
1133    }
1134}
1135
1136
1137static void
1138x_java_lex (token_ty *tp)
1139{
1140  phase6_get (tp);
1141}
1142
1143/* Supports 2 tokens of pushback.  */
1144static void
1145x_java_unlex (token_ty *tp)
1146{
1147  phase6_unget (tp);
1148}
1149
1150
1151/* ========================= Extracting strings.  ========================== */
1152
1153
1154/* Context lookup table.  */
1155static flag_context_list_table_ty *flag_context_list_table;
1156
1157
1158/* The file is broken into tokens.  Scan the token stream, looking for
1159   a keyword, followed by a left paren, followed by a string.  When we
1160   see this sequence, we have something to remember.  We assume we are
1161   looking at a valid C or C++ program, and leave the complaints about
1162   the grammar to the compiler.
1163
1164     Normal handling: Look for
1165       keyword ( ... msgid ... )
1166     Plural handling: Look for
1167       keyword ( ... msgid ... msgid_plural ... )
1168
1169   We use recursion because the arguments before msgid or between msgid
1170   and msgid_plural can contain subexpressions of the same form.  */
1171
1172
1173/* Extract messages until the next balanced closing parenthesis or brace,
1174   depending on TERMINATOR.
1175   Extracted messages are added to MLP.
1176   Return true upon eof, false upon closing parenthesis or brace.  */
1177static bool
1178extract_parenthesized (message_list_ty *mlp, token_type_ty terminator,
1179		       flag_context_ty outer_context,
1180		       flag_context_list_iterator_ty context_iter,
1181		       struct arglist_parser *argparser)
1182{
1183  /* Current argument number.  */
1184  int arg = 1;
1185  /* 0 when no keyword has been seen.  1 right after a keyword is seen.  */
1186  int state;
1187  /* Parameters of the keyword just seen.  Defined only in state 1.  */
1188  const struct callshapes *next_shapes = NULL;
1189  /* Context iterator that will be used if the next token is a '('.  */
1190  flag_context_list_iterator_ty next_context_iter =
1191    passthrough_context_list_iterator;
1192  /* Current context.  */
1193  flag_context_ty inner_context =
1194    inherited_context (outer_context,
1195		       flag_context_list_iterator_advance (&context_iter));
1196
1197  /* Start state is 0.  */
1198  state = 0;
1199
1200  for (;;)
1201    {
1202      token_ty token;
1203
1204      x_java_lex (&token);
1205      switch (token.type)
1206	{
1207	case token_type_symbol:
1208	  {
1209	    /* Combine symbol1 . ... . symbolN to a single strings, so that
1210	       we can recognize static function calls like
1211	       GettextResource.gettext.  The information present for
1212	       symbolI.....symbolN has precedence over the information for
1213	       symbolJ.....symbolN with J > I.  */
1214	    char *sum = token.string;
1215	    size_t sum_len = strlen (sum);
1216	    const char *dottedname;
1217	    flag_context_list_ty *context_list;
1218
1219	    for (;;)
1220	      {
1221		token_ty token2;
1222
1223		x_java_lex (&token2);
1224		if (token2.type == token_type_dot)
1225		  {
1226		    token_ty token3;
1227
1228		    x_java_lex (&token3);
1229		    if (token3.type == token_type_symbol)
1230		      {
1231			char *addend = token3.string;
1232			size_t addend_len = strlen (addend);
1233
1234			sum =
1235			  (char *) xrealloc (sum, sum_len + 1 + addend_len + 1);
1236			sum[sum_len] = '.';
1237			memcpy (sum + sum_len + 1, addend, addend_len + 1);
1238			sum_len += 1 + addend_len;
1239
1240			free_token (&token3);
1241			free_token (&token2);
1242			continue;
1243		      }
1244		    x_java_unlex (&token3);
1245		  }
1246		x_java_unlex (&token2);
1247		break;
1248	      }
1249
1250	    for (dottedname = sum;;)
1251	      {
1252		void *keyword_value;
1253
1254		if (hash_find_entry (&keywords, dottedname, strlen (dottedname),
1255				     &keyword_value)
1256		    == 0)
1257		  {
1258		    next_shapes = (const struct callshapes *) keyword_value;
1259		    state = 1;
1260		    break;
1261		  }
1262
1263		dottedname = strchr (dottedname, '.');
1264		if (dottedname == NULL)
1265		  {
1266		    state = 0;
1267		    break;
1268		  }
1269		dottedname++;
1270	      }
1271
1272	    for (dottedname = sum;;)
1273	      {
1274		context_list =
1275		  flag_context_list_table_lookup (
1276		    flag_context_list_table,
1277		    dottedname, strlen (dottedname));
1278		if (context_list != NULL)
1279		  break;
1280
1281		dottedname = strchr (dottedname, '.');
1282		if (dottedname == NULL)
1283		  break;
1284		dottedname++;
1285	      }
1286	    next_context_iter = flag_context_list_iterator (context_list);
1287
1288	    free (sum);
1289	    continue;
1290	  }
1291
1292	case token_type_lparen:
1293	  if (extract_parenthesized (mlp, token_type_rparen,
1294				     inner_context, next_context_iter,
1295				     arglist_parser_alloc (mlp,
1296							   state ? next_shapes : NULL)))
1297	    {
1298	      xgettext_current_source_encoding = po_charset_utf8;
1299	      arglist_parser_done (argparser, arg);
1300	      xgettext_current_source_encoding = xgettext_global_source_encoding;
1301	      return true;
1302	    }
1303	  next_context_iter = null_context_list_iterator;
1304	  state = 0;
1305	  continue;
1306
1307	case token_type_rparen:
1308	  if (terminator == token_type_rparen)
1309	    {
1310	      xgettext_current_source_encoding = po_charset_utf8;
1311	      arglist_parser_done (argparser, arg);
1312	      xgettext_current_source_encoding = xgettext_global_source_encoding;
1313	      return false;
1314	    }
1315	  if (terminator == token_type_rbrace)
1316	    {
1317	      error_with_progname = false;
1318	      error (0, 0,
1319		     _("%s:%d: warning: ')' found where '}' was expected"),
1320		     logical_file_name, token.line_number);
1321	      error_with_progname = true;
1322	    }
1323	  next_context_iter = null_context_list_iterator;
1324	  state = 0;
1325	  continue;
1326
1327	case token_type_lbrace:
1328	  if (extract_parenthesized (mlp, token_type_rbrace,
1329				     null_context, null_context_list_iterator,
1330				     arglist_parser_alloc (mlp, NULL)))
1331	    {
1332	      xgettext_current_source_encoding = po_charset_utf8;
1333	      arglist_parser_done (argparser, arg);
1334	      xgettext_current_source_encoding = xgettext_global_source_encoding;
1335	      return true;
1336	    }
1337	  next_context_iter = null_context_list_iterator;
1338	  state = 0;
1339	  continue;
1340
1341	case token_type_rbrace:
1342	  if (terminator == token_type_rbrace)
1343	    {
1344	      xgettext_current_source_encoding = po_charset_utf8;
1345	      arglist_parser_done (argparser, arg);
1346	      xgettext_current_source_encoding = xgettext_global_source_encoding;
1347	      return false;
1348	    }
1349	  if (terminator == token_type_rparen)
1350	    {
1351	      error_with_progname = false;
1352	      error (0, 0,
1353		     _("%s:%d: warning: '}' found where ')' was expected"),
1354		     logical_file_name, token.line_number);
1355	      error_with_progname = true;
1356	    }
1357	  next_context_iter = null_context_list_iterator;
1358	  state = 0;
1359	  continue;
1360
1361	case token_type_comma:
1362	  arg++;
1363	  inner_context =
1364	    inherited_context (outer_context,
1365			       flag_context_list_iterator_advance (
1366				 &context_iter));
1367	  next_context_iter = passthrough_context_list_iterator;
1368	  state = 0;
1369	  continue;
1370
1371	case token_type_string_literal:
1372	  {
1373	    lex_pos_ty pos;
1374	    pos.file_name = logical_file_name;
1375	    pos.line_number = token.line_number;
1376
1377	    xgettext_current_source_encoding = po_charset_utf8;
1378	    if (extract_all)
1379	      remember_a_message (mlp, NULL, token.string, inner_context,
1380				  &pos, token.comment);
1381	    else
1382	      arglist_parser_remember (argparser, arg, token.string,
1383				       inner_context,
1384				       pos.file_name, pos.line_number,
1385				       token.comment);
1386	    xgettext_current_source_encoding = xgettext_global_source_encoding;
1387	  }
1388	  drop_reference (token.comment);
1389	  next_context_iter = null_context_list_iterator;
1390	  state = 0;
1391	  continue;
1392
1393	case token_type_eof:
1394	  xgettext_current_source_encoding = po_charset_utf8;
1395	  arglist_parser_done (argparser, arg);
1396	  xgettext_current_source_encoding = xgettext_global_source_encoding;
1397	  return true;
1398
1399	case token_type_dot:
1400	case token_type_number:
1401	case token_type_plus:
1402	case token_type_other:
1403	  next_context_iter = null_context_list_iterator;
1404	  state = 0;
1405	  continue;
1406
1407	default:
1408	  abort ();
1409	}
1410    }
1411}
1412
1413
1414void
1415extract_java (FILE *f,
1416	      const char *real_filename, const char *logical_filename,
1417	      flag_context_list_table_ty *flag_table,
1418	      msgdomain_list_ty *mdlp)
1419{
1420  message_list_ty *mlp = mdlp->item[0]->messages;
1421
1422  fp = f;
1423  real_file_name = real_filename;
1424  logical_file_name = xstrdup (logical_filename);
1425  line_number = 1;
1426
1427  last_comment_line = -1;
1428  last_non_comment_line = -1;
1429
1430  phase6_last = token_type_eof;
1431
1432  flag_context_list_table = flag_table;
1433
1434  init_keywords ();
1435
1436  /* Eat tokens until eof is seen.  When extract_parenthesized returns
1437     due to an unbalanced closing parenthesis, just restart it.  */
1438  while (!extract_parenthesized (mlp, token_type_eof,
1439				 null_context, null_context_list_iterator,
1440				 arglist_parser_alloc (mlp, NULL)))
1441    ;
1442
1443  fp = NULL;
1444  real_file_name = NULL;
1445  logical_file_name = NULL;
1446  line_number = 0;
1447}
1448