• Home
  • History
  • Annotate
  • Line#
  • Navigate
  • Raw
  • Download
  • only in /netgear-WNDR4500v2-V1.0.0.60_1.0.38/ap/gpl/timemachine/gettext-0.17/gettext-tools/src/
1/* xgettext Perl backend.
2   Copyright (C) 2002-2007 Free Software Foundation, Inc.
3
4   This file was written by Guido Flohr <guido@imperia.net>, 2002-2003.
5
6   This program is free software: you can redistribute it and/or modify
7   it under the terms of the GNU General Public License as published by
8   the Free Software Foundation; either version 3 of the License, or
9   (at your option) any later version.
10
11   This program is distributed in the hope that it will be useful,
12   but WITHOUT ANY WARRANTY; without even the implied warranty of
13   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14   GNU General Public License for more details.
15
16   You should have received a copy of the GNU General Public License
17   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
18
19#ifdef HAVE_CONFIG_H
20# include "config.h"
21#endif
22
23/* Specification.  */
24#include "x-perl.h"
25
26#include <errno.h>
27#include <stdbool.h>
28#include <stdio.h>
29#include <stdlib.h>
30#include <string.h>
31
32#include "message.h"
33#include "xgettext.h"
34#include "x-perl.h"
35#include "error.h"
36#include "error-progname.h"
37#include "xalloc.h"
38#include "po-charset.h"
39#include "unistr.h"
40#include "uniname.h"
41#include "gettext.h"
42
43#define _(s) gettext(s)
44
45/* The Perl syntax is defined in perlsyn.pod.  Try the command
46   "man perlsyn" or "perldoc perlsyn".
47   Also, the syntax after the 'sub' keyword is specified in perlsub.pod.
48   Try the command "man perlsub" or "perldoc perlsub".  */
49
50#define DEBUG_PERL 0
51
52
53/* ====================== Keyword set customization.  ====================== */
54
55/* If true extract all strings.  */
56static bool extract_all = false;
57
58static hash_table keywords;
59static bool default_keywords = true;
60
61
62void
63x_perl_extract_all ()
64{
65  extract_all = true;
66}
67
68
69void
70x_perl_keyword (const char *name)
71{
72  if (name == NULL)
73    default_keywords = false;
74  else
75    {
76      const char *end;
77      struct callshape shape;
78      const char *colon;
79
80      if (keywords.table == NULL)
81	hash_init (&keywords, 100);
82
83      split_keywordspec (name, &end, &shape);
84
85      /* The characters between name and end should form a valid C identifier.
86	 A colon means an invalid parse in split_keywordspec().  */
87      colon = strchr (name, ':');
88      if (colon == NULL || colon >= end)
89	insert_keyword_callshape (&keywords, name, end - name, &shape);
90    }
91}
92
93/* Finish initializing the keywords hash table.
94   Called after argument processing, before each file is processed.  */
95static void
96init_keywords ()
97{
98  if (default_keywords)
99    {
100      /* When adding new keywords here, also update the documentation in
101	 xgettext.texi!  */
102      x_perl_keyword ("gettext");
103      x_perl_keyword ("%gettext");
104      x_perl_keyword ("$gettext");
105      x_perl_keyword ("dgettext:2");
106      x_perl_keyword ("dcgettext:2");
107      x_perl_keyword ("ngettext:1,2");
108      x_perl_keyword ("dngettext:2,3");
109      x_perl_keyword ("dcngettext:2,3");
110      x_perl_keyword ("gettext_noop");
111#if 0
112      x_perl_keyword ("__");
113      x_perl_keyword ("$__");
114      x_perl_keyword ("%__");
115      x_perl_keyword ("__x");
116      x_perl_keyword ("__n:1,2");
117      x_perl_keyword ("__nx:1,2");
118      x_perl_keyword ("__xn:1,2");
119      x_perl_keyword ("N__");
120#endif
121      default_keywords = false;
122    }
123}
124
125void
126init_flag_table_perl ()
127{
128  xgettext_record_flag ("gettext:1:pass-perl-format");
129  xgettext_record_flag ("gettext:1:pass-perl-brace-format");
130  xgettext_record_flag ("%gettext:1:pass-perl-format");
131  xgettext_record_flag ("%gettext:1:pass-perl-brace-format");
132  xgettext_record_flag ("$gettext:1:pass-perl-format");
133  xgettext_record_flag ("$gettext:1:pass-perl-brace-format");
134  xgettext_record_flag ("dgettext:2:pass-perl-format");
135  xgettext_record_flag ("dgettext:2:pass-perl-brace-format");
136  xgettext_record_flag ("dcgettext:2:pass-perl-format");
137  xgettext_record_flag ("dcgettext:2:pass-perl-brace-format");
138  xgettext_record_flag ("ngettext:1:pass-perl-format");
139  xgettext_record_flag ("ngettext:2:pass-perl-format");
140  xgettext_record_flag ("ngettext:1:pass-perl-brace-format");
141  xgettext_record_flag ("ngettext:2:pass-perl-brace-format");
142  xgettext_record_flag ("dngettext:2:pass-perl-format");
143  xgettext_record_flag ("dngettext:3:pass-perl-format");
144  xgettext_record_flag ("dngettext:2:pass-perl-brace-format");
145  xgettext_record_flag ("dngettext:3:pass-perl-brace-format");
146  xgettext_record_flag ("dcngettext:2:pass-perl-format");
147  xgettext_record_flag ("dcngettext:3:pass-perl-format");
148  xgettext_record_flag ("dcngettext:2:pass-perl-brace-format");
149  xgettext_record_flag ("dcngettext:3:pass-perl-brace-format");
150  xgettext_record_flag ("gettext_noop:1:pass-perl-format");
151  xgettext_record_flag ("gettext_noop:1:pass-perl-brace-format");
152  xgettext_record_flag ("printf:1:perl-format"); /* argument 1 or 2 ?? */
153  xgettext_record_flag ("sprintf:1:perl-format");
154#if 0
155  xgettext_record_flag ("__:1:pass-perl-format");
156  xgettext_record_flag ("__:1:pass-perl-brace-format");
157  xgettext_record_flag ("%__:1:pass-perl-format");
158  xgettext_record_flag ("%__:1:pass-perl-brace-format");
159  xgettext_record_flag ("$__:1:pass-perl-format");
160  xgettext_record_flag ("$__:1:pass-perl-brace-format");
161  xgettext_record_flag ("__x:1:perl-brace-format");
162  xgettext_record_flag ("__n:1:pass-perl-format");
163  xgettext_record_flag ("__n:2:pass-perl-format");
164  xgettext_record_flag ("__n:1:pass-perl-brace-format");
165  xgettext_record_flag ("__n:2:pass-perl-brace-format");
166  xgettext_record_flag ("__nx:1:perl-brace-format");
167  xgettext_record_flag ("__nx:2:perl-brace-format");
168  xgettext_record_flag ("__xn:1:perl-brace-format");
169  xgettext_record_flag ("__xn:2:perl-brace-format");
170  xgettext_record_flag ("N__:1:pass-perl-format");
171  xgettext_record_flag ("N__:1:pass-perl-brace-format");
172#endif
173}
174
175
176/* ======================== Reading of characters.  ======================== */
177
178/* Real filename, used in error messages about the input file.  */
179static const char *real_file_name;
180
181/* Logical filename and line number, used to label the extracted messages.  */
182static char *logical_file_name;
183static int line_number;
184
185/* The input file stream.  */
186static FILE *fp;
187
188/* The current line buffer.  */
189static char *linebuf;
190
191/* The size of the current line.  */
192static int linesize;
193
194/* The position in the current line.  */
195static int linepos;
196
197/* The size of the input buffer.  */
198static size_t linebuf_size;
199
200/* Number of lines eaten for here documents.  */
201static int here_eaten;
202
203/* Paranoia: EOF marker for __END__ or __DATA__.  */
204static bool end_of_file;
205
206
207/* 1. line_number handling.  */
208
209/* Returns the next character from the input stream or EOF.  */
210static int
211phase1_getc ()
212{
213  line_number += here_eaten;
214  here_eaten = 0;
215
216  if (end_of_file)
217    return EOF;
218
219  if (linepos >= linesize)
220    {
221      linesize = getline (&linebuf, &linebuf_size, fp);
222
223      if (linesize < 0)
224	{
225	  if (ferror (fp))
226	    error (EXIT_FAILURE, errno, _("error while reading \"%s\""),
227		   real_file_name);
228	  end_of_file = true;
229	  return EOF;
230	}
231
232      linepos = 0;
233      ++line_number;
234
235      /* Undosify.  This is important for catching the end of <<EOF and
236	 <<'EOF'.  We could rely on stdio doing this for us but you
237	 it is not uncommon to to come across Perl scripts with CRLF
238	 newline conventions on systems that do not follow this
239	 convention.  */
240      if (linesize >= 2 && linebuf[linesize - 1] == '\n'
241	  && linebuf[linesize - 2] == '\r')
242	{
243	  linebuf[linesize - 2] = '\n';
244	  linebuf[linesize - 1] = '\0';
245	  --linesize;
246	}
247    }
248
249  return linebuf[linepos++];
250}
251
252/* Supports only one pushback character.  */
253static void
254phase1_ungetc (int c)
255{
256  if (c != EOF)
257    {
258      if (linepos == 0)
259	/* Attempt to ungetc across line boundary.  Shouldn't happen.
260	   No two phase1_ungetc calls are permitted in a row.  */
261	abort ();
262
263      --linepos;
264    }
265}
266
267/* Read a here document and return its contents.
268   The delimiter is an UTF-8 encoded string; the resulting string is UTF-8
269   encoded as well.  */
270
271static char *
272get_here_document (const char *delimiter)
273{
274  /* Accumulator for the entire here document, including a NUL byte
275     at the end.  */
276  static char *buffer;
277  static size_t bufmax = 0;
278  size_t bufpos = 0;
279  /* Current line being appended.  */
280  static char *my_linebuf = NULL;
281  static size_t my_linebuf_size = 0;
282
283  /* Allocate the initial buffer.  Later on, bufmax > 0.  */
284  if (bufmax == 0)
285    {
286      buffer = XNMALLOC (1, char);
287      buffer[0] = '\0';
288      bufmax = 1;
289    }
290
291  for (;;)
292    {
293      int read_bytes = getline (&my_linebuf, &my_linebuf_size, fp);
294      char *my_line_utf8;
295      bool chomp;
296
297      if (read_bytes < 0)
298	{
299	  if (ferror (fp))
300	    {
301	      error (EXIT_FAILURE, errno, _("error while reading \"%s\""),
302		     real_file_name);
303	    }
304	  else
305	    {
306	      error_with_progname = false;
307	      error (EXIT_SUCCESS, 0, _("\
308%s:%d: can't find string terminator \"%s\" anywhere before EOF"),
309		     real_file_name, line_number, delimiter);
310	      error_with_progname = true;
311
312	      break;
313	    }
314	}
315
316      ++here_eaten;
317
318      /* Convert to UTF-8.  */
319      my_line_utf8 =
320	from_current_source_encoding (my_linebuf, logical_file_name,
321				      line_number + here_eaten);
322      if (my_line_utf8 != my_linebuf)
323	{
324	  if (strlen (my_line_utf8) >= my_linebuf_size)
325	    {
326	      my_linebuf_size = strlen (my_line_utf8) + 1;
327	      my_linebuf = xrealloc (my_linebuf, my_linebuf_size);
328	    }
329	  strcpy (my_linebuf, my_line_utf8);
330	  free (my_line_utf8);
331	}
332
333      /* Undosify.  This is important for catching the end of <<EOF and
334	 <<'EOF'.  We could rely on stdio doing this for us but you
335	 it is not uncommon to to come across Perl scripts with CRLF
336	 newline conventions on systems that do not follow this
337	 convention.  */
338      if (read_bytes >= 2 && my_linebuf[read_bytes - 1] == '\n'
339	  && my_linebuf[read_bytes - 2] == '\r')
340	{
341	  my_linebuf[read_bytes - 2] = '\n';
342	  my_linebuf[read_bytes - 1] = '\0';
343	  --read_bytes;
344	}
345
346      /* Temporarily remove the trailing newline from my_linebuf.  */
347      chomp = false;
348      if (read_bytes >= 1 && my_linebuf[read_bytes - 1] == '\n')
349	{
350	  chomp = true;
351	  my_linebuf[read_bytes - 1] = '\0';
352	}
353
354      /* See whether this line terminates the here document.  */
355      if (strcmp (my_linebuf, delimiter) == 0)
356	break;
357
358      /* Add back the trailing newline to my_linebuf.  */
359      if (chomp)
360	my_linebuf[read_bytes - 1] = '\n';
361
362      /* Ensure room for read_bytes + 1 bytes.  */
363      if (bufpos + read_bytes >= bufmax)
364	{
365	  do
366	    bufmax = 2 * bufmax + 10;
367	  while (bufpos + read_bytes >= bufmax);
368	  buffer = xrealloc (buffer, bufmax);
369	}
370      /* Append this line to the accumulator.  */
371      strcpy (buffer + bufpos, my_linebuf);
372      bufpos += read_bytes;
373    }
374
375  /* Done accumulating the here document.  */
376  return xstrdup (buffer);
377}
378
379/* Skips pod sections.  */
380static void
381skip_pod ()
382{
383  line_number += here_eaten;
384  here_eaten = 0;
385  linepos = 0;
386
387  for (;;)
388    {
389      linesize = getline (&linebuf, &linebuf_size, fp);
390
391      if (linesize < 0)
392	{
393	  if (ferror (fp))
394	    error (EXIT_FAILURE, errno, _("error while reading \"%s\""),
395		   real_file_name);
396	  return;
397	}
398
399      ++line_number;
400
401      if (strncmp ("=cut", linebuf, 4) == 0)
402	{
403	  /* Force reading of a new line on next call to phase1_getc().  */
404	  linepos = linesize;
405	  return;
406	}
407    }
408}
409
410
411/* These are for tracking whether comments count as immediately before
412   keyword.  */
413static int last_comment_line;
414static int last_non_comment_line;
415
416
417/* 2. Replace each comment that is not inside a string literal or regular
418   expression with a newline character.  We need to remember the comment
419   for later, because it may be attached to a keyword string.  */
420
421static int
422phase2_getc ()
423{
424  static char *buffer;
425  static size_t bufmax;
426  size_t buflen;
427  int lineno;
428  int c;
429  char *utf8_string;
430
431  c = phase1_getc ();
432  if (c == '#')
433    {
434      buflen = 0;
435      lineno = line_number;
436      /* Skip leading whitespace.  */
437      for (;;)
438	{
439	  c = phase1_getc ();
440	  if (c == EOF)
441	    break;
442	  if (c != ' ' && c != '\t' && c != '\r' && c != '\f')
443	    {
444	      phase1_ungetc (c);
445	      break;
446	    }
447	}
448      /* Accumulate the comment.  */
449      for (;;)
450	{
451	  c = phase1_getc ();
452	  if (c == '\n' || c == EOF)
453	    break;
454	  if (buflen >= bufmax)
455	    {
456	      bufmax = 2 * bufmax + 10;
457	      buffer = xrealloc (buffer, bufmax);
458	    }
459	  buffer[buflen++] = c;
460	}
461      if (buflen >= bufmax)
462	{
463	  bufmax = 2 * bufmax + 10;
464	  buffer = xrealloc (buffer, bufmax);
465	}
466      buffer[buflen] = '\0';
467      /* Convert it to UTF-8.  */
468      utf8_string =
469	from_current_source_encoding (buffer, logical_file_name, lineno);
470      /* Save it until we encounter the corresponding string.  */
471      savable_comment_add (utf8_string);
472      last_comment_line = lineno;
473    }
474  return c;
475}
476
477/* Supports only one pushback character.  */
478static void
479phase2_ungetc (int c)
480{
481  if (c != EOF)
482    phase1_ungetc (c);
483}
484
485/* Whitespace recognition.  */
486
487#define case_whitespace \
488  case ' ': case '\t': case '\r': case '\n': case '\f'
489
490static inline bool
491is_whitespace (int c)
492{
493  return (c == ' ' || c == '\t' || c == '\r' || c == '\n' || c == '\f');
494}
495
496
497/* ========================== Reading of tokens.  ========================== */
498
499
500enum token_type_ty
501{
502  token_type_eof,
503  token_type_lparen,		/* ( */
504  token_type_rparen,		/* ) */
505  token_type_comma,		/* , */
506  token_type_fat_comma,		/* => */
507  token_type_dereference,	/* , */
508  token_type_semicolon,         /* ; */
509  token_type_lbrace,            /* { */
510  token_type_rbrace,            /* } */
511  token_type_lbracket,          /* [ */
512  token_type_rbracket,          /* ] */
513  token_type_string,		/* quote-like */
514  token_type_named_op,          /* if, unless, while, ... */
515  token_type_variable,          /* $... */
516  token_type_symbol,		/* symbol, number */
517  token_type_regex_op,		/* s, tr, y, m.  */
518  token_type_dot,               /* . */
519  token_type_other,		/* regexp, misc. operator */
520  /* The following are not really token types, but variants used by
521     the parser.  */
522  token_type_keyword_symbol	/* keyword symbol */
523};
524typedef enum token_type_ty token_type_ty;
525
526/* Subtypes for strings, important for interpolation.  */
527enum string_type_ty
528{
529  string_type_verbatim,     /* "<<'EOF'", "m'...'", "s'...''...'",
530			       "tr/.../.../", "y/.../.../".  */
531  string_type_q,            /* "'..'", "q/.../".  */
532  string_type_qq,           /* '"..."', "`...`", "qq/.../", "qx/.../",
533			       "<file*glob>".  */
534  string_type_qr            /* Not supported.  */
535};
536
537/* Subtypes for symbols, important for dollar interpretation.  */
538enum symbol_type_ty
539{
540  symbol_type_none,         /* Nothing special.  */
541  symbol_type_sub,          /* 'sub'.  */
542  symbol_type_function      /* Function name after 'sub'.  */
543};
544
545typedef struct token_ty token_ty;
546struct token_ty
547{
548  token_type_ty type;
549  int sub_type;			/* for token_type_string, token_type_symbol */
550  char *string;			/* for:			in encoding:
551				   token_type_named_op	ASCII
552				   token_type_string	UTF-8
553				   token_type_symbol	ASCII
554				   token_type_variable	global_source_encoding
555				 */
556  refcounted_string_list_ty *comment; /* for token_type_string */
557  int line_number;
558};
559
560#if DEBUG_PERL
561static const char *
562token2string (const token_ty *token)
563{
564  switch (token->type)
565    {
566    case token_type_eof:
567      return "token_type_eof";
568    case token_type_lparen:
569      return "token_type_lparen";
570    case token_type_rparen:
571      return "token_type_rparen";
572    case token_type_comma:
573      return "token_type_comma";
574    case token_type_fat_comma:
575      return "token_type_fat_comma";
576    case token_type_dereference:
577      return "token_type_dereference";
578    case token_type_semicolon:
579      return "token_type_semicolon";
580    case token_type_lbrace:
581      return "token_type_lbrace";
582    case token_type_rbrace:
583      return "token_type_rbrace";
584    case token_type_lbracket:
585      return "token_type_lbracket";
586    case token_type_rbracket:
587      return "token_type_rbracket";
588    case token_type_string:
589      return "token_type_string";
590    case token_type_named_op:
591      return "token_type_named_op";
592    case token_type_variable:
593      return "token_type_variable";
594    case token_type_symbol:
595      return "token_type_symbol";
596    case token_type_regex_op:
597      return "token_type_regex_op";
598    case token_type_dot:
599      return "token_type_dot";
600    case token_type_other:
601      return "token_type_other";
602    default:
603      return "unknown";
604    }
605}
606#endif
607
608/* Free the memory pointed to by a 'struct token_ty'.  */
609static inline void
610free_token (token_ty *tp)
611{
612  switch (tp->type)
613    {
614    case token_type_named_op:
615    case token_type_string:
616    case token_type_symbol:
617    case token_type_variable:
618      free (tp->string);
619      break;
620    default:
621      break;
622    }
623  if (tp->type == token_type_string)
624    drop_reference (tp->comment);
625  free (tp);
626}
627
628/* Pass 1 of extracting quotes: Find the end of the string, regardless
629   of the semantics of the construct.  Return the complete string,
630   including the starting and the trailing delimiter, with backslashes
631   removed where appropriate.  */
632static char *
633extract_quotelike_pass1 (int delim)
634{
635  /* This function is called recursively.  No way to allocate stuff
636     statically.  Also alloca() is inappropriate due to limited stack
637     size on some platforms.  So we use malloc().  */
638  int bufmax = 10;
639  char *buffer = XNMALLOC (bufmax, char);
640  int bufpos = 0;
641  bool nested = true;
642  int counter_delim;
643
644  buffer[bufpos++] = delim;
645
646  /* Find the closing delimiter.  */
647  switch (delim)
648    {
649    case '(':
650      counter_delim = ')';
651      break;
652    case '{':
653      counter_delim = '}';
654      break;
655    case '[':
656      counter_delim = ']';
657      break;
658    case '<':
659      counter_delim = '>';
660      break;
661    default: /* "..." or '...' or |...| etc. */
662      nested = false;
663      counter_delim = delim;
664      break;
665    }
666
667  for (;;)
668    {
669      int c = phase1_getc ();
670
671      /* This round can produce 1 or 2 bytes.  Ensure room for 2 bytes.  */
672      if (bufpos + 2 > bufmax)
673	{
674	  bufmax = 2 * bufmax + 10;
675	  buffer = xrealloc (buffer, bufmax);
676	}
677
678      if (c == counter_delim || c == EOF)
679	{
680	  buffer[bufpos++] = counter_delim; /* will be stripped off later */
681	  buffer[bufpos++] = '\0';
682#if DEBUG_PERL
683	  fprintf (stderr, "PASS1: %s\n", buffer);
684#endif
685	  return buffer;
686	}
687
688      if (nested && c == delim)
689	{
690	  char *inner = extract_quotelike_pass1 (delim);
691	  size_t len = strlen (inner);
692
693	  /* Ensure room for len + 1 bytes.  */
694	  if (bufpos + len >= bufmax)
695	    {
696	      do
697		bufmax = 2 * bufmax + 10;
698	      while (bufpos + len >= bufmax);
699	      buffer = xrealloc (buffer, bufmax);
700	    }
701	  strcpy (buffer + bufpos, inner);
702	  free (inner);
703	  bufpos += len;
704	}
705      else if (c == '\\')
706	{
707	  c = phase1_getc ();
708	  if (c == '\\')
709	    {
710	      buffer[bufpos++] = '\\';
711	      buffer[bufpos++] = '\\';
712	    }
713	  else if (c == delim || c == counter_delim)
714	    {
715	      /* This is pass2 in Perl.  */
716	      buffer[bufpos++] = c;
717	    }
718	  else
719	    {
720	      buffer[bufpos++] = '\\';
721	      phase1_ungetc (c);
722	    }
723	}
724      else
725	{
726	  buffer[bufpos++] = c;
727	}
728    }
729}
730
731/* Like extract_quotelike_pass1, but return the complete string in UTF-8
732   encoding.  */
733static char *
734extract_quotelike_pass1_utf8 (int delim)
735{
736  char *string = extract_quotelike_pass1 (delim);
737  char *utf8_string =
738    from_current_source_encoding (string, logical_file_name, line_number);
739  if (utf8_string != string)
740    free (string);
741  return utf8_string;
742}
743
744
745/* ========= Reading of tokens and commands.  Extracting strings.  ========= */
746
747
748/* There is an ambiguity about '/': It can start a division operator ('/' or
749   '/=') or it can start a regular expression.  The distinction is important
750   because inside regular expressions, '#' loses its special meaning.
751   The distinction is possible depending on the parsing state: After a
752   variable or simple expression, it's a division operator; at the beginning
753   of an expression, it's a regexp.  */
754static bool prefer_division_over_regexp;
755
756/* Context lookup table.  */
757static flag_context_list_table_ty *flag_context_list_table;
758
759
760/* Forward declaration of local functions.  */
761static void interpolate_keywords (message_list_ty *mlp, const char *string,
762				  int lineno);
763static token_ty *x_perl_lex (message_list_ty *mlp);
764static void x_perl_unlex (token_ty *tp);
765static bool extract_balanced (message_list_ty *mlp,
766			      token_type_ty delim, bool eat_delim,
767			      bool comma_delim,
768			      flag_context_ty outer_context,
769			      flag_context_list_iterator_ty context_iter,
770			      int arg, struct arglist_parser *argparser);
771
772
773/* Extract an unsigned hexadecimal number from STRING, considering at
774   most LEN bytes and place the result in *RESULT.  Returns a pointer
775   to the first character past the hexadecimal number.  */
776static const char *
777extract_hex (const char *string, size_t len, unsigned int *result)
778{
779  size_t i;
780
781  *result = 0;
782
783  for (i = 0; i < len; i++)
784    {
785      char c = string[i];
786      int number;
787
788      if (c >= 'A' && c <= 'F')
789	number = c - 'A' + 10;
790      else if (c >= 'a' && c <= 'f')
791	number = c - 'a' + 10;
792      else if (c >= '0' && c <= '9')
793	number = c - '0';
794      else
795	break;
796
797      *result <<= 4;
798      *result |= number;
799    }
800
801  return string + i;
802}
803
804/* Extract an unsigned octal number from STRING, considering at
805   most LEN bytes and place the result in *RESULT.  Returns a pointer
806   to the first character past the octal number.  */
807static const char *
808extract_oct (const char *string, size_t len, unsigned int *result)
809{
810  size_t i;
811
812  *result = 0;
813
814  for (i = 0; i < len; i++)
815    {
816      char c = string[i];
817      int number;
818
819      if (c >= '0' && c <= '7')
820	number = c - '0';
821      else
822	break;
823
824      *result <<= 3;
825      *result |= number;
826    }
827
828  return string + i;
829}
830
831/* Extract the various quotelike constructs except for <<EOF.  See the
832   section "Gory details of parsing quoted constructs" in perlop.pod.
833   Return the resulting token in *tp; tp->type == token_type_string.  */
834static void
835extract_quotelike (token_ty *tp, int delim)
836{
837  char *string = extract_quotelike_pass1_utf8 (delim);
838  size_t len = strlen (string);
839
840  tp->type = token_type_string;
841  /* Take the string without the delimiters at the start and at the end.  */
842  if (!(len >= 2))
843    abort ();
844  string[len - 1] = '\0';
845  tp->string = xstrdup (string + 1);
846  free (string);
847  tp->comment = add_reference (savable_comment);
848}
849
850/* Extract the quotelike constructs with double delimiters, like
851   s/[SEARCH]/[REPLACE]/.  This function does not eat up trailing
852   modifiers (left to the caller).
853   Return the resulting token in *tp; tp->type == token_type_regex_op.  */
854static void
855extract_triple_quotelike (message_list_ty *mlp, token_ty *tp, int delim,
856			  bool interpolate)
857{
858  char *string;
859
860  tp->type = token_type_regex_op;
861
862  string = extract_quotelike_pass1_utf8 (delim);
863  if (interpolate)
864    interpolate_keywords (mlp, string, line_number);
865  free (string);
866
867  if (delim == '(' || delim == '<' || delim == '{' || delim == '[')
868    {
869      /* The delimiter for the second string can be different, e.g.
870	 s{SEARCH}{REPLACE} or s{SEARCH}/REPLACE/.  See "man perlrequick".  */
871      delim = phase1_getc ();
872      while (is_whitespace (delim))
873	{
874	  /* The hash-sign is not a valid delimiter after whitespace, ergo
875	     use phase2_getc() and not phase1_getc() now.  */
876	  delim = phase2_getc ();
877	}
878    }
879  string = extract_quotelike_pass1_utf8 (delim);
880  if (interpolate)
881    interpolate_keywords (mlp, string, line_number);
882  free (string);
883}
884
885/* Perform pass 3 of quotelike extraction (interpolation).
886   *tp is a token of type token_type_string.
887   This function replaces tp->string.
888   This function does not access tp->comment.  */
889/* FIXME: Currently may writes null-bytes into the string.  */
890static void
891extract_quotelike_pass3 (token_ty *tp, int error_level)
892{
893  static char *buffer;
894  static int bufmax = 0;
895  int bufpos = 0;
896  const char *crs;
897  bool uppercase;
898  bool lowercase;
899  bool quotemeta;
900
901#if DEBUG_PERL
902  switch (tp->sub_type)
903    {
904    case string_type_verbatim:
905      fprintf (stderr, "Interpolating string_type_verbatim:\n");
906      break;
907    case string_type_q:
908      fprintf (stderr, "Interpolating string_type_q:\n");
909      break;
910    case string_type_qq:
911      fprintf (stderr, "Interpolating string_type_qq:\n");
912      break;
913    case string_type_qr:
914      fprintf (stderr, "Interpolating string_type_qr:\n");
915      break;
916    }
917  fprintf (stderr, "%s\n", tp->string);
918  if (tp->sub_type == string_type_verbatim)
919    fprintf (stderr, "---> %s\n", tp->string);
920#endif
921
922  if (tp->sub_type == string_type_verbatim)
923    return;
924
925  /* Loop over tp->string, accumulating the expansion in buffer.  */
926  crs = tp->string;
927  uppercase = false;
928  lowercase = false;
929  quotemeta = false;
930  while (*crs)
931    {
932      bool backslashed;
933
934      /* Ensure room for 7 bytes, 6 (multi-)bytes plus a leading backslash
935	 if \Q modifier is present.  */
936      if (bufpos + 7 > bufmax)
937	{
938	  bufmax = 2 * bufmax + 10;
939	  buffer = xrealloc (buffer, bufmax);
940	}
941
942      if (tp->sub_type == string_type_q)
943	{
944	  switch (*crs)
945	    {
946	    case '\\':
947	      if (crs[1] == '\\')
948		{
949		  crs += 2;
950		  buffer[bufpos++] = '\\';
951		  break;
952		}
953	      /* FALLTHROUGH */
954	    default:
955	      buffer[bufpos++] = *crs++;
956	      break;
957	    }
958	  continue;
959	}
960
961      /* We only get here for double-quoted strings or regular expressions.
962	 Unescape escape sequences.  */
963      if (*crs == '\\')
964	{
965	  switch (crs[1])
966	    {
967	    case 't':
968	      crs += 2;
969	      buffer[bufpos++] = '\t';
970	      continue;
971	    case 'n':
972	      crs += 2;
973	      buffer[bufpos++] = '\n';
974	      continue;
975	    case 'r':
976	      crs += 2;
977	      buffer[bufpos++] = '\r';
978	      continue;
979	    case 'f':
980	      crs += 2;
981	      buffer[bufpos++] = '\f';
982	      continue;
983	    case 'b':
984	      crs += 2;
985	      buffer[bufpos++] = '\b';
986	      continue;
987	    case 'a':
988	      crs += 2;
989	      buffer[bufpos++] = '\a';
990	      continue;
991	    case 'e':
992	      crs += 2;
993	      buffer[bufpos++] = 0x1b;
994	      continue;
995	    case '0': case '1': case '2': case '3':
996	    case '4': case '5': case '6': case '7':
997	      {
998		unsigned int oct_number;
999		int length;
1000
1001		crs = extract_oct (crs + 1, 3, &oct_number);
1002
1003		/* FIXME: If one of the variables UPPERCASE or LOWERCASE is
1004		   true, the character should be converted to its uppercase
1005		   resp. lowercase equivalent.  I don't know if the necessary
1006		   facilities are already included in gettext.  For US-Ascii
1007		   the conversion can be already be done, however.  */
1008		if (uppercase && oct_number >= 'a' && oct_number <= 'z')
1009		  {
1010		    oct_number = oct_number - 'a' + 'A';
1011		  }
1012		else if (lowercase && oct_number >= 'A' && oct_number <= 'Z')
1013		  {
1014		    oct_number = oct_number - 'A' + 'a';
1015		  }
1016
1017
1018		/* Yes, octal escape sequences in the range 0x100..0x1ff are
1019		   valid.  */
1020		length = u8_uctomb ((unsigned char *) (buffer + bufpos),
1021				    oct_number, 2);
1022		if (length > 0)
1023		  bufpos += length;
1024	      }
1025	      continue;
1026	    case 'x':
1027	      {
1028		unsigned int hex_number = 0;
1029		int length;
1030
1031		crs += 2;
1032		if (*crs == '{')
1033		  {
1034		    const char *end = strchr (crs, '}');
1035		    if (end == NULL)
1036		      {
1037			error_with_progname = false;
1038			error (error_level, 0, _("\
1039%s:%d: missing right brace on \\x{HEXNUMBER}"), real_file_name, line_number);
1040			error_with_progname = true;
1041			++crs;
1042			continue;
1043		      }
1044		    else
1045		      {
1046			++crs;
1047			(void) extract_hex (crs, end - crs, &hex_number);
1048			crs = end + 1;
1049		      }
1050		  }
1051		else
1052		  {
1053		    crs = extract_hex (crs, 2, &hex_number);
1054		  }
1055
1056		/* FIXME: If one of the variables UPPERCASE or LOWERCASE is
1057		   true, the character should be converted to its uppercase
1058		   resp. lowercase equivalent.  I don't know if the necessary
1059		   facilities are already included in gettext.  For US-Ascii
1060		   the conversion can be already be done, however.  */
1061		if (uppercase && hex_number >= 'a' && hex_number <= 'z')
1062		  {
1063		    hex_number = hex_number - 'a' + 'A';
1064		  }
1065		else if (lowercase && hex_number >= 'A' && hex_number <= 'Z')
1066		  {
1067		    hex_number = hex_number - 'A' + 'a';
1068		  }
1069
1070		length = u8_uctomb ((unsigned char *) (buffer + bufpos),
1071				    hex_number, 6);
1072
1073		if (length > 0)
1074		  bufpos += length;
1075	      }
1076	      continue;
1077	    case 'c':
1078	      /* Perl's notion of control characters.  */
1079	      crs += 2;
1080	      if (*crs)
1081		{
1082		  int the_char = (unsigned char) *crs;
1083		  if (the_char >= 'a' || the_char <= 'z')
1084		    the_char = the_char - 'a' + 'A';
1085		  buffer[bufpos++] = the_char ^ 0x40;
1086		}
1087	      continue;
1088	    case 'N':
1089	      crs += 2;
1090	      if (*crs == '{')
1091		{
1092		  const char *end = strchr (crs + 1, '}');
1093		  if (end != NULL)
1094		    {
1095		      char *name;
1096		      unsigned int unicode;
1097
1098		      name = XNMALLOC (end - (crs + 1) + 1, char);
1099		      memcpy (name, crs + 1, end - (crs + 1));
1100		      name[end - (crs + 1)] = '\0';
1101
1102		      unicode = unicode_name_character (name);
1103		      if (unicode != UNINAME_INVALID)
1104			{
1105			  /* FIXME: Convert to upper/lowercase if the
1106			     corresponding flag is set to true.  */
1107			  int length =
1108			    u8_uctomb ((unsigned char *) (buffer + bufpos),
1109				       unicode, 6);
1110			  if (length > 0)
1111			    bufpos += length;
1112			}
1113
1114		      free (name);
1115
1116		      crs = end + 1;
1117		    }
1118		}
1119	      continue;
1120	    }
1121	}
1122
1123      /* No escape sequence, go on.  */
1124      if (*crs == '\\')
1125	{
1126	  ++crs;
1127	  switch (*crs)
1128	    {
1129	    case 'E':
1130	      uppercase = false;
1131	      lowercase = false;
1132	      quotemeta = false;
1133	      ++crs;
1134	      continue;
1135	    case 'L':
1136	      uppercase = false;
1137	      lowercase = true;
1138	      ++crs;
1139	      continue;
1140	    case 'U':
1141	      uppercase = true;
1142	      lowercase = false;
1143	      ++crs;
1144	      continue;
1145	    case 'Q':
1146	      quotemeta = true;
1147	      ++crs;
1148	      continue;
1149	    case 'l':
1150	      ++crs;
1151	      if (*crs >= 'A' && *crs <= 'Z')
1152		{
1153		  buffer[bufpos++] = *crs - 'A' + 'a';
1154		}
1155	      else if ((unsigned char) *crs >= 0x80)
1156		{
1157		  error_with_progname = false;
1158		  error (error_level, 0, _("\
1159%s:%d: invalid interpolation (\"\\l\") of 8bit character \"%c\""),
1160			 real_file_name, line_number, *crs);
1161		  error_with_progname = true;
1162		}
1163	      else
1164	        {
1165		  buffer[bufpos++] = *crs;
1166		}
1167	      ++crs;
1168	      continue;
1169	    case 'u':
1170	      ++crs;
1171	      if (*crs >= 'a' && *crs <= 'z')
1172		{
1173		  buffer[bufpos++] = *crs - 'a' + 'A';
1174		}
1175	      else if ((unsigned char) *crs >= 0x80)
1176		{
1177		  error_with_progname = false;
1178		  error (error_level, 0, _("\
1179%s:%d: invalid interpolation (\"\\u\") of 8bit character \"%c\""),
1180			 real_file_name, line_number, *crs);
1181		  error_with_progname = true;
1182		}
1183	      else
1184	        {
1185		  buffer[bufpos++] = *crs;
1186		}
1187	      ++crs;
1188	      continue;
1189	    case '\\':
1190	      buffer[bufpos++] = *crs;
1191	      ++crs;
1192	      continue;
1193	    default:
1194	      backslashed = true;
1195	      break;
1196	    }
1197	}
1198      else
1199	backslashed = false;
1200
1201      if (quotemeta
1202	  && !((*crs >= 'A' && *crs <= 'Z') || (*crs >= 'A' && *crs <= 'z')
1203	       || (*crs >= '0' && *crs <= '9') || *crs == '_'))
1204	{
1205	  buffer[bufpos++] = '\\';
1206	  backslashed = true;
1207	}
1208
1209      if (!backslashed && !extract_all && (*crs == '$' || *crs == '@'))
1210	{
1211	  error_with_progname = false;
1212	  error (error_level, 0, _("\
1213%s:%d: invalid variable interpolation at \"%c\""),
1214		 real_file_name, line_number, *crs);
1215	  error_with_progname = true;
1216	  ++crs;
1217	}
1218      else if (lowercase)
1219	{
1220	  if (*crs >= 'A' && *crs <= 'Z')
1221	    buffer[bufpos++] = *crs - 'A' + 'a';
1222	  else if ((unsigned char) *crs >= 0x80)
1223	    {
1224	      error_with_progname = false;
1225	      error (error_level, 0, _("\
1226%s:%d: invalid interpolation (\"\\L\") of 8bit character \"%c\""),
1227		     real_file_name, line_number, *crs);
1228	      error_with_progname = true;
1229	      buffer[bufpos++] = *crs;
1230	    }
1231	  else
1232	    buffer[bufpos++] = *crs;
1233	  ++crs;
1234	}
1235      else if (uppercase)
1236	{
1237	  if (*crs >= 'a' && *crs <= 'z')
1238	    buffer[bufpos++] = *crs - 'a' + 'A';
1239	  else if ((unsigned char) *crs >= 0x80)
1240	    {
1241	      error_with_progname = false;
1242	      error (error_level, 0, _("\
1243%s:%d: invalid interpolation (\"\\U\") of 8bit character \"%c\""),
1244		     real_file_name, line_number, *crs);
1245	      error_with_progname = true;
1246	      buffer[bufpos++] = *crs;
1247	    }
1248	  else
1249	    buffer[bufpos++] = *crs;
1250	  ++crs;
1251	}
1252      else
1253	{
1254	  buffer[bufpos++] = *crs++;
1255	}
1256    }
1257
1258  /* Ensure room for 1 more byte.  */
1259  if (bufpos >= bufmax)
1260    {
1261      bufmax = 2 * bufmax + 10;
1262      buffer = xrealloc (buffer, bufmax);
1263    }
1264
1265  buffer[bufpos++] = '\0';
1266
1267#if DEBUG_PERL
1268  fprintf (stderr, "---> %s\n", buffer);
1269#endif
1270
1271  /* Replace tp->string.  */
1272  free (tp->string);
1273  tp->string = xstrdup (buffer);
1274}
1275
1276/* Parse a variable.  This is done in several steps:
1277     1) Consume all leading occurencies of '$', '@', '%', and '*'.
1278     2) Determine the name of the variable from the following input.
1279     3) Parse possible following hash keys or array indexes.
1280 */
1281static void
1282extract_variable (message_list_ty *mlp, token_ty *tp, int first)
1283{
1284  static char *buffer;
1285  static int bufmax = 0;
1286  int bufpos = 0;
1287  int c = first;
1288  size_t varbody_length = 0;
1289  bool maybe_hash_deref = false;
1290  bool maybe_hash_value = false;
1291
1292  tp->type = token_type_variable;
1293
1294#if DEBUG_PERL
1295  fprintf (stderr, "%s:%d: extracting variable type '%c'\n",
1296	   real_file_name, line_number, first);
1297#endif
1298
1299  /*
1300   * 1) Consume dollars and so on (not euros ...).  Unconditionally
1301   *    accepting the hash sign (#) will maybe lead to inaccurate
1302   *    results.  FIXME!
1303   */
1304  while (c == '$' || c == '*' || c == '#' || c == '@' || c == '%')
1305    {
1306      if (bufpos >= bufmax)
1307	{
1308	  bufmax = 2 * bufmax + 10;
1309	  buffer = xrealloc (buffer, bufmax);
1310	}
1311      buffer[bufpos++] = c;
1312      c = phase1_getc ();
1313    }
1314
1315  if (c == EOF)
1316    {
1317      tp->type = token_type_eof;
1318      return;
1319    }
1320
1321  /* Hash references are treated in a special way, when looking for
1322     our keywords.  */
1323  if (buffer[0] == '$')
1324    {
1325      if (bufpos == 1)
1326	maybe_hash_value = true;
1327      else if (bufpos == 2 && buffer[1] == '$')
1328	{
1329	  if (!(c == '{'
1330		|| (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z')
1331		|| (c >= '0' && c <= '9')
1332		|| c == '_' || c == ':' || c == '\'' || c >= 0x80))
1333	    {
1334	      /* Special variable $$ for pid.  */
1335	      if (bufpos >= bufmax)
1336		{
1337		  bufmax = 2 * bufmax + 10;
1338		  buffer = xrealloc (buffer, bufmax);
1339		}
1340	      buffer[bufpos++] = '\0';
1341	      tp->string = xstrdup (buffer);
1342#if DEBUG_PERL
1343	      fprintf (stderr, "%s:%d: is PID ($$)\n",
1344		       real_file_name, line_number);
1345#endif
1346
1347	      phase1_ungetc (c);
1348	      return;
1349	    }
1350
1351	  maybe_hash_deref = true;
1352	  bufpos = 1;
1353	}
1354    }
1355
1356  /*
1357   * 2) Get the name of the variable.  The first character is practically
1358   *    arbitrary.  Punctuation and numbers automagically put a variable
1359   *    in the global namespace but that subtle difference is not interesting
1360   *    for us.
1361   */
1362  if (bufpos >= bufmax)
1363    {
1364      bufmax = 2 * bufmax + 10;
1365      buffer = xrealloc (buffer, bufmax);
1366    }
1367  if (c == '{')
1368    {
1369      /* Yuck, we cannot accept ${gettext} as a keyword...  Except for
1370       * debugging purposes it is also harmless, that we suppress the
1371       * real name of the variable.
1372       */
1373#if DEBUG_PERL
1374      fprintf (stderr, "%s:%d: braced {variable_name}\n",
1375	       real_file_name, line_number);
1376#endif
1377
1378      if (extract_balanced (mlp, token_type_rbrace, true, false,
1379			    null_context, null_context_list_iterator,
1380			    1, arglist_parser_alloc (mlp, NULL)))
1381	return;
1382      buffer[bufpos++] = c;
1383    }
1384  else
1385    {
1386      while ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z')
1387	     || (c >= '0' && c <= '9')
1388	     || c == '_' || c == ':' || c == '\'' || c >= 0x80)
1389	{
1390	  ++varbody_length;
1391	  if (bufpos >= bufmax)
1392	    {
1393	      bufmax = 2 * bufmax + 10;
1394	      buffer = xrealloc (buffer, bufmax);
1395	    }
1396	  buffer[bufpos++] = c;
1397	  c = phase1_getc ();
1398	}
1399      phase1_ungetc (c);
1400    }
1401
1402  /* Probably some strange Perl variable like $`.  */
1403  if (varbody_length == 0)
1404    {
1405      c = phase1_getc ();
1406      if (c == EOF || is_whitespace (c))
1407	phase1_ungetc (c);  /* Loser.  */
1408      else
1409	{
1410	  if (bufpos >= bufmax)
1411	    {
1412	      bufmax = 2 * bufmax + 10;
1413	      buffer = xrealloc (buffer, bufmax);
1414	    }
1415	  buffer[bufpos++] = c;
1416	}
1417    }
1418
1419  if (bufpos >= bufmax)
1420    {
1421      bufmax = 2 * bufmax + 10;
1422      buffer = xrealloc (buffer, bufmax);
1423    }
1424  buffer[bufpos++] = '\0';
1425
1426  tp->string = xstrdup (buffer);
1427
1428#if DEBUG_PERL
1429  fprintf (stderr, "%s:%d: complete variable name: %s\n",
1430	   real_file_name, line_number, tp->string);
1431#endif
1432
1433  prefer_division_over_regexp = true;
1434
1435  /*
1436   * 3) If the following looks strange to you, this is valid Perl syntax:
1437   *
1438   *      $var = $$hashref    # We can place a
1439   *                          # comment here and then ...
1440   *             {key_into_hashref};
1441   *
1442   *    POD sections are not allowed but we leave complaints about
1443   *    that to the compiler/interpreter.
1444   */
1445  /* We only extract strings from the first hash key (if present).  */
1446
1447  if (maybe_hash_deref || maybe_hash_value)
1448    {
1449      bool is_dereference = false;
1450      int c;
1451
1452      do
1453	c = phase2_getc ();
1454      while (is_whitespace (c));
1455
1456      if (c == '-')
1457	{
1458	  int c2 = phase1_getc ();
1459
1460	  if (c2 == '>')
1461	    {
1462	      is_dereference = true;
1463
1464	      do
1465		c = phase2_getc ();
1466	      while (is_whitespace (c));
1467	    }
1468	  else if (c2 != '\n')
1469	    {
1470	      /* Discarding the newline is harmless here.  The only
1471		 special character recognized after a minus is greater-than
1472		 for dereference.  However, the sequence "-\n>" that we
1473	         treat incorrectly here, is a syntax error.  */
1474	      phase1_ungetc (c2);
1475	    }
1476	}
1477
1478      if (maybe_hash_value && is_dereference)
1479	{
1480#if DEBUG_PERL
1481	  fprintf (stderr, "%s:%d: first keys preceded by \"->\"\n",
1482		   real_file_name, line_number);
1483#endif
1484	}
1485      else if (maybe_hash_value)
1486	{
1487	  /* Fake it into a hash.  */
1488	  tp->string[0] = '%';
1489	}
1490
1491      /* Do NOT change that into else if (see above).  */
1492      if ((maybe_hash_value || maybe_hash_deref) && c == '{')
1493	{
1494	  void *keyword_value;
1495
1496#if DEBUG_PERL
1497	  fprintf (stderr, "%s:%d: first keys preceded by '{'\n",
1498		   real_file_name, line_number);
1499#endif
1500
1501	  if (hash_find_entry (&keywords, tp->string, strlen (tp->string),
1502			       &keyword_value) == 0)
1503	    {
1504	      /* TODO: Shouldn't we use the shapes of the keyword, instead
1505		 of hardwiring argnum1 = 1 ?
1506	      const struct callshapes *shapes =
1507		(const struct callshapes *) keyword_value;
1508	      */
1509	      struct callshapes shapes;
1510	      shapes.keyword = tp->string; /* XXX storage duration? */
1511	      shapes.keyword_len = strlen (tp->string);
1512	      shapes.nshapes = 1;
1513	      shapes.shapes[0].argnum1 = 1;
1514	      shapes.shapes[0].argnum2 = 0;
1515	      shapes.shapes[0].argnumc = 0;
1516	      shapes.shapes[0].argnum1_glib_context = false;
1517	      shapes.shapes[0].argnum2_glib_context = false;
1518	      shapes.shapes[0].argtotal = 0;
1519	      string_list_init (&shapes.shapes[0].xcomments);
1520
1521	      {
1522		/* Extract a possible string from the key.  Before proceeding
1523		   we check whether the open curly is followed by a symbol and
1524		   then by a right curly.  */
1525		flag_context_list_iterator_ty context_iter =
1526		  flag_context_list_iterator (
1527		    flag_context_list_table_lookup (
1528		      flag_context_list_table,
1529		      tp->string, strlen (tp->string)));
1530		token_ty *t1 = x_perl_lex (mlp);
1531
1532#if DEBUG_PERL
1533		fprintf (stderr, "%s:%d: extracting string key\n",
1534			 real_file_name, line_number);
1535#endif
1536
1537		if (t1->type == token_type_symbol
1538		    || t1->type == token_type_named_op)
1539		  {
1540		    token_ty *t2 = x_perl_lex (mlp);
1541		    if (t2->type == token_type_rbrace)
1542		      {
1543			flag_context_ty context;
1544			lex_pos_ty pos;
1545
1546			context =
1547			  inherited_context (null_context,
1548					     flag_context_list_iterator_advance (
1549					       &context_iter));
1550
1551			pos.line_number = line_number;
1552			pos.file_name = logical_file_name;
1553
1554			xgettext_current_source_encoding = po_charset_utf8;
1555			remember_a_message (mlp, NULL, xstrdup (t1->string),
1556					    context, &pos, savable_comment);
1557			xgettext_current_source_encoding = xgettext_global_source_encoding;
1558			free_token (t2);
1559			free_token (t1);
1560		      }
1561		    else
1562		      {
1563			x_perl_unlex (t2);
1564		      }
1565		  }
1566		else
1567		  {
1568		    x_perl_unlex (t1);
1569		    if (extract_balanced (mlp, token_type_rbrace, true, false,
1570					  null_context, context_iter,
1571					  1, arglist_parser_alloc (mlp, &shapes)))
1572		      return;
1573		  }
1574	      }
1575	    }
1576	  else
1577	    {
1578	      phase2_ungetc (c);
1579	    }
1580	}
1581      else
1582	{
1583	  phase2_ungetc (c);
1584	}
1585    }
1586
1587  /* Now consume "->", "[...]", and "{...}".  */
1588  for (;;)
1589    {
1590      int c = phase2_getc ();
1591      int c2;
1592
1593      switch (c)
1594	{
1595	case '{':
1596#if DEBUG_PERL
1597	  fprintf (stderr, "%s:%d: extracting balanced '{' after varname\n",
1598		   real_file_name, line_number);
1599#endif
1600	  extract_balanced (mlp, token_type_rbrace, true, false,
1601			    null_context, null_context_list_iterator,
1602			    1, arglist_parser_alloc (mlp, NULL));
1603	  break;
1604
1605	case '[':
1606#if DEBUG_PERL
1607	  fprintf (stderr, "%s:%d: extracting balanced '[' after varname\n",
1608		   real_file_name, line_number);
1609#endif
1610	  extract_balanced (mlp, token_type_rbracket, true, false,
1611			    null_context, null_context_list_iterator,
1612			    1, arglist_parser_alloc (mlp, NULL));
1613	  break;
1614
1615	case '-':
1616	  c2 = phase1_getc ();
1617	  if (c2 == '>')
1618	    {
1619#if DEBUG_PERL
1620	      fprintf (stderr, "%s:%d: another \"->\" after varname\n",
1621		       real_file_name, line_number);
1622#endif
1623	      break;
1624	    }
1625	  else if (c2 != '\n')
1626	    {
1627	      /* Discarding the newline is harmless here.  The only
1628		 special character recognized after a minus is greater-than
1629		 for dereference.  However, the sequence "-\n>" that we
1630	         treat incorrectly here, is a syntax error.  */
1631	      phase1_ungetc (c2);
1632	    }
1633	  /* FALLTHROUGH */
1634
1635	default:
1636#if DEBUG_PERL
1637	  fprintf (stderr, "%s:%d: variable finished\n",
1638		   real_file_name, line_number);
1639#endif
1640	  phase2_ungetc (c);
1641	  return;
1642	}
1643    }
1644}
1645
1646/* Actually a simplified version of extract_variable().  It searches for
1647   variables inside a double-quoted string that may interpolate to
1648   some keyword hash (reference).  The string is UTF-8 encoded.  */
1649static void
1650interpolate_keywords (message_list_ty *mlp, const char *string, int lineno)
1651{
1652  static char *buffer;
1653  static int bufmax = 0;
1654  int bufpos = 0;
1655  flag_context_ty context;
1656  int c;
1657  bool maybe_hash_deref = false;
1658  enum parser_state
1659    {
1660      initial,
1661      one_dollar,
1662      two_dollars,
1663      identifier,
1664      minus,
1665      wait_lbrace,
1666      wait_quote,
1667      dquote,
1668      squote,
1669      barekey,
1670      wait_rbrace
1671    } state;
1672  token_ty token;
1673
1674  lex_pos_ty pos;
1675
1676  /* States are:
1677   *
1678   * initial:      initial
1679   * one_dollar:   dollar sign seen in state INITIAL
1680   * two_dollars:  another dollar-sign has been seen in state ONE_DOLLAR
1681   * identifier:   a valid identifier character has been seen in state
1682   *               ONE_DOLLAR or TWO_DOLLARS
1683   * minus:        a minus-sign has been seen in state IDENTIFIER
1684   * wait_lbrace:  a greater-than has been seen in state MINUS
1685   * wait_quote:   a left brace has been seen in state IDENTIFIER or in
1686   *               state WAIT_LBRACE
1687   * dquote:       a double-quote has been seen in state WAIT_QUOTE
1688   * squote:       a single-quote has been seen in state WAIT_QUOTE
1689   * barekey:      an bareword character has been seen in state WAIT_QUOTE
1690   * wait_rbrace:  closing quote has been seen in state DQUOTE or SQUOTE
1691   *
1692   * In the states initial...identifier the context is null_context; in the
1693   * states minus...wait_rbrace the context is the one suitable for the first
1694   * argument of the last seen identifier.
1695   */
1696  state = initial;
1697  context = null_context;
1698
1699  token.type = token_type_string;
1700  token.sub_type = string_type_qq;
1701  token.line_number = line_number;
1702  /* No need for  token.comment = add_reference (savable_comment);  here.
1703     We can let token.comment uninitialized here, and use savable_comment
1704     directly, because this function only parses the given string and does
1705     not call phase2_getc.  */
1706  pos.file_name = logical_file_name;
1707  pos.line_number = lineno;
1708
1709  while ((c = (unsigned char) *string++) != '\0')
1710    {
1711      void *keyword_value;
1712
1713      if (state == initial)
1714	bufpos = 0;
1715
1716      if (c == '\n')
1717	lineno++;
1718
1719      if (bufpos + 1 >= bufmax)
1720	{
1721	  bufmax = 2 * bufmax + 10;
1722	  buffer = xrealloc (buffer, bufmax);
1723	}
1724
1725      switch (state)
1726	{
1727	case initial:
1728	  switch (c)
1729	    {
1730	    case '\\':
1731	      c = (unsigned char) *string++;
1732	      if (c == '\0')
1733		return;
1734	      break;
1735	    case '$':
1736	      buffer[bufpos++] = '$';
1737	      maybe_hash_deref = false;
1738	      state = one_dollar;
1739	      break;
1740	    default:
1741	      break;
1742	    }
1743	  break;
1744	case one_dollar:
1745	  switch (c)
1746	    {
1747	    case '$':
1748	      /*
1749	       * This is enough to make us believe later that we dereference
1750	       * a hash reference.
1751	       */
1752	      maybe_hash_deref = true;
1753	      state = two_dollars;
1754	      break;
1755	    default:
1756	      if (c == '_' || c == ':' || c == '\'' || c >= 0x80
1757		  || (c >= 'A' && c <= 'Z')
1758		  || (c >= 'a' && c <= 'z')
1759		  || (c >= '0' && c <= '9'))
1760		{
1761		  buffer[bufpos++] = c;
1762		  state = identifier;
1763		}
1764	      else
1765		state = initial;
1766	      break;
1767	    }
1768	  break;
1769	case two_dollars:
1770	  if (c == '_' || c == ':' || c == '\'' || c >= 0x80
1771	      || (c >= 'A' && c <= 'Z')
1772	      || (c >= 'a' && c <= 'z')
1773	      || (c >= '0' && c <= '9'))
1774	    {
1775	      buffer[bufpos++] = c;
1776	      state = identifier;
1777	    }
1778	  else
1779	    state = initial;
1780	  break;
1781	case identifier:
1782	  switch (c)
1783	    {
1784	    case '-':
1785	      if (hash_find_entry (&keywords, buffer, bufpos, &keyword_value)
1786		  == 0)
1787		{
1788		  flag_context_list_iterator_ty context_iter =
1789		    flag_context_list_iterator (
1790		      flag_context_list_table_lookup (
1791			flag_context_list_table,
1792			buffer, bufpos));
1793		  context =
1794		    inherited_context (null_context,
1795				       flag_context_list_iterator_advance (
1796					 &context_iter));
1797		  state = minus;
1798		}
1799	      else
1800		state = initial;
1801	      break;
1802	    case '{':
1803	      if (!maybe_hash_deref)
1804		buffer[0] = '%';
1805	      if (hash_find_entry (&keywords, buffer, bufpos, &keyword_value)
1806		  == 0)
1807		{
1808		  flag_context_list_iterator_ty context_iter =
1809		    flag_context_list_iterator (
1810		      flag_context_list_table_lookup (
1811			flag_context_list_table,
1812			buffer, bufpos));
1813		  context =
1814		    inherited_context (null_context,
1815				       flag_context_list_iterator_advance (
1816					 &context_iter));
1817		  state = wait_quote;
1818		}
1819	      else
1820		state = initial;
1821	      break;
1822	    default:
1823	      if (c == '_' || c == ':' || c == '\'' || c >= 0x80
1824		  || (c >= 'A' && c <= 'Z')
1825		  || (c >= 'a' && c <= 'z')
1826		  || (c >= '0' && c <= '9'))
1827		{
1828		  buffer[bufpos++] = c;
1829		}
1830	      else
1831		state = initial;
1832	      break;
1833	    }
1834	  break;
1835	case minus:
1836	  switch (c)
1837	    {
1838	    case '>':
1839	      state = wait_lbrace;
1840	      break;
1841	    default:
1842	      context = null_context;
1843	      state = initial;
1844	      break;
1845	    }
1846	  break;
1847	case wait_lbrace:
1848	  switch (c)
1849	    {
1850	    case '{':
1851	      state = wait_quote;
1852	      break;
1853	    default:
1854	      context = null_context;
1855	      state = initial;
1856	      break;
1857	    }
1858	  break;
1859	case wait_quote:
1860	  switch (c)
1861	    {
1862	    case_whitespace:
1863	      break;
1864	    case '\'':
1865	      pos.line_number = lineno;
1866	      bufpos = 0;
1867	      state = squote;
1868	      break;
1869	    case '"':
1870	      pos.line_number = lineno;
1871	      bufpos = 0;
1872	      state = dquote;
1873	      break;
1874	    default:
1875	      if (c == '_' || (c >= '0' && c <= '9') || c >= 0x80
1876		  || (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'))
1877		{
1878		  pos.line_number = lineno;
1879		  bufpos = 0;
1880		  buffer[bufpos++] = c;
1881		  state = barekey;
1882		}
1883	      else
1884		{
1885		  context = null_context;
1886		  state = initial;
1887		}
1888	      break;
1889	    }
1890	  break;
1891	case dquote:
1892	  switch (c)
1893	    {
1894	    case '"':
1895	      /* The resulting string has to be interpolated twice.  */
1896	      buffer[bufpos] = '\0';
1897	      token.string = xstrdup (buffer);
1898	      extract_quotelike_pass3 (&token, EXIT_FAILURE);
1899	      /* The string can only shrink with interpolation (because
1900		 we ignore \Q).  */
1901	      if (!(strlen (token.string) <= bufpos))
1902		abort ();
1903	      strcpy (buffer, token.string);
1904	      free (token.string);
1905	      state = wait_rbrace;
1906	      break;
1907	    case '\\':
1908	      if (string[0] == '\"')
1909		{
1910		  buffer[bufpos++] = string++[0];
1911		}
1912	      else if (string[0])
1913		{
1914		  buffer[bufpos++] = '\\';
1915		  buffer[bufpos++] = string++[0];
1916		}
1917	      else
1918		{
1919		  context = null_context;
1920		  state = initial;
1921		}
1922	      break;
1923	    default:
1924	      buffer[bufpos++] = c;
1925	      break;
1926	    }
1927	  break;
1928	case squote:
1929	  switch (c)
1930	    {
1931	    case '\'':
1932	      state = wait_rbrace;
1933	      break;
1934	    case '\\':
1935	      if (string[0] == '\'')
1936		{
1937		  buffer[bufpos++] = string++[0];
1938		}
1939	      else if (string[0])
1940		{
1941		  buffer[bufpos++] = '\\';
1942		  buffer[bufpos++] = string++[0];
1943		}
1944	      else
1945		{
1946		  context = null_context;
1947		  state = initial;
1948		}
1949	      break;
1950	    default:
1951	      buffer[bufpos++] = c;
1952	      break;
1953	    }
1954	  break;
1955	case barekey:
1956	  if (c == '_' || (c >= '0' && c <= '9') || c >= 0x80
1957	      || (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'))
1958	    {
1959	      buffer[bufpos++] = c;
1960	      break;
1961	    }
1962	  else if (is_whitespace (c))
1963	    {
1964	      state = wait_rbrace;
1965	      break;
1966	    }
1967	  else if (c != '}')
1968	    {
1969	      context = null_context;
1970	      state = initial;
1971	      break;
1972	    }
1973	  /* Must be right brace.  */
1974	  /* FALLTHROUGH */
1975	case wait_rbrace:
1976	  switch (c)
1977	    {
1978	    case_whitespace:
1979	      break;
1980	    case '}':
1981	      buffer[bufpos] = '\0';
1982	      token.string = xstrdup (buffer);
1983	      extract_quotelike_pass3 (&token, EXIT_FAILURE);
1984	      xgettext_current_source_encoding = po_charset_utf8;
1985	      remember_a_message (mlp, NULL, token.string, context, &pos,
1986				  savable_comment);
1987	      xgettext_current_source_encoding = xgettext_global_source_encoding;
1988	      /* FALLTHROUGH */
1989	    default:
1990	      context = null_context;
1991	      state = initial;
1992	      break;
1993	    }
1994	  break;
1995	}
1996    }
1997}
1998
1999/* The last token seen in the token stream.  This is important for the
2000   interpretation of '?' and '/'.  */
2001static token_type_ty last_token;
2002
2003/* Combine characters into tokens.  Discard whitespace.  */
2004
2005static void
2006x_perl_prelex (message_list_ty *mlp, token_ty *tp)
2007{
2008  static char *buffer;
2009  static int bufmax;
2010  int bufpos;
2011  int c;
2012
2013  for (;;)
2014    {
2015      c = phase2_getc ();
2016      tp->line_number = line_number;
2017
2018      switch (c)
2019	{
2020	case EOF:
2021	  tp->type = token_type_eof;
2022	  return;
2023
2024	case '\n':
2025	  if (last_non_comment_line > last_comment_line)
2026	    savable_comment_reset ();
2027	  /* FALLTHROUGH */
2028	case '\t':
2029	case ' ':
2030	  /* Ignore whitespace.  */
2031	  continue;
2032
2033	case '%':
2034	case '@':
2035	case '*':
2036	case '$':
2037	  if (!extract_all)
2038	    {
2039	      extract_variable (mlp, tp, c);
2040	      prefer_division_over_regexp = true;
2041	      return;
2042	    }
2043	  break;
2044	}
2045
2046      last_non_comment_line = tp->line_number;
2047
2048      switch (c)
2049	{
2050	case '.':
2051	  {
2052	    int c2 = phase1_getc ();
2053	    phase1_ungetc (c2);
2054	    if (c2 == '.')
2055	      {
2056		tp->type = token_type_other;
2057		prefer_division_over_regexp = false;
2058		return;
2059	      }
2060	    else if (c2 >= '0' && c2 <= '9')
2061	      {
2062		prefer_division_over_regexp = false;
2063	      }
2064	    else
2065	      {
2066		tp->type = token_type_dot;
2067		prefer_division_over_regexp = true;
2068		return;
2069	      }
2070	  }
2071	  /* FALLTHROUGH */
2072	case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
2073	case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
2074	case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
2075	case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
2076	case 'Y': case 'Z':
2077	case '_':
2078	case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
2079	case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
2080	case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
2081	case 's': case 't': case 'u': case 'v': case 'w': case 'x':
2082	case 'y': case 'z':
2083	case '0': case '1': case '2': case '3': case '4':
2084	case '5': case '6': case '7': case '8': case '9':
2085	  /* Symbol, or part of a number.  */
2086	  prefer_division_over_regexp = true;
2087	  bufpos = 0;
2088	  for (;;)
2089	    {
2090	      if (bufpos >= bufmax)
2091		{
2092		  bufmax = 2 * bufmax + 10;
2093		  buffer = xrealloc (buffer, bufmax);
2094		}
2095	      buffer[bufpos++] = c;
2096	      c = phase1_getc ();
2097	      switch (c)
2098		{
2099		case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
2100		case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
2101		case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
2102		case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
2103		case 'Y': case 'Z':
2104		case '_':
2105		case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
2106		case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
2107		case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
2108		case 's': case 't': case 'u': case 'v': case 'w': case 'x':
2109		case 'y': case 'z':
2110		case '0': case '1': case '2': case '3': case '4':
2111		case '5': case '6': case '7': case '8': case '9':
2112		  continue;
2113
2114		default:
2115		  phase1_ungetc (c);
2116		  break;
2117		}
2118	      break;
2119	    }
2120	  if (bufpos >= bufmax)
2121	    {
2122	      bufmax = 2 * bufmax + 10;
2123	      buffer = xrealloc (buffer, bufmax);
2124	    }
2125	  buffer[bufpos] = '\0';
2126
2127	  if (strcmp (buffer, "__END__") == 0
2128	      || strcmp (buffer, "__DATA__") == 0)
2129	    {
2130	      end_of_file = true;
2131	      tp->type = token_type_eof;
2132	      return;
2133	    }
2134	  else if (strcmp (buffer, "and") == 0
2135		   || strcmp (buffer, "cmp") == 0
2136		   || strcmp (buffer, "eq") == 0
2137		   || strcmp (buffer, "if") == 0
2138		   || strcmp (buffer, "ge") == 0
2139		   || strcmp (buffer, "gt") == 0
2140		   || strcmp (buffer, "le") == 0
2141		   || strcmp (buffer, "lt") == 0
2142		   || strcmp (buffer, "ne") == 0
2143		   || strcmp (buffer, "not") == 0
2144		   || strcmp (buffer, "or") == 0
2145		   || strcmp (buffer, "unless") == 0
2146		   || strcmp (buffer, "while") == 0
2147		   || strcmp (buffer, "xor") == 0)
2148	    {
2149	      tp->type = token_type_named_op;
2150	      tp->string = xstrdup (buffer);
2151	      prefer_division_over_regexp = false;
2152	      return;
2153	    }
2154	  else if (strcmp (buffer, "s") == 0
2155		 || strcmp (buffer, "y") == 0
2156		 || strcmp (buffer, "tr") == 0)
2157	    {
2158	      int delim = phase1_getc ();
2159
2160	      while (is_whitespace (delim))
2161		delim = phase2_getc ();
2162
2163	      if (delim == EOF)
2164		{
2165		  tp->type = token_type_eof;
2166		  return;
2167		}
2168	      if ((delim >= '0' && delim <= '9')
2169		  || (delim >= 'A' && delim <= 'Z')
2170		  || (delim >= 'a' && delim <= 'z'))
2171		{
2172		  /* False positive.  */
2173		  phase2_ungetc (delim);
2174		  tp->type = token_type_symbol;
2175		  tp->sub_type = symbol_type_none;
2176		  tp->string = xstrdup (buffer);
2177		  prefer_division_over_regexp = true;
2178		  return;
2179		}
2180	      extract_triple_quotelike (mlp, tp, delim,
2181					buffer[0] == 's' && delim != '\'');
2182
2183	      /* Eat the following modifiers.  */
2184	      do
2185		c = phase1_getc ();
2186	      while (c >= 'a' && c <= 'z');
2187	      phase1_ungetc (c);
2188	      return;
2189	    }
2190	  else if (strcmp (buffer, "m") == 0)
2191	    {
2192	      int delim = phase1_getc ();
2193
2194	      while (is_whitespace (delim))
2195		delim = phase2_getc ();
2196
2197	      if (delim == EOF)
2198		{
2199		  tp->type = token_type_eof;
2200		  return;
2201		}
2202	      if ((delim >= '0' && delim <= '9')
2203		  || (delim >= 'A' && delim <= 'Z')
2204		  || (delim >= 'a' && delim <= 'z'))
2205		{
2206		  /* False positive.  */
2207		  phase2_ungetc (delim);
2208		  tp->type = token_type_symbol;
2209		  tp->sub_type = symbol_type_none;
2210		  tp->string = xstrdup (buffer);
2211		  prefer_division_over_regexp = true;
2212		  return;
2213		}
2214	      extract_quotelike (tp, delim);
2215	      if (delim != '\'')
2216		interpolate_keywords (mlp, tp->string, line_number);
2217	      free (tp->string);
2218	      drop_reference (tp->comment);
2219	      tp->type = token_type_regex_op;
2220	      prefer_division_over_regexp = true;
2221
2222	      /* Eat the following modifiers.  */
2223	      do
2224		c = phase1_getc ();
2225	      while (c >= 'a' && c <= 'z');
2226	      phase1_ungetc (c);
2227	      return;
2228	    }
2229	  else if (strcmp (buffer, "qq") == 0
2230		   || strcmp (buffer, "q") == 0
2231		   || strcmp (buffer, "qx") == 0
2232		   || strcmp (buffer, "qw") == 0
2233		   || strcmp (buffer, "qr") == 0)
2234	    {
2235	      /* The qw (...) construct is not really a string but we
2236		 can treat in the same manner and then pretend it is
2237		 a symbol.  Rationale: Saying "qw (foo bar)" is the
2238		 same as "my @list = ('foo', 'bar'); @list;".  */
2239
2240	      int delim = phase1_getc ();
2241
2242	      while (is_whitespace (delim))
2243		delim = phase2_getc ();
2244
2245	      if (delim == EOF)
2246		{
2247		  tp->type = token_type_eof;
2248		  return;
2249		}
2250	      prefer_division_over_regexp = true;
2251
2252	      if ((delim >= '0' && delim <= '9')
2253		  || (delim >= 'A' && delim <= 'Z')
2254		  || (delim >= 'a' && delim <= 'z'))
2255		{
2256		  /* False positive.  */
2257		  phase2_ungetc (delim);
2258		  tp->type = token_type_symbol;
2259		  tp->sub_type = symbol_type_none;
2260		  tp->string = xstrdup (buffer);
2261		  prefer_division_over_regexp = true;
2262		  return;
2263		}
2264
2265	      extract_quotelike (tp, delim);
2266
2267	      switch (buffer[1])
2268		{
2269		case 'q':
2270		case 'x':
2271		  tp->type = token_type_string;
2272		  tp->sub_type = string_type_qq;
2273		  interpolate_keywords (mlp, tp->string, line_number);
2274		  break;
2275		case 'r':
2276		  drop_reference (tp->comment);
2277		  tp->type = token_type_regex_op;
2278		  break;
2279		case 'w':
2280		  drop_reference (tp->comment);
2281		  tp->type = token_type_symbol;
2282		  tp->sub_type = symbol_type_none;
2283		  break;
2284		case '\0':
2285		  tp->type = token_type_string;
2286		  tp->sub_type = string_type_q;
2287		  break;
2288		default:
2289		  abort ();
2290		}
2291	      return;
2292	    }
2293	  else if (strcmp (buffer, "grep") == 0
2294		   || strcmp (buffer, "split") == 0)
2295	    {
2296	      prefer_division_over_regexp = false;
2297	    }
2298	  tp->type = token_type_symbol;
2299	  tp->sub_type = (strcmp (buffer, "sub") == 0
2300			  ? symbol_type_sub
2301			  : symbol_type_none);
2302	  tp->string = xstrdup (buffer);
2303	  return;
2304
2305	case '"':
2306	  prefer_division_over_regexp = true;
2307	  extract_quotelike (tp, c);
2308	  tp->sub_type = string_type_qq;
2309	  interpolate_keywords (mlp, tp->string, line_number);
2310	  return;
2311
2312	case '`':
2313	  prefer_division_over_regexp = true;
2314	  extract_quotelike (tp, c);
2315	  tp->sub_type = string_type_qq;
2316	  interpolate_keywords (mlp, tp->string, line_number);
2317	  return;
2318
2319	case '\'':
2320	  prefer_division_over_regexp = true;
2321	  extract_quotelike (tp, c);
2322	  tp->sub_type = string_type_q;
2323	  return;
2324
2325	case '(':
2326	  c = phase2_getc ();
2327	  if (c == ')')
2328	    /* Ignore empty list.  */
2329	    continue;
2330	  else
2331	    phase2_ungetc (c);
2332	  tp->type = token_type_lparen;
2333	  prefer_division_over_regexp = false;
2334	  return;
2335
2336	case ')':
2337	  tp->type = token_type_rparen;
2338	  prefer_division_over_regexp = true;
2339	  return;
2340
2341	case '{':
2342	  tp->type = token_type_lbrace;
2343	  prefer_division_over_regexp = false;
2344	  return;
2345
2346	case '}':
2347	  tp->type = token_type_rbrace;
2348	  prefer_division_over_regexp = false;
2349	  return;
2350
2351	case '[':
2352	  tp->type = token_type_lbracket;
2353	  prefer_division_over_regexp = false;
2354	  return;
2355
2356	case ']':
2357	  tp->type = token_type_rbracket;
2358	  prefer_division_over_regexp = false;
2359	  return;
2360
2361	case ';':
2362	  tp->type = token_type_semicolon;
2363	  prefer_division_over_regexp = false;
2364	  return;
2365
2366	case ',':
2367	  tp->type = token_type_comma;
2368	  prefer_division_over_regexp = false;
2369	  return;
2370
2371	case '=':
2372	  /* Check for fat comma.  */
2373	  c = phase1_getc ();
2374	  if (c == '>')
2375	    {
2376	      tp->type = token_type_fat_comma;
2377	      return;
2378	    }
2379	  else if (linepos == 2
2380		   && (last_token == token_type_semicolon
2381		       || last_token == token_type_rbrace)
2382		   && ((c >= 'A' && c <='Z')
2383		       || (c >= 'a' && c <= 'z')))
2384	    {
2385#if DEBUG_PERL
2386	      fprintf (stderr, "%s:%d: start pod section\n",
2387		       real_file_name, line_number);
2388#endif
2389	      skip_pod ();
2390#if DEBUG_PERL
2391	      fprintf (stderr, "%s:%d: end pod section\n",
2392		       real_file_name, line_number);
2393#endif
2394	      continue;
2395	    }
2396	  phase1_ungetc (c);
2397	  tp->type = token_type_other;
2398	  prefer_division_over_regexp = false;
2399	  return;
2400
2401	case '<':
2402	  /* Check for <<EOF and friends.  */
2403	  prefer_division_over_regexp = false;
2404	  c = phase1_getc ();
2405	  if (c == '<')
2406	    {
2407	      c = phase1_getc ();
2408	      if (c == '\'')
2409		{
2410		  char *string;
2411		  extract_quotelike (tp, c);
2412		  string = get_here_document (tp->string);
2413		  free (tp->string);
2414		  tp->string = string;
2415		  tp->type = token_type_string;
2416		  tp->sub_type = string_type_verbatim;
2417		  tp->line_number = line_number + 1;
2418		  return;
2419		}
2420	      else if (c == '"')
2421		{
2422		  char *string;
2423		  extract_quotelike (tp, c);
2424		  string = get_here_document (tp->string);
2425		  free (tp->string);
2426		  tp->string = string;
2427		  tp->type = token_type_string;
2428		  tp->sub_type = string_type_qq;
2429		  tp->line_number = line_number + 1;
2430		  interpolate_keywords (mlp, tp->string, line_number + 1);
2431		  return;
2432		}
2433	      else if ((c >= 'A' && c <= 'Z')
2434		       || (c >= 'a' && c <= 'z')
2435		       || c == '_')
2436		{
2437		  bufpos = 0;
2438		  while ((c >= 'A' && c <= 'Z')
2439			 || (c >= 'a' && c <= 'z')
2440			 || (c >= '0' && c <= '9')
2441			 || c == '_' || c >= 0x80)
2442		    {
2443		      if (bufpos >= bufmax)
2444			{
2445			  bufmax = 2 * bufmax + 10;
2446			  buffer = xrealloc (buffer, bufmax);
2447			}
2448		      buffer[bufpos++] = c;
2449		      c = phase1_getc ();
2450		    }
2451		  if (c == EOF)
2452		    {
2453		      tp->type = token_type_eof;
2454		      return;
2455		    }
2456		  else
2457		    {
2458		      char *string;
2459		      phase1_ungetc (c);
2460		      if (bufpos >= bufmax)
2461			{
2462			  bufmax = 2 * bufmax + 10;
2463			  buffer = xrealloc (buffer, bufmax);
2464			}
2465		      buffer[bufpos++] = '\0';
2466		      string = get_here_document (buffer);
2467		      tp->string = string;
2468		      tp->type = token_type_string;
2469		      tp->sub_type = string_type_qq;
2470		      tp->comment = add_reference (savable_comment);
2471		      tp->line_number = line_number + 1;
2472		      interpolate_keywords (mlp, tp->string, line_number + 1);
2473		      return;
2474		    }
2475		}
2476	      else
2477		{
2478		  tp->type = token_type_other;
2479		  return;
2480		}
2481	    }
2482	  else
2483	    {
2484	      phase1_ungetc (c);
2485	      tp->type = token_type_other;
2486	    }
2487	  return;  /* End of case '>'.  */
2488
2489	case '-':
2490	  /* Check for dereferencing operator.  */
2491	  c = phase1_getc ();
2492	  if (c == '>')
2493	    {
2494	      tp->type = token_type_dereference;
2495	      return;
2496	    }
2497	  else if ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'))
2498	    {
2499	      /* One of the -X (filetest) functions.  We play safe
2500		 and accept all alphabetical characters here.  */
2501	      tp->type = token_type_other;
2502	      return;
2503	    }
2504	  phase1_ungetc (c);
2505	  tp->type = token_type_other;
2506	  prefer_division_over_regexp = false;
2507	  return;
2508
2509	case '/':
2510	case '?':
2511	  if (!prefer_division_over_regexp)
2512	    {
2513	      extract_quotelike (tp, c);
2514	      interpolate_keywords (mlp, tp->string, line_number);
2515	      free (tp->string);
2516	      drop_reference (tp->comment);
2517	      tp->type = token_type_other;
2518	      prefer_division_over_regexp = true;
2519	      /* Eat the following modifiers.  */
2520	      do
2521		c = phase1_getc ();
2522	      while (c >= 'a' && c <= 'z');
2523	      phase1_ungetc (c);
2524	      return;
2525	    }
2526	  /* FALLTHROUGH */
2527
2528	default:
2529	  /* We could carefully recognize each of the 2 and 3 character
2530	     operators, but it is not necessary, as we only need to recognize
2531	     gettext invocations.  Don't bother.  */
2532	  tp->type = token_type_other;
2533	  prefer_division_over_regexp = false;
2534	  return;
2535	}
2536    }
2537}
2538
2539
2540/* A token stack used as a lookahead buffer.  */
2541
2542typedef struct token_stack_ty token_stack_ty;
2543struct token_stack_ty
2544{
2545  token_ty **items;
2546  size_t nitems;
2547  size_t nitems_max;
2548};
2549
2550static struct token_stack_ty token_stack;
2551
2552#if DEBUG_PERL
2553/* Dumps all resources allocated by stack STACK.  */
2554static int
2555token_stack_dump (token_stack_ty *stack)
2556{
2557  size_t i;
2558
2559  fprintf (stderr, "BEGIN STACK DUMP\n");
2560  for (i = 0; i < stack->nitems; i++)
2561    {
2562      token_ty *token = stack->items[i];
2563      fprintf (stderr, "  [%s]\n", token2string (token));
2564      switch (token->type)
2565	{
2566	case token_type_named_op:
2567	case token_type_string:
2568	case token_type_symbol:
2569	case token_type_variable:
2570	  fprintf (stderr, "    string: %s\n", token->string);
2571	  break;
2572	}
2573    }
2574  fprintf (stderr, "END STACK DUMP\n");
2575  return 0;
2576}
2577#endif
2578
2579/* Pushes the token TOKEN onto the stack STACK.  */
2580static inline void
2581token_stack_push (token_stack_ty *stack, token_ty *token)
2582{
2583  if (stack->nitems >= stack->nitems_max)
2584    {
2585      size_t nbytes;
2586
2587      stack->nitems_max = 2 * stack->nitems_max + 4;
2588      nbytes = stack->nitems_max * sizeof (token_ty *);
2589      stack->items = xrealloc (stack->items, nbytes);
2590    }
2591  stack->items[stack->nitems++] = token;
2592}
2593
2594/* Pops the most recently pushed token from the stack STACK and returns it.
2595   Returns NULL if the stack is empty.  */
2596static inline token_ty *
2597token_stack_pop (token_stack_ty *stack)
2598{
2599  if (stack->nitems > 0)
2600    return stack->items[--(stack->nitems)];
2601  else
2602    return NULL;
2603}
2604
2605/* Return the top of the stack without removing it from the stack, or
2606   NULL if the stack is empty.  */
2607static inline token_ty *
2608token_stack_peek (const token_stack_ty *stack)
2609{
2610  if (stack->nitems > 0)
2611    return stack->items[stack->nitems - 1];
2612  else
2613    return NULL;
2614}
2615
2616/* Frees all resources allocated by stack STACK.  */
2617static inline void
2618token_stack_free (token_stack_ty *stack)
2619{
2620  size_t i;
2621
2622  for (i = 0; i < stack->nitems; i++)
2623    free_token (stack->items[i]);
2624  free (stack->items);
2625}
2626
2627
2628static token_ty *
2629x_perl_lex (message_list_ty *mlp)
2630{
2631#if DEBUG_PERL
2632  int dummy = token_stack_dump (&token_stack);
2633#endif
2634  token_ty *tp = token_stack_pop (&token_stack);
2635
2636  if (!tp)
2637    {
2638      tp = XMALLOC (token_ty);
2639      x_perl_prelex (mlp, tp);
2640#if DEBUG_PERL
2641      fprintf (stderr, "%s:%d: x_perl_prelex returned %s\n",
2642	       real_file_name, line_number, token2string (tp));
2643#endif
2644    }
2645#if DEBUG_PERL
2646  else
2647    {
2648      fprintf (stderr, "%s:%d: %s recycled from stack\n",
2649	       real_file_name, line_number, token2string (tp));
2650    }
2651#endif
2652
2653  /* A symbol followed by a fat comma is really a single-quoted string.
2654     Function definitions or forward declarations also need a special
2655     handling because the dollars and at signs inside the parentheses
2656     must not be interpreted as the beginning of a variable ')'.  */
2657  if (tp->type == token_type_symbol || tp->type == token_type_named_op)
2658    {
2659      token_ty *next = token_stack_peek (&token_stack);
2660
2661      if (!next)
2662	{
2663#if DEBUG_PERL
2664	  fprintf (stderr, "%s:%d: pre-fetching next token\n",
2665		   real_file_name, line_number);
2666#endif
2667	  next = x_perl_lex (mlp);
2668	  x_perl_unlex (next);
2669#if DEBUG_PERL
2670	  fprintf (stderr, "%s:%d: unshifted next token\n",
2671		   real_file_name, line_number);
2672#endif
2673	}
2674
2675#if DEBUG_PERL
2676      fprintf (stderr, "%s:%d: next token is %s\n",
2677	       real_file_name, line_number, token2string (next));
2678#endif
2679
2680      if (next->type == token_type_fat_comma)
2681	{
2682	  tp->type = token_type_string;
2683	  tp->sub_type = string_type_q;
2684	  tp->comment = add_reference (savable_comment);
2685#if DEBUG_PERL
2686	  fprintf (stderr,
2687		   "%s:%d: token %s mutated to token_type_string\n",
2688		   real_file_name, line_number, token2string (tp));
2689#endif
2690	}
2691      else if (tp->type == token_type_symbol && tp->sub_type == symbol_type_sub
2692	       && next->type == token_type_symbol)
2693        {
2694	  /* Start of a function declaration or definition.  Mark this
2695	     symbol as a function name, so that we can later eat up
2696	     possible prototype information.  */
2697#if DEBUG_PERL
2698	  fprintf (stderr, "%s:%d: subroutine declaration/definition '%s'\n",
2699		   real_file_name, line_number, next->string);
2700#endif
2701	  next->sub_type = symbol_type_function;
2702	}
2703      else if (tp->type == token_type_symbol
2704	       && (tp->sub_type == symbol_type_sub
2705		   || tp->sub_type == symbol_type_function)
2706	       && next->type == token_type_lparen)
2707        {
2708	  /* For simplicity we simply consume everything up to the
2709	     closing parenthesis.  Actually only a limited set of
2710	     characters is allowed inside parentheses but we leave
2711	     complaints to the interpreter and are prepared for
2712	     future extensions to the Perl syntax.  */
2713	  int c;
2714
2715#if DEBUG_PERL
2716	  fprintf (stderr, "%s:%d: consuming prototype information\n",
2717		   real_file_name, line_number);
2718#endif
2719
2720	  do
2721	    {
2722	      c = phase1_getc ();
2723#if DEBUG_PERL
2724	      fprintf (stderr, "  consuming character '%c'\n", c);
2725#endif
2726	    }
2727	  while (c != EOF && c != ')');
2728	  phase1_ungetc (c);
2729	}
2730    }
2731
2732  return tp;
2733}
2734
2735static void
2736x_perl_unlex (token_ty *tp)
2737{
2738  token_stack_push (&token_stack, tp);
2739}
2740
2741
2742/* ========================= Extracting strings.  ========================== */
2743
2744/* Assuming TP is a string token, this function accumulates all subsequent
2745   . string2 . string3 ... to the string.  (String concatenation.)  */
2746
2747static char *
2748collect_message (message_list_ty *mlp, token_ty *tp, int error_level)
2749{
2750  char *string;
2751  size_t len;
2752
2753  extract_quotelike_pass3 (tp, error_level);
2754  string = xstrdup (tp->string);
2755  len = strlen (tp->string) + 1;
2756
2757  for (;;)
2758    {
2759      int c;
2760
2761      do
2762	c = phase2_getc ();
2763      while (is_whitespace (c));
2764
2765      if (c != '.')
2766	{
2767	  phase2_ungetc (c);
2768	  return string;
2769	}
2770
2771      do
2772	c = phase2_getc ();
2773      while (is_whitespace (c));
2774
2775      phase2_ungetc (c);
2776
2777      if (c == '"' || c == '\'' || c == '`'
2778	  || (!prefer_division_over_regexp && (c == '/' || c == '?'))
2779	  || c == 'q')
2780	{
2781	  token_ty *qstring = x_perl_lex (mlp);
2782	  if (qstring->type != token_type_string)
2783	    {
2784	      /* assert (qstring->type == token_type_symbol) */
2785	      x_perl_unlex (qstring);
2786	      return string;
2787	    }
2788
2789	  extract_quotelike_pass3 (qstring, error_level);
2790	  len += strlen (qstring->string);
2791	  string = xrealloc (string, len);
2792	  strcat (string, qstring->string);
2793	  free_token (qstring);
2794	}
2795    }
2796}
2797
2798/* The file is broken into tokens.  Scan the token stream, looking for
2799   a keyword, followed by a left paren, followed by a string.  When we
2800   see this sequence, we have something to remember.  We assume we are
2801   looking at a valid Perl program, and leave the complaints about
2802   the grammar to the compiler.
2803
2804     Normal handling: Look for
2805       keyword ( ... msgid ... )
2806     Plural handling: Look for
2807       keyword ( ... msgid ... msgid_plural ... )
2808
2809   We use recursion because the arguments before msgid or between msgid
2810   and msgid_plural can contain subexpressions of the same form.
2811
2812   In Perl, parentheses around function arguments can be omitted.
2813
2814   The general rules are:
2815     1) Functions declared with a prototype take exactly the specified number
2816        of arguments.
2817          sub one_arg ($) { ... }
2818          sub two_args ($$) { ... }
2819     2) When a function name is immediately followed by an opening parenthesis,
2820        the argument list ends at the corresponding closing parenthesis.
2821
2822   If rule 1 and rule 2 are contradictory, i.e. when the program calls a
2823   function with an explicit argument list and the wrong number of arguments,
2824   the program is invalid:
2825     sub two_args ($$) { ... }
2826     foo two_args (x), y             - invalid due to rules 1 and 2
2827
2828   Ambiguities are resolved as follows:
2829     3) Some built-ins, such as 'abs', 'sqrt', 'sin', 'cos', ..., and functions
2830        declared with a prototype of exactly one argument take exactly one
2831        argument:
2832          foo sin x, y  ==>  foo (sin (x), y)
2833          sub one_arg ($) { ... }
2834          foo one_arg x, y, z  ==>  foo (one_arg (x), y, z)
2835     4) Other identifiers, if not immediately followed by an opening
2836        parenthesis, consume the entire remaining argument list:
2837          foo bar x, y  ==>  foo (bar (x, y))
2838          sub two_args ($$) { ... }
2839          foo two_args x, y  ==>  foo (two_args (x, y))
2840
2841   Other series of comma separated expressions without a function name at
2842   the beginning are comma expressions:
2843          sub two_args ($$) { ... }
2844          foo two_args x, (y, z)  ==>  foo (two_args (x, (y, z)))
2845   Note that the evaluation of comma expressions returns a list of values
2846   when in list context (e.g. inside the argument list of a function without
2847   prototype) but only one value when inside the argument list of a function
2848   with a prototype:
2849          sub print3 ($$$) { print @_ }
2850          print3 5, (6, 7), 8  ==>  578
2851          print 5, (6, 7), 8  ==>  5678
2852
2853   Where rule 3 or 4 contradict rule 1 or 2, the program is invalid:
2854     sin (x, y)                      - invalid due to rules 2 and 3
2855     sub one_arg ($) { ... }
2856     one_arg (x, y)                  - invalid due to rules 2 and 3
2857     sub two_args ($$) { ... }
2858     foo two_args x, y, z            - invalid due to rules 1 and 4
2859 */
2860
2861/* Extract messages until the next balanced closing parenthesis.
2862   Extracted messages are added to MLP.
2863
2864   DELIM can be either token_type_rbrace, token_type_rbracket,
2865   token_type_rparen.  Additionally, if COMMA_DELIM is true, parsing
2866   stops at the next comma outside parentheses.
2867
2868   ARG is the current argument list position, starts with 1.
2869   ARGPARSER is the corresponding argument list parser.
2870
2871   Returns true for EOF, false otherwise.  */
2872
2873static bool
2874extract_balanced (message_list_ty *mlp,
2875		  token_type_ty delim, bool eat_delim, bool comma_delim,
2876		  flag_context_ty outer_context,
2877		  flag_context_list_iterator_ty context_iter,
2878		  int arg, struct arglist_parser *argparser)
2879{
2880  /* Whether to implicitly assume the next tokens are arguments even without
2881     a '('.  */
2882  bool next_is_argument = false;
2883  /* Parameters of the keyword just seen.  Defined only when next_is_argument
2884     is true.  */
2885  const struct callshapes *next_shapes = NULL;
2886  struct arglist_parser *next_argparser = NULL;
2887
2888  /* Whether to not consider strings until the next comma.  */
2889  bool skip_until_comma = false;
2890
2891  /* Context iterator that will be used if the next token is a '('.  */
2892  flag_context_list_iterator_ty next_context_iter =
2893    passthrough_context_list_iterator;
2894  /* Current context.  */
2895  flag_context_ty inner_context =
2896    inherited_context (outer_context,
2897		       flag_context_list_iterator_advance (&context_iter));
2898
2899#if DEBUG_PERL
2900  static int nesting_level = 0;
2901
2902  ++nesting_level;
2903#endif
2904
2905  last_token = token_type_semicolon;  /* Safe assumption.  */
2906  prefer_division_over_regexp = false;
2907
2908  for (;;)
2909    {
2910      /* The current token.  */
2911      token_ty *tp;
2912
2913      tp = x_perl_lex (mlp);
2914
2915      last_token = tp->type;
2916
2917      if (delim == tp->type)
2918	{
2919	  xgettext_current_source_encoding = po_charset_utf8;
2920	  arglist_parser_done (argparser, arg);
2921	  xgettext_current_source_encoding = xgettext_global_source_encoding;
2922	  if (next_argparser != NULL)
2923	    free (next_argparser);
2924#if DEBUG_PERL
2925	  fprintf (stderr, "%s:%d: extract_balanced finished (%d)\n",
2926		   logical_file_name, tp->line_number, --nesting_level);
2927#endif
2928	  if (eat_delim)
2929	    free_token (tp);
2930	  else
2931	    /* Preserve the delimiter for the caller.  */
2932	    x_perl_unlex (tp);
2933	  return false;
2934	}
2935
2936      if (comma_delim && tp->type == token_type_comma)
2937	{
2938	  xgettext_current_source_encoding = po_charset_utf8;
2939	  arglist_parser_done (argparser, arg);
2940	  xgettext_current_source_encoding = xgettext_global_source_encoding;
2941	  if (next_argparser != NULL)
2942	    free (next_argparser);
2943#if DEBUG_PERL
2944	  fprintf (stderr, "%s:%d: extract_balanced finished at comma (%d)\n",
2945		   logical_file_name, tp->line_number, --nesting_level);
2946#endif
2947	  x_perl_unlex (tp);
2948	  return false;
2949	}
2950
2951      if (next_is_argument && tp->type != token_type_lparen)
2952	{
2953	  /* An argument list starts, even though there is no '('.  */
2954	  bool next_comma_delim;
2955
2956	  x_perl_unlex (tp);
2957
2958	  if (next_shapes != NULL)
2959	    /* We know something about the function being called.  Assume
2960	       that it consumes only one argument if no argument number or
2961	       total > 1 is specified.  */
2962	    {
2963	      size_t i;
2964
2965	      next_comma_delim = true;
2966	      for (i = 0; i < next_shapes->nshapes; i++)
2967		{
2968		  const struct callshape *shape = &next_shapes->shapes[i];
2969
2970		  if (shape->argnum1 > 1
2971		      || shape->argnum2 > 1
2972		      || shape->argnumc > 1
2973		      || shape->argtotal > 1)
2974		    next_comma_delim = false;
2975		}
2976	    }
2977	  else
2978	    /* We know nothing about the function being called.  It could be
2979	       a function prototyped to take only one argument, or on the other
2980	       hand it could be prototyped to take more than one argument or an
2981	       arbitrary argument list or it could be unprototyped.  Due to
2982	       the way the parser works, assuming the first case gives the
2983	       best results.  */
2984	    next_comma_delim = true;
2985
2986	  if (extract_balanced (mlp, delim, false, next_comma_delim,
2987				inner_context, next_context_iter,
2988				1, next_argparser))
2989	    {
2990	      xgettext_current_source_encoding = po_charset_utf8;
2991	      arglist_parser_done (argparser, arg);
2992	      xgettext_current_source_encoding = xgettext_global_source_encoding;
2993	      return true;
2994	    }
2995
2996	  next_is_argument = false;
2997	  next_argparser = NULL;
2998	  next_context_iter = null_context_list_iterator;
2999	  continue;
3000	}
3001
3002      switch (tp->type)
3003	{
3004	case token_type_symbol:
3005#if DEBUG_PERL
3006	  fprintf (stderr, "%s:%d: type symbol (%d) \"%s\"\n",
3007		   logical_file_name, tp->line_number, nesting_level,
3008		   tp->string);
3009#endif
3010
3011	  {
3012	    void *keyword_value;
3013
3014	    if (hash_find_entry (&keywords, tp->string, strlen (tp->string),
3015				 &keyword_value) == 0)
3016	      {
3017		const struct callshapes *shapes =
3018		  (const struct callshapes *) keyword_value;
3019
3020		last_token = token_type_keyword_symbol;
3021		next_shapes = shapes;
3022		next_argparser = arglist_parser_alloc (mlp, shapes);
3023	      }
3024	    else
3025	      {
3026		next_shapes = NULL;
3027		next_argparser = arglist_parser_alloc (mlp, NULL);
3028	      }
3029	  }
3030	  next_is_argument = true;
3031	  next_context_iter =
3032	    flag_context_list_iterator (
3033	      flag_context_list_table_lookup (
3034		flag_context_list_table,
3035		tp->string, strlen (tp->string)));
3036	  break;
3037
3038	case token_type_variable:
3039#if DEBUG_PERL
3040	  fprintf (stderr, "%s:%d: type variable (%d) \"%s\"\n",
3041		   logical_file_name, tp->line_number, nesting_level, tp->string);
3042#endif
3043	  prefer_division_over_regexp = true;
3044	  next_is_argument = false;
3045	  if (next_argparser != NULL)
3046	    free (next_argparser);
3047	  next_argparser = NULL;
3048	  next_context_iter = null_context_list_iterator;
3049	  break;
3050
3051	case token_type_lparen:
3052#if DEBUG_PERL
3053	  fprintf (stderr, "%s:%d: type left parenthesis (%d)\n",
3054		   logical_file_name, tp->line_number, nesting_level);
3055#endif
3056	  if (next_is_argument)
3057	    {
3058	      /* Parse the argument list of a function call.  */
3059	      if (extract_balanced (mlp, token_type_rparen, true, false,
3060				    inner_context, next_context_iter,
3061				    1, next_argparser))
3062		{
3063		  xgettext_current_source_encoding = po_charset_utf8;
3064		  arglist_parser_done (argparser, arg);
3065		  xgettext_current_source_encoding = xgettext_global_source_encoding;
3066		  return true;
3067		}
3068	      next_is_argument = false;
3069	      next_argparser = NULL;
3070	    }
3071	  else
3072	    {
3073	      /* Parse a parenthesized expression or comma expression.  */
3074	      if (extract_balanced (mlp, token_type_rparen, true, false,
3075				    inner_context, next_context_iter,
3076				    arg, arglist_parser_clone (argparser)))
3077		{
3078		  xgettext_current_source_encoding = po_charset_utf8;
3079		  arglist_parser_done (argparser, arg);
3080		  xgettext_current_source_encoding = xgettext_global_source_encoding;
3081		  if (next_argparser != NULL)
3082		    free (next_argparser);
3083		  free_token (tp);
3084		  return true;
3085		}
3086	      next_is_argument = false;
3087	      if (next_argparser != NULL)
3088		free (next_argparser);
3089	      next_argparser = NULL;
3090	    }
3091	  skip_until_comma = true;
3092	  next_context_iter = null_context_list_iterator;
3093	  break;
3094
3095	case token_type_rparen:
3096#if DEBUG_PERL
3097	  fprintf (stderr, "%s:%d: type right parenthesis (%d)\n",
3098		   logical_file_name, tp->line_number, nesting_level);
3099#endif
3100	  next_is_argument = false;
3101	  if (next_argparser != NULL)
3102	    free (next_argparser);
3103	  next_argparser = NULL;
3104	  skip_until_comma = true;
3105	  next_context_iter = null_context_list_iterator;
3106	  break;
3107
3108	case token_type_comma:
3109	case token_type_fat_comma:
3110#if DEBUG_PERL
3111	  fprintf (stderr, "%s:%d: type comma (%d)\n",
3112		   logical_file_name, tp->line_number, nesting_level);
3113#endif
3114	  if (arglist_parser_decidedp (argparser, arg))
3115	    {
3116	      /* We have missed the argument.  */
3117	      xgettext_current_source_encoding = po_charset_utf8;
3118	      arglist_parser_done (argparser, arg);
3119	      xgettext_current_source_encoding = xgettext_global_source_encoding;
3120	      argparser = arglist_parser_alloc (mlp, NULL);
3121	      arg = 0;
3122	    }
3123	  arg++;
3124#if DEBUG_PERL
3125	  fprintf (stderr, "%s:%d: arg: %d\n",
3126		   real_file_name, tp->line_number, arg);
3127#endif
3128	  inner_context =
3129	    inherited_context (outer_context,
3130			       flag_context_list_iterator_advance (
3131				 &context_iter));
3132	  next_is_argument = false;
3133	  if (next_argparser != NULL)
3134	    free (next_argparser);
3135	  next_argparser = NULL;
3136	  skip_until_comma = false;
3137	  next_context_iter = passthrough_context_list_iterator;
3138	  break;
3139
3140	case token_type_string:
3141#if DEBUG_PERL
3142	  fprintf (stderr, "%s:%d: type string (%d): \"%s\"\n",
3143		   logical_file_name, tp->line_number, nesting_level,
3144		   tp->string);
3145#endif
3146
3147	  if (extract_all)
3148	    {
3149	      char *string = collect_message (mlp, tp, EXIT_SUCCESS);
3150	      lex_pos_ty pos;
3151
3152	      pos.file_name = logical_file_name;
3153	      pos.line_number = tp->line_number;
3154	      xgettext_current_source_encoding = po_charset_utf8;
3155	      remember_a_message (mlp, NULL, string, inner_context, &pos,
3156				  tp->comment);
3157	      xgettext_current_source_encoding = xgettext_global_source_encoding;
3158	    }
3159	  else if (!skip_until_comma)
3160	    {
3161	      /* Need to collect the complete string, with error checking,
3162		 only if the argument ARG is used in ARGPARSER.  */
3163	      bool must_collect = false;
3164	      {
3165		size_t nalternatives = argparser->nalternatives;
3166		size_t i;
3167
3168		for (i = 0; i < nalternatives; i++)
3169		  {
3170		    struct partial_call *cp = &argparser->alternative[i];
3171
3172		    if (arg == cp->argnumc
3173			|| arg == cp->argnum1 || arg == cp->argnum2)
3174		      must_collect = true;
3175		  }
3176	      }
3177
3178	      if (must_collect)
3179		{
3180		  char *string = collect_message (mlp, tp, EXIT_FAILURE);
3181
3182		  xgettext_current_source_encoding = po_charset_utf8;
3183		  arglist_parser_remember (argparser, arg,
3184					   string, inner_context,
3185					   logical_file_name, tp->line_number,
3186					   tp->comment);
3187		  xgettext_current_source_encoding = xgettext_global_source_encoding;
3188		}
3189	    }
3190
3191	  if (arglist_parser_decidedp (argparser, arg))
3192	    {
3193	      xgettext_current_source_encoding = po_charset_utf8;
3194	      arglist_parser_done (argparser, arg);
3195	      xgettext_current_source_encoding = xgettext_global_source_encoding;
3196	      argparser = arglist_parser_alloc (mlp, NULL);
3197	    }
3198
3199	  next_is_argument = false;
3200	  if (next_argparser != NULL)
3201	    free (next_argparser);
3202	  next_argparser = NULL;
3203	  next_context_iter = null_context_list_iterator;
3204	  break;
3205
3206	case token_type_eof:
3207#if DEBUG_PERL
3208	  fprintf (stderr, "%s:%d: type EOF (%d)\n",
3209		   logical_file_name, tp->line_number, nesting_level);
3210#endif
3211	  xgettext_current_source_encoding = po_charset_utf8;
3212	  arglist_parser_done (argparser, arg);
3213	  xgettext_current_source_encoding = xgettext_global_source_encoding;
3214	  if (next_argparser != NULL)
3215	    free (next_argparser);
3216	  next_argparser = NULL;
3217	  free_token (tp);
3218	  return true;
3219
3220	case token_type_lbrace:
3221#if DEBUG_PERL
3222	  fprintf (stderr, "%s:%d: type lbrace (%d)\n",
3223		   logical_file_name, tp->line_number, nesting_level);
3224#endif
3225	  if (extract_balanced (mlp, token_type_rbrace, true, false,
3226				null_context, null_context_list_iterator,
3227				1, arglist_parser_alloc (mlp, NULL)))
3228	    {
3229	      xgettext_current_source_encoding = po_charset_utf8;
3230	      arglist_parser_done (argparser, arg);
3231	      xgettext_current_source_encoding = xgettext_global_source_encoding;
3232	      if (next_argparser != NULL)
3233		free (next_argparser);
3234	      free_token (tp);
3235	      return true;
3236	    }
3237	  next_is_argument = false;
3238	  if (next_argparser != NULL)
3239	    free (next_argparser);
3240	  next_argparser = NULL;
3241	  next_context_iter = null_context_list_iterator;
3242	  break;
3243
3244	case token_type_rbrace:
3245#if DEBUG_PERL
3246	  fprintf (stderr, "%s:%d: type rbrace (%d)\n",
3247		   logical_file_name, tp->line_number, nesting_level);
3248#endif
3249	  next_is_argument = false;
3250	  if (next_argparser != NULL)
3251	    free (next_argparser);
3252	  next_argparser = NULL;
3253	  next_context_iter = null_context_list_iterator;
3254	  break;
3255
3256	case token_type_lbracket:
3257#if DEBUG_PERL
3258	  fprintf (stderr, "%s:%d: type lbracket (%d)\n",
3259		   logical_file_name, tp->line_number, nesting_level);
3260#endif
3261	  if (extract_balanced (mlp, token_type_rbracket, true, false,
3262				null_context, null_context_list_iterator,
3263				1, arglist_parser_alloc (mlp, NULL)))
3264	    {
3265	      xgettext_current_source_encoding = po_charset_utf8;
3266	      arglist_parser_done (argparser, arg);
3267	      xgettext_current_source_encoding = xgettext_global_source_encoding;
3268	      if (next_argparser != NULL)
3269		free (next_argparser);
3270	      free_token (tp);
3271	      return true;
3272	    }
3273	  next_is_argument = false;
3274	  if (next_argparser != NULL)
3275	    free (next_argparser);
3276	  next_argparser = NULL;
3277	  next_context_iter = null_context_list_iterator;
3278	  break;
3279
3280	case token_type_rbracket:
3281#if DEBUG_PERL
3282	  fprintf (stderr, "%s:%d: type rbracket (%d)\n",
3283		   logical_file_name, tp->line_number, nesting_level);
3284#endif
3285	  next_is_argument = false;
3286	  if (next_argparser != NULL)
3287	    free (next_argparser);
3288	  next_argparser = NULL;
3289	  next_context_iter = null_context_list_iterator;
3290	  break;
3291
3292	case token_type_semicolon:
3293#if DEBUG_PERL
3294	  fprintf (stderr, "%s:%d: type semicolon (%d)\n",
3295		   logical_file_name, tp->line_number, nesting_level);
3296#endif
3297
3298	  /* The ultimate sign.  */
3299	  xgettext_current_source_encoding = po_charset_utf8;
3300	  arglist_parser_done (argparser, arg);
3301	  xgettext_current_source_encoding = xgettext_global_source_encoding;
3302	  argparser = arglist_parser_alloc (mlp, NULL);
3303
3304	  /* FIXME: Instead of resetting outer_context here, it may be better
3305	     to recurse in the next_is_argument handling above, waiting for
3306	     the next semicolon or other statement terminator.  */
3307	  outer_context = null_context;
3308	  context_iter = null_context_list_iterator;
3309	  next_is_argument = false;
3310	  if (next_argparser != NULL)
3311	    free (next_argparser);
3312	  next_argparser = NULL;
3313	  next_context_iter = passthrough_context_list_iterator;
3314	  inner_context =
3315	    inherited_context (outer_context,
3316			       flag_context_list_iterator_advance (
3317				 &context_iter));
3318	  break;
3319
3320	case token_type_dereference:
3321#if DEBUG_PERL
3322	  fprintf (stderr, "%s:%d: type dereference (%d)\n",
3323		   logical_file_name, tp->line_number, nesting_level);
3324#endif
3325	  next_is_argument = false;
3326	  if (next_argparser != NULL)
3327	    free (next_argparser);
3328	  next_argparser = NULL;
3329	  next_context_iter = null_context_list_iterator;
3330	  break;
3331
3332	case token_type_dot:
3333#if DEBUG_PERL
3334	  fprintf (stderr, "%s:%d: type dot (%d)\n",
3335		   logical_file_name, tp->line_number, nesting_level);
3336#endif
3337	  next_is_argument = false;
3338	  if (next_argparser != NULL)
3339	    free (next_argparser);
3340	  next_argparser = NULL;
3341	  next_context_iter = null_context_list_iterator;
3342	  break;
3343
3344	case token_type_named_op:
3345#if DEBUG_PERL
3346	  fprintf (stderr, "%s:%d: type named operator (%d): %s\n",
3347		   logical_file_name, tp->line_number, nesting_level,
3348		   tp->string);
3349#endif
3350	  next_is_argument = false;
3351	  if (next_argparser != NULL)
3352	    free (next_argparser);
3353	  next_argparser = NULL;
3354	  next_context_iter = null_context_list_iterator;
3355	  break;
3356
3357	case token_type_regex_op:
3358#if DEBUG_PERL
3359	  fprintf (stderr, "%s:%d: type regex operator (%d)\n",
3360		   logical_file_name, tp->line_number, nesting_level);
3361#endif
3362	  next_is_argument = false;
3363	  if (next_argparser != NULL)
3364	    free (next_argparser);
3365	  next_argparser = NULL;
3366	  next_context_iter = null_context_list_iterator;
3367	  break;
3368
3369	case token_type_other:
3370#if DEBUG_PERL
3371	  fprintf (stderr, "%s:%d: type other (%d)\n",
3372		   logical_file_name, tp->line_number, nesting_level);
3373#endif
3374	  next_is_argument = false;
3375	  if (next_argparser != NULL)
3376	    free (next_argparser);
3377	  next_argparser = NULL;
3378	  next_context_iter = null_context_list_iterator;
3379	  break;
3380
3381	default:
3382	  fprintf (stderr, "%s:%d: unknown token type %d\n",
3383		   real_file_name, tp->line_number, tp->type);
3384	  abort ();
3385	}
3386
3387      free_token (tp);
3388    }
3389}
3390
3391void
3392extract_perl (FILE *f, const char *real_filename, const char *logical_filename,
3393	      flag_context_list_table_ty *flag_table,
3394	      msgdomain_list_ty *mdlp)
3395{
3396  message_list_ty *mlp = mdlp->item[0]->messages;
3397
3398  fp = f;
3399  real_file_name = real_filename;
3400  logical_file_name = xstrdup (logical_filename);
3401  line_number = 0;
3402
3403  last_comment_line = -1;
3404  last_non_comment_line = -1;
3405
3406  flag_context_list_table = flag_table;
3407
3408  init_keywords ();
3409
3410  token_stack.items = NULL;
3411  token_stack.nitems = 0;
3412  token_stack.nitems_max = 0;
3413  linesize = 0;
3414  linepos = 0;
3415  here_eaten = 0;
3416  end_of_file = false;
3417
3418  /* Eat tokens until eof is seen.  When extract_balanced returns
3419     due to an unbalanced closing brace, just restart it.  */
3420  while (!extract_balanced (mlp, token_type_rbrace, true, false,
3421			    null_context, null_context_list_iterator,
3422			    1, arglist_parser_alloc (mlp, NULL)))
3423    ;
3424
3425  fp = NULL;
3426  real_file_name = NULL;
3427  free (logical_file_name);
3428  logical_file_name = NULL;
3429  line_number = 0;
3430  last_token = token_type_semicolon;
3431  token_stack_free (&token_stack);
3432  here_eaten = 0;
3433  end_of_file = true;
3434}
3435