1/* xgettext Python backend.
2   Copyright (C) 2002-2003, 2005-2006 Free Software Foundation, Inc.
3
4   This file was written by Bruno Haible <haible@clisp.cons.org>, 2002.
5
6   This program is free software; you can redistribute it and/or modify
7   it under the terms of the GNU General Public License as published by
8   the Free Software Foundation; either version 2, or (at your option)
9   any later version.
10
11   This program is distributed in the hope that it will be useful,
12   but WITHOUT ANY WARRANTY; without even the implied warranty of
13   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14   GNU General Public License for more details.
15
16   You should have received a copy of the GNU General Public License
17   along with this program; if not, write to the Free Software Foundation,
18   Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.  */
19
20#ifdef HAVE_CONFIG_H
21# include "config.h"
22#endif
23
24#include <assert.h>
25#include <errno.h>
26#include <stdbool.h>
27#include <stdio.h>
28#include <stdlib.h>
29#include <string.h>
30
31#include "message.h"
32#include "xgettext.h"
33#include "x-python.h"
34#include "error.h"
35#include "error-progname.h"
36#include "progname.h"
37#include "basename.h"
38#include "xerror.h"
39#include "xvasprintf.h"
40#include "xalloc.h"
41#include "exit.h"
42#include "c-strstr.h"
43#include "c-ctype.h"
44#include "po-charset.h"
45#include "uniname.h"
46#include "utf16-ucs4.h"
47#include "utf8-ucs4.h"
48#include "ucs4-utf8.h"
49#include "gettext.h"
50
51#define _(s) gettext(s)
52
53#define max(a,b) ((a) > (b) ? (a) : (b))
54
55#define SIZEOF(a) (sizeof(a) / sizeof(a[0]))
56
57
58/* The Python syntax is defined in the Python Reference Manual
59   /usr/share/doc/packages/python/html/ref/index.html.
60   See also Python-2.0/Parser/tokenizer.c, Python-2.0/Python/compile.c,
61   Python-2.0/Objects/unicodeobject.c.  */
62
63
64/* ====================== Keyword set customization.  ====================== */
65
66/* If true extract all strings.  */
67static bool extract_all = false;
68
69static hash_table keywords;
70static bool default_keywords = true;
71
72
73void
74x_python_extract_all ()
75{
76  extract_all = true;
77}
78
79
80void
81x_python_keyword (const char *name)
82{
83  if (name == NULL)
84    default_keywords = false;
85  else
86    {
87      const char *end;
88      struct callshape shape;
89      const char *colon;
90
91      if (keywords.table == NULL)
92	hash_init (&keywords, 100);
93
94      split_keywordspec (name, &end, &shape);
95
96      /* The characters between name and end should form a valid C identifier.
97	 A colon means an invalid parse in split_keywordspec().  */
98      colon = strchr (name, ':');
99      if (colon == NULL || colon >= end)
100	insert_keyword_callshape (&keywords, name, end - name, &shape);
101    }
102}
103
104/* Finish initializing the keywords hash table.
105   Called after argument processing, before each file is processed.  */
106static void
107init_keywords ()
108{
109  if (default_keywords)
110    {
111      /* When adding new keywords here, also update the documentation in
112	 xgettext.texi!  */
113      x_python_keyword ("gettext");
114      x_python_keyword ("ugettext");
115      x_python_keyword ("dgettext:2");
116      x_python_keyword ("ngettext:1,2");
117      x_python_keyword ("ungettext:1,2");
118      x_python_keyword ("dngettext:2,3");
119      x_python_keyword ("_");
120      default_keywords = false;
121    }
122}
123
124void
125init_flag_table_python ()
126{
127  xgettext_record_flag ("gettext:1:pass-python-format");
128  xgettext_record_flag ("ugettext:1:pass-python-format");
129  xgettext_record_flag ("dgettext:2:pass-python-format");
130  xgettext_record_flag ("ngettext:1:pass-python-format");
131  xgettext_record_flag ("ngettext:2:pass-python-format");
132  xgettext_record_flag ("ungettext:1:pass-python-format");
133  xgettext_record_flag ("ungettext:2:pass-python-format");
134  xgettext_record_flag ("dngettext:2:pass-python-format");
135  xgettext_record_flag ("dngettext:3:pass-python-format");
136  xgettext_record_flag ("_:1:pass-python-format");
137  /* xgettext_record_flag ("%:1:python-format"); // % is an infix operator! */
138}
139
140
141/* ======================== Reading of characters.  ======================== */
142
143/* Real filename, used in error messages about the input file.  */
144static const char *real_file_name;
145
146/* Logical filename and line number, used to label the extracted messages.  */
147static char *logical_file_name;
148static int line_number;
149
150/* The input file stream.  */
151static FILE *fp;
152
153
154/* 1. line_number handling.  */
155
156/* Maximum used, roughly a safer MB_LEN_MAX.  */
157#define MAX_PHASE1_PUSHBACK 16
158static unsigned char phase1_pushback[MAX_PHASE1_PUSHBACK];
159static int phase1_pushback_length;
160
161/* Read the next single byte from the input file.  */
162static int
163phase1_getc ()
164{
165  int c;
166
167  if (phase1_pushback_length)
168    c = phase1_pushback[--phase1_pushback_length];
169  else
170    {
171      c = getc (fp);
172
173      if (c == EOF)
174	{
175	  if (ferror (fp))
176	    error (EXIT_FAILURE, errno, _("error while reading \"%s\""),
177		   real_file_name);
178	  return EOF;
179	}
180    }
181
182  if (c == '\n')
183    ++line_number;
184
185  return c;
186}
187
188/* Supports MAX_PHASE1_PUSHBACK characters of pushback.  */
189static void
190phase1_ungetc (int c)
191{
192  if (c != EOF)
193    {
194      if (c == '\n')
195	--line_number;
196
197      if (phase1_pushback_length == SIZEOF (phase1_pushback))
198	abort ();
199      phase1_pushback[phase1_pushback_length++] = c;
200    }
201}
202
203
204/* Phase 2: Conversion to Unicode.
205   This is done early because PEP 0263 specifies that conversion to Unicode
206   conceptually occurs before tokenization.  A test case where it matters
207   is with encodings like BIG5: when a double-byte character ending in 0x5C
208   is followed by '\' or 'u0021', the tokenizer must not treat the second
209   half of the double-byte character as a backslash.  */
210
211/* End-of-file indicator for functions returning an UCS-4 character.  */
212#define UEOF -1
213
214static int phase2_pushback[max (9, UNINAME_MAX + 3)];
215static int phase2_pushback_length;
216
217/* Read the next Unicode UCS-4 character from the input file.  */
218static int
219phase2_getc ()
220{
221  if (phase2_pushback_length)
222    return phase2_pushback[--phase2_pushback_length];
223
224  if (xgettext_current_source_encoding == po_charset_ascii)
225    {
226      int c = phase1_getc ();
227      if (c == EOF)
228	return UEOF;
229      if (!c_isascii (c))
230	{
231	  char buffer[21];
232	  sprintf (buffer, ":%ld", (long) line_number);
233	  multiline_error (xstrdup (""),
234			   xasprintf (_("\
235Non-ASCII string at %s%s.\n\
236Please specify the source encoding through --from-code or through a comment\n\
237as specified in http://www.python.org/peps/pep-0263.html.\n"),
238			   real_file_name, buffer));
239	  exit (EXIT_FAILURE);
240	}
241      return c;
242    }
243  else if (xgettext_current_source_encoding != po_charset_utf8)
244    {
245#if HAVE_ICONV
246      /* Use iconv on an increasing number of bytes.  Read only as many bytes
247	 through phase1_getc as needed.  This is needed to give reasonable
248	 interactive behaviour when fp is connected to an interactive tty.  */
249      unsigned char buf[MAX_PHASE1_PUSHBACK];
250      size_t bufcount;
251      int c = phase1_getc ();
252      if (c == EOF)
253	return UEOF;
254      buf[0] = (unsigned char) c;
255      bufcount = 1;
256
257      for (;;)
258	{
259	  unsigned char scratchbuf[6];
260	  const char *inptr = (const char *) &buf[0];
261	  size_t insize = bufcount;
262	  char *outptr = (char *) &scratchbuf[0];
263	  size_t outsize = sizeof (scratchbuf);
264
265	  size_t res = iconv (xgettext_current_source_iconv,
266			      (ICONV_CONST char **) &inptr, &insize,
267			      &outptr, &outsize);
268	  /* We expect that a character has been produced if and only if
269	     some input bytes have been consumed.  */
270	  if ((insize < bufcount) != (outsize < sizeof (scratchbuf)))
271	    abort ();
272	  if (outsize == sizeof (scratchbuf))
273	    {
274	      /* No character has been produced.  Must be an error.  */
275	      if (res != (size_t)(-1))
276		abort ();
277
278	      if (errno == EILSEQ)
279		{
280		  /* An invalid multibyte sequence was encountered.  */
281		  multiline_error (xstrdup (""),
282				   xasprintf (_("\
283%s:%d: Invalid multibyte sequence.\n\
284Please specify the correct source encoding through --from-code or through a\n\
285comment as specified in http://www.python.org/peps/pep-0263.html.\n"),
286				   real_file_name, line_number));
287		  exit (EXIT_FAILURE);
288		}
289	      else if (errno == EINVAL)
290		{
291		  /* An incomplete multibyte character.  */
292		  int c;
293
294		  if (bufcount == MAX_PHASE1_PUSHBACK)
295		    {
296		      /* An overlong incomplete multibyte sequence was
297			 encountered.  */
298		      multiline_error (xstrdup (""),
299				       xasprintf (_("\
300%s:%d: Long incomplete multibyte sequence.\n\
301Please specify the correct source encoding through --from-code or through a\n\
302comment as specified in http://www.python.org/peps/pep-0263.html.\n"),
303				       real_file_name, line_number));
304		      exit (EXIT_FAILURE);
305		    }
306
307		  /* Read one more byte and retry iconv.  */
308		  c = phase1_getc ();
309		  if (c == EOF)
310		    {
311		      multiline_error (xstrdup (""),
312				       xasprintf (_("\
313%s:%d: Incomplete multibyte sequence at end of file.\n\
314Please specify the correct source encoding through --from-code or through a\n\
315comment as specified in http://www.python.org/peps/pep-0263.html.\n"),
316				       real_file_name, line_number));
317		      exit (EXIT_FAILURE);
318		    }
319		  if (c == '\n')
320		    {
321		      multiline_error (xstrdup (""),
322				       xasprintf (_("\
323%s:%d: Incomplete multibyte sequence at end of line.\n\
324Please specify the correct source encoding through --from-code or through a\n\
325comment as specified in http://www.python.org/peps/pep-0263.html.\n"),
326				       real_file_name, line_number - 1));
327		      exit (EXIT_FAILURE);
328		    }
329		  buf[bufcount++] = (unsigned char) c;
330		}
331	      else
332		error (EXIT_FAILURE, errno, _("%s:%d: iconv failure"),
333		       real_file_name, line_number);
334	    }
335	  else
336	    {
337	      size_t outbytes = sizeof (scratchbuf) - outsize;
338	      size_t bytes = bufcount - insize;
339	      unsigned int uc;
340
341	      /* We expect that one character has been produced.  */
342	      if (bytes == 0)
343		abort ();
344	      if (outbytes == 0)
345		abort ();
346	      /* Push back the unused bytes.  */
347	      while (insize > 0)
348		phase1_ungetc (buf[--insize]);
349	      /* Convert the character from UTF-8 to UCS-4.  */
350	      if (u8_mbtouc (&uc, scratchbuf, outbytes) < outbytes)
351		{
352		  /* scratchbuf contains an out-of-range Unicode character
353		     (> 0x10ffff).  */
354		  multiline_error (xstrdup (""),
355				   xasprintf (_("\
356%s:%d: Invalid multibyte sequence.\n\
357Please specify the source encoding through --from-code or through a comment\n\
358as specified in http://www.python.org/peps/pep-0263.html.\n"),
359				   real_file_name, line_number));
360		  exit (EXIT_FAILURE);
361		}
362	      return uc;
363	    }
364	}
365#else
366      /* If we don't have iconv(), the only supported values for
367	 xgettext_global_source_encoding and thus also for
368	 xgettext_current_source_encoding are ASCII and UTF-8.  */
369      abort ();
370#endif
371    }
372  else
373    {
374      /* Read an UTF-8 encoded character.  */
375      unsigned char buf[6];
376      unsigned int count;
377      int c;
378      unsigned int uc;
379
380      c = phase1_getc ();
381      if (c == EOF)
382	return UEOF;
383      buf[0] = c;
384      count = 1;
385
386      if (buf[0] >= 0xc0)
387	{
388	  c = phase1_getc ();
389	  if (c == EOF)
390	    return UEOF;
391	  buf[1] = c;
392	  count = 2;
393	}
394
395      if (buf[0] >= 0xe0
396	  && ((buf[1] ^ 0x80) < 0x40))
397	{
398	  c = phase1_getc ();
399	  if (c == EOF)
400	    return UEOF;
401	  buf[2] = c;
402	  count = 3;
403	}
404
405      if (buf[0] >= 0xf0
406	  && ((buf[1] ^ 0x80) < 0x40)
407	  && ((buf[2] ^ 0x80) < 0x40))
408	{
409	  c = phase1_getc ();
410	  if (c == EOF)
411	    return UEOF;
412	  buf[3] = c;
413	  count = 4;
414	}
415
416      if (buf[0] >= 0xf8
417	  && ((buf[1] ^ 0x80) < 0x40)
418	  && ((buf[2] ^ 0x80) < 0x40)
419	  && ((buf[3] ^ 0x80) < 0x40))
420	{
421	  c = phase1_getc ();
422	  if (c == EOF)
423	    return UEOF;
424	  buf[4] = c;
425	  count = 5;
426	}
427
428      if (buf[0] >= 0xfc
429	  && ((buf[1] ^ 0x80) < 0x40)
430	  && ((buf[2] ^ 0x80) < 0x40)
431	  && ((buf[3] ^ 0x80) < 0x40)
432	  && ((buf[4] ^ 0x80) < 0x40))
433	{
434	  c = phase1_getc ();
435	  if (c == EOF)
436	    return UEOF;
437	  buf[5] = c;
438	  count = 6;
439	}
440
441      u8_mbtouc (&uc, buf, count);
442      return uc;
443    }
444}
445
446/* Supports max (9, UNINAME_MAX + 3) pushback characters.  */
447static void
448phase2_ungetc (int c)
449{
450  if (c != UEOF)
451    {
452      if (phase2_pushback_length == SIZEOF (phase2_pushback))
453	abort ();
454      phase2_pushback[phase2_pushback_length++] = c;
455    }
456}
457
458
459/* ========================= Accumulating strings.  ======================== */
460
461/* A string buffer type that allows appending Unicode characters.
462   Returns the entire string in UTF-8 encoding.  */
463
464struct unicode_string_buffer
465{
466  /* The part of the string that has already been converted to UTF-8.  */
467  char *utf8_buffer;
468  size_t utf8_buflen;
469  size_t utf8_allocated;
470};
471
472/* Initialize a 'struct unicode_string_buffer' to empty.  */
473static inline void
474init_unicode_string_buffer (struct unicode_string_buffer *bp)
475{
476  bp->utf8_buffer = NULL;
477  bp->utf8_buflen = 0;
478  bp->utf8_allocated = 0;
479}
480
481/* Auxiliary function: Ensure count more bytes are available in bp->utf8.  */
482static inline void
483unicode_string_buffer_append_unicode_grow (struct unicode_string_buffer *bp,
484					   size_t count)
485{
486  if (bp->utf8_buflen + count > bp->utf8_allocated)
487    {
488      size_t new_allocated = 2 * bp->utf8_allocated + 10;
489      if (new_allocated < bp->utf8_buflen + count)
490	new_allocated = bp->utf8_buflen + count;
491      bp->utf8_allocated = new_allocated;
492      bp->utf8_buffer = xrealloc (bp->utf8_buffer, new_allocated);
493    }
494}
495
496/* Auxiliary function: Append a Unicode character to bp->utf8.
497   uc must be < 0x110000.  */
498static inline void
499unicode_string_buffer_append_unicode (struct unicode_string_buffer *bp,
500				      unsigned int uc)
501{
502  unsigned char utf8buf[6];
503  int count = u8_uctomb (utf8buf, uc, 6);
504
505  if (count < 0)
506    /* The caller should have ensured that uc is not out-of-range.  */
507    abort ();
508
509  unicode_string_buffer_append_unicode_grow (bp, count);
510  memcpy (bp->utf8_buffer + bp->utf8_buflen, utf8buf, count);
511  bp->utf8_buflen += count;
512}
513
514/* Return the string buffer's contents.  */
515static char *
516unicode_string_buffer_result (struct unicode_string_buffer *bp)
517{
518  /* NUL-terminate it.  */
519  unicode_string_buffer_append_unicode_grow (bp, 1);
520  bp->utf8_buffer[bp->utf8_buflen] = '\0';
521  /* Return it.  */
522  return bp->utf8_buffer;
523}
524
525/* Free the memory pointed to by a 'struct unicode_string_buffer'.  */
526static inline void
527free_unicode_string_buffer (struct unicode_string_buffer *bp)
528{
529  free (bp->utf8_buffer);
530}
531
532
533/* ======================== Accumulating comments.  ======================== */
534
535
536/* Accumulating a single comment line.  */
537
538static struct unicode_string_buffer comment_buffer;
539
540static inline void
541comment_start ()
542{
543  comment_buffer.utf8_buflen = 0;
544}
545
546static inline bool
547comment_at_start ()
548{
549  return (comment_buffer.utf8_buflen == 0);
550}
551
552static inline void
553comment_add (int c)
554{
555  unicode_string_buffer_append_unicode (&comment_buffer, c);
556}
557
558static inline const char *
559comment_line_end ()
560{
561  char *buffer = unicode_string_buffer_result (&comment_buffer);
562  size_t buflen = strlen (buffer);
563
564  while (buflen >= 1
565	 && (buffer[buflen - 1] == ' ' || buffer[buflen - 1] == '\t'))
566    --buflen;
567  buffer[buflen] = '\0';
568  savable_comment_add (buffer);
569  return buffer;
570}
571
572
573/* These are for tracking whether comments count as immediately before
574   keyword.  */
575static int last_comment_line;
576static int last_non_comment_line;
577
578
579/* ======================== Recognizing comments.  ======================== */
580
581
582/* Recognizing the "coding" comment.
583   As specified in PEP 0263, it takes the form
584     "coding" [":"|"="] {alphanumeric or "-" or "_" or "*"}*
585   and is located in a comment in a line that
586     - is either the first or second line,
587     - is not a continuation line,
588     - contains no other tokens except this comment.  */
589
590/* Canonicalized encoding name for the current input file.  */
591static const char *xgettext_current_file_source_encoding;
592
593#if HAVE_ICONV
594/* Converter from xgettext_current_file_source_encoding to UTF-8 (except from
595   ASCII or UTF-8, when this conversion is a no-op).  */
596static iconv_t xgettext_current_file_source_iconv;
597#endif
598
599static inline void
600set_current_file_source_encoding (const char *canon_encoding)
601{
602  xgettext_current_file_source_encoding = canon_encoding;
603
604  if (xgettext_current_file_source_encoding != po_charset_ascii
605      && xgettext_current_file_source_encoding != po_charset_utf8)
606    {
607#if HAVE_ICONV
608      iconv_t cd;
609
610      /* Avoid glibc-2.1 bug with EUC-KR.  */
611# if (__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) && !defined _LIBICONV_VERSION
612      if (strcmp (xgettext_current_file_source_encoding, "EUC-KR") == 0)
613	cd = (iconv_t)(-1);
614      else
615# endif
616      cd = iconv_open (po_charset_utf8, xgettext_current_file_source_encoding);
617      if (cd == (iconv_t)(-1))
618	error_at_line (EXIT_FAILURE, 0, logical_file_name, line_number - 1, _("\
619Cannot convert from \"%s\" to \"%s\". %s relies on iconv(), \
620and iconv() does not support this conversion."),
621	       xgettext_current_file_source_encoding, po_charset_utf8,
622	       basename (program_name));
623      xgettext_current_file_source_iconv = cd;
624#else
625      error_at_line (EXIT_FAILURE, 0, logical_file_name, line_number - 1, _("\
626Cannot convert from \"%s\" to \"%s\". %s relies on iconv(). \
627This version was built without iconv()."),
628	     xgettext_global_source_encoding, po_charset_utf8,
629	     basename (program_name));
630#endif
631    }
632
633  xgettext_current_source_encoding = xgettext_current_file_source_encoding;
634#if HAVE_ICONV
635  xgettext_current_source_iconv = xgettext_current_file_source_iconv;
636#endif
637}
638
639static inline void
640try_to_extract_coding (const char *comment)
641{
642  const char *p = c_strstr (comment, "coding");
643
644  if (p != NULL)
645    {
646      p += 6;
647      if (*p == ':' || *p == '=')
648	{
649	  p++;
650	  while (*p == ' ' || *p == '\t')
651	    p++;
652	  {
653	    const char *encoding_start = p;
654
655	    while (c_isalnum (*p) || *p == '-' || *p == '_' || *p == '.')
656	      p++;
657	    {
658	      const char *encoding_end = p;
659
660	      if (encoding_end > encoding_start)
661		{
662		  /* Extract the encoding string.  */
663		  size_t encoding_len = encoding_end - encoding_start;
664		  char *encoding = (char *) xmalloc (encoding_len + 1);
665
666		  memcpy (encoding, encoding_start, encoding_len);
667		  encoding[encoding_len] = '\0';
668
669		  {
670		    /* Canonicalize it.  */
671		    const char *canon_encoding = po_charset_canonicalize (encoding);
672		    if (canon_encoding == NULL)
673		      {
674			error_at_line (0, 0,
675				       logical_file_name, line_number - 1, _("\
676Unknown encoding \"%s\". Proceeding with ASCII instead."),
677				       encoding);
678		        canon_encoding = po_charset_ascii;
679		      }
680
681		    /* Activate it.  */
682		    set_current_file_source_encoding (canon_encoding);
683		  }
684
685		  free (encoding);
686		}
687	    }
688	  }
689	}
690    }
691}
692
693/* Tracking whether the current line is a continuation line or contains a
694   non-blank character.  */
695static bool continuation_or_nonblank_line = false;
696
697
698/* Phase 3: Outside strings, replace backslash-newline with nothing and a
699   comment with nothing.  */
700
701static int
702phase3_getc ()
703{
704  int c;
705
706  for (;;)
707    {
708      c = phase2_getc ();
709      if (c == '\\')
710	{
711	  c = phase2_getc ();
712	  if (c != '\n')
713	    {
714	      phase2_ungetc (c);
715	      /* This shouldn't happen usually, because "A backslash is
716		 illegal elsewhere on a line outside a string literal."  */
717	      return '\\';
718	    }
719	  /* Eat backslash-newline.  */
720	  continuation_or_nonblank_line = true;
721	}
722      else if (c == '#')
723	{
724	  /* Eat a comment.  */
725	  const char *comment;
726
727	  last_comment_line = line_number;
728	  comment_start ();
729	  for (;;)
730	    {
731	      c = phase2_getc ();
732	      if (c == UEOF || c == '\n')
733		break;
734	      /* We skip all leading white space, but not EOLs.  */
735	      if (!(comment_at_start () && (c == ' ' || c == '\t')))
736		comment_add (c);
737	    }
738	  comment = comment_line_end ();
739	  if (line_number - 1 <= 2 && !continuation_or_nonblank_line)
740	    try_to_extract_coding (comment);
741	  continuation_or_nonblank_line = false;
742	  return c;
743	}
744      else
745	{
746	  if (c == '\n')
747	    continuation_or_nonblank_line = false;
748	  else if (!(c == ' ' || c == '\t' || c == '\f'))
749	    continuation_or_nonblank_line = true;
750	  return c;
751	}
752    }
753}
754
755/* Supports only one pushback character.  */
756static void
757phase3_ungetc (int c)
758{
759  phase2_ungetc (c);
760}
761
762
763/* ========================= Accumulating strings.  ======================== */
764
765/* Return value of phase7_getuc when EOF is reached.  */
766#define P7_EOF (-1)
767#define P7_STRING_END (-2)
768
769/* Convert an UTF-16 or UTF-32 code point to a return value that can be
770   distinguished from a single-byte return value.  */
771#define UNICODE(code) (0x100 + (code))
772
773/* Test a return value of phase7_getuc whether it designates an UTF-16 or
774   UTF-32 code point.  */
775#define IS_UNICODE(p7_result) ((p7_result) >= 0x100)
776
777/* Extract the UTF-16 or UTF-32 code of a return value that satisfies
778   IS_UNICODE.  */
779#define UNICODE_VALUE(p7_result) ((p7_result) - 0x100)
780
781/* A string buffer type that allows appending bytes (in the
782   xgettext_current_source_encoding) or Unicode characters.
783   Returns the entire string in UTF-8 encoding.  */
784
785struct mixed_string_buffer
786{
787  /* The part of the string that has already been converted to UTF-8.  */
788  char *utf8_buffer;
789  size_t utf8_buflen;
790  size_t utf8_allocated;
791  /* The first half of an UTF-16 surrogate character.  */
792  unsigned short utf16_surr;
793  /* The part of the string that is still in the source encoding.  */
794  char *curr_buffer;
795  size_t curr_buflen;
796  size_t curr_allocated;
797};
798
799/* Initialize a 'struct mixed_string_buffer' to empty.  */
800static inline void
801init_mixed_string_buffer (struct mixed_string_buffer *bp)
802{
803  bp->utf8_buffer = NULL;
804  bp->utf8_buflen = 0;
805  bp->utf8_allocated = 0;
806  bp->utf16_surr = 0;
807  bp->curr_buffer = NULL;
808  bp->curr_buflen = 0;
809  bp->curr_allocated = 0;
810}
811
812/* Auxiliary function: Append a byte to bp->curr.  */
813static inline void
814mixed_string_buffer_append_byte (struct mixed_string_buffer *bp, unsigned char c)
815{
816  if (bp->curr_buflen == bp->curr_allocated)
817    {
818      bp->curr_allocated = 2 * bp->curr_allocated + 10;
819      bp->curr_buffer = xrealloc (bp->curr_buffer, bp->curr_allocated);
820    }
821  bp->curr_buffer[bp->curr_buflen++] = c;
822}
823
824/* Auxiliary function: Ensure count more bytes are available in bp->utf8.  */
825static inline void
826mixed_string_buffer_append_unicode_grow (struct mixed_string_buffer *bp, size_t count)
827{
828  if (bp->utf8_buflen + count > bp->utf8_allocated)
829    {
830      size_t new_allocated = 2 * bp->utf8_allocated + 10;
831      if (new_allocated < bp->utf8_buflen + count)
832	new_allocated = bp->utf8_buflen + count;
833      bp->utf8_allocated = new_allocated;
834      bp->utf8_buffer = xrealloc (bp->utf8_buffer, new_allocated);
835    }
836}
837
838/* Auxiliary function: Append a Unicode character to bp->utf8.
839   uc must be < 0x110000.  */
840static inline void
841mixed_string_buffer_append_unicode (struct mixed_string_buffer *bp, unsigned int uc)
842{
843  unsigned char utf8buf[6];
844  int count = u8_uctomb (utf8buf, uc, 6);
845
846  if (count < 0)
847    /* The caller should have ensured that uc is not out-of-range.  */
848    abort ();
849
850  mixed_string_buffer_append_unicode_grow (bp, count);
851  memcpy (bp->utf8_buffer + bp->utf8_buflen, utf8buf, count);
852  bp->utf8_buflen += count;
853}
854
855/* Auxiliary function: Flush bp->utf16_surr into bp->utf8_buffer.  */
856static inline void
857mixed_string_buffer_flush_utf16_surr (struct mixed_string_buffer *bp)
858{
859  if (bp->utf16_surr != 0)
860    {
861      /* A half surrogate is invalid, therefore use U+FFFD instead.  */
862      mixed_string_buffer_append_unicode (bp, 0xfffd);
863      bp->utf16_surr = 0;
864    }
865}
866
867/* Auxiliary function: Flush bp->curr_buffer into bp->utf8_buffer.  */
868static inline void
869mixed_string_buffer_flush_curr_buffer (struct mixed_string_buffer *bp, int lineno)
870{
871  if (bp->curr_buflen > 0)
872    {
873      char *curr;
874      size_t count;
875
876      mixed_string_buffer_append_byte (bp, '\0');
877
878      /* Convert from the source encoding to UTF-8.  */
879      curr = from_current_source_encoding (bp->curr_buffer,
880					   logical_file_name, lineno);
881
882      /* Append it to bp->utf8_buffer.  */
883      count = strlen (curr);
884      mixed_string_buffer_append_unicode_grow (bp, count);
885      memcpy (bp->utf8_buffer + bp->utf8_buflen, curr, count);
886      bp->utf8_buflen += count;
887
888      if (curr != bp->curr_buffer)
889	free (curr);
890      bp->curr_buflen = 0;
891    }
892}
893
894/* Append a character or Unicode character to a 'struct mixed_string_buffer'.  */
895static void
896mixed_string_buffer_append (struct mixed_string_buffer *bp, int c)
897{
898  if (IS_UNICODE (c))
899    {
900      /* Append a Unicode character.  */
901
902      /* Switch from multibyte character mode to Unicode character mode.  */
903      mixed_string_buffer_flush_curr_buffer (bp, line_number);
904
905      /* Test whether this character and the previous one form a Unicode
906	 surrogate character pair.  */
907      if (bp->utf16_surr != 0
908	  && (c >= UNICODE (0xdc00) && c < UNICODE (0xe000)))
909	{
910	  unsigned short utf16buf[2];
911	  unsigned int uc;
912
913	  utf16buf[0] = bp->utf16_surr;
914	  utf16buf[1] = UNICODE_VALUE (c);
915	  if (u16_mbtouc_aux (&uc, utf16buf, 2) != 2)
916	    abort ();
917
918	  mixed_string_buffer_append_unicode (bp, uc);
919	  bp->utf16_surr = 0;
920	}
921      else
922	{
923	  mixed_string_buffer_flush_utf16_surr (bp);
924
925	  if (c >= UNICODE (0xd800) && c < UNICODE (0xdc00))
926	    bp->utf16_surr = UNICODE_VALUE (c);
927	  else
928	    mixed_string_buffer_append_unicode (bp, UNICODE_VALUE (c));
929	}
930    }
931  else
932    {
933      /* Append a single byte.  */
934
935      /* Switch from Unicode character mode to multibyte character mode.  */
936      mixed_string_buffer_flush_utf16_surr (bp);
937
938      /* When a newline is seen, convert the accumulated multibyte sequence.
939	 This ensures a correct line number in the error message in case of
940	 a conversion error.  The "- 1" is to account for the newline.  */
941      if (c == '\n')
942	mixed_string_buffer_flush_curr_buffer (bp, line_number - 1);
943
944      mixed_string_buffer_append_byte (bp, (unsigned char) c);
945    }
946}
947
948/* Return the string buffer's contents.  */
949static char *
950mixed_string_buffer_result (struct mixed_string_buffer *bp)
951{
952  /* Flush all into bp->utf8_buffer.  */
953  mixed_string_buffer_flush_utf16_surr (bp);
954  mixed_string_buffer_flush_curr_buffer (bp, line_number);
955  /* NUL-terminate it.  */
956  mixed_string_buffer_append_unicode_grow (bp, 1);
957  bp->utf8_buffer[bp->utf8_buflen] = '\0';
958  /* Return it.  */
959  return bp->utf8_buffer;
960}
961
962/* Free the memory pointed to by a 'struct mixed_string_buffer'.  */
963static inline void
964free_mixed_string_buffer (struct mixed_string_buffer *bp)
965{
966  free (bp->utf8_buffer);
967  free (bp->curr_buffer);
968}
969
970
971/* ========================== Reading of tokens.  ========================== */
972
973
974enum token_type_ty
975{
976  token_type_eof,
977  token_type_lparen,		/* ( */
978  token_type_rparen,		/* ) */
979  token_type_comma,		/* , */
980  token_type_string,		/* "abc", 'abc', """abc""", '''abc''' */
981  token_type_symbol,		/* symbol, number */
982  token_type_other		/* misc. operator */
983};
984typedef enum token_type_ty token_type_ty;
985
986typedef struct token_ty token_ty;
987struct token_ty
988{
989  token_type_ty type;
990  char *string;		/* for token_type_string, token_type_symbol */
991  refcounted_string_list_ty *comment;	/* for token_type_string */
992  int line_number;
993};
994
995
996/* There are two different input syntaxes for strings, "abc" and r"abc",
997   and two different input syntaxes for Unicode strings, u"abc" and ur"abc".
998   Which escape sequences are understood, i.e. what is interpreted specially
999   after backslash?
1000    "abc"     \<nl> \\ \' \" \a\b\f\n\r\t\v \ooo \xnn
1001    r"abc"
1002    u"abc"    \<nl> \\ \' \" \a\b\f\n\r\t\v \ooo \xnn \unnnn \Unnnnnnnn \N{...}
1003    ur"abc"                                           \unnnn
1004   The \unnnn values are UTF-16 values; a single \Unnnnnnnn can expand to two
1005   \unnnn items.  The \ooo and \xnn values are in the current source encoding.
1006 */
1007
1008static int
1009phase7_getuc (int quote_char,
1010	      bool triple, bool interpret_ansic, bool interpret_unicode,
1011	      unsigned int *backslash_counter)
1012{
1013  int c;
1014
1015  for (;;)
1016    {
1017      /* Use phase 2, because phase 3 elides comments.  */
1018      c = phase2_getc ();
1019
1020      if (c == UEOF)
1021	return P7_EOF;
1022
1023      if (c == quote_char && (interpret_ansic || (*backslash_counter & 1) == 0))
1024	{
1025	  if (triple)
1026	    {
1027	      int c1 = phase2_getc ();
1028	      if (c1 == quote_char)
1029		{
1030		  int c2 = phase2_getc ();
1031		  if (c2 == quote_char)
1032		    return P7_STRING_END;
1033		  phase2_ungetc (c2);
1034		}
1035	      phase2_ungetc (c1);
1036	      return UNICODE (c);
1037	    }
1038	  else
1039	    return P7_STRING_END;
1040	}
1041
1042      if (c == '\n')
1043	{
1044	  if (triple)
1045	    {
1046	      *backslash_counter = 0;
1047	      return UNICODE ('\n');
1048	    }
1049	  /* In r"..." and ur"..." strings, newline is only allowed
1050	     immediately after an odd number of backslashes (although the
1051	     backslashes are not interpreted!).  */
1052	  if (!(interpret_ansic || (*backslash_counter & 1) == 0))
1053	    {
1054	      *backslash_counter = 0;
1055	      return UNICODE ('\n');
1056	    }
1057	  phase2_ungetc (c);
1058	  error_with_progname = false;
1059	  error (0, 0, _("%s:%d: warning: unterminated string"),
1060		 logical_file_name, line_number);
1061	  error_with_progname = true;
1062	  return P7_STRING_END;
1063	}
1064
1065      if (c != '\\')
1066	{
1067	  *backslash_counter = 0;
1068	  return UNICODE (c);
1069	}
1070
1071      /* Backslash handling.  */
1072
1073      if (!interpret_ansic && !interpret_unicode)
1074	{
1075	  ++*backslash_counter;
1076	  return UNICODE ('\\');
1077	}
1078
1079      /* Dispatch according to the character following the backslash.  */
1080      c = phase2_getc ();
1081      if (c == UEOF)
1082	{
1083	  ++*backslash_counter;
1084	  return UNICODE ('\\');
1085	}
1086
1087      if (interpret_ansic)
1088	switch (c)
1089	  {
1090	  case '\n':
1091	    continue;
1092	  case '\\':
1093	    ++*backslash_counter;
1094	    return UNICODE (c);
1095	  case '\'': case '"':
1096	    *backslash_counter = 0;
1097	    return UNICODE (c);
1098	  case 'a':
1099	    *backslash_counter = 0;
1100	    return UNICODE ('\a');
1101	  case 'b':
1102	    *backslash_counter = 0;
1103	    return UNICODE ('\b');
1104	  case 'f':
1105	    *backslash_counter = 0;
1106	    return UNICODE ('\f');
1107	  case 'n':
1108	    *backslash_counter = 0;
1109	    return UNICODE ('\n');
1110	  case 'r':
1111	    *backslash_counter = 0;
1112	    return UNICODE ('\r');
1113	  case 't':
1114	    *backslash_counter = 0;
1115	    return UNICODE ('\t');
1116	  case 'v':
1117	    *backslash_counter = 0;
1118	    return UNICODE ('\v');
1119	  case '0': case '1': case '2': case '3': case '4':
1120	  case '5': case '6': case '7':
1121	    {
1122	      int n = c - '0';
1123
1124	      c = phase2_getc ();
1125	      if (c != UEOF)
1126		{
1127		  if (c >= '0' && c <= '7')
1128		    {
1129		      n = (n << 3) + (c - '0');
1130		      c = phase2_getc ();
1131		      if (c != UEOF)
1132			{
1133			  if (c >= '0' && c <= '7')
1134			    n = (n << 3) + (c - '0');
1135			  else
1136			    phase2_ungetc (c);
1137			}
1138		    }
1139		  else
1140		    phase2_ungetc (c);
1141		}
1142	      *backslash_counter = 0;
1143	      return (unsigned char) n;
1144	    }
1145	  case 'x':
1146	    {
1147	      int c1 = phase2_getc ();
1148	      int n1;
1149
1150	      if (c1 >= '0' && c1 <= '9')
1151		n1 = c1 - '0';
1152	      else if (c1 >= 'A' && c1 <= 'F')
1153		n1 = c1 - 'A' + 10;
1154	      else if (c1 >= 'a' && c1 <= 'f')
1155		n1 = c1 - 'a' + 10;
1156	      else
1157		n1 = -1;
1158
1159	      if (n1 >= 0)
1160		{
1161		  int c2 = phase2_getc ();
1162		  int n2;
1163
1164		  if (c2 >= '0' && c2 <= '9')
1165		    n2 = c2 - '0';
1166		  else if (c2 >= 'A' && c2 <= 'F')
1167		    n2 = c2 - 'A' + 10;
1168		  else if (c2 >= 'a' && c2 <= 'f')
1169		    n2 = c2 - 'a' + 10;
1170		  else
1171		    n2 = -1;
1172
1173		  if (n2 >= 0)
1174		    {
1175		      *backslash_counter = 0;
1176		      return (unsigned char) ((n1 << 4) + n2);
1177		    }
1178
1179		  phase2_ungetc (c2);
1180		}
1181	      phase2_ungetc (c1);
1182	      phase2_ungetc (c);
1183	      ++*backslash_counter;
1184	      return UNICODE ('\\');
1185	    }
1186	  }
1187
1188      if (interpret_unicode)
1189	{
1190	  if (c == 'u')
1191	    {
1192	      unsigned char buf[4];
1193	      unsigned int n = 0;
1194	      int i;
1195
1196	      for (i = 0; i < 4; i++)
1197		{
1198		  int c1 = phase2_getc ();
1199
1200		  if (c1 >= '0' && c1 <= '9')
1201		    n = (n << 4) + (c1 - '0');
1202		  else if (c1 >= 'A' && c1 <= 'F')
1203		    n = (n << 4) + (c1 - 'A' + 10);
1204		  else if (c1 >= 'a' && c1 <= 'f')
1205		    n = (n << 4) + (c1 - 'a' + 10);
1206		  else
1207		    {
1208		      phase2_ungetc (c1);
1209		      while (--i >= 0)
1210			phase2_ungetc (buf[i]);
1211		      phase2_ungetc (c);
1212		      ++*backslash_counter;
1213		      return UNICODE ('\\');
1214		    }
1215
1216		  buf[i] = c1;
1217		}
1218	      *backslash_counter = 0;
1219	      return UNICODE (n);
1220	    }
1221
1222	  if (interpret_ansic)
1223	    {
1224	      if (c == 'U')
1225		{
1226		  unsigned char buf[8];
1227		  unsigned int n = 0;
1228		  int i;
1229
1230		  for (i = 0; i < 8; i++)
1231		    {
1232		      int c1 = phase2_getc ();
1233
1234		      if (c1 >= '0' && c1 <= '9')
1235			n = (n << 4) + (c1 - '0');
1236		      else if (c1 >= 'A' && c1 <= 'F')
1237			n = (n << 4) + (c1 - 'A' + 10);
1238		      else if (c1 >= 'a' && c1 <= 'f')
1239			n = (n << 4) + (c1 - 'a' + 10);
1240		      else
1241			{
1242			  phase2_ungetc (c1);
1243			  while (--i >= 0)
1244			    phase2_ungetc (buf[i]);
1245			  phase2_ungetc (c);
1246			  ++*backslash_counter;
1247			  return UNICODE ('\\');
1248			}
1249
1250		      buf[i] = c1;
1251		    }
1252		  if (n < 0x110000)
1253		    {
1254		      *backslash_counter = 0;
1255		      return UNICODE (n);
1256		    }
1257
1258		  error_with_progname = false;
1259		  error (0, 0, _("%s:%d: warning: invalid Unicode character"),
1260			 logical_file_name, line_number);
1261		  error_with_progname = true;
1262
1263		  while (--i >= 0)
1264		    phase2_ungetc (buf[i]);
1265		  phase2_ungetc (c);
1266		  ++*backslash_counter;
1267		  return UNICODE ('\\');
1268		}
1269
1270	      if (c == 'N')
1271		{
1272		  int c1 = phase2_getc ();
1273		  if (c1 == '{')
1274		    {
1275		      unsigned char buf[UNINAME_MAX + 1];
1276		      int i;
1277		      unsigned int n;
1278
1279		      for (i = 0; i < UNINAME_MAX; i++)
1280			{
1281			  int c2 = phase2_getc ();
1282			  if (!(c2 >= ' ' && c2 <= '~'))
1283			    {
1284			      phase2_ungetc (c2);
1285			      while (--i >= 0)
1286				phase2_ungetc (buf[i]);
1287			      phase2_ungetc (c1);
1288			      phase2_ungetc (c);
1289			      ++*backslash_counter;
1290			      return UNICODE ('\\');
1291			    }
1292			  if (c2 == '}')
1293			    break;
1294			  buf[i] = c2;
1295			}
1296		      buf[i] = '\0';
1297
1298		      n = unicode_name_character ((char *) buf);
1299		      if (n != UNINAME_INVALID)
1300			{
1301			  *backslash_counter = 0;
1302			  return UNICODE (n);
1303			}
1304
1305		      phase2_ungetc ('}');
1306		      while (--i >= 0)
1307			phase2_ungetc (buf[i]);
1308		    }
1309		  phase2_ungetc (c1);
1310		  phase2_ungetc (c);
1311		  ++*backslash_counter;
1312		  return UNICODE ('\\');
1313		}
1314	    }
1315	}
1316
1317      phase2_ungetc (c);
1318      ++*backslash_counter;
1319      return UNICODE ('\\');
1320    }
1321}
1322
1323
1324/* Combine characters into tokens.  Discard whitespace except newlines at
1325   the end of logical lines.  */
1326
1327/* Number of pending open parentheses/braces/brackets.  */
1328static int open_pbb;
1329
1330static token_ty phase5_pushback[1];
1331static int phase5_pushback_length;
1332
1333static void
1334phase5_get (token_ty *tp)
1335{
1336  int c;
1337
1338  if (phase5_pushback_length)
1339    {
1340      *tp = phase5_pushback[--phase5_pushback_length];
1341      return;
1342    }
1343
1344  for (;;)
1345    {
1346      tp->line_number = line_number;
1347      c = phase3_getc ();
1348
1349      switch (c)
1350	{
1351	case UEOF:
1352	  tp->type = token_type_eof;
1353	  return;
1354
1355	case ' ':
1356	case '\t':
1357	case '\f':
1358	  /* Ignore whitespace and comments.  */
1359	  continue;
1360
1361	case '\n':
1362	  if (last_non_comment_line > last_comment_line)
1363	    savable_comment_reset ();
1364	  /* Ignore newline if and only if it is used for implicit line
1365	     joining.  */
1366	  if (open_pbb > 0)
1367	    continue;
1368	  tp->type = token_type_other;
1369	  return;
1370	}
1371
1372      last_non_comment_line = tp->line_number;
1373
1374      switch (c)
1375	{
1376	case '.':
1377	  {
1378	    int c1 = phase3_getc ();
1379	    phase3_ungetc (c1);
1380	    if (!(c1 >= '0' && c1 <= '9'))
1381	      {
1382
1383		tp->type = token_type_other;
1384		return;
1385	      }
1386	  }
1387	  /* FALLTHROUGH */
1388	case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
1389	case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
1390	case 'M': case 'N': case 'O': case 'P': case 'Q':
1391	case 'S': case 'T':           case 'V': case 'W': case 'X':
1392	case 'Y': case 'Z':
1393	case '_':
1394	case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
1395	case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
1396	case 'm': case 'n': case 'o': case 'p': case 'q':
1397	case 's': case 't':           case 'v': case 'w': case 'x':
1398	case 'y': case 'z':
1399	case '0': case '1': case '2': case '3': case '4':
1400	case '5': case '6': case '7': case '8': case '9':
1401	symbol:
1402	  /* Symbol, or part of a number.  */
1403	  {
1404	    static char *buffer;
1405	    static int bufmax;
1406	    int bufpos;
1407
1408	    bufpos = 0;
1409	    for (;;)
1410	      {
1411		if (bufpos >= bufmax)
1412		  {
1413		    bufmax = 2 * bufmax + 10;
1414		    buffer = xrealloc (buffer, bufmax);
1415		  }
1416		buffer[bufpos++] = c;
1417		c = phase3_getc ();
1418		switch (c)
1419		  {
1420		  case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
1421		  case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
1422		  case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
1423		  case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
1424		  case 'Y': case 'Z':
1425		  case '_':
1426		  case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
1427		  case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
1428		  case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
1429		  case 's': case 't': case 'u': case 'v': case 'w': case 'x':
1430		  case 'y': case 'z':
1431		  case '0': case '1': case '2': case '3': case '4':
1432		  case '5': case '6': case '7': case '8': case '9':
1433		    continue;
1434		  default:
1435		    phase3_ungetc (c);
1436		    break;
1437		  }
1438		break;
1439	      }
1440	    if (bufpos >= bufmax)
1441	      {
1442		bufmax = 2 * bufmax + 10;
1443		buffer = xrealloc (buffer, bufmax);
1444	      }
1445	    buffer[bufpos] = '\0';
1446	    tp->string = xstrdup (buffer);
1447	    tp->type = token_type_symbol;
1448	    return;
1449	  }
1450
1451	/* Strings.  */
1452	  {
1453	    struct mixed_string_buffer literal;
1454	    int quote_char;
1455	    bool interpret_ansic;
1456	    bool interpret_unicode;
1457	    bool triple;
1458	    unsigned int backslash_counter;
1459
1460	    case 'R': case 'r':
1461	      {
1462		int c1 = phase2_getc ();
1463		if (c1 == '"' || c1 == '\'')
1464		  {
1465		    quote_char = c1;
1466		    interpret_ansic = false;
1467		    interpret_unicode = false;
1468		    goto string;
1469		  }
1470		phase2_ungetc (c1);
1471		goto symbol;
1472	      }
1473
1474	    case 'U': case 'u':
1475	      {
1476		int c1 = phase2_getc ();
1477		if (c1 == '"' || c1 == '\'')
1478		  {
1479		    quote_char = c1;
1480		    interpret_ansic = true;
1481		    interpret_unicode = true;
1482		    goto string;
1483		  }
1484		if (c1 == 'R' || c1 == 'r')
1485		  {
1486		    int c2 = phase2_getc ();
1487		    if (c2 == '"' || c2 == '\'')
1488		      {
1489			quote_char = c2;
1490			interpret_ansic = false;
1491			interpret_unicode = true;
1492			goto string;
1493		      }
1494		    phase2_ungetc (c2);
1495		  }
1496		phase2_ungetc (c1);
1497		goto symbol;
1498	      }
1499
1500	    case '"': case '\'':
1501	      quote_char = c;
1502	      interpret_ansic = true;
1503	      interpret_unicode = false;
1504	    string:
1505	      triple = false;
1506	      {
1507		int c1 = phase2_getc ();
1508		if (c1 == quote_char)
1509		  {
1510		    int c2 = phase2_getc ();
1511		    if (c2 == quote_char)
1512		      triple = true;
1513		    else
1514		      {
1515			phase2_ungetc (c2);
1516			phase2_ungetc (c1);
1517		      }
1518		  }
1519		else
1520		  phase2_ungetc (c1);
1521	      }
1522	      backslash_counter = 0;
1523	      /* Start accumulating the string.  */
1524	      init_mixed_string_buffer (&literal);
1525	      for (;;)
1526		{
1527		  int uc = phase7_getuc (quote_char, triple, interpret_ansic,
1528					 interpret_unicode, &backslash_counter);
1529
1530		  if (uc == P7_EOF || uc == P7_STRING_END)
1531		    break;
1532
1533		  if (IS_UNICODE (uc))
1534		    assert (UNICODE_VALUE (uc) >= 0
1535			    && UNICODE_VALUE (uc) < 0x110000);
1536
1537		  mixed_string_buffer_append (&literal, uc);
1538		}
1539	      tp->string = xstrdup (mixed_string_buffer_result (&literal));
1540	      free_mixed_string_buffer (&literal);
1541	      tp->comment = add_reference (savable_comment);
1542	      tp->type = token_type_string;
1543	      return;
1544	  }
1545
1546	case '(':
1547	  open_pbb++;
1548	  tp->type = token_type_lparen;
1549	  return;
1550
1551	case ')':
1552	  if (open_pbb > 0)
1553	    open_pbb--;
1554	  tp->type = token_type_rparen;
1555	  return;
1556
1557	case ',':
1558	  tp->type = token_type_comma;
1559	  return;
1560
1561	case '[': case '{':
1562	  open_pbb++;
1563	  tp->type = token_type_other;
1564	  return;
1565
1566	case ']': case '}':
1567	  if (open_pbb > 0)
1568	    open_pbb--;
1569	  tp->type = token_type_other;
1570	  return;
1571
1572	default:
1573	  /* We could carefully recognize each of the 2 and 3 character
1574	     operators, but it is not necessary, as we only need to recognize
1575	     gettext invocations.  Don't bother.  */
1576	  tp->type = token_type_other;
1577	  return;
1578	}
1579    }
1580}
1581
1582/* Supports only one pushback token.  */
1583static void
1584phase5_unget (token_ty *tp)
1585{
1586  if (tp->type != token_type_eof)
1587    {
1588      if (phase5_pushback_length == SIZEOF (phase5_pushback))
1589	abort ();
1590      phase5_pushback[phase5_pushback_length++] = *tp;
1591    }
1592}
1593
1594
1595/* Combine adjacent strings to form a single string.  Note that the end
1596   of a logical line appears as a token of its own, therefore strings that
1597   belong to different logical lines will not be concatenated.  */
1598
1599static void
1600x_python_lex (token_ty *tp)
1601{
1602  phase5_get (tp);
1603  if (tp->type != token_type_string)
1604    return;
1605  for (;;)
1606    {
1607      token_ty tmp;
1608      size_t len;
1609
1610      phase5_get (&tmp);
1611      if (tmp.type != token_type_string)
1612	{
1613	  phase5_unget (&tmp);
1614	  return;
1615	}
1616      len = strlen (tp->string);
1617      tp->string = xrealloc (tp->string, len + strlen (tmp.string) + 1);
1618      strcpy (tp->string + len, tmp.string);
1619      free (tmp.string);
1620    }
1621}
1622
1623
1624/* ========================= Extracting strings.  ========================== */
1625
1626
1627/* Context lookup table.  */
1628static flag_context_list_table_ty *flag_context_list_table;
1629
1630
1631/* The file is broken into tokens.  Scan the token stream, looking for
1632   a keyword, followed by a left paren, followed by a string.  When we
1633   see this sequence, we have something to remember.  We assume we are
1634   looking at a valid C or C++ program, and leave the complaints about
1635   the grammar to the compiler.
1636
1637     Normal handling: Look for
1638       keyword ( ... msgid ... )
1639     Plural handling: Look for
1640       keyword ( ... msgid ... msgid_plural ... )
1641
1642   We use recursion because the arguments before msgid or between msgid
1643   and msgid_plural can contain subexpressions of the same form.  */
1644
1645
1646/* Extract messages until the next balanced closing parenthesis.
1647   Extracted messages are added to MLP.
1648   Return true upon eof, false upon closing parenthesis.  */
1649static bool
1650extract_parenthesized (message_list_ty *mlp,
1651		       flag_context_ty outer_context,
1652		       flag_context_list_iterator_ty context_iter,
1653		       struct arglist_parser *argparser)
1654{
1655  /* Current argument number.  */
1656  int arg = 1;
1657  /* 0 when no keyword has been seen.  1 right after a keyword is seen.  */
1658  int state;
1659  /* Parameters of the keyword just seen.  Defined only in state 1.  */
1660  const struct callshapes *next_shapes = NULL;
1661  /* Context iterator that will be used if the next token is a '('.  */
1662  flag_context_list_iterator_ty next_context_iter =
1663    passthrough_context_list_iterator;
1664  /* Current context.  */
1665  flag_context_ty inner_context =
1666    inherited_context (outer_context,
1667		       flag_context_list_iterator_advance (&context_iter));
1668
1669  /* Start state is 0.  */
1670  state = 0;
1671
1672  for (;;)
1673    {
1674      token_ty token;
1675
1676      x_python_lex (&token);
1677      switch (token.type)
1678	{
1679	case token_type_symbol:
1680	  {
1681	    void *keyword_value;
1682
1683	    if (hash_find_entry (&keywords, token.string, strlen (token.string),
1684				 &keyword_value)
1685		== 0)
1686	      {
1687		next_shapes = (const struct callshapes *) keyword_value;
1688		state = 1;
1689	      }
1690	    else
1691	      state = 0;
1692	  }
1693	  next_context_iter =
1694	    flag_context_list_iterator (
1695	      flag_context_list_table_lookup (
1696		flag_context_list_table,
1697		token.string, strlen (token.string)));
1698	  free (token.string);
1699	  continue;
1700
1701	case token_type_lparen:
1702	  if (extract_parenthesized (mlp, inner_context, next_context_iter,
1703				     arglist_parser_alloc (mlp,
1704							   state ? next_shapes : NULL)))
1705	    {
1706	      xgettext_current_source_encoding = po_charset_utf8;
1707	      arglist_parser_done (argparser, arg);
1708	      xgettext_current_source_encoding = xgettext_current_file_source_encoding;
1709	      return true;
1710	    }
1711	  next_context_iter = null_context_list_iterator;
1712	  state = 0;
1713	  continue;
1714
1715	case token_type_rparen:
1716	  xgettext_current_source_encoding = po_charset_utf8;
1717	  arglist_parser_done (argparser, arg);
1718	  xgettext_current_source_encoding = xgettext_current_file_source_encoding;
1719	  return false;
1720
1721	case token_type_comma:
1722	  arg++;
1723	  inner_context =
1724	    inherited_context (outer_context,
1725			       flag_context_list_iterator_advance (
1726				 &context_iter));
1727	  next_context_iter = passthrough_context_list_iterator;
1728	  state = 0;
1729	  continue;
1730
1731	case token_type_string:
1732	  {
1733	    lex_pos_ty pos;
1734	    pos.file_name = logical_file_name;
1735	    pos.line_number = token.line_number;
1736
1737	    xgettext_current_source_encoding = po_charset_utf8;
1738	    if (extract_all)
1739	      remember_a_message (mlp, NULL, token.string, inner_context,
1740				  &pos, token.comment);
1741	    else
1742	      arglist_parser_remember (argparser, arg, token.string,
1743				       inner_context,
1744				       pos.file_name, pos.line_number,
1745				       token.comment);
1746	    xgettext_current_source_encoding = xgettext_current_file_source_encoding;
1747	  }
1748	  drop_reference (token.comment);
1749	  next_context_iter = null_context_list_iterator;
1750	  state = 0;
1751	  continue;
1752
1753	case token_type_eof:
1754	  xgettext_current_source_encoding = po_charset_utf8;
1755	  arglist_parser_done (argparser, arg);
1756	  xgettext_current_source_encoding = xgettext_current_file_source_encoding;
1757	  return true;
1758
1759	case token_type_other:
1760	  next_context_iter = null_context_list_iterator;
1761	  state = 0;
1762	  continue;
1763
1764	default:
1765	  abort ();
1766	}
1767    }
1768}
1769
1770
1771void
1772extract_python (FILE *f,
1773		const char *real_filename, const char *logical_filename,
1774		flag_context_list_table_ty *flag_table,
1775		msgdomain_list_ty *mdlp)
1776{
1777  message_list_ty *mlp = mdlp->item[0]->messages;
1778
1779  fp = f;
1780  real_file_name = real_filename;
1781  logical_file_name = xstrdup (logical_filename);
1782  line_number = 1;
1783
1784  last_comment_line = -1;
1785  last_non_comment_line = -1;
1786
1787  xgettext_current_file_source_encoding = xgettext_global_source_encoding;
1788#if HAVE_ICONV
1789  xgettext_current_file_source_iconv = xgettext_global_source_iconv;
1790#endif
1791
1792  xgettext_current_source_encoding = xgettext_current_file_source_encoding;
1793#if HAVE_ICONV
1794  xgettext_current_source_iconv = xgettext_current_file_source_iconv;
1795#endif
1796
1797  continuation_or_nonblank_line = false;
1798
1799  open_pbb = 0;
1800
1801  flag_context_list_table = flag_table;
1802
1803  init_keywords ();
1804
1805  /* Eat tokens until eof is seen.  When extract_parenthesized returns
1806     due to an unbalanced closing parenthesis, just restart it.  */
1807  while (!extract_parenthesized (mlp, null_context, null_context_list_iterator,
1808				 arglist_parser_alloc (mlp, NULL)))
1809    ;
1810
1811  fp = NULL;
1812  real_file_name = NULL;
1813  logical_file_name = NULL;
1814  line_number = 0;
1815}
1816