• Home
  • History
  • Annotate
  • Line#
  • Navigate
  • Raw
  • Download
  • only in /netgear-WNDR4500v2-V1.0.0.60_1.0.38/ap/gpl/timemachine/gettext-0.17/gettext-tools/src/
1/* xgettext Python backend.
2   Copyright (C) 2002-2003, 2005-2007 Free Software Foundation, Inc.
3
4   This file was written by Bruno Haible <haible@clisp.cons.org>, 2002.
5
6   This program is free software: you can redistribute it and/or modify
7   it under the terms of the GNU General Public License as published by
8   the Free Software Foundation; either version 3 of the License, or
9   (at your option) any later version.
10
11   This program is distributed in the hope that it will be useful,
12   but WITHOUT ANY WARRANTY; without even the implied warranty of
13   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14   GNU General Public License for more details.
15
16   You should have received a copy of the GNU General Public License
17   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
18
19#ifdef HAVE_CONFIG_H
20# include "config.h"
21#endif
22
23/* Specification.  */
24#include "x-python.h"
25
26#include <assert.h>
27#include <errno.h>
28#include <stdbool.h>
29#include <stdio.h>
30#include <stdlib.h>
31#include <string.h>
32
33#include "message.h"
34#include "xgettext.h"
35#include "x-python.h"
36#include "error.h"
37#include "error-progname.h"
38#include "progname.h"
39#include "basename.h"
40#include "xerror.h"
41#include "xvasprintf.h"
42#include "xalloc.h"
43#include "c-strstr.h"
44#include "c-ctype.h"
45#include "po-charset.h"
46#include "uniname.h"
47#include "unistr.h"
48#include "gettext.h"
49
50#define _(s) gettext(s)
51
52#define max(a,b) ((a) > (b) ? (a) : (b))
53
54#define SIZEOF(a) (sizeof(a) / sizeof(a[0]))
55
56
57/* The Python syntax is defined in the Python Reference Manual
58   /usr/share/doc/packages/python/html/ref/index.html.
59   See also Python-2.0/Parser/tokenizer.c, Python-2.0/Python/compile.c,
60   Python-2.0/Objects/unicodeobject.c.  */
61
62
63/* ====================== Keyword set customization.  ====================== */
64
65/* If true extract all strings.  */
66static bool extract_all = false;
67
68static hash_table keywords;
69static bool default_keywords = true;
70
71
72void
73x_python_extract_all ()
74{
75  extract_all = true;
76}
77
78
79void
80x_python_keyword (const char *name)
81{
82  if (name == NULL)
83    default_keywords = false;
84  else
85    {
86      const char *end;
87      struct callshape shape;
88      const char *colon;
89
90      if (keywords.table == NULL)
91	hash_init (&keywords, 100);
92
93      split_keywordspec (name, &end, &shape);
94
95      /* The characters between name and end should form a valid C identifier.
96	 A colon means an invalid parse in split_keywordspec().  */
97      colon = strchr (name, ':');
98      if (colon == NULL || colon >= end)
99	insert_keyword_callshape (&keywords, name, end - name, &shape);
100    }
101}
102
103/* Finish initializing the keywords hash table.
104   Called after argument processing, before each file is processed.  */
105static void
106init_keywords ()
107{
108  if (default_keywords)
109    {
110      /* When adding new keywords here, also update the documentation in
111	 xgettext.texi!  */
112      x_python_keyword ("gettext");
113      x_python_keyword ("ugettext");
114      x_python_keyword ("dgettext:2");
115      x_python_keyword ("ngettext:1,2");
116      x_python_keyword ("ungettext:1,2");
117      x_python_keyword ("dngettext:2,3");
118      x_python_keyword ("_");
119      default_keywords = false;
120    }
121}
122
123void
124init_flag_table_python ()
125{
126  xgettext_record_flag ("gettext:1:pass-python-format");
127  xgettext_record_flag ("ugettext:1:pass-python-format");
128  xgettext_record_flag ("dgettext:2:pass-python-format");
129  xgettext_record_flag ("ngettext:1:pass-python-format");
130  xgettext_record_flag ("ngettext:2:pass-python-format");
131  xgettext_record_flag ("ungettext:1:pass-python-format");
132  xgettext_record_flag ("ungettext:2:pass-python-format");
133  xgettext_record_flag ("dngettext:2:pass-python-format");
134  xgettext_record_flag ("dngettext:3:pass-python-format");
135  xgettext_record_flag ("_:1:pass-python-format");
136  /* xgettext_record_flag ("%:1:python-format"); // % is an infix operator! */
137}
138
139
140/* ======================== Reading of characters.  ======================== */
141
142/* Real filename, used in error messages about the input file.  */
143static const char *real_file_name;
144
145/* Logical filename and line number, used to label the extracted messages.  */
146static char *logical_file_name;
147static int line_number;
148
149/* The input file stream.  */
150static FILE *fp;
151
152
153/* 1. line_number handling.  */
154
155/* Maximum used, roughly a safer MB_LEN_MAX.  */
156#define MAX_PHASE1_PUSHBACK 16
157static unsigned char phase1_pushback[MAX_PHASE1_PUSHBACK];
158static int phase1_pushback_length;
159
160/* Read the next single byte from the input file.  */
161static int
162phase1_getc ()
163{
164  int c;
165
166  if (phase1_pushback_length)
167    c = phase1_pushback[--phase1_pushback_length];
168  else
169    {
170      c = getc (fp);
171
172      if (c == EOF)
173	{
174	  if (ferror (fp))
175	    error (EXIT_FAILURE, errno, _("error while reading \"%s\""),
176		   real_file_name);
177	  return EOF;
178	}
179    }
180
181  if (c == '\n')
182    ++line_number;
183
184  return c;
185}
186
187/* Supports MAX_PHASE1_PUSHBACK characters of pushback.  */
188static void
189phase1_ungetc (int c)
190{
191  if (c != EOF)
192    {
193      if (c == '\n')
194	--line_number;
195
196      if (phase1_pushback_length == SIZEOF (phase1_pushback))
197	abort ();
198      phase1_pushback[phase1_pushback_length++] = c;
199    }
200}
201
202
203/* Phase 2: Conversion to Unicode.
204   This is done early because PEP 0263 specifies that conversion to Unicode
205   conceptually occurs before tokenization.  A test case where it matters
206   is with encodings like BIG5: when a double-byte character ending in 0x5C
207   is followed by '\' or 'u0021', the tokenizer must not treat the second
208   half of the double-byte character as a backslash.  */
209
210/* End-of-file indicator for functions returning an UCS-4 character.  */
211#define UEOF -1
212
213static int phase2_pushback[max (9, UNINAME_MAX + 3)];
214static int phase2_pushback_length;
215
216/* Read the next Unicode UCS-4 character from the input file.  */
217static int
218phase2_getc ()
219{
220  if (phase2_pushback_length)
221    return phase2_pushback[--phase2_pushback_length];
222
223  if (xgettext_current_source_encoding == po_charset_ascii)
224    {
225      int c = phase1_getc ();
226      if (c == EOF)
227	return UEOF;
228      if (!c_isascii (c))
229	{
230	  char buffer[21];
231	  sprintf (buffer, ":%ld", (long) line_number);
232	  multiline_error (xstrdup (""),
233			   xasprintf (_("\
234Non-ASCII string at %s%s.\n\
235Please specify the source encoding through --from-code or through a comment\n\
236as specified in http://www.python.org/peps/pep-0263.html.\n"),
237			   real_file_name, buffer));
238	  exit (EXIT_FAILURE);
239	}
240      return c;
241    }
242  else if (xgettext_current_source_encoding != po_charset_utf8)
243    {
244#if HAVE_ICONV
245      /* Use iconv on an increasing number of bytes.  Read only as many bytes
246	 through phase1_getc as needed.  This is needed to give reasonable
247	 interactive behaviour when fp is connected to an interactive tty.  */
248      unsigned char buf[MAX_PHASE1_PUSHBACK];
249      size_t bufcount;
250      int c = phase1_getc ();
251      if (c == EOF)
252	return UEOF;
253      buf[0] = (unsigned char) c;
254      bufcount = 1;
255
256      for (;;)
257	{
258	  unsigned char scratchbuf[6];
259	  const char *inptr = (const char *) &buf[0];
260	  size_t insize = bufcount;
261	  char *outptr = (char *) &scratchbuf[0];
262	  size_t outsize = sizeof (scratchbuf);
263
264	  size_t res = iconv (xgettext_current_source_iconv,
265			      (ICONV_CONST char **) &inptr, &insize,
266			      &outptr, &outsize);
267	  /* We expect that a character has been produced if and only if
268	     some input bytes have been consumed.  */
269	  if ((insize < bufcount) != (outsize < sizeof (scratchbuf)))
270	    abort ();
271	  if (outsize == sizeof (scratchbuf))
272	    {
273	      /* No character has been produced.  Must be an error.  */
274	      if (res != (size_t)(-1))
275		abort ();
276
277	      if (errno == EILSEQ)
278		{
279		  /* An invalid multibyte sequence was encountered.  */
280		  multiline_error (xstrdup (""),
281				   xasprintf (_("\
282%s:%d: Invalid multibyte sequence.\n\
283Please specify the correct source encoding through --from-code or through a\n\
284comment as specified in http://www.python.org/peps/pep-0263.html.\n"),
285				   real_file_name, line_number));
286		  exit (EXIT_FAILURE);
287		}
288	      else if (errno == EINVAL)
289		{
290		  /* An incomplete multibyte character.  */
291		  int c;
292
293		  if (bufcount == MAX_PHASE1_PUSHBACK)
294		    {
295		      /* An overlong incomplete multibyte sequence was
296			 encountered.  */
297		      multiline_error (xstrdup (""),
298				       xasprintf (_("\
299%s:%d: Long incomplete multibyte sequence.\n\
300Please specify the correct source encoding through --from-code or through a\n\
301comment as specified in http://www.python.org/peps/pep-0263.html.\n"),
302				       real_file_name, line_number));
303		      exit (EXIT_FAILURE);
304		    }
305
306		  /* Read one more byte and retry iconv.  */
307		  c = phase1_getc ();
308		  if (c == EOF)
309		    {
310		      multiline_error (xstrdup (""),
311				       xasprintf (_("\
312%s:%d: Incomplete multibyte sequence at end of file.\n\
313Please specify the correct source encoding through --from-code or through a\n\
314comment as specified in http://www.python.org/peps/pep-0263.html.\n"),
315				       real_file_name, line_number));
316		      exit (EXIT_FAILURE);
317		    }
318		  if (c == '\n')
319		    {
320		      multiline_error (xstrdup (""),
321				       xasprintf (_("\
322%s:%d: Incomplete multibyte sequence at end of line.\n\
323Please specify the correct source encoding through --from-code or through a\n\
324comment as specified in http://www.python.org/peps/pep-0263.html.\n"),
325				       real_file_name, line_number - 1));
326		      exit (EXIT_FAILURE);
327		    }
328		  buf[bufcount++] = (unsigned char) c;
329		}
330	      else
331		error (EXIT_FAILURE, errno, _("%s:%d: iconv failure"),
332		       real_file_name, line_number);
333	    }
334	  else
335	    {
336	      size_t outbytes = sizeof (scratchbuf) - outsize;
337	      size_t bytes = bufcount - insize;
338	      unsigned int uc;
339
340	      /* We expect that one character has been produced.  */
341	      if (bytes == 0)
342		abort ();
343	      if (outbytes == 0)
344		abort ();
345	      /* Push back the unused bytes.  */
346	      while (insize > 0)
347		phase1_ungetc (buf[--insize]);
348	      /* Convert the character from UTF-8 to UCS-4.  */
349	      if (u8_mbtouc (&uc, scratchbuf, outbytes) < outbytes)
350		{
351		  /* scratchbuf contains an out-of-range Unicode character
352		     (> 0x10ffff).  */
353		  multiline_error (xstrdup (""),
354				   xasprintf (_("\
355%s:%d: Invalid multibyte sequence.\n\
356Please specify the source encoding through --from-code or through a comment\n\
357as specified in http://www.python.org/peps/pep-0263.html.\n"),
358				   real_file_name, line_number));
359		  exit (EXIT_FAILURE);
360		}
361	      return uc;
362	    }
363	}
364#else
365      /* If we don't have iconv(), the only supported values for
366	 xgettext_global_source_encoding and thus also for
367	 xgettext_current_source_encoding are ASCII and UTF-8.  */
368      abort ();
369#endif
370    }
371  else
372    {
373      /* Read an UTF-8 encoded character.  */
374      unsigned char buf[6];
375      unsigned int count;
376      int c;
377      unsigned int uc;
378
379      c = phase1_getc ();
380      if (c == EOF)
381	return UEOF;
382      buf[0] = c;
383      count = 1;
384
385      if (buf[0] >= 0xc0)
386	{
387	  c = phase1_getc ();
388	  if (c == EOF)
389	    return UEOF;
390	  buf[1] = c;
391	  count = 2;
392	}
393
394      if (buf[0] >= 0xe0
395	  && ((buf[1] ^ 0x80) < 0x40))
396	{
397	  c = phase1_getc ();
398	  if (c == EOF)
399	    return UEOF;
400	  buf[2] = c;
401	  count = 3;
402	}
403
404      if (buf[0] >= 0xf0
405	  && ((buf[1] ^ 0x80) < 0x40)
406	  && ((buf[2] ^ 0x80) < 0x40))
407	{
408	  c = phase1_getc ();
409	  if (c == EOF)
410	    return UEOF;
411	  buf[3] = c;
412	  count = 4;
413	}
414
415      if (buf[0] >= 0xf8
416	  && ((buf[1] ^ 0x80) < 0x40)
417	  && ((buf[2] ^ 0x80) < 0x40)
418	  && ((buf[3] ^ 0x80) < 0x40))
419	{
420	  c = phase1_getc ();
421	  if (c == EOF)
422	    return UEOF;
423	  buf[4] = c;
424	  count = 5;
425	}
426
427      if (buf[0] >= 0xfc
428	  && ((buf[1] ^ 0x80) < 0x40)
429	  && ((buf[2] ^ 0x80) < 0x40)
430	  && ((buf[3] ^ 0x80) < 0x40)
431	  && ((buf[4] ^ 0x80) < 0x40))
432	{
433	  c = phase1_getc ();
434	  if (c == EOF)
435	    return UEOF;
436	  buf[5] = c;
437	  count = 6;
438	}
439
440      u8_mbtouc (&uc, buf, count);
441      return uc;
442    }
443}
444
445/* Supports max (9, UNINAME_MAX + 3) pushback characters.  */
446static void
447phase2_ungetc (int c)
448{
449  if (c != UEOF)
450    {
451      if (phase2_pushback_length == SIZEOF (phase2_pushback))
452	abort ();
453      phase2_pushback[phase2_pushback_length++] = c;
454    }
455}
456
457
458/* ========================= Accumulating strings.  ======================== */
459
460/* A string buffer type that allows appending Unicode characters.
461   Returns the entire string in UTF-8 encoding.  */
462
463struct unicode_string_buffer
464{
465  /* The part of the string that has already been converted to UTF-8.  */
466  char *utf8_buffer;
467  size_t utf8_buflen;
468  size_t utf8_allocated;
469};
470
471/* Initialize a 'struct unicode_string_buffer' to empty.  */
472static inline void
473init_unicode_string_buffer (struct unicode_string_buffer *bp)
474{
475  bp->utf8_buffer = NULL;
476  bp->utf8_buflen = 0;
477  bp->utf8_allocated = 0;
478}
479
480/* Auxiliary function: Ensure count more bytes are available in bp->utf8.  */
481static inline void
482unicode_string_buffer_append_unicode_grow (struct unicode_string_buffer *bp,
483					   size_t count)
484{
485  if (bp->utf8_buflen + count > bp->utf8_allocated)
486    {
487      size_t new_allocated = 2 * bp->utf8_allocated + 10;
488      if (new_allocated < bp->utf8_buflen + count)
489	new_allocated = bp->utf8_buflen + count;
490      bp->utf8_allocated = new_allocated;
491      bp->utf8_buffer = xrealloc (bp->utf8_buffer, new_allocated);
492    }
493}
494
495/* Auxiliary function: Append a Unicode character to bp->utf8.
496   uc must be < 0x110000.  */
497static inline void
498unicode_string_buffer_append_unicode (struct unicode_string_buffer *bp,
499				      unsigned int uc)
500{
501  unsigned char utf8buf[6];
502  int count = u8_uctomb (utf8buf, uc, 6);
503
504  if (count < 0)
505    /* The caller should have ensured that uc is not out-of-range.  */
506    abort ();
507
508  unicode_string_buffer_append_unicode_grow (bp, count);
509  memcpy (bp->utf8_buffer + bp->utf8_buflen, utf8buf, count);
510  bp->utf8_buflen += count;
511}
512
513/* Return the string buffer's contents.  */
514static char *
515unicode_string_buffer_result (struct unicode_string_buffer *bp)
516{
517  /* NUL-terminate it.  */
518  unicode_string_buffer_append_unicode_grow (bp, 1);
519  bp->utf8_buffer[bp->utf8_buflen] = '\0';
520  /* Return it.  */
521  return bp->utf8_buffer;
522}
523
524/* Free the memory pointed to by a 'struct unicode_string_buffer'.  */
525static inline void
526free_unicode_string_buffer (struct unicode_string_buffer *bp)
527{
528  free (bp->utf8_buffer);
529}
530
531
532/* ======================== Accumulating comments.  ======================== */
533
534
535/* Accumulating a single comment line.  */
536
537static struct unicode_string_buffer comment_buffer;
538
539static inline void
540comment_start ()
541{
542  comment_buffer.utf8_buflen = 0;
543}
544
545static inline bool
546comment_at_start ()
547{
548  return (comment_buffer.utf8_buflen == 0);
549}
550
551static inline void
552comment_add (int c)
553{
554  unicode_string_buffer_append_unicode (&comment_buffer, c);
555}
556
557static inline const char *
558comment_line_end ()
559{
560  char *buffer = unicode_string_buffer_result (&comment_buffer);
561  size_t buflen = strlen (buffer);
562
563  while (buflen >= 1
564	 && (buffer[buflen - 1] == ' ' || buffer[buflen - 1] == '\t'))
565    --buflen;
566  buffer[buflen] = '\0';
567  savable_comment_add (buffer);
568  return buffer;
569}
570
571
572/* These are for tracking whether comments count as immediately before
573   keyword.  */
574static int last_comment_line;
575static int last_non_comment_line;
576
577
578/* ======================== Recognizing comments.  ======================== */
579
580
581/* Recognizing the "coding" comment.
582   As specified in PEP 0263, it takes the form
583     "coding" [":"|"="] {alphanumeric or "-" or "_" or "*"}*
584   and is located in a comment in a line that
585     - is either the first or second line,
586     - is not a continuation line,
587     - contains no other tokens except this comment.  */
588
589/* Canonicalized encoding name for the current input file.  */
590static const char *xgettext_current_file_source_encoding;
591
592#if HAVE_ICONV
593/* Converter from xgettext_current_file_source_encoding to UTF-8 (except from
594   ASCII or UTF-8, when this conversion is a no-op).  */
595static iconv_t xgettext_current_file_source_iconv;
596#endif
597
598static inline void
599set_current_file_source_encoding (const char *canon_encoding)
600{
601  xgettext_current_file_source_encoding = canon_encoding;
602
603  if (xgettext_current_file_source_encoding != po_charset_ascii
604      && xgettext_current_file_source_encoding != po_charset_utf8)
605    {
606#if HAVE_ICONV
607      iconv_t cd;
608
609      /* Avoid glibc-2.1 bug with EUC-KR.  */
610# if (__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) && !defined _LIBICONV_VERSION
611      if (strcmp (xgettext_current_file_source_encoding, "EUC-KR") == 0)
612	cd = (iconv_t)(-1);
613      else
614# endif
615      cd = iconv_open (po_charset_utf8, xgettext_current_file_source_encoding);
616      if (cd == (iconv_t)(-1))
617	error_at_line (EXIT_FAILURE, 0, logical_file_name, line_number - 1, _("\
618Cannot convert from \"%s\" to \"%s\". %s relies on iconv(), \
619and iconv() does not support this conversion."),
620	       xgettext_current_file_source_encoding, po_charset_utf8,
621	       basename (program_name));
622      xgettext_current_file_source_iconv = cd;
623#else
624      error_at_line (EXIT_FAILURE, 0, logical_file_name, line_number - 1, _("\
625Cannot convert from \"%s\" to \"%s\". %s relies on iconv(). \
626This version was built without iconv()."),
627	     xgettext_global_source_encoding, po_charset_utf8,
628	     basename (program_name));
629#endif
630    }
631
632  xgettext_current_source_encoding = xgettext_current_file_source_encoding;
633#if HAVE_ICONV
634  xgettext_current_source_iconv = xgettext_current_file_source_iconv;
635#endif
636}
637
638static inline void
639try_to_extract_coding (const char *comment)
640{
641  const char *p = c_strstr (comment, "coding");
642
643  if (p != NULL)
644    {
645      p += 6;
646      if (*p == ':' || *p == '=')
647	{
648	  p++;
649	  while (*p == ' ' || *p == '\t')
650	    p++;
651	  {
652	    const char *encoding_start = p;
653
654	    while (c_isalnum (*p) || *p == '-' || *p == '_' || *p == '.')
655	      p++;
656	    {
657	      const char *encoding_end = p;
658
659	      if (encoding_end > encoding_start)
660		{
661		  /* Extract the encoding string.  */
662		  size_t encoding_len = encoding_end - encoding_start;
663		  char *encoding = XNMALLOC (encoding_len + 1, char);
664
665		  memcpy (encoding, encoding_start, encoding_len);
666		  encoding[encoding_len] = '\0';
667
668		  {
669		    /* Canonicalize it.  */
670		    const char *canon_encoding = po_charset_canonicalize (encoding);
671		    if (canon_encoding == NULL)
672		      {
673			error_at_line (0, 0,
674				       logical_file_name, line_number - 1, _("\
675Unknown encoding \"%s\". Proceeding with ASCII instead."),
676				       encoding);
677		        canon_encoding = po_charset_ascii;
678		      }
679
680		    /* Activate it.  */
681		    set_current_file_source_encoding (canon_encoding);
682		  }
683
684		  free (encoding);
685		}
686	    }
687	  }
688	}
689    }
690}
691
692/* Tracking whether the current line is a continuation line or contains a
693   non-blank character.  */
694static bool continuation_or_nonblank_line = false;
695
696
697/* Phase 3: Outside strings, replace backslash-newline with nothing and a
698   comment with nothing.  */
699
700static int
701phase3_getc ()
702{
703  int c;
704
705  for (;;)
706    {
707      c = phase2_getc ();
708      if (c == '\\')
709	{
710	  c = phase2_getc ();
711	  if (c != '\n')
712	    {
713	      phase2_ungetc (c);
714	      /* This shouldn't happen usually, because "A backslash is
715		 illegal elsewhere on a line outside a string literal."  */
716	      return '\\';
717	    }
718	  /* Eat backslash-newline.  */
719	  continuation_or_nonblank_line = true;
720	}
721      else if (c == '#')
722	{
723	  /* Eat a comment.  */
724	  const char *comment;
725
726	  last_comment_line = line_number;
727	  comment_start ();
728	  for (;;)
729	    {
730	      c = phase2_getc ();
731	      if (c == UEOF || c == '\n')
732		break;
733	      /* We skip all leading white space, but not EOLs.  */
734	      if (!(comment_at_start () && (c == ' ' || c == '\t')))
735		comment_add (c);
736	    }
737	  comment = comment_line_end ();
738	  if (line_number - 1 <= 2 && !continuation_or_nonblank_line)
739	    try_to_extract_coding (comment);
740	  continuation_or_nonblank_line = false;
741	  return c;
742	}
743      else
744	{
745	  if (c == '\n')
746	    continuation_or_nonblank_line = false;
747	  else if (!(c == ' ' || c == '\t' || c == '\f'))
748	    continuation_or_nonblank_line = true;
749	  return c;
750	}
751    }
752}
753
754/* Supports only one pushback character.  */
755static void
756phase3_ungetc (int c)
757{
758  phase2_ungetc (c);
759}
760
761
762/* ========================= Accumulating strings.  ======================== */
763
764/* Return value of phase7_getuc when EOF is reached.  */
765#define P7_EOF (-1)
766#define P7_STRING_END (-2)
767
768/* Convert an UTF-16 or UTF-32 code point to a return value that can be
769   distinguished from a single-byte return value.  */
770#define UNICODE(code) (0x100 + (code))
771
772/* Test a return value of phase7_getuc whether it designates an UTF-16 or
773   UTF-32 code point.  */
774#define IS_UNICODE(p7_result) ((p7_result) >= 0x100)
775
776/* Extract the UTF-16 or UTF-32 code of a return value that satisfies
777   IS_UNICODE.  */
778#define UNICODE_VALUE(p7_result) ((p7_result) - 0x100)
779
780/* A string buffer type that allows appending bytes (in the
781   xgettext_current_source_encoding) or Unicode characters.
782   Returns the entire string in UTF-8 encoding.  */
783
784struct mixed_string_buffer
785{
786  /* The part of the string that has already been converted to UTF-8.  */
787  char *utf8_buffer;
788  size_t utf8_buflen;
789  size_t utf8_allocated;
790  /* The first half of an UTF-16 surrogate character.  */
791  unsigned short utf16_surr;
792  /* The part of the string that is still in the source encoding.  */
793  char *curr_buffer;
794  size_t curr_buflen;
795  size_t curr_allocated;
796};
797
798/* Initialize a 'struct mixed_string_buffer' to empty.  */
799static inline void
800init_mixed_string_buffer (struct mixed_string_buffer *bp)
801{
802  bp->utf8_buffer = NULL;
803  bp->utf8_buflen = 0;
804  bp->utf8_allocated = 0;
805  bp->utf16_surr = 0;
806  bp->curr_buffer = NULL;
807  bp->curr_buflen = 0;
808  bp->curr_allocated = 0;
809}
810
811/* Auxiliary function: Append a byte to bp->curr.  */
812static inline void
813mixed_string_buffer_append_byte (struct mixed_string_buffer *bp, unsigned char c)
814{
815  if (bp->curr_buflen == bp->curr_allocated)
816    {
817      bp->curr_allocated = 2 * bp->curr_allocated + 10;
818      bp->curr_buffer = xrealloc (bp->curr_buffer, bp->curr_allocated);
819    }
820  bp->curr_buffer[bp->curr_buflen++] = c;
821}
822
823/* Auxiliary function: Ensure count more bytes are available in bp->utf8.  */
824static inline void
825mixed_string_buffer_append_unicode_grow (struct mixed_string_buffer *bp, size_t count)
826{
827  if (bp->utf8_buflen + count > bp->utf8_allocated)
828    {
829      size_t new_allocated = 2 * bp->utf8_allocated + 10;
830      if (new_allocated < bp->utf8_buflen + count)
831	new_allocated = bp->utf8_buflen + count;
832      bp->utf8_allocated = new_allocated;
833      bp->utf8_buffer = xrealloc (bp->utf8_buffer, new_allocated);
834    }
835}
836
837/* Auxiliary function: Append a Unicode character to bp->utf8.
838   uc must be < 0x110000.  */
839static inline void
840mixed_string_buffer_append_unicode (struct mixed_string_buffer *bp, unsigned int uc)
841{
842  unsigned char utf8buf[6];
843  int count = u8_uctomb (utf8buf, uc, 6);
844
845  if (count < 0)
846    /* The caller should have ensured that uc is not out-of-range.  */
847    abort ();
848
849  mixed_string_buffer_append_unicode_grow (bp, count);
850  memcpy (bp->utf8_buffer + bp->utf8_buflen, utf8buf, count);
851  bp->utf8_buflen += count;
852}
853
854/* Auxiliary function: Flush bp->utf16_surr into bp->utf8_buffer.  */
855static inline void
856mixed_string_buffer_flush_utf16_surr (struct mixed_string_buffer *bp)
857{
858  if (bp->utf16_surr != 0)
859    {
860      /* A half surrogate is invalid, therefore use U+FFFD instead.  */
861      mixed_string_buffer_append_unicode (bp, 0xfffd);
862      bp->utf16_surr = 0;
863    }
864}
865
866/* Auxiliary function: Flush bp->curr_buffer into bp->utf8_buffer.  */
867static inline void
868mixed_string_buffer_flush_curr_buffer (struct mixed_string_buffer *bp, int lineno)
869{
870  if (bp->curr_buflen > 0)
871    {
872      char *curr;
873      size_t count;
874
875      mixed_string_buffer_append_byte (bp, '\0');
876
877      /* Convert from the source encoding to UTF-8.  */
878      curr = from_current_source_encoding (bp->curr_buffer,
879					   logical_file_name, lineno);
880
881      /* Append it to bp->utf8_buffer.  */
882      count = strlen (curr);
883      mixed_string_buffer_append_unicode_grow (bp, count);
884      memcpy (bp->utf8_buffer + bp->utf8_buflen, curr, count);
885      bp->utf8_buflen += count;
886
887      if (curr != bp->curr_buffer)
888	free (curr);
889      bp->curr_buflen = 0;
890    }
891}
892
893/* Append a character or Unicode character to a 'struct mixed_string_buffer'.  */
894static void
895mixed_string_buffer_append (struct mixed_string_buffer *bp, int c)
896{
897  if (IS_UNICODE (c))
898    {
899      /* Append a Unicode character.  */
900
901      /* Switch from multibyte character mode to Unicode character mode.  */
902      mixed_string_buffer_flush_curr_buffer (bp, line_number);
903
904      /* Test whether this character and the previous one form a Unicode
905	 surrogate character pair.  */
906      if (bp->utf16_surr != 0
907	  && (c >= UNICODE (0xdc00) && c < UNICODE (0xe000)))
908	{
909	  unsigned short utf16buf[2];
910	  unsigned int uc;
911
912	  utf16buf[0] = bp->utf16_surr;
913	  utf16buf[1] = UNICODE_VALUE (c);
914	  if (u16_mbtouc (&uc, utf16buf, 2) != 2)
915	    abort ();
916
917	  mixed_string_buffer_append_unicode (bp, uc);
918	  bp->utf16_surr = 0;
919	}
920      else
921	{
922	  mixed_string_buffer_flush_utf16_surr (bp);
923
924	  if (c >= UNICODE (0xd800) && c < UNICODE (0xdc00))
925	    bp->utf16_surr = UNICODE_VALUE (c);
926	  else
927	    mixed_string_buffer_append_unicode (bp, UNICODE_VALUE (c));
928	}
929    }
930  else
931    {
932      /* Append a single byte.  */
933
934      /* Switch from Unicode character mode to multibyte character mode.  */
935      mixed_string_buffer_flush_utf16_surr (bp);
936
937      /* When a newline is seen, convert the accumulated multibyte sequence.
938	 This ensures a correct line number in the error message in case of
939	 a conversion error.  The "- 1" is to account for the newline.  */
940      if (c == '\n')
941	mixed_string_buffer_flush_curr_buffer (bp, line_number - 1);
942
943      mixed_string_buffer_append_byte (bp, (unsigned char) c);
944    }
945}
946
947/* Return the string buffer's contents.  */
948static char *
949mixed_string_buffer_result (struct mixed_string_buffer *bp)
950{
951  /* Flush all into bp->utf8_buffer.  */
952  mixed_string_buffer_flush_utf16_surr (bp);
953  mixed_string_buffer_flush_curr_buffer (bp, line_number);
954  /* NUL-terminate it.  */
955  mixed_string_buffer_append_unicode_grow (bp, 1);
956  bp->utf8_buffer[bp->utf8_buflen] = '\0';
957  /* Return it.  */
958  return bp->utf8_buffer;
959}
960
961/* Free the memory pointed to by a 'struct mixed_string_buffer'.  */
962static inline void
963free_mixed_string_buffer (struct mixed_string_buffer *bp)
964{
965  free (bp->utf8_buffer);
966  free (bp->curr_buffer);
967}
968
969
970/* ========================== Reading of tokens.  ========================== */
971
972
973enum token_type_ty
974{
975  token_type_eof,
976  token_type_lparen,		/* ( */
977  token_type_rparen,		/* ) */
978  token_type_comma,		/* , */
979  token_type_string,		/* "abc", 'abc', """abc""", '''abc''' */
980  token_type_symbol,		/* symbol, number */
981  token_type_other		/* misc. operator */
982};
983typedef enum token_type_ty token_type_ty;
984
985typedef struct token_ty token_ty;
986struct token_ty
987{
988  token_type_ty type;
989  char *string;		/* for token_type_string, token_type_symbol */
990  refcounted_string_list_ty *comment;	/* for token_type_string */
991  int line_number;
992};
993
994
995/* There are two different input syntaxes for strings, "abc" and r"abc",
996   and two different input syntaxes for Unicode strings, u"abc" and ur"abc".
997   Which escape sequences are understood, i.e. what is interpreted specially
998   after backslash?
999    "abc"     \<nl> \\ \' \" \a\b\f\n\r\t\v \ooo \xnn
1000    r"abc"
1001    u"abc"    \<nl> \\ \' \" \a\b\f\n\r\t\v \ooo \xnn \unnnn \Unnnnnnnn \N{...}
1002    ur"abc"                                           \unnnn
1003   The \unnnn values are UTF-16 values; a single \Unnnnnnnn can expand to two
1004   \unnnn items.  The \ooo and \xnn values are in the current source encoding
1005   for byte strings, and Unicode code points for Unicode strings.
1006 */
1007
1008static int
1009phase7_getuc (int quote_char,
1010	      bool triple, bool interpret_ansic, bool interpret_unicode,
1011	      unsigned int *backslash_counter)
1012{
1013  int c;
1014
1015  for (;;)
1016    {
1017      /* Use phase 2, because phase 3 elides comments.  */
1018      c = phase2_getc ();
1019
1020      if (c == UEOF)
1021	return P7_EOF;
1022
1023      if (c == quote_char && (interpret_ansic || (*backslash_counter & 1) == 0))
1024	{
1025	  if (triple)
1026	    {
1027	      int c1 = phase2_getc ();
1028	      if (c1 == quote_char)
1029		{
1030		  int c2 = phase2_getc ();
1031		  if (c2 == quote_char)
1032		    return P7_STRING_END;
1033		  phase2_ungetc (c2);
1034		}
1035	      phase2_ungetc (c1);
1036	      return UNICODE (c);
1037	    }
1038	  else
1039	    return P7_STRING_END;
1040	}
1041
1042      if (c == '\n')
1043	{
1044	  if (triple)
1045	    {
1046	      *backslash_counter = 0;
1047	      return UNICODE ('\n');
1048	    }
1049	  /* In r"..." and ur"..." strings, newline is only allowed
1050	     immediately after an odd number of backslashes (although the
1051	     backslashes are not interpreted!).  */
1052	  if (!(interpret_ansic || (*backslash_counter & 1) == 0))
1053	    {
1054	      *backslash_counter = 0;
1055	      return UNICODE ('\n');
1056	    }
1057	  phase2_ungetc (c);
1058	  error_with_progname = false;
1059	  error (0, 0, _("%s:%d: warning: unterminated string"),
1060		 logical_file_name, line_number);
1061	  error_with_progname = true;
1062	  return P7_STRING_END;
1063	}
1064
1065      if (c != '\\')
1066	{
1067	  *backslash_counter = 0;
1068	  return UNICODE (c);
1069	}
1070
1071      /* Backslash handling.  */
1072
1073      if (!interpret_ansic && !interpret_unicode)
1074	{
1075	  ++*backslash_counter;
1076	  return UNICODE ('\\');
1077	}
1078
1079      /* Dispatch according to the character following the backslash.  */
1080      c = phase2_getc ();
1081      if (c == UEOF)
1082	{
1083	  ++*backslash_counter;
1084	  return UNICODE ('\\');
1085	}
1086
1087      if (interpret_ansic)
1088	switch (c)
1089	  {
1090	  case '\n':
1091	    continue;
1092	  case '\\':
1093	    ++*backslash_counter;
1094	    return UNICODE (c);
1095	  case '\'': case '"':
1096	    *backslash_counter = 0;
1097	    return UNICODE (c);
1098	  case 'a':
1099	    *backslash_counter = 0;
1100	    return UNICODE ('\a');
1101	  case 'b':
1102	    *backslash_counter = 0;
1103	    return UNICODE ('\b');
1104	  case 'f':
1105	    *backslash_counter = 0;
1106	    return UNICODE ('\f');
1107	  case 'n':
1108	    *backslash_counter = 0;
1109	    return UNICODE ('\n');
1110	  case 'r':
1111	    *backslash_counter = 0;
1112	    return UNICODE ('\r');
1113	  case 't':
1114	    *backslash_counter = 0;
1115	    return UNICODE ('\t');
1116	  case 'v':
1117	    *backslash_counter = 0;
1118	    return UNICODE ('\v');
1119	  case '0': case '1': case '2': case '3': case '4':
1120	  case '5': case '6': case '7':
1121	    {
1122	      int n = c - '0';
1123
1124	      c = phase2_getc ();
1125	      if (c != UEOF)
1126		{
1127		  if (c >= '0' && c <= '7')
1128		    {
1129		      n = (n << 3) + (c - '0');
1130		      c = phase2_getc ();
1131		      if (c != UEOF)
1132			{
1133			  if (c >= '0' && c <= '7')
1134			    n = (n << 3) + (c - '0');
1135			  else
1136			    phase2_ungetc (c);
1137			}
1138		    }
1139		  else
1140		    phase2_ungetc (c);
1141		}
1142	      *backslash_counter = 0;
1143	      if (interpret_unicode)
1144		return UNICODE (n);
1145	      else
1146		return (unsigned char) n;
1147	    }
1148	  case 'x':
1149	    {
1150	      int c1 = phase2_getc ();
1151	      int n1;
1152
1153	      if (c1 >= '0' && c1 <= '9')
1154		n1 = c1 - '0';
1155	      else if (c1 >= 'A' && c1 <= 'F')
1156		n1 = c1 - 'A' + 10;
1157	      else if (c1 >= 'a' && c1 <= 'f')
1158		n1 = c1 - 'a' + 10;
1159	      else
1160		n1 = -1;
1161
1162	      if (n1 >= 0)
1163		{
1164		  int c2 = phase2_getc ();
1165		  int n2;
1166
1167		  if (c2 >= '0' && c2 <= '9')
1168		    n2 = c2 - '0';
1169		  else if (c2 >= 'A' && c2 <= 'F')
1170		    n2 = c2 - 'A' + 10;
1171		  else if (c2 >= 'a' && c2 <= 'f')
1172		    n2 = c2 - 'a' + 10;
1173		  else
1174		    n2 = -1;
1175
1176		  if (n2 >= 0)
1177		    {
1178		      int n = (n1 << 4) + n2;
1179		      *backslash_counter = 0;
1180		      if (interpret_unicode)
1181			return UNICODE (n);
1182		      else
1183			return (unsigned char) n;
1184		    }
1185
1186		  phase2_ungetc (c2);
1187		}
1188	      phase2_ungetc (c1);
1189	      phase2_ungetc (c);
1190	      ++*backslash_counter;
1191	      return UNICODE ('\\');
1192	    }
1193	  }
1194
1195      if (interpret_unicode)
1196	{
1197	  if (c == 'u')
1198	    {
1199	      unsigned char buf[4];
1200	      unsigned int n = 0;
1201	      int i;
1202
1203	      for (i = 0; i < 4; i++)
1204		{
1205		  int c1 = phase2_getc ();
1206
1207		  if (c1 >= '0' && c1 <= '9')
1208		    n = (n << 4) + (c1 - '0');
1209		  else if (c1 >= 'A' && c1 <= 'F')
1210		    n = (n << 4) + (c1 - 'A' + 10);
1211		  else if (c1 >= 'a' && c1 <= 'f')
1212		    n = (n << 4) + (c1 - 'a' + 10);
1213		  else
1214		    {
1215		      phase2_ungetc (c1);
1216		      while (--i >= 0)
1217			phase2_ungetc (buf[i]);
1218		      phase2_ungetc (c);
1219		      ++*backslash_counter;
1220		      return UNICODE ('\\');
1221		    }
1222
1223		  buf[i] = c1;
1224		}
1225	      *backslash_counter = 0;
1226	      return UNICODE (n);
1227	    }
1228
1229	  if (interpret_ansic)
1230	    {
1231	      if (c == 'U')
1232		{
1233		  unsigned char buf[8];
1234		  unsigned int n = 0;
1235		  int i;
1236
1237		  for (i = 0; i < 8; i++)
1238		    {
1239		      int c1 = phase2_getc ();
1240
1241		      if (c1 >= '0' && c1 <= '9')
1242			n = (n << 4) + (c1 - '0');
1243		      else if (c1 >= 'A' && c1 <= 'F')
1244			n = (n << 4) + (c1 - 'A' + 10);
1245		      else if (c1 >= 'a' && c1 <= 'f')
1246			n = (n << 4) + (c1 - 'a' + 10);
1247		      else
1248			{
1249			  phase2_ungetc (c1);
1250			  while (--i >= 0)
1251			    phase2_ungetc (buf[i]);
1252			  phase2_ungetc (c);
1253			  ++*backslash_counter;
1254			  return UNICODE ('\\');
1255			}
1256
1257		      buf[i] = c1;
1258		    }
1259		  if (n < 0x110000)
1260		    {
1261		      *backslash_counter = 0;
1262		      return UNICODE (n);
1263		    }
1264
1265		  error_with_progname = false;
1266		  error (0, 0, _("%s:%d: warning: invalid Unicode character"),
1267			 logical_file_name, line_number);
1268		  error_with_progname = true;
1269
1270		  while (--i >= 0)
1271		    phase2_ungetc (buf[i]);
1272		  phase2_ungetc (c);
1273		  ++*backslash_counter;
1274		  return UNICODE ('\\');
1275		}
1276
1277	      if (c == 'N')
1278		{
1279		  int c1 = phase2_getc ();
1280		  if (c1 == '{')
1281		    {
1282		      unsigned char buf[UNINAME_MAX + 1];
1283		      int i;
1284		      unsigned int n;
1285
1286		      for (i = 0; i < UNINAME_MAX; i++)
1287			{
1288			  int c2 = phase2_getc ();
1289			  if (!(c2 >= ' ' && c2 <= '~'))
1290			    {
1291			      phase2_ungetc (c2);
1292			      while (--i >= 0)
1293				phase2_ungetc (buf[i]);
1294			      phase2_ungetc (c1);
1295			      phase2_ungetc (c);
1296			      ++*backslash_counter;
1297			      return UNICODE ('\\');
1298			    }
1299			  if (c2 == '}')
1300			    break;
1301			  buf[i] = c2;
1302			}
1303		      buf[i] = '\0';
1304
1305		      n = unicode_name_character ((char *) buf);
1306		      if (n != UNINAME_INVALID)
1307			{
1308			  *backslash_counter = 0;
1309			  return UNICODE (n);
1310			}
1311
1312		      phase2_ungetc ('}');
1313		      while (--i >= 0)
1314			phase2_ungetc (buf[i]);
1315		    }
1316		  phase2_ungetc (c1);
1317		  phase2_ungetc (c);
1318		  ++*backslash_counter;
1319		  return UNICODE ('\\');
1320		}
1321	    }
1322	}
1323
1324      phase2_ungetc (c);
1325      ++*backslash_counter;
1326      return UNICODE ('\\');
1327    }
1328}
1329
1330
1331/* Combine characters into tokens.  Discard whitespace except newlines at
1332   the end of logical lines.  */
1333
1334/* Number of pending open parentheses/braces/brackets.  */
1335static int open_pbb;
1336
1337static token_ty phase5_pushback[1];
1338static int phase5_pushback_length;
1339
1340static void
1341phase5_get (token_ty *tp)
1342{
1343  int c;
1344
1345  if (phase5_pushback_length)
1346    {
1347      *tp = phase5_pushback[--phase5_pushback_length];
1348      return;
1349    }
1350
1351  for (;;)
1352    {
1353      tp->line_number = line_number;
1354      c = phase3_getc ();
1355
1356      switch (c)
1357	{
1358	case UEOF:
1359	  tp->type = token_type_eof;
1360	  return;
1361
1362	case ' ':
1363	case '\t':
1364	case '\f':
1365	  /* Ignore whitespace and comments.  */
1366	  continue;
1367
1368	case '\n':
1369	  if (last_non_comment_line > last_comment_line)
1370	    savable_comment_reset ();
1371	  /* Ignore newline if and only if it is used for implicit line
1372	     joining.  */
1373	  if (open_pbb > 0)
1374	    continue;
1375	  tp->type = token_type_other;
1376	  return;
1377	}
1378
1379      last_non_comment_line = tp->line_number;
1380
1381      switch (c)
1382	{
1383	case '.':
1384	  {
1385	    int c1 = phase3_getc ();
1386	    phase3_ungetc (c1);
1387	    if (!(c1 >= '0' && c1 <= '9'))
1388	      {
1389
1390		tp->type = token_type_other;
1391		return;
1392	      }
1393	  }
1394	  /* FALLTHROUGH */
1395	case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
1396	case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
1397	case 'M': case 'N': case 'O': case 'P': case 'Q':
1398	case 'S': case 'T':           case 'V': case 'W': case 'X':
1399	case 'Y': case 'Z':
1400	case '_':
1401	case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
1402	case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
1403	case 'm': case 'n': case 'o': case 'p': case 'q':
1404	case 's': case 't':           case 'v': case 'w': case 'x':
1405	case 'y': case 'z':
1406	case '0': case '1': case '2': case '3': case '4':
1407	case '5': case '6': case '7': case '8': case '9':
1408	symbol:
1409	  /* Symbol, or part of a number.  */
1410	  {
1411	    static char *buffer;
1412	    static int bufmax;
1413	    int bufpos;
1414
1415	    bufpos = 0;
1416	    for (;;)
1417	      {
1418		if (bufpos >= bufmax)
1419		  {
1420		    bufmax = 2 * bufmax + 10;
1421		    buffer = xrealloc (buffer, bufmax);
1422		  }
1423		buffer[bufpos++] = c;
1424		c = phase3_getc ();
1425		switch (c)
1426		  {
1427		  case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
1428		  case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
1429		  case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
1430		  case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
1431		  case 'Y': case 'Z':
1432		  case '_':
1433		  case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
1434		  case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
1435		  case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
1436		  case 's': case 't': case 'u': case 'v': case 'w': case 'x':
1437		  case 'y': case 'z':
1438		  case '0': case '1': case '2': case '3': case '4':
1439		  case '5': case '6': case '7': case '8': case '9':
1440		    continue;
1441		  default:
1442		    phase3_ungetc (c);
1443		    break;
1444		  }
1445		break;
1446	      }
1447	    if (bufpos >= bufmax)
1448	      {
1449		bufmax = 2 * bufmax + 10;
1450		buffer = xrealloc (buffer, bufmax);
1451	      }
1452	    buffer[bufpos] = '\0';
1453	    tp->string = xstrdup (buffer);
1454	    tp->type = token_type_symbol;
1455	    return;
1456	  }
1457
1458	/* Strings.  */
1459	  {
1460	    struct mixed_string_buffer literal;
1461	    int quote_char;
1462	    bool interpret_ansic;
1463	    bool interpret_unicode;
1464	    bool triple;
1465	    unsigned int backslash_counter;
1466
1467	    case 'R': case 'r':
1468	      {
1469		int c1 = phase2_getc ();
1470		if (c1 == '"' || c1 == '\'')
1471		  {
1472		    quote_char = c1;
1473		    interpret_ansic = false;
1474		    interpret_unicode = false;
1475		    goto string;
1476		  }
1477		phase2_ungetc (c1);
1478		goto symbol;
1479	      }
1480
1481	    case 'U': case 'u':
1482	      {
1483		int c1 = phase2_getc ();
1484		if (c1 == '"' || c1 == '\'')
1485		  {
1486		    quote_char = c1;
1487		    interpret_ansic = true;
1488		    interpret_unicode = true;
1489		    goto string;
1490		  }
1491		if (c1 == 'R' || c1 == 'r')
1492		  {
1493		    int c2 = phase2_getc ();
1494		    if (c2 == '"' || c2 == '\'')
1495		      {
1496			quote_char = c2;
1497			interpret_ansic = false;
1498			interpret_unicode = true;
1499			goto string;
1500		      }
1501		    phase2_ungetc (c2);
1502		  }
1503		phase2_ungetc (c1);
1504		goto symbol;
1505	      }
1506
1507	    case '"': case '\'':
1508	      quote_char = c;
1509	      interpret_ansic = true;
1510	      interpret_unicode = false;
1511	    string:
1512	      triple = false;
1513	      {
1514		int c1 = phase2_getc ();
1515		if (c1 == quote_char)
1516		  {
1517		    int c2 = phase2_getc ();
1518		    if (c2 == quote_char)
1519		      triple = true;
1520		    else
1521		      {
1522			phase2_ungetc (c2);
1523			phase2_ungetc (c1);
1524		      }
1525		  }
1526		else
1527		  phase2_ungetc (c1);
1528	      }
1529	      backslash_counter = 0;
1530	      /* Start accumulating the string.  */
1531	      init_mixed_string_buffer (&literal);
1532	      for (;;)
1533		{
1534		  int uc = phase7_getuc (quote_char, triple, interpret_ansic,
1535					 interpret_unicode, &backslash_counter);
1536
1537		  if (uc == P7_EOF || uc == P7_STRING_END)
1538		    break;
1539
1540		  if (IS_UNICODE (uc))
1541		    assert (UNICODE_VALUE (uc) >= 0
1542			    && UNICODE_VALUE (uc) < 0x110000);
1543
1544		  mixed_string_buffer_append (&literal, uc);
1545		}
1546	      tp->string = xstrdup (mixed_string_buffer_result (&literal));
1547	      free_mixed_string_buffer (&literal);
1548	      tp->comment = add_reference (savable_comment);
1549	      tp->type = token_type_string;
1550	      return;
1551	  }
1552
1553	case '(':
1554	  open_pbb++;
1555	  tp->type = token_type_lparen;
1556	  return;
1557
1558	case ')':
1559	  if (open_pbb > 0)
1560	    open_pbb--;
1561	  tp->type = token_type_rparen;
1562	  return;
1563
1564	case ',':
1565	  tp->type = token_type_comma;
1566	  return;
1567
1568	case '[': case '{':
1569	  open_pbb++;
1570	  tp->type = token_type_other;
1571	  return;
1572
1573	case ']': case '}':
1574	  if (open_pbb > 0)
1575	    open_pbb--;
1576	  tp->type = token_type_other;
1577	  return;
1578
1579	default:
1580	  /* We could carefully recognize each of the 2 and 3 character
1581	     operators, but it is not necessary, as we only need to recognize
1582	     gettext invocations.  Don't bother.  */
1583	  tp->type = token_type_other;
1584	  return;
1585	}
1586    }
1587}
1588
1589/* Supports only one pushback token.  */
1590static void
1591phase5_unget (token_ty *tp)
1592{
1593  if (tp->type != token_type_eof)
1594    {
1595      if (phase5_pushback_length == SIZEOF (phase5_pushback))
1596	abort ();
1597      phase5_pushback[phase5_pushback_length++] = *tp;
1598    }
1599}
1600
1601
1602/* Combine adjacent strings to form a single string.  Note that the end
1603   of a logical line appears as a token of its own, therefore strings that
1604   belong to different logical lines will not be concatenated.  */
1605
1606static void
1607x_python_lex (token_ty *tp)
1608{
1609  phase5_get (tp);
1610  if (tp->type != token_type_string)
1611    return;
1612  for (;;)
1613    {
1614      token_ty tmp;
1615      size_t len;
1616
1617      phase5_get (&tmp);
1618      if (tmp.type != token_type_string)
1619	{
1620	  phase5_unget (&tmp);
1621	  return;
1622	}
1623      len = strlen (tp->string);
1624      tp->string = xrealloc (tp->string, len + strlen (tmp.string) + 1);
1625      strcpy (tp->string + len, tmp.string);
1626      free (tmp.string);
1627    }
1628}
1629
1630
1631/* ========================= Extracting strings.  ========================== */
1632
1633
1634/* Context lookup table.  */
1635static flag_context_list_table_ty *flag_context_list_table;
1636
1637
1638/* The file is broken into tokens.  Scan the token stream, looking for
1639   a keyword, followed by a left paren, followed by a string.  When we
1640   see this sequence, we have something to remember.  We assume we are
1641   looking at a valid C or C++ program, and leave the complaints about
1642   the grammar to the compiler.
1643
1644     Normal handling: Look for
1645       keyword ( ... msgid ... )
1646     Plural handling: Look for
1647       keyword ( ... msgid ... msgid_plural ... )
1648
1649   We use recursion because the arguments before msgid or between msgid
1650   and msgid_plural can contain subexpressions of the same form.  */
1651
1652
1653/* Extract messages until the next balanced closing parenthesis.
1654   Extracted messages are added to MLP.
1655   Return true upon eof, false upon closing parenthesis.  */
1656static bool
1657extract_parenthesized (message_list_ty *mlp,
1658		       flag_context_ty outer_context,
1659		       flag_context_list_iterator_ty context_iter,
1660		       struct arglist_parser *argparser)
1661{
1662  /* Current argument number.  */
1663  int arg = 1;
1664  /* 0 when no keyword has been seen.  1 right after a keyword is seen.  */
1665  int state;
1666  /* Parameters of the keyword just seen.  Defined only in state 1.  */
1667  const struct callshapes *next_shapes = NULL;
1668  /* Context iterator that will be used if the next token is a '('.  */
1669  flag_context_list_iterator_ty next_context_iter =
1670    passthrough_context_list_iterator;
1671  /* Current context.  */
1672  flag_context_ty inner_context =
1673    inherited_context (outer_context,
1674		       flag_context_list_iterator_advance (&context_iter));
1675
1676  /* Start state is 0.  */
1677  state = 0;
1678
1679  for (;;)
1680    {
1681      token_ty token;
1682
1683      x_python_lex (&token);
1684      switch (token.type)
1685	{
1686	case token_type_symbol:
1687	  {
1688	    void *keyword_value;
1689
1690	    if (hash_find_entry (&keywords, token.string, strlen (token.string),
1691				 &keyword_value)
1692		== 0)
1693	      {
1694		next_shapes = (const struct callshapes *) keyword_value;
1695		state = 1;
1696	      }
1697	    else
1698	      state = 0;
1699	  }
1700	  next_context_iter =
1701	    flag_context_list_iterator (
1702	      flag_context_list_table_lookup (
1703		flag_context_list_table,
1704		token.string, strlen (token.string)));
1705	  free (token.string);
1706	  continue;
1707
1708	case token_type_lparen:
1709	  if (extract_parenthesized (mlp, inner_context, next_context_iter,
1710				     arglist_parser_alloc (mlp,
1711							   state ? next_shapes : NULL)))
1712	    {
1713	      xgettext_current_source_encoding = po_charset_utf8;
1714	      arglist_parser_done (argparser, arg);
1715	      xgettext_current_source_encoding = xgettext_current_file_source_encoding;
1716	      return true;
1717	    }
1718	  next_context_iter = null_context_list_iterator;
1719	  state = 0;
1720	  continue;
1721
1722	case token_type_rparen:
1723	  xgettext_current_source_encoding = po_charset_utf8;
1724	  arglist_parser_done (argparser, arg);
1725	  xgettext_current_source_encoding = xgettext_current_file_source_encoding;
1726	  return false;
1727
1728	case token_type_comma:
1729	  arg++;
1730	  inner_context =
1731	    inherited_context (outer_context,
1732			       flag_context_list_iterator_advance (
1733				 &context_iter));
1734	  next_context_iter = passthrough_context_list_iterator;
1735	  state = 0;
1736	  continue;
1737
1738	case token_type_string:
1739	  {
1740	    lex_pos_ty pos;
1741	    pos.file_name = logical_file_name;
1742	    pos.line_number = token.line_number;
1743
1744	    xgettext_current_source_encoding = po_charset_utf8;
1745	    if (extract_all)
1746	      remember_a_message (mlp, NULL, token.string, inner_context,
1747				  &pos, token.comment);
1748	    else
1749	      arglist_parser_remember (argparser, arg, token.string,
1750				       inner_context,
1751				       pos.file_name, pos.line_number,
1752				       token.comment);
1753	    xgettext_current_source_encoding = xgettext_current_file_source_encoding;
1754	  }
1755	  drop_reference (token.comment);
1756	  next_context_iter = null_context_list_iterator;
1757	  state = 0;
1758	  continue;
1759
1760	case token_type_eof:
1761	  xgettext_current_source_encoding = po_charset_utf8;
1762	  arglist_parser_done (argparser, arg);
1763	  xgettext_current_source_encoding = xgettext_current_file_source_encoding;
1764	  return true;
1765
1766	case token_type_other:
1767	  next_context_iter = null_context_list_iterator;
1768	  state = 0;
1769	  continue;
1770
1771	default:
1772	  abort ();
1773	}
1774    }
1775}
1776
1777
1778void
1779extract_python (FILE *f,
1780		const char *real_filename, const char *logical_filename,
1781		flag_context_list_table_ty *flag_table,
1782		msgdomain_list_ty *mdlp)
1783{
1784  message_list_ty *mlp = mdlp->item[0]->messages;
1785
1786  fp = f;
1787  real_file_name = real_filename;
1788  logical_file_name = xstrdup (logical_filename);
1789  line_number = 1;
1790
1791  last_comment_line = -1;
1792  last_non_comment_line = -1;
1793
1794  xgettext_current_file_source_encoding = xgettext_global_source_encoding;
1795#if HAVE_ICONV
1796  xgettext_current_file_source_iconv = xgettext_global_source_iconv;
1797#endif
1798
1799  xgettext_current_source_encoding = xgettext_current_file_source_encoding;
1800#if HAVE_ICONV
1801  xgettext_current_source_iconv = xgettext_current_file_source_iconv;
1802#endif
1803
1804  continuation_or_nonblank_line = false;
1805
1806  open_pbb = 0;
1807
1808  flag_context_list_table = flag_table;
1809
1810  init_keywords ();
1811
1812  /* Eat tokens until eof is seen.  When extract_parenthesized returns
1813     due to an unbalanced closing parenthesis, just restart it.  */
1814  while (!extract_parenthesized (mlp, null_context, null_context_list_iterator,
1815				 arglist_parser_alloc (mlp, NULL)))
1816    ;
1817
1818  fp = NULL;
1819  real_file_name = NULL;
1820  logical_file_name = NULL;
1821  line_number = 0;
1822}
1823