1/* xgettext C# backend.
2   Copyright (C) 2003, 2005-2007 Free Software Foundation, Inc.
3   Written by Bruno Haible <bruno@clisp.org>, 2003.
4
5   This program is free software: you can redistribute it and/or modify
6   it under the terms of the GNU General Public License as published by
7   the Free Software Foundation; either version 3 of the License, or
8   (at your option) any later version.
9
10   This program is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13   GNU General Public License for more details.
14
15   You should have received a copy of the GNU General Public License
16   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
17
18#ifdef HAVE_CONFIG_H
19# include "config.h"
20#endif
21
22/* Specification.  */
23#include "x-csharp.h"
24
25#include <errno.h>
26#include <stdbool.h>
27#include <stdio.h>
28#include <stdlib.h>
29#include <string.h>
30
31#include "message.h"
32#include "xgettext.h"
33#include "x-csharp.h"
34#include "c-ctype.h"
35#include "error.h"
36#include "error-progname.h"
37#include "xalloc.h"
38#include "xerror.h"
39#include "xvasprintf.h"
40#include "hash.h"
41#include "po-charset.h"
42#include "unistr.h"
43#include "gettext.h"
44
45#define _(s) gettext(s)
46
47#define SIZEOF(a) (sizeof(a) / sizeof(a[0]))
48
49
50/* The C# syntax is defined in ECMA-334, second edition.  */
51
52
53/* ====================== Keyword set customization.  ====================== */
54
55/* If true extract all strings.  */
56static bool extract_all = false;
57
58static hash_table keywords;
59static bool default_keywords = true;
60
61
62void
63x_csharp_extract_all ()
64{
65  extract_all = true;
66}
67
68
69/* Processes a --keyword option.
70   Non-ASCII function names can be used if given in UTF-8 encoding.  */
71void
72x_csharp_keyword (const char *name)
73{
74  if (name == NULL)
75    default_keywords = false;
76  else
77    {
78      const char *end;
79      struct callshape shape;
80      const char *colon;
81
82      if (keywords.table == NULL)
83	hash_init (&keywords, 100);
84
85      split_keywordspec (name, &end, &shape);
86
87      /* The characters between name and end should form a valid C#
88	 identifier sequence with dots.
89	 A colon means an invalid parse in split_keywordspec().  */
90      colon = strchr (name, ':');
91      if (colon == NULL || colon >= end)
92	insert_keyword_callshape (&keywords, name, end - name, &shape);
93    }
94}
95
96/* Finish initializing the keywords hash table.
97   Called after argument processing, before each file is processed.  */
98static void
99init_keywords ()
100{
101  if (default_keywords)
102    {
103      /* When adding new keywords here, also update the documentation in
104	 xgettext.texi!  */
105      x_csharp_keyword ("GetString");	/* Resource{Manager,Set}.GetString */
106      x_csharp_keyword ("GetPluralString:1,2");	/* GettextResource{Manager,Set}.GetPluralString */
107      x_csharp_keyword ("GetParticularString:1c,2"); /* Resource{Manager,Set}.GetParticularString */
108      x_csharp_keyword ("GetParticularPluralString:1c,2,3"); /* Resource{Manager,Set}.GetParticularPluralString */
109      default_keywords = false;
110    }
111}
112
113void
114init_flag_table_csharp ()
115{
116  xgettext_record_flag ("GetString:1:pass-csharp-format");
117  xgettext_record_flag ("GetPluralString:1:pass-csharp-format");
118  xgettext_record_flag ("GetPluralString:2:pass-csharp-format");
119  xgettext_record_flag ("GetParticularString:2:pass-csharp-format");
120  xgettext_record_flag ("GetParticularPluralString:2:pass-csharp-format");
121  xgettext_record_flag ("GetParticularPluralString:3:pass-csharp-format");
122  xgettext_record_flag ("String.Format:1:csharp-format");
123}
124
125
126/* ======================== Reading of characters.  ======================== */
127
128/* Real filename, used in error messages about the input file.  */
129static const char *real_file_name;
130
131/* Logical filename and line number, used to label the extracted messages.  */
132static char *logical_file_name;
133static int line_number;
134
135/* The input file stream.  */
136static FILE *fp;
137
138
139/* Phase 1: line_number handling.  */
140
141/* Maximum used, roughly a safer MB_LEN_MAX.  */
142#define MAX_PHASE1_PUSHBACK 16
143static unsigned char phase1_pushback[MAX_PHASE1_PUSHBACK];
144static int phase1_pushback_length;
145
146/* Read the next single byte from the input file.  */
147static int
148phase1_getc ()
149{
150  int c;
151
152  if (phase1_pushback_length)
153    {
154      c = phase1_pushback[--phase1_pushback_length];
155      if (c == '\n')
156	++line_number;
157      return c;
158    }
159
160  c = getc (fp);
161  if (c == EOF)
162    {
163      if (ferror (fp))
164	error (EXIT_FAILURE, errno, _("error while reading \"%s\""),
165	       real_file_name);
166      return EOF;
167    }
168
169  if (c == '\n')
170    ++line_number;
171  return c;
172}
173
174/* Supports MAX_PHASE1_PUSHBACK characters of pushback.  */
175static void
176phase1_ungetc (int c)
177{
178  if (c != EOF)
179    {
180      if (c == '\n')
181	--line_number;
182      if (phase1_pushback_length == SIZEOF (phase1_pushback))
183	abort ();
184      phase1_pushback[phase1_pushback_length++] = c;
185    }
186}
187
188
189/* Phase 2: Conversion to Unicode.
190   This is done early because ECMA-334 section 9.1. says that the source is
191   "an ordered sequence of Unicode characters", and because the recognition
192   of the line terminators (ECMA-334 section 9.3.1) is hardly possible without
193   prior conversion to Unicode.  */
194
195/* End-of-file indicator for functions returning an UCS-4 character.  */
196#define UEOF -1
197
198/* Newline Unicode character.  */
199#define UNL 0x000a
200
201static int phase2_pushback[1];
202static int phase2_pushback_length;
203
204/* Read the next Unicode UCS-4 character from the input file.  */
205static int
206phase2_getc ()
207{
208  if (phase2_pushback_length)
209    return phase2_pushback[--phase2_pushback_length];
210
211  if (xgettext_current_source_encoding == po_charset_ascii)
212    {
213      int c = phase1_getc ();
214      if (c == EOF)
215	return UEOF;
216      if (!c_isascii (c))
217	{
218	  char buffer[21];
219	  sprintf (buffer, ":%ld", (long) line_number);
220	  multiline_error (xstrdup (""),
221			   xasprintf (_("\
222Non-ASCII string at %s%s.\n\
223Please specify the source encoding through --from-code.\n"),
224			   real_file_name, buffer));
225	  exit (EXIT_FAILURE);
226	}
227      return c;
228    }
229  else if (xgettext_current_source_encoding != po_charset_utf8)
230    {
231#if HAVE_ICONV
232      /* Use iconv on an increasing number of bytes.  Read only as many bytes
233	 through phase1_getc as needed.  This is needed to give reasonable
234	 interactive behaviour when fp is connected to an interactive tty.  */
235      unsigned char buf[MAX_PHASE1_PUSHBACK];
236      size_t bufcount;
237      int c = phase1_getc ();
238      if (c == EOF)
239	return UEOF;
240      buf[0] = (unsigned char) c;
241      bufcount = 1;
242
243      for (;;)
244	{
245	  unsigned char scratchbuf[6];
246	  const char *inptr = (const char *) &buf[0];
247	  size_t insize = bufcount;
248	  char *outptr = (char *) &scratchbuf[0];
249	  size_t outsize = sizeof (scratchbuf);
250
251	  size_t res = iconv (xgettext_current_source_iconv,
252			      (ICONV_CONST char **) &inptr, &insize,
253			      &outptr, &outsize);
254	  /* We expect that a character has been produced if and only if
255	     some input bytes have been consumed.  */
256	  if ((insize < bufcount) != (outsize < sizeof (scratchbuf)))
257	    abort ();
258	  if (outsize == sizeof (scratchbuf))
259	    {
260	      /* No character has been produced.  Must be an error.  */
261	      if (res != (size_t)(-1))
262		abort ();
263
264	      if (errno == EILSEQ)
265		{
266		  /* An invalid multibyte sequence was encountered.  */
267		  multiline_error (xstrdup (""),
268				   xasprintf (_("\
269%s:%d: Invalid multibyte sequence.\n\
270Please specify the correct source encoding through --from-code.\n"),
271				   real_file_name, line_number));
272		  exit (EXIT_FAILURE);
273		}
274	      else if (errno == EINVAL)
275		{
276		  /* An incomplete multibyte character.  */
277		  int c;
278
279		  if (bufcount == MAX_PHASE1_PUSHBACK)
280		    {
281		      /* An overlong incomplete multibyte sequence was
282			 encountered.  */
283		      multiline_error (xstrdup (""),
284				       xasprintf (_("\
285%s:%d: Long incomplete multibyte sequence.\n\
286Please specify the correct source encoding through --from-code.\n"),
287				       real_file_name, line_number));
288		      exit (EXIT_FAILURE);
289		    }
290
291		  /* Read one more byte and retry iconv.  */
292		  c = phase1_getc ();
293		  if (c == EOF)
294		    {
295		      multiline_error (xstrdup (""),
296				       xasprintf (_("\
297%s:%d: Incomplete multibyte sequence at end of file.\n\
298Please specify the correct source encoding through --from-code.\n"),
299				       real_file_name, line_number));
300		      exit (EXIT_FAILURE);
301		    }
302		  if (c == '\n')
303		    {
304		      multiline_error (xstrdup (""),
305				       xasprintf (_("\
306%s:%d: Incomplete multibyte sequence at end of line.\n\
307Please specify the correct source encoding through --from-code.\n"),
308				       real_file_name, line_number - 1));
309		      exit (EXIT_FAILURE);
310		    }
311		  buf[bufcount++] = (unsigned char) c;
312		}
313	      else
314		error (EXIT_FAILURE, errno, _("%s:%d: iconv failure"),
315		       real_file_name, line_number);
316	    }
317	  else
318	    {
319	      size_t outbytes = sizeof (scratchbuf) - outsize;
320	      size_t bytes = bufcount - insize;
321	      unsigned int uc;
322
323	      /* We expect that one character has been produced.  */
324	      if (bytes == 0)
325		abort ();
326	      if (outbytes == 0)
327		abort ();
328	      /* Push back the unused bytes.  */
329	      while (insize > 0)
330		phase1_ungetc (buf[--insize]);
331	      /* Convert the character from UTF-8 to UCS-4.  */
332	      if (u8_mbtouc (&uc, scratchbuf, outbytes) < outbytes)
333		{
334		  /* scratchbuf contains an out-of-range Unicode character
335		     (> 0x10ffff).  */
336		  multiline_error (xstrdup (""),
337				   xasprintf (_("\
338%s:%d: Invalid multibyte sequence.\n\
339Please specify the source encoding through --from-code.\n"),
340				   real_file_name, line_number));
341		  exit (EXIT_FAILURE);
342		}
343	      return uc;
344	    }
345	}
346#else
347      /* If we don't have iconv(), the only supported values for
348	 xgettext_global_source_encoding and thus also for
349	 xgettext_current_source_encoding are ASCII and UTF-8.  */
350      abort ();
351#endif
352    }
353  else
354    {
355      /* Read an UTF-8 encoded character.  */
356      unsigned char buf[6];
357      unsigned int count;
358      int c;
359      unsigned int uc;
360
361      c = phase1_getc ();
362      if (c == EOF)
363	return UEOF;
364      buf[0] = c;
365      count = 1;
366
367      if (buf[0] >= 0xc0)
368	{
369	  c = phase1_getc ();
370	  if (c == EOF)
371	    return UEOF;
372	  buf[1] = c;
373	  count = 2;
374	}
375
376      if (buf[0] >= 0xe0
377	  && ((buf[1] ^ 0x80) < 0x40))
378	{
379	  c = phase1_getc ();
380	  if (c == EOF)
381	    return UEOF;
382	  buf[2] = c;
383	  count = 3;
384	}
385
386      if (buf[0] >= 0xf0
387	  && ((buf[1] ^ 0x80) < 0x40)
388	  && ((buf[2] ^ 0x80) < 0x40))
389	{
390	  c = phase1_getc ();
391	  if (c == EOF)
392	    return UEOF;
393	  buf[3] = c;
394	  count = 4;
395	}
396
397      if (buf[0] >= 0xf8
398	  && ((buf[1] ^ 0x80) < 0x40)
399	  && ((buf[2] ^ 0x80) < 0x40)
400	  && ((buf[3] ^ 0x80) < 0x40))
401	{
402	  c = phase1_getc ();
403	  if (c == EOF)
404	    return UEOF;
405	  buf[4] = c;
406	  count = 5;
407	}
408
409      if (buf[0] >= 0xfc
410	  && ((buf[1] ^ 0x80) < 0x40)
411	  && ((buf[2] ^ 0x80) < 0x40)
412	  && ((buf[3] ^ 0x80) < 0x40)
413	  && ((buf[4] ^ 0x80) < 0x40))
414	{
415	  c = phase1_getc ();
416	  if (c == EOF)
417	    return UEOF;
418	  buf[5] = c;
419	  count = 6;
420	}
421
422      u8_mbtouc (&uc, buf, count);
423      return uc;
424    }
425}
426
427/* Supports only one pushback character.  */
428static void
429phase2_ungetc (int c)
430{
431  if (c != UEOF)
432    {
433      if (phase2_pushback_length == SIZEOF (phase2_pushback))
434	abort ();
435      phase2_pushback[phase2_pushback_length++] = c;
436    }
437}
438
439
440/* Phase 3: Convert all line terminators to LF.
441   See ECMA-334 section 9.3.1.  */
442
443/* Line number defined in terms of phase3.  */
444static int logical_line_number;
445
446static int phase3_pushback[9];
447static int phase3_pushback_length;
448
449/* Read the next Unicode UCS-4 character from the input file, mapping
450   all line terminators to U+000A, and dropping U+001A at the end of file.  */
451static int
452phase3_getc ()
453{
454  int c;
455
456  if (phase3_pushback_length)
457    {
458      c = phase3_pushback[--phase3_pushback_length];
459      if (c == UNL)
460	++logical_line_number;
461      return c;
462    }
463
464  c = phase2_getc ();
465
466  if (c == 0x000d)
467    {
468      int c1 = phase2_getc ();
469
470      if (c1 != UEOF && c1 != 0x000a)
471	phase2_ungetc (c1);
472
473      /* Seen line terminator CR or CR/LF.  */
474      ++logical_line_number;
475      return UNL;
476    }
477
478  if (c == 0x0085 || c == 0x2028 || c == 0x2029)
479    {
480      /* Seen Unicode word processor newline.  */
481      ++logical_line_number;
482      return UNL;
483    }
484
485  if (c == 0x001a)
486    {
487      int c1 = phase2_getc ();
488
489      if (c1 == UEOF)
490	/* Seen U+001A right before the end of file.  */
491	return UEOF;
492
493      phase2_ungetc (c1);
494    }
495
496  if (c == UNL)
497    ++logical_line_number;
498  return c;
499}
500
501/* Supports 9 characters of pushback.  */
502static void
503phase3_ungetc (int c)
504{
505  if (c != UEOF)
506    {
507      if (c == UNL)
508	--logical_line_number;
509      if (phase3_pushback_length == SIZEOF (phase3_pushback))
510	abort ();
511      phase3_pushback[phase3_pushback_length++] = c;
512    }
513}
514
515
516/* ========================= Accumulating strings.  ======================== */
517
518/* A string buffer type that allows appending Unicode characters.
519   Returns the entire string in UTF-8 encoding.  */
520
521struct string_buffer
522{
523  /* The part of the string that has already been converted to UTF-8.  */
524  char *utf8_buffer;
525  size_t utf8_buflen;
526  size_t utf8_allocated;
527};
528
529/* Initialize a 'struct string_buffer' to empty.  */
530static inline void
531init_string_buffer (struct string_buffer *bp)
532{
533  bp->utf8_buffer = NULL;
534  bp->utf8_buflen = 0;
535  bp->utf8_allocated = 0;
536}
537
538/* Auxiliary function: Ensure count more bytes are available in bp->utf8.  */
539static inline void
540string_buffer_append_unicode_grow (struct string_buffer *bp, size_t count)
541{
542  if (bp->utf8_buflen + count > bp->utf8_allocated)
543    {
544      size_t new_allocated = 2 * bp->utf8_allocated + 10;
545      if (new_allocated < bp->utf8_buflen + count)
546	new_allocated = bp->utf8_buflen + count;
547      bp->utf8_allocated = new_allocated;
548      bp->utf8_buffer = xrealloc (bp->utf8_buffer, new_allocated);
549    }
550}
551
552/* Auxiliary function: Append a Unicode character to bp->utf8.
553   uc must be < 0x110000.  */
554static inline void
555string_buffer_append_unicode (struct string_buffer *bp, unsigned int uc)
556{
557  unsigned char utf8buf[6];
558  int count = u8_uctomb (utf8buf, uc, 6);
559
560  if (count < 0)
561    /* The caller should have ensured that uc is not out-of-range.  */
562    abort ();
563
564  string_buffer_append_unicode_grow (bp, count);
565  memcpy (bp->utf8_buffer + bp->utf8_buflen, utf8buf, count);
566  bp->utf8_buflen += count;
567}
568
569/* Return the string buffer's contents.  */
570static char *
571string_buffer_result (struct string_buffer *bp)
572{
573  /* NUL-terminate it.  */
574  string_buffer_append_unicode_grow (bp, 1);
575  bp->utf8_buffer[bp->utf8_buflen] = '\0';
576  /* Return it.  */
577  return bp->utf8_buffer;
578}
579
580/* Free the memory pointed to by a 'struct string_buffer'.  */
581static inline void
582free_string_buffer (struct string_buffer *bp)
583{
584  free (bp->utf8_buffer);
585}
586
587
588/* ======================== Accumulating comments.  ======================== */
589
590
591/* Accumulating a single comment line.  */
592
593static struct string_buffer comment_buffer;
594
595static inline void
596comment_start ()
597{
598  comment_buffer.utf8_buflen = 0;
599}
600
601static inline bool
602comment_at_start ()
603{
604  return (comment_buffer.utf8_buflen == 0);
605}
606
607static inline void
608comment_add (int c)
609{
610  string_buffer_append_unicode (&comment_buffer, c);
611}
612
613static inline void
614comment_line_end (size_t chars_to_remove)
615{
616  char *buffer = string_buffer_result (&comment_buffer);
617  size_t buflen = strlen (buffer);
618
619  buflen -= chars_to_remove;
620  while (buflen >= 1
621	 && (buffer[buflen - 1] == ' ' || buffer[buflen - 1] == '\t'))
622    --buflen;
623  buffer[buflen] = '\0';
624  savable_comment_add (buffer);
625}
626
627
628/* These are for tracking whether comments count as immediately before
629   keyword.  */
630static int last_comment_line;
631static int last_non_comment_line;
632
633
634/* Phase 4: Replace each comment that is not inside a character constant or
635   string literal with a space or newline character.
636   See ECMA-334 section 9.3.2.  */
637
638static int
639phase4_getc ()
640{
641  int c0;
642  int c;
643  bool last_was_star;
644
645  c0 = phase3_getc ();
646  if (c0 != '/')
647    return c0;
648  c = phase3_getc ();
649  switch (c)
650    {
651    default:
652      phase3_ungetc (c);
653      return c0;
654
655    case '*':
656      /* C style comment.  */
657      comment_start ();
658      last_was_star = false;
659      for (;;)
660	{
661	  c = phase3_getc ();
662	  if (c == UEOF)
663	    break;
664	  /* We skip all leading white space, but not EOLs.  */
665	  if (!(comment_at_start () && (c == ' ' || c == '\t')))
666	    comment_add (c);
667	  switch (c)
668	    {
669	    case UNL:
670	      comment_line_end (1);
671	      comment_start ();
672	      last_was_star = false;
673	      continue;
674
675	    case '*':
676	      last_was_star = true;
677	      continue;
678
679	    case '/':
680	      if (last_was_star)
681		{
682		  comment_line_end (2);
683		  break;
684		}
685	      /* FALLTHROUGH */
686
687	    default:
688	      last_was_star = false;
689	      continue;
690	    }
691	  break;
692	}
693      last_comment_line = logical_line_number;
694      return ' ';
695
696    case '/':
697      /* C++ style comment.  */
698      last_comment_line = logical_line_number;
699      comment_start ();
700      for (;;)
701	{
702	  c = phase3_getc ();
703	  if (c == UNL || c == UEOF)
704	    break;
705	  /* We skip all leading white space, but not EOLs.  */
706	  if (!(comment_at_start () && (c == ' ' || c == '\t')))
707	    comment_add (c);
708	}
709      phase3_ungetc (c); /* push back the newline, to decrement logical_line_number */
710      comment_line_end (0);
711      phase3_getc (); /* read the newline again */
712      return UNL;
713    }
714}
715
716/* Supports only one pushback character.  */
717static void
718phase4_ungetc (int c)
719{
720  phase3_ungetc (c);
721}
722
723
724/* ======================= Character classification.  ====================== */
725
726
727/* Return true if a given character is white space.
728   See ECMA-334 section 9.3.3.  */
729static bool
730is_whitespace (int c)
731{
732  /* Unicode character class Zs, as of Unicode 4.0.  */
733  /* grep '^[^;]*;[^;]*;Zs;' UnicodeData-4.0.0.txt */
734  switch (c >> 8)
735    {
736    case 0x00:
737      return (c == 0x0020 || c == 0x00a0);
738    case 0x16:
739      return (c == 0x1680);
740    case 0x18:
741      return (c == 0x180e);
742    case 0x20:
743      return ((c >= 0x2000 && c <= 0x200b) || c == 0x202f || c == 0x205f);
744    case 0x30:
745      return (c == 0x3000);
746    default:
747      return false;
748    }
749}
750
751
752/* C# allows identifiers containing many Unicode characters.  We recognize
753   them; to use an identifier with Unicode characters in a --keyword option,
754   it must be specified in UTF-8.  */
755
756static inline int
757bitmap_lookup (const void *table, unsigned int uc)
758{
759  unsigned int index1 = uc >> 16;
760  if (index1 < ((const int *) table)[0])
761    {
762      int lookup1 = ((const int *) table)[1 + index1];
763      if (lookup1 >= 0)
764	{
765	  unsigned int index2 = (uc >> 9) & 0x7f;
766	  int lookup2 = ((const int *) table)[lookup1 + index2];
767	  if (lookup2 >= 0)
768	    {
769	      unsigned int index3 = (uc >> 5) & 0xf;
770	      unsigned int lookup3 = ((const int *) table)[lookup2 + index3];
771
772	      return (lookup3 >> (uc & 0x1f)) & 1;
773	    }
774	}
775    }
776  return 0;
777}
778
779/* Unicode character classes Lu, Ll, Lt, Lm, Lo, Nl, as of Unicode 4.0,
780   plus the underscore.  */
781static const
782struct
783  {
784    int header[1];
785    int level1[3];
786    int level2[3 << 7];
787    /*unsigned*/ int level3[34 << 4];
788  }
789table_identifier_start =
790{
791  { 3 },
792  {     4,   132,   260 },
793  {
794      388,   404,   420,   436,   452,   468,   484,   500,
795      516,   532,   548,   564,   580,    -1,   596,   612,
796      628,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
797      644,    -1,   660,   660,   660,   660,   660,   660,
798      660,   660,   660,   660,   660,   660,   676,   660,
799      660,   660,   660,   660,   660,   660,   660,   660,
800      660,   660,   660,   660,   660,   660,   660,   660,
801      660,   660,   660,   660,   660,   660,   660,   660,
802      660,   660,   660,   660,   660,   660,   660,   660,
803      660,   660,   660,   660,   660,   660,   660,   692,
804      660,   660,   708,    -1,    -1,    -1,   660,   660,
805      660,   660,   660,   660,   660,   660,   660,   660,
806      660,   660,   660,   660,   660,   660,   660,   660,
807      660,   660,   660,   724,    -1,    -1,    -1,    -1,
808       -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
809       -1,    -1,    -1,    -1,   740,   756,   772,   788,
810      804,   820,   836,    -1,   852,    -1,    -1,    -1,
811       -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
812       -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
813       -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
814       -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
815       -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
816       -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
817       -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
818       -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
819       -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
820       -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
821       -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
822       -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
823       -1,    -1,   868,   884,    -1,    -1,    -1,    -1,
824       -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
825       -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
826      660,   660,   660,   660,   660,   660,   660,   660,
827      660,   660,   660,   660,   660,   660,   660,   660,
828      660,   660,   660,   660,   660,   660,   660,   660,
829      660,   660,   660,   660,   660,   660,   660,   660,
830      660,   660,   660,   660,   660,   660,   660,   660,
831      660,   660,   660,   660,   660,   660,   660,   660,
832      660,   660,   660,   660,   660,   660,   660,   660,
833      660,   660,   660,   660,   660,   660,   660,   660,
834      660,   660,   660,   660,   660,   660,   660,   660,
835      660,   660,   660,   660,   660,   660,   660,   660,
836      660,   660,   660,   900,    -1,    -1,    -1,    -1,
837       -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
838       -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
839       -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
840       -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
841       -1,    -1,    -1,    -1,   660,   916,    -1,    -1
842  },
843  {
844    0x00000000, 0x00000000, 0x87FFFFFE, 0x07FFFFFE,
845    0x00000000, 0x04200400, 0xFF7FFFFF, 0xFF7FFFFF,
846    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
847    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
848    0xFFFFFFFF, 0x007FFFFF, 0xFFFF0000, 0xFFFFFFFF,
849    0xFFFFFFFF, 0xFFFFFFFF, 0x0003FFC3, 0x0000401F,
850    0x00000000, 0x00000000, 0x00000000, 0x04000000,
851    0xFFFFD740, 0xFFFFFFFB, 0xFFFF7FFF, 0x0FBFFFFF,
852    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
853    0xFFFFFC03, 0xFFFFFFFF, 0xFFFF7FFF, 0x033FFFFF,
854    0x0000FFFF, 0xFFFE0000, 0x027FFFFF, 0xFFFFFFFE,
855    0x000000FF, 0x00000000, 0xFFFF0000, 0x000707FF,
856    0x00000000, 0x07FFFFFE, 0x000007FF, 0xFFFEC000,
857    0xFFFFFFFF, 0xFFFFFFFF, 0x002FFFFF, 0x9C00C060,
858    0xFFFD0000, 0x0000FFFF, 0x0000E000, 0x00000000,
859    0xFFFFFFFF, 0x0002003F, 0x00000000, 0x00000000,
860    0x00000000, 0x00000000, 0x00000000, 0x00000000,
861    0x00000000, 0x00000000, 0x00000000, 0x00000000,
862    0xFFFFFFF0, 0x23FFFFFF, 0xFF010000, 0x00000003,
863    0xFFF99FE0, 0x23C5FDFF, 0xB0000000, 0x00030003,
864    0xFFF987E0, 0x036DFDFF, 0x5E000000, 0x001C0000,
865    0xFFFBBFE0, 0x23EDFDFF, 0x00010000, 0x00000003,
866    0xFFF99FE0, 0x23EDFDFF, 0xB0000000, 0x00020003,
867    0xD63DC7E8, 0x03BFC718, 0x00000000, 0x00000000,
868    0xFFFDDFE0, 0x03EFFDFF, 0x00000000, 0x00000003,
869    0xFFFDDFE0, 0x23EFFDFF, 0x40000000, 0x00000003,
870    0xFFFDDFE0, 0x03FFFDFF, 0x00000000, 0x00000003,
871    0xFC7FFFE0, 0x2FFBFFFF, 0x0000007F, 0x00000000,
872    0xFFFFFFFE, 0x000DFFFF, 0x0000007F, 0x00000000,
873    0xFEF02596, 0x200DECAE, 0x3000005F, 0x00000000,
874    0x00000001, 0x00000000, 0xFFFFFEFF, 0x000007FF,
875    0x00000F00, 0x00000000, 0x00000000, 0x00000000,
876    0xFFFFFFFF, 0x000006FB, 0x003F0000, 0x00000000,
877    0x00000000, 0xFFFFFFFF, 0xFFFF003F, 0x01FFFFFF,
878    0xFFFFFFFF, 0xFFFFFFFF, 0x83FFFFFF, 0xFFFFFFFF,
879    0xFFFFFFFF, 0xFFFFFF07, 0xFFFFFFFF, 0x03FFFFFF,
880    0xFFFFFF7F, 0xFFFFFFFF, 0x3D7F3D7F, 0xFFFFFFFF,
881    0xFFFF3D7F, 0x7F3D7FFF, 0xFF7F7F3D, 0xFFFF7FFF,
882    0x7F3D7FFF, 0xFFFFFFFF, 0x07FFFF7F, 0x00000000,
883    0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0x001FFFFF,
884    0xFFFFFFFE, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
885    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
886    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
887    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
888    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x007F9FFF,
889    0x07FFFFFE, 0xFFFFFFFF, 0xFFFFFFFF, 0x0001C7FF,
890    0x0003DFFF, 0x0003FFFF, 0x0003FFFF, 0x0001DFFF,
891    0xFFFFFFFF, 0x000FFFFF, 0x10800000, 0x00000000,
892    0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0x00FFFFFF,
893    0xFFFFFFFF, 0x000001FF, 0x00000000, 0x00000000,
894    0x1FFFFFFF, 0x00000000, 0xFFFF0000, 0x001F3FFF,
895    0x00000000, 0x00000000, 0x00000000, 0x00000000,
896    0x00000000, 0x00000000, 0x00000000, 0x00000000,
897    0x00000000, 0x00000000, 0x00000000, 0x00000000,
898    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000FFF,
899    0x00000000, 0x00000000, 0x00000000, 0x00000000,
900    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
901    0x0FFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x03FFFFFF,
902    0x3F3FFFFF, 0xFFFFFFFF, 0xAAFF3F3F, 0x3FFFFFFF,
903    0xFFFFFFFF, 0x5FDFFFFF, 0x0FCF1FDC, 0x1FDC1FFF,
904    0x00000000, 0x00000000, 0x00000000, 0x80020000,
905    0x00000000, 0x00000000, 0x00000000, 0x00000000,
906    0x3E2FFC84, 0xE3FBBD50, 0x000003E0, 0xFFFFFFFF,
907    0x0000000F, 0x00000000, 0x00000000, 0x00000000,
908    0x000000E0, 0x1F3E03FE, 0xFFFFFFFE, 0xFFFFFFFF,
909    0xE07FFFFF, 0xFFFFFFFE, 0xFFFFFFFF, 0xF7FFFFFF,
910    0xFFFFFFE0, 0xFFFE1FFF, 0xFFFFFFFF, 0xFFFFFFFF,
911    0x00007FFF, 0x00FFFFFF, 0x00000000, 0xFFFF0000,
912    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
913    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
914    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
915    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
916    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
917    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
918    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
919    0xFFFFFFFF, 0x003FFFFF, 0x00000000, 0x00000000,
920    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
921    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
922    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
923    0xFFFFFFFF, 0x0000003F, 0x00000000, 0x00000000,
924    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
925    0x00001FFF, 0x00000000, 0x00000000, 0x00000000,
926    0x00000000, 0x00000000, 0x00000000, 0x00000000,
927    0x00000000, 0x00000000, 0x00000000, 0x00000000,
928    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
929    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
930    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
931    0xFFFFFFFF, 0x0000000F, 0x00000000, 0x00000000,
932    0x00000000, 0x00000000, 0x00000000, 0x00000000,
933    0x00000000, 0x00000000, 0x00000000, 0x00000000,
934    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
935    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
936    0xFFFFFFFF, 0xFFFF3FFF, 0xFFFFFFFF, 0x000007FF,
937    0x00000000, 0x00000000, 0x00000000, 0x00000000,
938    0xA0F8007F, 0x5F7FFDFF, 0xFFFFFFDB, 0xFFFFFFFF,
939    0xFFFFFFFF, 0x0003FFFF, 0xFFF80000, 0xFFFFFFFF,
940    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
941    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
942    0xFFFFFFFF, 0x3FFFFFFF, 0xFFFF0000, 0xFFFFFFFF,
943    0xFFFCFFFF, 0xFFFFFFFF, 0x000000FF, 0x0FFF0000,
944    0x00000000, 0x00000000, 0x00000000, 0xFFDF0000,
945    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x1FFFFFFF,
946    0x00000000, 0x07FFFFFE, 0x07FFFFFE, 0xFFFFFFC0,
947    0xFFFFFFFF, 0x7FFFFFFF, 0x1CFCFCFC, 0x00000000,
948    0xFFFFEFFF, 0xB7FFFF7F, 0x3FFF3FFF, 0x00000000,
949    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x07FFFFFF,
950    0x00000000, 0x00000000, 0x00000000, 0x00000000,
951    0x00000000, 0x00000000, 0x00000000, 0x00000000,
952    0x00000000, 0x00000000, 0x00000000, 0x00000000,
953    0x00000000, 0x00000000, 0x00000000, 0x00000000,
954    0x7FFFFFFF, 0xFFFF0000, 0x000007FF, 0x00000000,
955    0x3FFFFFFF, 0x00000000, 0x00000000, 0x00000000,
956    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
957    0x3FFFFFFF, 0x00000000, 0x00000000, 0x00000000,
958    0x00000000, 0x00000000, 0x00000000, 0x00000000,
959    0x00000000, 0x00000000, 0x00000000, 0x00000000,
960    0xFFFFFD3F, 0x91BFFFFF, 0x00000000, 0x00000000,
961    0x00000000, 0x00000000, 0x00000000, 0x00000000,
962    0x00000000, 0x00000000, 0x00000000, 0x00000000,
963    0x00000000, 0x00000000, 0x00000000, 0x00000000,
964    0xFFFFFFFF, 0xFFFFFFFF, 0xFFDFFFFF, 0xFFFFFFFF,
965    0xDFFFFFFF, 0xEBFFDE64, 0xFFFFFFEF, 0xFFFFFFFF,
966    0xDFDFE7BF, 0x7BFFFFFF, 0xFFFDFC5F, 0xFFFFFFFF,
967    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
968    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
969    0xFFFFFFFF, 0xFFFFFF0F, 0xF7FFFFFD, 0xF7FFFFFF,
970    0xFFDFFFFF, 0xFFDFFFFF, 0xFFFF7FFF, 0xFFFF7FFF,
971    0xFFFFFDFF, 0xFFFFFDFF, 0x000003F7, 0x00000000,
972    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
973    0xFFFFFFFF, 0xFFFFFFFF, 0x007FFFFF, 0x00000000,
974    0x00000000, 0x00000000, 0x00000000, 0x00000000,
975    0x00000000, 0x00000000, 0x00000000, 0x00000000,
976    0x3FFFFFFF, 0x00000000, 0x00000000, 0x00000000,
977    0x00000000, 0x00000000, 0x00000000, 0x00000000,
978    0x00000000, 0x00000000, 0x00000000, 0x00000000,
979    0x00000000, 0x00000000, 0x00000000, 0x00000000
980  }
981};
982
983/* Unicode character classes Lu, Ll, Lt, Lm, Lo, Nl, Nd, Pc, Mn, Mc, Cf,
984   as of Unicode 4.0.  */
985static const
986struct
987  {
988    int header[1];
989    int level1[15];
990    int level2[4 << 7];
991    /*unsigned*/ int level3[36 << 4];
992  }
993table_identifier_part =
994{
995  { 15 },
996  {
997       16,   144,   272,    -1,    -1,    -1,    -1,    -1,
998       -1,    -1,    -1,    -1,    -1,    -1,   400
999  },
1000  {
1001      528,   544,   560,   576,   592,   608,   624,   640,
1002      656,   672,   688,   704,   720,    -1,   736,   752,
1003      768,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
1004      784,    -1,   800,   800,   800,   800,   800,   800,
1005      800,   800,   800,   800,   800,   800,   816,   800,
1006      800,   800,   800,   800,   800,   800,   800,   800,
1007      800,   800,   800,   800,   800,   800,   800,   800,
1008      800,   800,   800,   800,   800,   800,   800,   800,
1009      800,   800,   800,   800,   800,   800,   800,   800,
1010      800,   800,   800,   800,   800,   800,   800,   832,
1011      800,   800,   848,    -1,    -1,    -1,   800,   800,
1012      800,   800,   800,   800,   800,   800,   800,   800,
1013      800,   800,   800,   800,   800,   800,   800,   800,
1014      800,   800,   800,   864,    -1,    -1,    -1,    -1,
1015       -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
1016       -1,    -1,    -1,    -1,   880,   896,   912,   928,
1017      944,   960,   976,    -1,   992,    -1,    -1,    -1,
1018       -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
1019       -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
1020       -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
1021       -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
1022       -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
1023       -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
1024       -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
1025       -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
1026       -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
1027       -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
1028       -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
1029       -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
1030     1008,    -1,  1024,  1040,    -1,    -1,    -1,    -1,
1031       -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
1032       -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
1033      800,   800,   800,   800,   800,   800,   800,   800,
1034      800,   800,   800,   800,   800,   800,   800,   800,
1035      800,   800,   800,   800,   800,   800,   800,   800,
1036      800,   800,   800,   800,   800,   800,   800,   800,
1037      800,   800,   800,   800,   800,   800,   800,   800,
1038      800,   800,   800,   800,   800,   800,   800,   800,
1039      800,   800,   800,   800,   800,   800,   800,   800,
1040      800,   800,   800,   800,   800,   800,   800,   800,
1041      800,   800,   800,   800,   800,   800,   800,   800,
1042      800,   800,   800,   800,   800,   800,   800,   800,
1043      800,   800,   800,  1056,    -1,    -1,    -1,    -1,
1044       -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
1045       -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
1046       -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
1047       -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
1048       -1,    -1,    -1,    -1,   800,  1072,    -1,    -1,
1049     1088,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
1050       -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
1051       -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
1052       -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
1053       -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
1054       -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
1055       -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
1056       -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
1057       -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
1058       -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
1059       -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
1060       -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
1061       -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
1062       -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
1063       -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
1064       -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1
1065  },
1066  {
1067    0x00000000, 0x03FF0000, 0x87FFFFFE, 0x07FFFFFE,
1068    0x00000000, 0x04202400, 0xFF7FFFFF, 0xFF7FFFFF,
1069    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1070    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1071    0xFFFFFFFF, 0x007FFFFF, 0xFFFF0000, 0xFFFFFFFF,
1072    0xFFFFFFFF, 0xFFFFFFFF, 0x0003FFC3, 0x0000401F,
1073    0xFFFFFFFF, 0xFFFFFFFF, 0xE0FFFFFF, 0x0400FFFF,
1074    0xFFFFD740, 0xFFFFFFFB, 0xFFFF7FFF, 0x0FBFFFFF,
1075    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1076    0xFFFFFC7B, 0xFFFFFFFF, 0xFFFF7FFF, 0x033FFFFF,
1077    0x0000FFFF, 0xFFFE0000, 0x027FFFFF, 0xFFFFFFFE,
1078    0xFFFE00FF, 0xBBFFFFFB, 0xFFFF0016, 0x000707FF,
1079    0x003F000F, 0x07FFFFFE, 0x01FFFFFF, 0xFFFFC3FF,
1080    0xFFFFFFFF, 0xFFFFFFFF, 0xBFEFFFFF, 0x9FFFFDFF,
1081    0xFFFF8000, 0xFFFFFFFF, 0x0000E7FF, 0x00000000,
1082    0xFFFFFFFF, 0x0003FFFF, 0x00000000, 0x00000000,
1083    0x00000000, 0x00000000, 0x00000000, 0x00000000,
1084    0x00000000, 0x00000000, 0x00000000, 0x00000000,
1085    0xFFFFFFFE, 0xF3FFFFFF, 0xFF1F3FFF, 0x0000FFCF,
1086    0xFFF99FEE, 0xF3C5FDFF, 0xB080399F, 0x0003FFCF,
1087    0xFFF987EE, 0xD36DFDFF, 0x5E003987, 0x001FFFC0,
1088    0xFFFBBFEE, 0xF3EDFDFF, 0x00013BBF, 0x0000FFCF,
1089    0xFFF99FEE, 0xF3EDFDFF, 0xB0C0398F, 0x0002FFC3,
1090    0xD63DC7EC, 0xC3BFC718, 0x00803DC7, 0x0000FF80,
1091    0xFFFDDFEE, 0xC3EFFDFF, 0x00603DDF, 0x0000FFC3,
1092    0xFFFDDFEC, 0xF3EFFDFF, 0x40603DDF, 0x0000FFC3,
1093    0xFFFDDFEC, 0xC3FFFDFF, 0x00803DCF, 0x0000FFC3,
1094    0xFC7FFFEC, 0x2FFBFFFF, 0xFF5F847F, 0x000C0000,
1095    0xFFFFFFFE, 0x07FFFFFF, 0x03FF7FFF, 0x00000000,
1096    0xFEF02596, 0x3BFFECAE, 0x33FF3F5F, 0x00000000,
1097    0x03000001, 0xC2A003FF, 0xFFFFFEFF, 0xFFFE07FF,
1098    0xFEFF0FDF, 0x1FFFFFFF, 0x00000040, 0x00000000,
1099    0xFFFFFFFF, 0x03C7F6FB, 0x03FF03FF, 0x00000000,
1100    0x00000000, 0xFFFFFFFF, 0xFFFF003F, 0x01FFFFFF,
1101    0xFFFFFFFF, 0xFFFFFFFF, 0x83FFFFFF, 0xFFFFFFFF,
1102    0xFFFFFFFF, 0xFFFFFF07, 0xFFFFFFFF, 0x03FFFFFF,
1103    0xFFFFFF7F, 0xFFFFFFFF, 0x3D7F3D7F, 0xFFFFFFFF,
1104    0xFFFF3D7F, 0x7F3D7FFF, 0xFF7F7F3D, 0xFFFF7FFF,
1105    0x7F3D7FFF, 0xFFFFFFFF, 0x07FFFF7F, 0x0003FE00,
1106    0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0x001FFFFF,
1107    0xFFFFFFFE, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1108    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1109    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1110    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1111    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x007F9FFF,
1112    0x07FFFFFE, 0xFFFFFFFF, 0xFFFFFFFF, 0x0001C7FF,
1113    0x001FDFFF, 0x001FFFFF, 0x000FFFFF, 0x000DDFFF,
1114    0xFFFFFFFF, 0xFFFFFFFF, 0x308FFFFF, 0x000003FF,
1115    0x03FF3800, 0xFFFFFFFF, 0xFFFFFFFF, 0x00FFFFFF,
1116    0xFFFFFFFF, 0x000003FF, 0x00000000, 0x00000000,
1117    0x1FFFFFFF, 0x0FFF0FFF, 0xFFFFFFC0, 0x001F3FFF,
1118    0x00000000, 0x00000000, 0x00000000, 0x00000000,
1119    0x00000000, 0x00000000, 0x00000000, 0x00000000,
1120    0x00000000, 0x00000000, 0x00000000, 0x00000000,
1121    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000FFF,
1122    0x00000000, 0x00000000, 0x00000000, 0x00000000,
1123    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1124    0x0FFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x03FFFFFF,
1125    0x3F3FFFFF, 0xFFFFFFFF, 0xAAFF3F3F, 0x3FFFFFFF,
1126    0xFFFFFFFF, 0x5FDFFFFF, 0x0FCF1FDC, 0x1FDC1FFF,
1127    0x0000F000, 0x80007C00, 0x00100001, 0x8002FC0F,
1128    0x00000000, 0x00000000, 0x1FFF0000, 0x000007E2,
1129    0x3E2FFC84, 0xE3FBBD50, 0x000003E0, 0xFFFFFFFF,
1130    0x0000000F, 0x00000000, 0x00000000, 0x00000000,
1131    0x000000E0, 0x1F3EFFFE, 0xFFFFFFFE, 0xFFFFFFFF,
1132    0xE67FFFFF, 0xFFFFFFFE, 0xFFFFFFFF, 0xFFFFFFFF,
1133    0xFFFFFFE0, 0xFFFE1FFF, 0xFFFFFFFF, 0xFFFFFFFF,
1134    0x00007FFF, 0x00FFFFFF, 0x00000000, 0xFFFF0000,
1135    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1136    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1137    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1138    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1139    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1140    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1141    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1142    0xFFFFFFFF, 0x003FFFFF, 0x00000000, 0x00000000,
1143    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1144    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1145    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1146    0xFFFFFFFF, 0x0000003F, 0x00000000, 0x00000000,
1147    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1148    0x00001FFF, 0x00000000, 0x00000000, 0x00000000,
1149    0x00000000, 0x00000000, 0x00000000, 0x00000000,
1150    0x00000000, 0x00000000, 0x00000000, 0x00000000,
1151    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1152    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1153    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1154    0xFFFFFFFF, 0x0000000F, 0x00000000, 0x00000000,
1155    0x00000000, 0x00000000, 0x00000000, 0x00000000,
1156    0x00000000, 0x00000000, 0x00000000, 0x00000000,
1157    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1158    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1159    0xFFFFFFFF, 0xFFFF3FFF, 0xFFFFFFFF, 0x000007FF,
1160    0x00000000, 0x00000000, 0x00000000, 0x00000000,
1161    0xE0F8007F, 0x5F7FFDFF, 0xFFFFFFDB, 0xFFFFFFFF,
1162    0xFFFFFFFF, 0x0003FFFF, 0xFFF80000, 0xFFFFFFFF,
1163    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1164    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1165    0xFFFFFFFF, 0x3FFFFFFF, 0xFFFF0000, 0xFFFFFFFF,
1166    0xFFFCFFFF, 0xFFFFFFFF, 0x000000FF, 0x0FFF0000,
1167    0x0000FFFF, 0x0018000F, 0x0000E000, 0xFFDF0000,
1168    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x9FFFFFFF,
1169    0x03FF0000, 0x87FFFFFE, 0x07FFFFFE, 0xFFFFFFE0,
1170    0xFFFFFFFF, 0x7FFFFFFF, 0x1CFCFCFC, 0x0E000000,
1171    0xFFFFEFFF, 0xB7FFFF7F, 0x3FFF3FFF, 0x00000000,
1172    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x07FFFFFF,
1173    0x00000000, 0x00000000, 0x00000000, 0x00000000,
1174    0x00000000, 0x00000000, 0x00000000, 0x00000000,
1175    0x00000000, 0x00000000, 0x00000000, 0x00000000,
1176    0x00000000, 0x00000000, 0x00000000, 0x00000000,
1177    0x7FFFFFFF, 0xFFFF0000, 0x000007FF, 0x00000000,
1178    0x3FFFFFFF, 0x00000000, 0x00000000, 0x00000000,
1179    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1180    0x3FFFFFFF, 0x000003FF, 0x00000000, 0x00000000,
1181    0x00000000, 0x00000000, 0x00000000, 0x00000000,
1182    0x00000000, 0x00000000, 0x00000000, 0x00000000,
1183    0xFFFFFD3F, 0x91BFFFFF, 0x00000000, 0x00000000,
1184    0x00000000, 0x00000000, 0x00000000, 0x00000000,
1185    0x00000000, 0x00000000, 0x00000000, 0x00000000,
1186    0x00000000, 0x00000000, 0x00000000, 0x00000000,
1187    0x00000000, 0x00000000, 0x00000000, 0x00000000,
1188    0x00000000, 0x00000000, 0x00000000, 0x00000000,
1189    0x00000000, 0x00000000, 0x00000000, 0xFFFFE3E0,
1190    0x00000FE7, 0x00003C00, 0x00000000, 0x00000000,
1191    0xFFFFFFFF, 0xFFFFFFFF, 0xFFDFFFFF, 0xFFFFFFFF,
1192    0xDFFFFFFF, 0xEBFFDE64, 0xFFFFFFEF, 0xFFFFFFFF,
1193    0xDFDFE7BF, 0x7BFFFFFF, 0xFFFDFC5F, 0xFFFFFFFF,
1194    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1195    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1196    0xFFFFFFFF, 0xFFFFFF0F, 0xF7FFFFFD, 0xF7FFFFFF,
1197    0xFFDFFFFF, 0xFFDFFFFF, 0xFFFF7FFF, 0xFFFF7FFF,
1198    0xFFFFFDFF, 0xFFFFFDFF, 0xFFFFC3F7, 0xFFFFFFFF,
1199    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1200    0xFFFFFFFF, 0xFFFFFFFF, 0x007FFFFF, 0x00000000,
1201    0x00000000, 0x00000000, 0x00000000, 0x00000000,
1202    0x00000000, 0x00000000, 0x00000000, 0x00000000,
1203    0x3FFFFFFF, 0x00000000, 0x00000000, 0x00000000,
1204    0x00000000, 0x00000000, 0x00000000, 0x00000000,
1205    0x00000000, 0x00000000, 0x00000000, 0x00000000,
1206    0x00000000, 0x00000000, 0x00000000, 0x00000000,
1207    0x00000002, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1208    0x00000000, 0x00000000, 0x00000000, 0x00000000,
1209    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1210    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x0000FFFF
1211  }
1212};
1213
1214/* Return true if a given character can occur as first character of an
1215   identifier.  See ECMA-334 section 9.4.2.  */
1216static bool
1217is_identifier_start (int c)
1218{
1219  return bitmap_lookup (&table_identifier_start, c);
1220  /* In ASCII only this would be:
1221     return ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || c == '_');
1222   */
1223}
1224
1225/* Return true if a given character can occur as character of an identifier.
1226   See ECMA-334 section 9.4.2.  */
1227static bool
1228is_identifier_part (int c)
1229{
1230  return bitmap_lookup (&table_identifier_part, c);
1231  /* In ASCII only this would be:
1232     return ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z')
1233             || (c >= '0' && c <= '9') || c == '_');
1234   */
1235}
1236
1237static bool
1238is_any_character (int c)
1239{
1240  return true;
1241}
1242
1243
1244/* ======================= Preprocessor directives.  ======================= */
1245
1246
1247/* Phase 5: Remove preprocessor lines.  See ECMA-334 section 9.5.
1248   As a side effect, this also removes initial whitespace on every line;
1249   this whitespace doesn't matter.  */
1250
1251static int phase5_pushback[10];
1252static int phase5_pushback_length;
1253
1254static int
1255phase5_getc ()
1256{
1257  int c;
1258
1259  if (phase5_pushback_length)
1260    return phase5_pushback[--phase5_pushback_length];
1261
1262  c = phase4_getc ();
1263  if (c != UNL)
1264    return c;
1265
1266  do
1267    c = phase3_getc ();
1268  while (c != UEOF && is_whitespace (c));
1269
1270  if (c == '#')
1271    {
1272      /* Ignore the entire line containing the preprocessor directive
1273	 (including the // comment if it contains one).  */
1274      do
1275	c = phase3_getc ();
1276      while (c != UEOF && c != UNL);
1277      return c;
1278    }
1279  else
1280    {
1281      phase3_ungetc (c);
1282      return UNL;
1283    }
1284}
1285
1286#ifdef unused
1287static void
1288phase5_ungetc (int c)
1289{
1290  if (c != UEOF)
1291    {
1292      if (phase5_pushback_length == SIZEOF (phase5_pushback))
1293	abort ();
1294      phase5_pushback[phase5_pushback_length++] = c;
1295    }
1296}
1297#endif
1298
1299
1300/* ========================== Reading of tokens.  ========================== */
1301
1302enum token_type_ty
1303{
1304  token_type_eof,
1305  token_type_lparen,		/* ( */
1306  token_type_rparen,		/* ) */
1307  token_type_lbrace,		/* { */
1308  token_type_rbrace,		/* } */
1309  token_type_comma,		/* , */
1310  token_type_dot,		/* . */
1311  token_type_string_literal,	/* "abc", @"abc" */
1312  token_type_number,		/* 1.23 */
1313  token_type_symbol,		/* identifier, keyword, null */
1314  token_type_plus,		/* + */
1315  token_type_other		/* character literal, misc. operator */
1316};
1317typedef enum token_type_ty token_type_ty;
1318
1319typedef struct token_ty token_ty;
1320struct token_ty
1321{
1322  token_type_ty type;
1323  char *string;		/* for token_type_string_literal, token_type_symbol */
1324  refcounted_string_list_ty *comment;	/* for token_type_string_literal */
1325  int line_number;
1326  int logical_line_number;
1327};
1328
1329
1330/* Free the memory pointed to by a 'struct token_ty'.  */
1331static inline void
1332free_token (token_ty *tp)
1333{
1334  if (tp->type == token_type_string_literal || tp->type == token_type_symbol)
1335    free (tp->string);
1336  if (tp->type == token_type_string_literal)
1337    drop_reference (tp->comment);
1338}
1339
1340
1341/* Read a Unicode escape sequence outside string/character literals.
1342   Reject Unicode escapes that don't fulfill the given predicate.
1343   See ECMA-334 section 9.4.2.  */
1344static int
1345do_getc_unicode_escaped (bool (*predicate) (int))
1346{
1347  int c;
1348
1349  /* Use phase 3, because phase 4 elides comments.  */
1350  c = phase3_getc ();
1351  if (c == UEOF)
1352    return '\\';
1353  if (c == 'u' || c == 'U')
1354    {
1355      unsigned char buf[8];
1356      int expect;
1357      unsigned int n;
1358      int i;
1359
1360      expect = (c == 'U' ? 8 : 4);
1361      n = 0;
1362      for (i = 0; i < expect; i++)
1363	{
1364	  int c1 = phase3_getc ();
1365
1366	  if (c1 >= '0' && c1 <= '9')
1367	    n = (n << 4) + (c1 - '0');
1368	  else if (c1 >= 'A' && c1 <= 'F')
1369	    n = (n << 4) + (c1 - 'A' + 10);
1370	  else if (c1 >= 'a' && c1 <= 'f')
1371	    n = (n << 4) + (c1 - 'a' + 10);
1372	  else
1373	    {
1374	      phase3_ungetc (c1);
1375	      while (--i >= 0)
1376		phase3_ungetc (buf[i]);
1377	      phase3_ungetc (c);
1378	      return '\\';
1379	    }
1380
1381	  buf[i] = c1;
1382	}
1383
1384      if (n >= 0x110000)
1385	{
1386	  error_with_progname = false;
1387	  error (0, 0, _("%s:%d: warning: invalid Unicode character"),
1388		 logical_file_name, line_number);
1389	  error_with_progname = true;
1390	}
1391      else if (predicate (n))
1392	return n;
1393
1394      while (--i >= 0)
1395	phase3_ungetc (buf[i]);
1396    }
1397  phase3_ungetc (c);
1398  return '\\';
1399}
1400
1401
1402/* Read an escape sequence inside a string literal or character literal.
1403   See ECMA-334 sections 9.4.4.4., 9.4.4.5.  */
1404static int
1405do_getc_escaped ()
1406{
1407  int c;
1408  int n;
1409  int i;
1410
1411  /* Use phase 3, because phase 4 elides comments.  */
1412  c = phase3_getc ();
1413  if (c == UEOF)
1414    return '\\';
1415  switch (c)
1416    {
1417    case 'a':
1418      return 0x0007;
1419    case 'b':
1420      return 0x0008;
1421    case 't':
1422      return 0x0009;
1423    case 'n':
1424      return 0x000a;
1425    case 'v':
1426      return 0x000b;
1427    case 'f':
1428      return 0x000c;
1429    case 'r':
1430      return 0x000d;
1431    case '"':
1432      return '"';
1433    case '\'':
1434      return '\'';
1435    case '\\':
1436      return '\\';
1437    case '0':
1438      return 0x0000;
1439    case 'x':
1440      c = phase3_getc ();
1441      switch (c)
1442	{
1443	default:
1444	  phase3_ungetc (c);
1445	  phase3_ungetc ('x');
1446	  return '\\';
1447
1448	case '0': case '1': case '2': case '3': case '4':
1449	case '5': case '6': case '7': case '8': case '9':
1450	case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
1451	case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
1452	  break;
1453	}
1454      n = 0;
1455      for (i = 0;; i++)
1456	{
1457	  switch (c)
1458	    {
1459	    default:
1460	      phase3_ungetc (c);
1461	      return n;
1462	    case '0': case '1': case '2': case '3': case '4':
1463	    case '5': case '6': case '7': case '8': case '9':
1464	      n = n * 16 + c - '0';
1465	      break;
1466	    case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
1467	      n = n * 16 + 10 + c - 'A';
1468	      break;
1469	    case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
1470	      n = n * 16 + 10 + c - 'a';
1471	      break;
1472	    }
1473	  if (i == 3)
1474	    break;
1475	  c = phase3_getc ();
1476	}
1477      return n;
1478    case 'u': case 'U':
1479      phase3_ungetc (c);
1480      return do_getc_unicode_escaped (is_any_character);
1481    default:
1482      /* Invalid escape sequence.  */
1483      phase3_ungetc (c);
1484      return '\\';
1485    }
1486}
1487
1488/* Read a regular string literal or character literal.
1489   See ECMA-334 sections 9.4.4.4., 9.4.4.5.  */
1490static void
1491accumulate_escaped (struct string_buffer *literal, int delimiter)
1492{
1493  int c;
1494
1495  for (;;)
1496    {
1497      /* Use phase 3, because phase 4 elides comments.  */
1498      c = phase3_getc ();
1499      if (c == UEOF || c == delimiter)
1500	break;
1501      if (c == UNL)
1502	{
1503	  phase3_ungetc (c);
1504	  error_with_progname = false;
1505	  if (delimiter == '\'')
1506	    error (0, 0, _("%s:%d: warning: unterminated character constant"),
1507		   logical_file_name, line_number);
1508	  else
1509	    error (0, 0, _("%s:%d: warning: unterminated string constant"),
1510		   logical_file_name, line_number);
1511	  error_with_progname = true;
1512	  break;
1513	}
1514      if (c == '\\')
1515	c = do_getc_escaped ();
1516      string_buffer_append_unicode (literal, c);
1517    }
1518}
1519
1520
1521/* Combine characters into tokens.  Discard whitespace.  */
1522
1523/* Maximum used guaranteed to be < 4.  */
1524static token_ty phase6_pushback[4];
1525static int phase6_pushback_length;
1526
1527static void
1528phase6_get (token_ty *tp)
1529{
1530  int c;
1531
1532  if (phase6_pushback_length)
1533    {
1534      *tp = phase6_pushback[--phase6_pushback_length];
1535      return;
1536    }
1537  tp->string = NULL;
1538
1539  for (;;)
1540    {
1541      tp->line_number = line_number;
1542      tp->logical_line_number = logical_line_number;
1543      c = phase5_getc ();
1544
1545      if (c == UEOF)
1546	{
1547	  tp->type = token_type_eof;
1548	  return;
1549	}
1550
1551      switch (c)
1552	{
1553	case UNL:
1554	  if (last_non_comment_line > last_comment_line)
1555	    savable_comment_reset ();
1556	  /* FALLTHROUGH */
1557	case ' ':
1558	case '\t':
1559	case '\f':
1560	  /* Ignore whitespace and comments.  */
1561	  continue;
1562	}
1563
1564      last_non_comment_line = tp->logical_line_number;
1565
1566      switch (c)
1567	{
1568	case '(':
1569	  tp->type = token_type_lparen;
1570	  return;
1571
1572	case ')':
1573	  tp->type = token_type_rparen;
1574	  return;
1575
1576	case '{':
1577	  tp->type = token_type_lbrace;
1578	  return;
1579
1580	case '}':
1581	  tp->type = token_type_rbrace;
1582	  return;
1583
1584	case ',':
1585	  tp->type = token_type_comma;
1586	  return;
1587
1588	case '.':
1589	  c = phase4_getc ();
1590	  if (!(c >= '0' && c <= '9'))
1591	    {
1592	      phase4_ungetc (c);
1593	      tp->type = token_type_dot;
1594	      return;
1595	    }
1596	  /* FALLTHROUGH */
1597
1598	case '0': case '1': case '2': case '3': case '4':
1599	case '5': case '6': case '7': case '8': case '9':
1600	  {
1601	    /* Don't need to verify the complicated syntax of integers and
1602	       floating-point numbers.  We assume a valid C# input.
1603	       The simplified syntax that we recognize as number is: any
1604	       sequence of alphanumeric characters, additionally '+' and '-'
1605	       immediately after 'e' or 'E' except in hexadecimal numbers.  */
1606	    bool hexadecimal = false;
1607
1608	    for (;;)
1609	      {
1610		c = phase4_getc ();
1611		if (c >= '0' && c <= '9')
1612		  continue;
1613		if ((c >= 'A' && c <= 'Z') || (c >= 'a' &&c <= 'z'))
1614		  {
1615		    if (c == 'X' || c == 'x')
1616		      hexadecimal = true;
1617		    if ((c == 'E' || c == 'e') && !hexadecimal)
1618		      {
1619			c = phase4_getc ();
1620			if (!(c == '+' || c == '-'))
1621			  phase4_ungetc (c);
1622		      }
1623		    continue;
1624		  }
1625		if (c == '.')
1626		  continue;
1627		break;
1628	      }
1629	    phase4_ungetc (c);
1630	    tp->type = token_type_number;
1631	    return;
1632	  }
1633
1634	case '"':
1635	  /* Regular string literal.  */
1636	  {
1637	    struct string_buffer literal;
1638
1639	    init_string_buffer (&literal);
1640	    accumulate_escaped (&literal, '"');
1641	    tp->string = xstrdup (string_buffer_result (&literal));
1642	    free_string_buffer (&literal);
1643	    tp->comment = add_reference (savable_comment);
1644	    tp->type = token_type_string_literal;
1645	    return;
1646	  }
1647
1648	case '\'':
1649	  /* Character literal.  */
1650	  {
1651	    struct string_buffer literal;
1652
1653	    init_string_buffer (&literal);
1654	    accumulate_escaped (&literal, '\'');
1655	    free_string_buffer (&literal);
1656	    tp->type = token_type_other;
1657	    return;
1658	  }
1659
1660	case '+':
1661	  c = phase4_getc ();
1662	  if (c == '+')
1663	    /* Operator ++ */
1664	    tp->type = token_type_other;
1665	  else if (c == '=')
1666	    /* Operator += */
1667	    tp->type = token_type_other;
1668	  else
1669	    {
1670	      /* Operator + */
1671	      phase4_ungetc (c);
1672	      tp->type = token_type_plus;
1673	    }
1674	  return;
1675
1676	case '@':
1677	  c = phase4_getc ();
1678	  if (c == '"')
1679	    {
1680	      /* Verbatim string literal.  */
1681	      struct string_buffer literal;
1682
1683	      init_string_buffer (&literal);
1684	      for (;;)
1685		{
1686		  /* Use phase 2, because phase 4 elides comments and phase 3
1687		     mixes up the newline characters.  */
1688		  c = phase2_getc ();
1689		  if (c == UEOF)
1690		    break;
1691		  if (c == '"')
1692		    {
1693		      c = phase2_getc ();
1694		      if (c != '"')
1695			{
1696			  phase2_ungetc (c);
1697			  break;
1698			}
1699		    }
1700		  /* No special treatment of newline and backslash here.  */
1701		  string_buffer_append_unicode (&literal, c);
1702		}
1703	      tp->string = xstrdup (string_buffer_result (&literal));
1704	      free_string_buffer (&literal);
1705	      tp->comment = add_reference (savable_comment);
1706	      tp->type = token_type_string_literal;
1707	      return;
1708	    }
1709	  /* FALLTHROUGH, so that @identifier is recognized.  */
1710
1711	default:
1712	  if (c == '\\')
1713	    c = do_getc_unicode_escaped (is_identifier_start);
1714	  if (is_identifier_start (c))
1715	    {
1716	      static struct string_buffer buffer;
1717	      buffer.utf8_buflen = 0;
1718	      for (;;)
1719		{
1720		  string_buffer_append_unicode (&buffer, c);
1721		  c = phase4_getc ();
1722		  if (c == '\\')
1723		    c = do_getc_unicode_escaped (is_identifier_part);
1724		  if (!is_identifier_part (c))
1725		    break;
1726		}
1727	      phase4_ungetc (c);
1728	      tp->string = xstrdup (string_buffer_result (&buffer));
1729	      tp->type = token_type_symbol;
1730	      return;
1731	    }
1732	  else
1733	    {
1734	      /* Misc. operator.  */
1735	      tp->type = token_type_other;
1736	      return;
1737	    }
1738	}
1739    }
1740}
1741
1742/* Supports 3 tokens of pushback.  */
1743static void
1744phase6_unget (token_ty *tp)
1745{
1746  if (tp->type != token_type_eof)
1747    {
1748      if (phase6_pushback_length == SIZEOF (phase6_pushback))
1749	abort ();
1750      phase6_pushback[phase6_pushback_length++] = *tp;
1751    }
1752}
1753
1754
1755/* Compile-time optimization of string literal concatenation.
1756   Combine "string1" + ... + "stringN" to the concatenated string if
1757     - the token after this expression is not '.' (because then the last
1758       string could be part of a method call expression).  */
1759
1760static token_ty phase7_pushback[2];
1761static int phase7_pushback_length;
1762
1763static void
1764phase7_get (token_ty *tp)
1765{
1766  if (phase7_pushback_length)
1767    {
1768      *tp = phase7_pushback[--phase7_pushback_length];
1769      return;
1770    }
1771
1772  phase6_get (tp);
1773  if (tp->type == token_type_string_literal)
1774    {
1775      char *sum = tp->string;
1776      size_t sum_len = strlen (sum);
1777
1778      for (;;)
1779	{
1780	  token_ty token2;
1781
1782	  phase6_get (&token2);
1783	  if (token2.type == token_type_plus)
1784	    {
1785	      token_ty token3;
1786
1787	      phase6_get (&token3);
1788	      if (token3.type == token_type_string_literal)
1789		{
1790		  token_ty token_after;
1791
1792		  phase6_get (&token_after);
1793		  if (token_after.type != token_type_dot)
1794		    {
1795		      char *addend = token3.string;
1796		      size_t addend_len = strlen (addend);
1797
1798		      sum = (char *) xrealloc (sum, sum_len + addend_len + 1);
1799		      memcpy (sum + sum_len, addend, addend_len + 1);
1800		      sum_len += addend_len;
1801
1802		      phase6_unget (&token_after);
1803		      free_token (&token3);
1804		      free_token (&token2);
1805		      continue;
1806		    }
1807		  phase6_unget (&token_after);
1808		}
1809	      phase6_unget (&token3);
1810	    }
1811	  phase6_unget (&token2);
1812	  break;
1813	}
1814      tp->string = sum;
1815    }
1816}
1817
1818/* Supports 2 tokens of pushback.  */
1819static void
1820phase7_unget (token_ty *tp)
1821{
1822  if (tp->type != token_type_eof)
1823    {
1824      if (phase7_pushback_length == SIZEOF (phase7_pushback))
1825	abort ();
1826      phase7_pushback[phase7_pushback_length++] = *tp;
1827    }
1828}
1829
1830
1831static void
1832x_csharp_lex (token_ty *tp)
1833{
1834  phase7_get (tp);
1835}
1836
1837/* Supports 2 tokens of pushback.  */
1838static void
1839x_csharp_unlex (token_ty *tp)
1840{
1841  phase7_unget (tp);
1842}
1843
1844
1845/* ========================= Extracting strings.  ========================== */
1846
1847
1848/* Context lookup table.  */
1849static flag_context_list_table_ty *flag_context_list_table;
1850
1851
1852/* The file is broken into tokens.  Scan the token stream, looking for
1853   a keyword, followed by a left paren, followed by a string.  When we
1854   see this sequence, we have something to remember.  We assume we are
1855   looking at a valid C or C++ program, and leave the complaints about
1856   the grammar to the compiler.
1857
1858     Normal handling: Look for
1859       keyword ( ... msgid ... )
1860     Plural handling: Look for
1861       keyword ( ... msgid ... msgid_plural ... )
1862
1863   We use recursion because the arguments before msgid or between msgid
1864   and msgid_plural can contain subexpressions of the same form.  */
1865
1866
1867/* Extract messages until the next balanced closing parenthesis or brace,
1868   depending on TERMINATOR.
1869   Extracted messages are added to MLP.
1870   Return true upon eof, false upon closing parenthesis or brace.  */
1871static bool
1872extract_parenthesized (message_list_ty *mlp, token_type_ty terminator,
1873		       flag_context_ty outer_context,
1874		       flag_context_list_iterator_ty context_iter,
1875		       struct arglist_parser *argparser)
1876{
1877  /* Current argument number.  */
1878  int arg = 1;
1879  /* 0 when no keyword has been seen.  1 right after a keyword is seen.  */
1880  int state;
1881  /* Parameters of the keyword just seen.  Defined only in state 1.  */
1882  const struct callshapes *next_shapes = NULL;
1883  /* Context iterator that will be used if the next token is a '('.  */
1884  flag_context_list_iterator_ty next_context_iter =
1885    passthrough_context_list_iterator;
1886  /* Current context.  */
1887  flag_context_ty inner_context =
1888    inherited_context (outer_context,
1889		       flag_context_list_iterator_advance (&context_iter));
1890
1891  /* Start state is 0.  */
1892  state = 0;
1893
1894  for (;;)
1895    {
1896      token_ty token;
1897
1898      x_csharp_lex (&token);
1899      switch (token.type)
1900	{
1901	case token_type_symbol:
1902	  {
1903	    /* Combine symbol1 . ... . symbolN to a single strings, so that
1904	       we can recognize static function calls like
1905	       GettextResource.gettext.  The information present for
1906	       symbolI.....symbolN has precedence over the information for
1907	       symbolJ.....symbolN with J > I.  */
1908	    char *sum = token.string;
1909	    size_t sum_len = strlen (sum);
1910	    const char *dottedname;
1911	    flag_context_list_ty *context_list;
1912
1913	    for (;;)
1914	      {
1915		token_ty token2;
1916
1917		x_csharp_lex (&token2);
1918		if (token2.type == token_type_dot)
1919		  {
1920		    token_ty token3;
1921
1922		    x_csharp_lex (&token3);
1923		    if (token3.type == token_type_symbol)
1924		      {
1925			char *addend = token3.string;
1926			size_t addend_len = strlen (addend);
1927
1928			sum =
1929			  (char *) xrealloc (sum, sum_len + 1 + addend_len + 1);
1930			sum[sum_len] = '.';
1931			memcpy (sum + sum_len + 1, addend, addend_len + 1);
1932			sum_len += 1 + addend_len;
1933
1934			free_token (&token3);
1935			free_token (&token2);
1936			continue;
1937		      }
1938		    x_csharp_unlex (&token3);
1939		  }
1940		x_csharp_unlex (&token2);
1941		break;
1942	      }
1943
1944	    for (dottedname = sum;;)
1945	      {
1946		void *keyword_value;
1947
1948		if (hash_find_entry (&keywords, dottedname, strlen (dottedname),
1949				     &keyword_value)
1950		    == 0)
1951		  {
1952		    next_shapes = (const struct callshapes *) keyword_value;
1953		    state = 1;
1954		    break;
1955		  }
1956
1957		dottedname = strchr (dottedname, '.');
1958		if (dottedname == NULL)
1959		  {
1960		    state = 0;
1961		    break;
1962		  }
1963		dottedname++;
1964	      }
1965
1966	    for (dottedname = sum;;)
1967	      {
1968		context_list =
1969		  flag_context_list_table_lookup (
1970		    flag_context_list_table,
1971		    dottedname, strlen (dottedname));
1972		if (context_list != NULL)
1973		  break;
1974
1975		dottedname = strchr (dottedname, '.');
1976		if (dottedname == NULL)
1977		  break;
1978		dottedname++;
1979	      }
1980	    next_context_iter = flag_context_list_iterator (context_list);
1981
1982	    free (sum);
1983	    continue;
1984	  }
1985
1986	case token_type_lparen:
1987	  if (extract_parenthesized (mlp, token_type_rparen,
1988				     inner_context, next_context_iter,
1989				     arglist_parser_alloc (mlp,
1990							   state ? next_shapes : NULL)))
1991	    {
1992	      xgettext_current_source_encoding = po_charset_utf8;
1993	      arglist_parser_done (argparser, arg);
1994	      xgettext_current_source_encoding = xgettext_global_source_encoding;
1995	      return true;
1996	    }
1997	  next_context_iter = null_context_list_iterator;
1998	  state = 0;
1999	  continue;
2000
2001	case token_type_rparen:
2002	  if (terminator == token_type_rparen)
2003	    {
2004	      xgettext_current_source_encoding = po_charset_utf8;
2005	      arglist_parser_done (argparser, arg);
2006	      xgettext_current_source_encoding = xgettext_global_source_encoding;
2007	      return false;
2008	    }
2009	  if (terminator == token_type_rbrace)
2010	    {
2011	      error_with_progname = false;
2012	      error (0, 0,
2013		     _("%s:%d: warning: ')' found where '}' was expected"),
2014		     logical_file_name, token.line_number);
2015	      error_with_progname = true;
2016	    }
2017	  next_context_iter = null_context_list_iterator;
2018	  state = 0;
2019	  continue;
2020
2021	case token_type_lbrace:
2022	  if (extract_parenthesized (mlp, token_type_rbrace,
2023				     null_context, null_context_list_iterator,
2024				     arglist_parser_alloc (mlp, NULL)))
2025	    {
2026	      xgettext_current_source_encoding = po_charset_utf8;
2027	      arglist_parser_done (argparser, arg);
2028	      xgettext_current_source_encoding = xgettext_global_source_encoding;
2029	      return true;
2030	    }
2031	  next_context_iter = null_context_list_iterator;
2032	  state = 0;
2033	  continue;
2034
2035	case token_type_rbrace:
2036	  if (terminator == token_type_rbrace)
2037	    {
2038	      xgettext_current_source_encoding = po_charset_utf8;
2039	      arglist_parser_done (argparser, arg);
2040	      xgettext_current_source_encoding = xgettext_global_source_encoding;
2041	      return false;
2042	    }
2043	  if (terminator == token_type_rparen)
2044	    {
2045	      error_with_progname = false;
2046	      error (0, 0,
2047		     _("%s:%d: warning: '}' found where ')' was expected"),
2048		     logical_file_name, token.line_number);
2049	      error_with_progname = true;
2050	    }
2051	  next_context_iter = null_context_list_iterator;
2052	  state = 0;
2053	  continue;
2054
2055	case token_type_comma:
2056	  arg++;
2057	  inner_context =
2058	    inherited_context (outer_context,
2059			       flag_context_list_iterator_advance (
2060				 &context_iter));
2061	  next_context_iter = passthrough_context_list_iterator;
2062	  state = 0;
2063	  continue;
2064
2065	case token_type_string_literal:
2066	  {
2067	    lex_pos_ty pos;
2068	    pos.file_name = logical_file_name;
2069	    pos.line_number = token.line_number;
2070
2071	    xgettext_current_source_encoding = po_charset_utf8;
2072	    if (extract_all)
2073	      remember_a_message (mlp, NULL, token.string, inner_context,
2074				  &pos, token.comment);
2075	    else
2076	      arglist_parser_remember (argparser, arg, token.string,
2077				       inner_context,
2078				       pos.file_name, pos.line_number,
2079				       token.comment);
2080	    xgettext_current_source_encoding = xgettext_global_source_encoding;
2081	  }
2082	  drop_reference (token.comment);
2083	  next_context_iter = null_context_list_iterator;
2084	  state = 0;
2085	  continue;
2086
2087	case token_type_eof:
2088	  xgettext_current_source_encoding = po_charset_utf8;
2089	  arglist_parser_done (argparser, arg);
2090	  xgettext_current_source_encoding = xgettext_global_source_encoding;
2091	  return true;
2092
2093	case token_type_dot:
2094	case token_type_number:
2095	case token_type_plus:
2096	case token_type_other:
2097	  next_context_iter = null_context_list_iterator;
2098	  state = 0;
2099	  continue;
2100
2101	default:
2102	  abort ();
2103	}
2104    }
2105}
2106
2107
2108void
2109extract_csharp (FILE *f,
2110		const char *real_filename, const char *logical_filename,
2111		flag_context_list_table_ty *flag_table,
2112		msgdomain_list_ty *mdlp)
2113{
2114  message_list_ty *mlp = mdlp->item[0]->messages;
2115
2116  fp = f;
2117  real_file_name = real_filename;
2118  logical_file_name = xstrdup (logical_filename);
2119  line_number = 1;
2120
2121  logical_line_number = 1;
2122  last_comment_line = -1;
2123  last_non_comment_line = -1;
2124
2125  flag_context_list_table = flag_table;
2126
2127  init_keywords ();
2128
2129  /* Eat tokens until eof is seen.  When extract_parenthesized returns
2130     due to an unbalanced closing parenthesis, just restart it.  */
2131  while (!extract_parenthesized (mlp, token_type_eof,
2132				 null_context, null_context_list_iterator,
2133				 arglist_parser_alloc (mlp, NULL)))
2134    ;
2135
2136  fp = NULL;
2137  real_file_name = NULL;
2138  logical_file_name = NULL;
2139  line_number = 0;
2140}
2141