1/* xgettext C# backend.
2   Copyright (C) 2003, 2005-2006 Free Software Foundation, Inc.
3   Written by Bruno Haible <bruno@clisp.org>, 2003.
4
5   This program is free software; you can redistribute it and/or modify
6   it under the terms of the GNU General Public License as published by
7   the Free Software Foundation; either version 2, or (at your option)
8   any later version.
9
10   This program is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13   GNU General Public License for more details.
14
15   You should have received a copy of the GNU General Public License
16   along with this program; if not, write to the Free Software Foundation,
17   Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.  */
18
19#ifdef HAVE_CONFIG_H
20# include "config.h"
21#endif
22
23#include <errno.h>
24#include <stdbool.h>
25#include <stdio.h>
26#include <stdlib.h>
27#include <string.h>
28
29#include "message.h"
30#include "xgettext.h"
31#include "x-csharp.h"
32#include "c-ctype.h"
33#include "error.h"
34#include "error-progname.h"
35#include "xalloc.h"
36#include "xerror.h"
37#include "xvasprintf.h"
38#include "exit.h"
39#include "hash.h"
40#include "po-charset.h"
41#include "utf8-ucs4.h"
42#include "ucs4-utf8.h"
43#include "gettext.h"
44
45#define _(s) gettext(s)
46
47#define SIZEOF(a) (sizeof(a) / sizeof(a[0]))
48
49
50/* The C# syntax is defined in ECMA-334, second edition.  */
51
52
53/* ====================== Keyword set customization.  ====================== */
54
55/* If true extract all strings.  */
56static bool extract_all = false;
57
58static hash_table keywords;
59static bool default_keywords = true;
60
61
62void
63x_csharp_extract_all ()
64{
65  extract_all = true;
66}
67
68
69/* Processes a --keyword option.
70   Non-ASCII function names can be used if given in UTF-8 encoding.  */
71void
72x_csharp_keyword (const char *name)
73{
74  if (name == NULL)
75    default_keywords = false;
76  else
77    {
78      const char *end;
79      struct callshape shape;
80      const char *colon;
81
82      if (keywords.table == NULL)
83	hash_init (&keywords, 100);
84
85      split_keywordspec (name, &end, &shape);
86
87      /* The characters between name and end should form a valid C#
88	 identifier sequence with dots.
89	 A colon means an invalid parse in split_keywordspec().  */
90      colon = strchr (name, ':');
91      if (colon == NULL || colon >= end)
92	insert_keyword_callshape (&keywords, name, end - name, &shape);
93    }
94}
95
96/* Finish initializing the keywords hash table.
97   Called after argument processing, before each file is processed.  */
98static void
99init_keywords ()
100{
101  if (default_keywords)
102    {
103      /* When adding new keywords here, also update the documentation in
104	 xgettext.texi!  */
105      x_csharp_keyword ("GetString");	/* Resource{Manager,Set}.GetString */
106      x_csharp_keyword ("GetPluralString:1,2");	/* GettextResource{Manager,Set}.GetPluralString */
107      default_keywords = false;
108    }
109}
110
111void
112init_flag_table_csharp ()
113{
114  xgettext_record_flag ("GetString:1:pass-csharp-format");
115  xgettext_record_flag ("GetPluralString:1:pass-csharp-format");
116  xgettext_record_flag ("GetPluralString:2:pass-csharp-format");
117  xgettext_record_flag ("String.Format:1:csharp-format");
118}
119
120
121/* ======================== Reading of characters.  ======================== */
122
123/* Real filename, used in error messages about the input file.  */
124static const char *real_file_name;
125
126/* Logical filename and line number, used to label the extracted messages.  */
127static char *logical_file_name;
128static int line_number;
129
130/* The input file stream.  */
131static FILE *fp;
132
133
134/* Phase 1: line_number handling.  */
135
136/* Maximum used, roughly a safer MB_LEN_MAX.  */
137#define MAX_PHASE1_PUSHBACK 16
138static unsigned char phase1_pushback[MAX_PHASE1_PUSHBACK];
139static int phase1_pushback_length;
140
141/* Read the next single byte from the input file.  */
142static int
143phase1_getc ()
144{
145  int c;
146
147  if (phase1_pushback_length)
148    {
149      c = phase1_pushback[--phase1_pushback_length];
150      if (c == '\n')
151	++line_number;
152      return c;
153    }
154
155  c = getc (fp);
156  if (c == EOF)
157    {
158      if (ferror (fp))
159	error (EXIT_FAILURE, errno, _("error while reading \"%s\""),
160	       real_file_name);
161      return EOF;
162    }
163
164  if (c == '\n')
165    ++line_number;
166  return c;
167}
168
169/* Supports MAX_PHASE1_PUSHBACK characters of pushback.  */
170static void
171phase1_ungetc (int c)
172{
173  if (c != EOF)
174    {
175      if (c == '\n')
176	--line_number;
177      if (phase1_pushback_length == SIZEOF (phase1_pushback))
178	abort ();
179      phase1_pushback[phase1_pushback_length++] = c;
180    }
181}
182
183
184/* Phase 2: Conversion to Unicode.
185   This is done early because ECMA-334 section 9.1. says that the source is
186   "an ordered sequence of Unicode characters", and because the recognition
187   of the line terminators (ECMA-334 section 9.3.1) is hardly possible without
188   prior conversion to Unicode.  */
189
190/* End-of-file indicator for functions returning an UCS-4 character.  */
191#define UEOF -1
192
193/* Newline Unicode character.  */
194#define UNL 0x000a
195
196static int phase2_pushback[1];
197static int phase2_pushback_length;
198
199/* Read the next Unicode UCS-4 character from the input file.  */
200static int
201phase2_getc ()
202{
203  if (phase2_pushback_length)
204    return phase2_pushback[--phase2_pushback_length];
205
206  if (xgettext_current_source_encoding == po_charset_ascii)
207    {
208      int c = phase1_getc ();
209      if (c == EOF)
210	return UEOF;
211      if (!c_isascii (c))
212	{
213	  char buffer[21];
214	  sprintf (buffer, ":%ld", (long) line_number);
215	  multiline_error (xstrdup (""),
216			   xasprintf (_("\
217Non-ASCII string at %s%s.\n\
218Please specify the source encoding through --from-code.\n"),
219			   real_file_name, buffer));
220	  exit (EXIT_FAILURE);
221	}
222      return c;
223    }
224  else if (xgettext_current_source_encoding != po_charset_utf8)
225    {
226#if HAVE_ICONV
227      /* Use iconv on an increasing number of bytes.  Read only as many bytes
228	 through phase1_getc as needed.  This is needed to give reasonable
229	 interactive behaviour when fp is connected to an interactive tty.  */
230      unsigned char buf[MAX_PHASE1_PUSHBACK];
231      size_t bufcount;
232      int c = phase1_getc ();
233      if (c == EOF)
234	return UEOF;
235      buf[0] = (unsigned char) c;
236      bufcount = 1;
237
238      for (;;)
239	{
240	  unsigned char scratchbuf[6];
241	  const char *inptr = (const char *) &buf[0];
242	  size_t insize = bufcount;
243	  char *outptr = (char *) &scratchbuf[0];
244	  size_t outsize = sizeof (scratchbuf);
245
246	  size_t res = iconv (xgettext_current_source_iconv,
247			      (ICONV_CONST char **) &inptr, &insize,
248			      &outptr, &outsize);
249	  /* We expect that a character has been produced if and only if
250	     some input bytes have been consumed.  */
251	  if ((insize < bufcount) != (outsize < sizeof (scratchbuf)))
252	    abort ();
253	  if (outsize == sizeof (scratchbuf))
254	    {
255	      /* No character has been produced.  Must be an error.  */
256	      if (res != (size_t)(-1))
257		abort ();
258
259	      if (errno == EILSEQ)
260		{
261		  /* An invalid multibyte sequence was encountered.  */
262		  multiline_error (xstrdup (""),
263				   xasprintf (_("\
264%s:%d: Invalid multibyte sequence.\n\
265Please specify the correct source encoding through --from-code.\n"),
266				   real_file_name, line_number));
267		  exit (EXIT_FAILURE);
268		}
269	      else if (errno == EINVAL)
270		{
271		  /* An incomplete multibyte character.  */
272		  int c;
273
274		  if (bufcount == MAX_PHASE1_PUSHBACK)
275		    {
276		      /* An overlong incomplete multibyte sequence was
277			 encountered.  */
278		      multiline_error (xstrdup (""),
279				       xasprintf (_("\
280%s:%d: Long incomplete multibyte sequence.\n\
281Please specify the correct source encoding through --from-code.\n"),
282				       real_file_name, line_number));
283		      exit (EXIT_FAILURE);
284		    }
285
286		  /* Read one more byte and retry iconv.  */
287		  c = phase1_getc ();
288		  if (c == EOF)
289		    {
290		      multiline_error (xstrdup (""),
291				       xasprintf (_("\
292%s:%d: Incomplete multibyte sequence at end of file.\n\
293Please specify the correct source encoding through --from-code.\n"),
294				       real_file_name, line_number));
295		      exit (EXIT_FAILURE);
296		    }
297		  if (c == '\n')
298		    {
299		      multiline_error (xstrdup (""),
300				       xasprintf (_("\
301%s:%d: Incomplete multibyte sequence at end of line.\n\
302Please specify the correct source encoding through --from-code.\n"),
303				       real_file_name, line_number - 1));
304		      exit (EXIT_FAILURE);
305		    }
306		  buf[bufcount++] = (unsigned char) c;
307		}
308	      else
309		error (EXIT_FAILURE, errno, _("%s:%d: iconv failure"),
310		       real_file_name, line_number);
311	    }
312	  else
313	    {
314	      size_t outbytes = sizeof (scratchbuf) - outsize;
315	      size_t bytes = bufcount - insize;
316	      unsigned int uc;
317
318	      /* We expect that one character has been produced.  */
319	      if (bytes == 0)
320		abort ();
321	      if (outbytes == 0)
322		abort ();
323	      /* Push back the unused bytes.  */
324	      while (insize > 0)
325		phase1_ungetc (buf[--insize]);
326	      /* Convert the character from UTF-8 to UCS-4.  */
327	      if (u8_mbtouc (&uc, scratchbuf, outbytes) < outbytes)
328		{
329		  /* scratchbuf contains an out-of-range Unicode character
330		     (> 0x10ffff).  */
331		  multiline_error (xstrdup (""),
332				   xasprintf (_("\
333%s:%d: Invalid multibyte sequence.\n\
334Please specify the source encoding through --from-code.\n"),
335				   real_file_name, line_number));
336		  exit (EXIT_FAILURE);
337		}
338	      return uc;
339	    }
340	}
341#else
342      /* If we don't have iconv(), the only supported values for
343	 xgettext_global_source_encoding and thus also for
344	 xgettext_current_source_encoding are ASCII and UTF-8.  */
345      abort ();
346#endif
347    }
348  else
349    {
350      /* Read an UTF-8 encoded character.  */
351      unsigned char buf[6];
352      unsigned int count;
353      int c;
354      unsigned int uc;
355
356      c = phase1_getc ();
357      if (c == EOF)
358	return UEOF;
359      buf[0] = c;
360      count = 1;
361
362      if (buf[0] >= 0xc0)
363	{
364	  c = phase1_getc ();
365	  if (c == EOF)
366	    return UEOF;
367	  buf[1] = c;
368	  count = 2;
369	}
370
371      if (buf[0] >= 0xe0
372	  && ((buf[1] ^ 0x80) < 0x40))
373	{
374	  c = phase1_getc ();
375	  if (c == EOF)
376	    return UEOF;
377	  buf[2] = c;
378	  count = 3;
379	}
380
381      if (buf[0] >= 0xf0
382	  && ((buf[1] ^ 0x80) < 0x40)
383	  && ((buf[2] ^ 0x80) < 0x40))
384	{
385	  c = phase1_getc ();
386	  if (c == EOF)
387	    return UEOF;
388	  buf[3] = c;
389	  count = 4;
390	}
391
392      if (buf[0] >= 0xf8
393	  && ((buf[1] ^ 0x80) < 0x40)
394	  && ((buf[2] ^ 0x80) < 0x40)
395	  && ((buf[3] ^ 0x80) < 0x40))
396	{
397	  c = phase1_getc ();
398	  if (c == EOF)
399	    return UEOF;
400	  buf[4] = c;
401	  count = 5;
402	}
403
404      if (buf[0] >= 0xfc
405	  && ((buf[1] ^ 0x80) < 0x40)
406	  && ((buf[2] ^ 0x80) < 0x40)
407	  && ((buf[3] ^ 0x80) < 0x40)
408	  && ((buf[4] ^ 0x80) < 0x40))
409	{
410	  c = phase1_getc ();
411	  if (c == EOF)
412	    return UEOF;
413	  buf[5] = c;
414	  count = 6;
415	}
416
417      u8_mbtouc (&uc, buf, count);
418      return uc;
419    }
420}
421
422/* Supports only one pushback character.  */
423static void
424phase2_ungetc (int c)
425{
426  if (c != UEOF)
427    {
428      if (phase2_pushback_length == SIZEOF (phase2_pushback))
429	abort ();
430      phase2_pushback[phase2_pushback_length++] = c;
431    }
432}
433
434
435/* Phase 3: Convert all line terminators to LF.
436   See ECMA-334 section 9.3.1.  */
437
438/* Line number defined in terms of phase3.  */
439static int logical_line_number;
440
441static int phase3_pushback[9];
442static int phase3_pushback_length;
443
444/* Read the next Unicode UCS-4 character from the input file, mapping
445   all line terminators to U+000A, and dropping U+001A at the end of file.  */
446static int
447phase3_getc ()
448{
449  int c;
450
451  if (phase3_pushback_length)
452    {
453      c = phase3_pushback[--phase3_pushback_length];
454      if (c == UNL)
455	++logical_line_number;
456      return c;
457    }
458
459  c = phase2_getc ();
460
461  if (c == 0x000d)
462    {
463      int c1 = phase2_getc ();
464
465      if (c1 != UEOF && c1 != 0x000a)
466	phase2_ungetc (c1);
467
468      /* Seen line terminator CR or CR/LF.  */
469      ++logical_line_number;
470      return UNL;
471    }
472
473  if (c == 0x0085 || c == 0x2028 || c == 0x2029)
474    {
475      /* Seen Unicode word processor newline.  */
476      ++logical_line_number;
477      return UNL;
478    }
479
480  if (c == 0x001a)
481    {
482      int c1 = phase2_getc ();
483
484      if (c1 == UEOF)
485	/* Seen U+001A right before the end of file.  */
486	return UEOF;
487
488      phase2_ungetc (c1);
489    }
490
491  if (c == UNL)
492    ++logical_line_number;
493  return c;
494}
495
496/* Supports 9 characters of pushback.  */
497static void
498phase3_ungetc (int c)
499{
500  if (c != UEOF)
501    {
502      if (c == UNL)
503	--logical_line_number;
504      if (phase3_pushback_length == SIZEOF (phase3_pushback))
505	abort ();
506      phase3_pushback[phase3_pushback_length++] = c;
507    }
508}
509
510
511/* ========================= Accumulating strings.  ======================== */
512
513/* A string buffer type that allows appending Unicode characters.
514   Returns the entire string in UTF-8 encoding.  */
515
516struct string_buffer
517{
518  /* The part of the string that has already been converted to UTF-8.  */
519  char *utf8_buffer;
520  size_t utf8_buflen;
521  size_t utf8_allocated;
522};
523
524/* Initialize a 'struct string_buffer' to empty.  */
525static inline void
526init_string_buffer (struct string_buffer *bp)
527{
528  bp->utf8_buffer = NULL;
529  bp->utf8_buflen = 0;
530  bp->utf8_allocated = 0;
531}
532
533/* Auxiliary function: Ensure count more bytes are available in bp->utf8.  */
534static inline void
535string_buffer_append_unicode_grow (struct string_buffer *bp, size_t count)
536{
537  if (bp->utf8_buflen + count > bp->utf8_allocated)
538    {
539      size_t new_allocated = 2 * bp->utf8_allocated + 10;
540      if (new_allocated < bp->utf8_buflen + count)
541	new_allocated = bp->utf8_buflen + count;
542      bp->utf8_allocated = new_allocated;
543      bp->utf8_buffer = xrealloc (bp->utf8_buffer, new_allocated);
544    }
545}
546
547/* Auxiliary function: Append a Unicode character to bp->utf8.
548   uc must be < 0x110000.  */
549static inline void
550string_buffer_append_unicode (struct string_buffer *bp, unsigned int uc)
551{
552  unsigned char utf8buf[6];
553  int count = u8_uctomb (utf8buf, uc, 6);
554
555  if (count < 0)
556    /* The caller should have ensured that uc is not out-of-range.  */
557    abort ();
558
559  string_buffer_append_unicode_grow (bp, count);
560  memcpy (bp->utf8_buffer + bp->utf8_buflen, utf8buf, count);
561  bp->utf8_buflen += count;
562}
563
564/* Return the string buffer's contents.  */
565static char *
566string_buffer_result (struct string_buffer *bp)
567{
568  /* NUL-terminate it.  */
569  string_buffer_append_unicode_grow (bp, 1);
570  bp->utf8_buffer[bp->utf8_buflen] = '\0';
571  /* Return it.  */
572  return bp->utf8_buffer;
573}
574
575/* Free the memory pointed to by a 'struct string_buffer'.  */
576static inline void
577free_string_buffer (struct string_buffer *bp)
578{
579  free (bp->utf8_buffer);
580}
581
582
583/* ======================== Accumulating comments.  ======================== */
584
585
586/* Accumulating a single comment line.  */
587
588static struct string_buffer comment_buffer;
589
590static inline void
591comment_start ()
592{
593  comment_buffer.utf8_buflen = 0;
594}
595
596static inline bool
597comment_at_start ()
598{
599  return (comment_buffer.utf8_buflen == 0);
600}
601
602static inline void
603comment_add (int c)
604{
605  string_buffer_append_unicode (&comment_buffer, c);
606}
607
608static inline void
609comment_line_end (size_t chars_to_remove)
610{
611  char *buffer = string_buffer_result (&comment_buffer);
612  size_t buflen = strlen (buffer);
613
614  buflen -= chars_to_remove;
615  while (buflen >= 1
616	 && (buffer[buflen - 1] == ' ' || buffer[buflen - 1] == '\t'))
617    --buflen;
618  buffer[buflen] = '\0';
619  savable_comment_add (buffer);
620}
621
622
623/* These are for tracking whether comments count as immediately before
624   keyword.  */
625static int last_comment_line;
626static int last_non_comment_line;
627
628
629/* Phase 4: Replace each comment that is not inside a character constant or
630   string literal with a space or newline character.
631   See ECMA-334 section 9.3.2.  */
632
633static int
634phase4_getc ()
635{
636  int c0;
637  int c;
638  bool last_was_star;
639
640  c0 = phase3_getc ();
641  if (c0 != '/')
642    return c0;
643  c = phase3_getc ();
644  switch (c)
645    {
646    default:
647      phase3_ungetc (c);
648      return c0;
649
650    case '*':
651      /* C style comment.  */
652      comment_start ();
653      last_was_star = false;
654      for (;;)
655	{
656	  c = phase3_getc ();
657	  if (c == UEOF)
658	    break;
659	  /* We skip all leading white space, but not EOLs.  */
660	  if (!(comment_at_start () && (c == ' ' || c == '\t')))
661	    comment_add (c);
662	  switch (c)
663	    {
664	    case UNL:
665	      comment_line_end (1);
666	      comment_start ();
667	      last_was_star = false;
668	      continue;
669
670	    case '*':
671	      last_was_star = true;
672	      continue;
673
674	    case '/':
675	      if (last_was_star)
676		{
677		  comment_line_end (2);
678		  break;
679		}
680	      /* FALLTHROUGH */
681
682	    default:
683	      last_was_star = false;
684	      continue;
685	    }
686	  break;
687	}
688      last_comment_line = logical_line_number;
689      return ' ';
690
691    case '/':
692      /* C++ style comment.  */
693      last_comment_line = logical_line_number;
694      comment_start ();
695      for (;;)
696	{
697	  c = phase3_getc ();
698	  if (c == UNL || c == UEOF)
699	    break;
700	  /* We skip all leading white space, but not EOLs.  */
701	  if (!(comment_at_start () && (c == ' ' || c == '\t')))
702	    comment_add (c);
703	}
704      phase3_ungetc (c); /* push back the newline, to decrement logical_line_number */
705      comment_line_end (0);
706      phase3_getc (); /* read the newline again */
707      return UNL;
708    }
709}
710
711/* Supports only one pushback character.  */
712static void
713phase4_ungetc (int c)
714{
715  phase3_ungetc (c);
716}
717
718
719/* ======================= Character classification.  ====================== */
720
721
722/* Return true if a given character is white space.
723   See ECMA-334 section 9.3.3.  */
724static bool
725is_whitespace (int c)
726{
727  /* Unicode character class Zs, as of Unicode 4.0.  */
728  /* grep '^[^;]*;[^;]*;Zs;' UnicodeData-4.0.0.txt */
729  switch (c >> 8)
730    {
731    case 0x00:
732      return (c == 0x0020 || c == 0x00a0);
733    case 0x16:
734      return (c == 0x1680);
735    case 0x18:
736      return (c == 0x180e);
737    case 0x20:
738      return ((c >= 0x2000 && c <= 0x200b) || c == 0x202f || c == 0x205f);
739    case 0x30:
740      return (c == 0x3000);
741    default:
742      return false;
743    }
744}
745
746
747/* C# allows identifiers containing many Unicode characters.  We recognize
748   them; to use an identifier with Unicode characters in a --keyword option,
749   it must be specified in UTF-8.  */
750
751static inline int
752bitmap_lookup (const void *table, unsigned int uc)
753{
754  unsigned int index1 = uc >> 16;
755  if (index1 < ((const int *) table)[0])
756    {
757      int lookup1 = ((const int *) table)[1 + index1];
758      if (lookup1 >= 0)
759	{
760	  unsigned int index2 = (uc >> 9) & 0x7f;
761	  int lookup2 = ((const int *) table)[lookup1 + index2];
762	  if (lookup2 >= 0)
763	    {
764	      unsigned int index3 = (uc >> 5) & 0xf;
765	      unsigned int lookup3 = ((const int *) table)[lookup2 + index3];
766
767	      return (lookup3 >> (uc & 0x1f)) & 1;
768	    }
769	}
770    }
771  return 0;
772}
773
774/* Unicode character classes Lu, Ll, Lt, Lm, Lo, Nl, as of Unicode 4.0,
775   plus the underscore.  */
776static const
777struct
778  {
779    int header[1];
780    int level1[3];
781    int level2[3 << 7];
782    /*unsigned*/ int level3[34 << 4];
783  }
784table_identifier_start =
785{
786  { 3 },
787  {     4,   132,   260 },
788  {
789      388,   404,   420,   436,   452,   468,   484,   500,
790      516,   532,   548,   564,   580,    -1,   596,   612,
791      628,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
792      644,    -1,   660,   660,   660,   660,   660,   660,
793      660,   660,   660,   660,   660,   660,   676,   660,
794      660,   660,   660,   660,   660,   660,   660,   660,
795      660,   660,   660,   660,   660,   660,   660,   660,
796      660,   660,   660,   660,   660,   660,   660,   660,
797      660,   660,   660,   660,   660,   660,   660,   660,
798      660,   660,   660,   660,   660,   660,   660,   692,
799      660,   660,   708,    -1,    -1,    -1,   660,   660,
800      660,   660,   660,   660,   660,   660,   660,   660,
801      660,   660,   660,   660,   660,   660,   660,   660,
802      660,   660,   660,   724,    -1,    -1,    -1,    -1,
803       -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
804       -1,    -1,    -1,    -1,   740,   756,   772,   788,
805      804,   820,   836,    -1,   852,    -1,    -1,    -1,
806       -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
807       -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
808       -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
809       -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
810       -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
811       -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
812       -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
813       -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
814       -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
815       -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
816       -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
817       -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
818       -1,    -1,   868,   884,    -1,    -1,    -1,    -1,
819       -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
820       -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
821      660,   660,   660,   660,   660,   660,   660,   660,
822      660,   660,   660,   660,   660,   660,   660,   660,
823      660,   660,   660,   660,   660,   660,   660,   660,
824      660,   660,   660,   660,   660,   660,   660,   660,
825      660,   660,   660,   660,   660,   660,   660,   660,
826      660,   660,   660,   660,   660,   660,   660,   660,
827      660,   660,   660,   660,   660,   660,   660,   660,
828      660,   660,   660,   660,   660,   660,   660,   660,
829      660,   660,   660,   660,   660,   660,   660,   660,
830      660,   660,   660,   660,   660,   660,   660,   660,
831      660,   660,   660,   900,    -1,    -1,    -1,    -1,
832       -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
833       -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
834       -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
835       -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
836       -1,    -1,    -1,    -1,   660,   916,    -1,    -1
837  },
838  {
839    0x00000000, 0x00000000, 0x87FFFFFE, 0x07FFFFFE,
840    0x00000000, 0x04200400, 0xFF7FFFFF, 0xFF7FFFFF,
841    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
842    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
843    0xFFFFFFFF, 0x007FFFFF, 0xFFFF0000, 0xFFFFFFFF,
844    0xFFFFFFFF, 0xFFFFFFFF, 0x0003FFC3, 0x0000401F,
845    0x00000000, 0x00000000, 0x00000000, 0x04000000,
846    0xFFFFD740, 0xFFFFFFFB, 0xFFFF7FFF, 0x0FBFFFFF,
847    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
848    0xFFFFFC03, 0xFFFFFFFF, 0xFFFF7FFF, 0x033FFFFF,
849    0x0000FFFF, 0xFFFE0000, 0x027FFFFF, 0xFFFFFFFE,
850    0x000000FF, 0x00000000, 0xFFFF0000, 0x000707FF,
851    0x00000000, 0x07FFFFFE, 0x000007FF, 0xFFFEC000,
852    0xFFFFFFFF, 0xFFFFFFFF, 0x002FFFFF, 0x9C00C060,
853    0xFFFD0000, 0x0000FFFF, 0x0000E000, 0x00000000,
854    0xFFFFFFFF, 0x0002003F, 0x00000000, 0x00000000,
855    0x00000000, 0x00000000, 0x00000000, 0x00000000,
856    0x00000000, 0x00000000, 0x00000000, 0x00000000,
857    0xFFFFFFF0, 0x23FFFFFF, 0xFF010000, 0x00000003,
858    0xFFF99FE0, 0x23C5FDFF, 0xB0000000, 0x00030003,
859    0xFFF987E0, 0x036DFDFF, 0x5E000000, 0x001C0000,
860    0xFFFBBFE0, 0x23EDFDFF, 0x00010000, 0x00000003,
861    0xFFF99FE0, 0x23EDFDFF, 0xB0000000, 0x00020003,
862    0xD63DC7E8, 0x03BFC718, 0x00000000, 0x00000000,
863    0xFFFDDFE0, 0x03EFFDFF, 0x00000000, 0x00000003,
864    0xFFFDDFE0, 0x23EFFDFF, 0x40000000, 0x00000003,
865    0xFFFDDFE0, 0x03FFFDFF, 0x00000000, 0x00000003,
866    0xFC7FFFE0, 0x2FFBFFFF, 0x0000007F, 0x00000000,
867    0xFFFFFFFE, 0x000DFFFF, 0x0000007F, 0x00000000,
868    0xFEF02596, 0x200DECAE, 0x3000005F, 0x00000000,
869    0x00000001, 0x00000000, 0xFFFFFEFF, 0x000007FF,
870    0x00000F00, 0x00000000, 0x00000000, 0x00000000,
871    0xFFFFFFFF, 0x000006FB, 0x003F0000, 0x00000000,
872    0x00000000, 0xFFFFFFFF, 0xFFFF003F, 0x01FFFFFF,
873    0xFFFFFFFF, 0xFFFFFFFF, 0x83FFFFFF, 0xFFFFFFFF,
874    0xFFFFFFFF, 0xFFFFFF07, 0xFFFFFFFF, 0x03FFFFFF,
875    0xFFFFFF7F, 0xFFFFFFFF, 0x3D7F3D7F, 0xFFFFFFFF,
876    0xFFFF3D7F, 0x7F3D7FFF, 0xFF7F7F3D, 0xFFFF7FFF,
877    0x7F3D7FFF, 0xFFFFFFFF, 0x07FFFF7F, 0x00000000,
878    0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0x001FFFFF,
879    0xFFFFFFFE, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
880    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
881    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
882    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
883    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x007F9FFF,
884    0x07FFFFFE, 0xFFFFFFFF, 0xFFFFFFFF, 0x0001C7FF,
885    0x0003DFFF, 0x0003FFFF, 0x0003FFFF, 0x0001DFFF,
886    0xFFFFFFFF, 0x000FFFFF, 0x10800000, 0x00000000,
887    0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0x00FFFFFF,
888    0xFFFFFFFF, 0x000001FF, 0x00000000, 0x00000000,
889    0x1FFFFFFF, 0x00000000, 0xFFFF0000, 0x001F3FFF,
890    0x00000000, 0x00000000, 0x00000000, 0x00000000,
891    0x00000000, 0x00000000, 0x00000000, 0x00000000,
892    0x00000000, 0x00000000, 0x00000000, 0x00000000,
893    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000FFF,
894    0x00000000, 0x00000000, 0x00000000, 0x00000000,
895    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
896    0x0FFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x03FFFFFF,
897    0x3F3FFFFF, 0xFFFFFFFF, 0xAAFF3F3F, 0x3FFFFFFF,
898    0xFFFFFFFF, 0x5FDFFFFF, 0x0FCF1FDC, 0x1FDC1FFF,
899    0x00000000, 0x00000000, 0x00000000, 0x80020000,
900    0x00000000, 0x00000000, 0x00000000, 0x00000000,
901    0x3E2FFC84, 0xE3FBBD50, 0x000003E0, 0xFFFFFFFF,
902    0x0000000F, 0x00000000, 0x00000000, 0x00000000,
903    0x000000E0, 0x1F3E03FE, 0xFFFFFFFE, 0xFFFFFFFF,
904    0xE07FFFFF, 0xFFFFFFFE, 0xFFFFFFFF, 0xF7FFFFFF,
905    0xFFFFFFE0, 0xFFFE1FFF, 0xFFFFFFFF, 0xFFFFFFFF,
906    0x00007FFF, 0x00FFFFFF, 0x00000000, 0xFFFF0000,
907    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
908    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
909    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
910    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
911    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
912    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
913    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
914    0xFFFFFFFF, 0x003FFFFF, 0x00000000, 0x00000000,
915    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
916    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
917    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
918    0xFFFFFFFF, 0x0000003F, 0x00000000, 0x00000000,
919    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
920    0x00001FFF, 0x00000000, 0x00000000, 0x00000000,
921    0x00000000, 0x00000000, 0x00000000, 0x00000000,
922    0x00000000, 0x00000000, 0x00000000, 0x00000000,
923    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
924    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
925    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
926    0xFFFFFFFF, 0x0000000F, 0x00000000, 0x00000000,
927    0x00000000, 0x00000000, 0x00000000, 0x00000000,
928    0x00000000, 0x00000000, 0x00000000, 0x00000000,
929    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
930    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
931    0xFFFFFFFF, 0xFFFF3FFF, 0xFFFFFFFF, 0x000007FF,
932    0x00000000, 0x00000000, 0x00000000, 0x00000000,
933    0xA0F8007F, 0x5F7FFDFF, 0xFFFFFFDB, 0xFFFFFFFF,
934    0xFFFFFFFF, 0x0003FFFF, 0xFFF80000, 0xFFFFFFFF,
935    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
936    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
937    0xFFFFFFFF, 0x3FFFFFFF, 0xFFFF0000, 0xFFFFFFFF,
938    0xFFFCFFFF, 0xFFFFFFFF, 0x000000FF, 0x0FFF0000,
939    0x00000000, 0x00000000, 0x00000000, 0xFFDF0000,
940    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x1FFFFFFF,
941    0x00000000, 0x07FFFFFE, 0x07FFFFFE, 0xFFFFFFC0,
942    0xFFFFFFFF, 0x7FFFFFFF, 0x1CFCFCFC, 0x00000000,
943    0xFFFFEFFF, 0xB7FFFF7F, 0x3FFF3FFF, 0x00000000,
944    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x07FFFFFF,
945    0x00000000, 0x00000000, 0x00000000, 0x00000000,
946    0x00000000, 0x00000000, 0x00000000, 0x00000000,
947    0x00000000, 0x00000000, 0x00000000, 0x00000000,
948    0x00000000, 0x00000000, 0x00000000, 0x00000000,
949    0x7FFFFFFF, 0xFFFF0000, 0x000007FF, 0x00000000,
950    0x3FFFFFFF, 0x00000000, 0x00000000, 0x00000000,
951    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
952    0x3FFFFFFF, 0x00000000, 0x00000000, 0x00000000,
953    0x00000000, 0x00000000, 0x00000000, 0x00000000,
954    0x00000000, 0x00000000, 0x00000000, 0x00000000,
955    0xFFFFFD3F, 0x91BFFFFF, 0x00000000, 0x00000000,
956    0x00000000, 0x00000000, 0x00000000, 0x00000000,
957    0x00000000, 0x00000000, 0x00000000, 0x00000000,
958    0x00000000, 0x00000000, 0x00000000, 0x00000000,
959    0xFFFFFFFF, 0xFFFFFFFF, 0xFFDFFFFF, 0xFFFFFFFF,
960    0xDFFFFFFF, 0xEBFFDE64, 0xFFFFFFEF, 0xFFFFFFFF,
961    0xDFDFE7BF, 0x7BFFFFFF, 0xFFFDFC5F, 0xFFFFFFFF,
962    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
963    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
964    0xFFFFFFFF, 0xFFFFFF0F, 0xF7FFFFFD, 0xF7FFFFFF,
965    0xFFDFFFFF, 0xFFDFFFFF, 0xFFFF7FFF, 0xFFFF7FFF,
966    0xFFFFFDFF, 0xFFFFFDFF, 0x000003F7, 0x00000000,
967    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
968    0xFFFFFFFF, 0xFFFFFFFF, 0x007FFFFF, 0x00000000,
969    0x00000000, 0x00000000, 0x00000000, 0x00000000,
970    0x00000000, 0x00000000, 0x00000000, 0x00000000,
971    0x3FFFFFFF, 0x00000000, 0x00000000, 0x00000000,
972    0x00000000, 0x00000000, 0x00000000, 0x00000000,
973    0x00000000, 0x00000000, 0x00000000, 0x00000000,
974    0x00000000, 0x00000000, 0x00000000, 0x00000000
975  }
976};
977
978/* Unicode character classes Lu, Ll, Lt, Lm, Lo, Nl, Nd, Pc, Mn, Mc, Cf,
979   as of Unicode 4.0.  */
980static const
981struct
982  {
983    int header[1];
984    int level1[15];
985    int level2[4 << 7];
986    /*unsigned*/ int level3[36 << 4];
987  }
988table_identifier_part =
989{
990  { 15 },
991  {
992       16,   144,   272,    -1,    -1,    -1,    -1,    -1,
993       -1,    -1,    -1,    -1,    -1,    -1,   400
994  },
995  {
996      528,   544,   560,   576,   592,   608,   624,   640,
997      656,   672,   688,   704,   720,    -1,   736,   752,
998      768,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
999      784,    -1,   800,   800,   800,   800,   800,   800,
1000      800,   800,   800,   800,   800,   800,   816,   800,
1001      800,   800,   800,   800,   800,   800,   800,   800,
1002      800,   800,   800,   800,   800,   800,   800,   800,
1003      800,   800,   800,   800,   800,   800,   800,   800,
1004      800,   800,   800,   800,   800,   800,   800,   800,
1005      800,   800,   800,   800,   800,   800,   800,   832,
1006      800,   800,   848,    -1,    -1,    -1,   800,   800,
1007      800,   800,   800,   800,   800,   800,   800,   800,
1008      800,   800,   800,   800,   800,   800,   800,   800,
1009      800,   800,   800,   864,    -1,    -1,    -1,    -1,
1010       -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
1011       -1,    -1,    -1,    -1,   880,   896,   912,   928,
1012      944,   960,   976,    -1,   992,    -1,    -1,    -1,
1013       -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
1014       -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
1015       -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
1016       -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
1017       -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
1018       -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
1019       -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
1020       -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
1021       -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
1022       -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
1023       -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
1024       -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
1025     1008,    -1,  1024,  1040,    -1,    -1,    -1,    -1,
1026       -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
1027       -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
1028      800,   800,   800,   800,   800,   800,   800,   800,
1029      800,   800,   800,   800,   800,   800,   800,   800,
1030      800,   800,   800,   800,   800,   800,   800,   800,
1031      800,   800,   800,   800,   800,   800,   800,   800,
1032      800,   800,   800,   800,   800,   800,   800,   800,
1033      800,   800,   800,   800,   800,   800,   800,   800,
1034      800,   800,   800,   800,   800,   800,   800,   800,
1035      800,   800,   800,   800,   800,   800,   800,   800,
1036      800,   800,   800,   800,   800,   800,   800,   800,
1037      800,   800,   800,   800,   800,   800,   800,   800,
1038      800,   800,   800,  1056,    -1,    -1,    -1,    -1,
1039       -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
1040       -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
1041       -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
1042       -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
1043       -1,    -1,    -1,    -1,   800,  1072,    -1,    -1,
1044     1088,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
1045       -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
1046       -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
1047       -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
1048       -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
1049       -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
1050       -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
1051       -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
1052       -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
1053       -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
1054       -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
1055       -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
1056       -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
1057       -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
1058       -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
1059       -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1
1060  },
1061  {
1062    0x00000000, 0x03FF0000, 0x87FFFFFE, 0x07FFFFFE,
1063    0x00000000, 0x04202400, 0xFF7FFFFF, 0xFF7FFFFF,
1064    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1065    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1066    0xFFFFFFFF, 0x007FFFFF, 0xFFFF0000, 0xFFFFFFFF,
1067    0xFFFFFFFF, 0xFFFFFFFF, 0x0003FFC3, 0x0000401F,
1068    0xFFFFFFFF, 0xFFFFFFFF, 0xE0FFFFFF, 0x0400FFFF,
1069    0xFFFFD740, 0xFFFFFFFB, 0xFFFF7FFF, 0x0FBFFFFF,
1070    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1071    0xFFFFFC7B, 0xFFFFFFFF, 0xFFFF7FFF, 0x033FFFFF,
1072    0x0000FFFF, 0xFFFE0000, 0x027FFFFF, 0xFFFFFFFE,
1073    0xFFFE00FF, 0xBBFFFFFB, 0xFFFF0016, 0x000707FF,
1074    0x003F000F, 0x07FFFFFE, 0x01FFFFFF, 0xFFFFC3FF,
1075    0xFFFFFFFF, 0xFFFFFFFF, 0xBFEFFFFF, 0x9FFFFDFF,
1076    0xFFFF8000, 0xFFFFFFFF, 0x0000E7FF, 0x00000000,
1077    0xFFFFFFFF, 0x0003FFFF, 0x00000000, 0x00000000,
1078    0x00000000, 0x00000000, 0x00000000, 0x00000000,
1079    0x00000000, 0x00000000, 0x00000000, 0x00000000,
1080    0xFFFFFFFE, 0xF3FFFFFF, 0xFF1F3FFF, 0x0000FFCF,
1081    0xFFF99FEE, 0xF3C5FDFF, 0xB080399F, 0x0003FFCF,
1082    0xFFF987EE, 0xD36DFDFF, 0x5E003987, 0x001FFFC0,
1083    0xFFFBBFEE, 0xF3EDFDFF, 0x00013BBF, 0x0000FFCF,
1084    0xFFF99FEE, 0xF3EDFDFF, 0xB0C0398F, 0x0002FFC3,
1085    0xD63DC7EC, 0xC3BFC718, 0x00803DC7, 0x0000FF80,
1086    0xFFFDDFEE, 0xC3EFFDFF, 0x00603DDF, 0x0000FFC3,
1087    0xFFFDDFEC, 0xF3EFFDFF, 0x40603DDF, 0x0000FFC3,
1088    0xFFFDDFEC, 0xC3FFFDFF, 0x00803DCF, 0x0000FFC3,
1089    0xFC7FFFEC, 0x2FFBFFFF, 0xFF5F847F, 0x000C0000,
1090    0xFFFFFFFE, 0x07FFFFFF, 0x03FF7FFF, 0x00000000,
1091    0xFEF02596, 0x3BFFECAE, 0x33FF3F5F, 0x00000000,
1092    0x03000001, 0xC2A003FF, 0xFFFFFEFF, 0xFFFE07FF,
1093    0xFEFF0FDF, 0x1FFFFFFF, 0x00000040, 0x00000000,
1094    0xFFFFFFFF, 0x03C7F6FB, 0x03FF03FF, 0x00000000,
1095    0x00000000, 0xFFFFFFFF, 0xFFFF003F, 0x01FFFFFF,
1096    0xFFFFFFFF, 0xFFFFFFFF, 0x83FFFFFF, 0xFFFFFFFF,
1097    0xFFFFFFFF, 0xFFFFFF07, 0xFFFFFFFF, 0x03FFFFFF,
1098    0xFFFFFF7F, 0xFFFFFFFF, 0x3D7F3D7F, 0xFFFFFFFF,
1099    0xFFFF3D7F, 0x7F3D7FFF, 0xFF7F7F3D, 0xFFFF7FFF,
1100    0x7F3D7FFF, 0xFFFFFFFF, 0x07FFFF7F, 0x0003FE00,
1101    0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0x001FFFFF,
1102    0xFFFFFFFE, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1103    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1104    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1105    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1106    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x007F9FFF,
1107    0x07FFFFFE, 0xFFFFFFFF, 0xFFFFFFFF, 0x0001C7FF,
1108    0x001FDFFF, 0x001FFFFF, 0x000FFFFF, 0x000DDFFF,
1109    0xFFFFFFFF, 0xFFFFFFFF, 0x308FFFFF, 0x000003FF,
1110    0x03FF3800, 0xFFFFFFFF, 0xFFFFFFFF, 0x00FFFFFF,
1111    0xFFFFFFFF, 0x000003FF, 0x00000000, 0x00000000,
1112    0x1FFFFFFF, 0x0FFF0FFF, 0xFFFFFFC0, 0x001F3FFF,
1113    0x00000000, 0x00000000, 0x00000000, 0x00000000,
1114    0x00000000, 0x00000000, 0x00000000, 0x00000000,
1115    0x00000000, 0x00000000, 0x00000000, 0x00000000,
1116    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000FFF,
1117    0x00000000, 0x00000000, 0x00000000, 0x00000000,
1118    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1119    0x0FFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x03FFFFFF,
1120    0x3F3FFFFF, 0xFFFFFFFF, 0xAAFF3F3F, 0x3FFFFFFF,
1121    0xFFFFFFFF, 0x5FDFFFFF, 0x0FCF1FDC, 0x1FDC1FFF,
1122    0x0000F000, 0x80007C00, 0x00100001, 0x8002FC0F,
1123    0x00000000, 0x00000000, 0x1FFF0000, 0x000007E2,
1124    0x3E2FFC84, 0xE3FBBD50, 0x000003E0, 0xFFFFFFFF,
1125    0x0000000F, 0x00000000, 0x00000000, 0x00000000,
1126    0x000000E0, 0x1F3EFFFE, 0xFFFFFFFE, 0xFFFFFFFF,
1127    0xE67FFFFF, 0xFFFFFFFE, 0xFFFFFFFF, 0xFFFFFFFF,
1128    0xFFFFFFE0, 0xFFFE1FFF, 0xFFFFFFFF, 0xFFFFFFFF,
1129    0x00007FFF, 0x00FFFFFF, 0x00000000, 0xFFFF0000,
1130    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1131    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1132    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1133    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1134    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1135    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1136    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1137    0xFFFFFFFF, 0x003FFFFF, 0x00000000, 0x00000000,
1138    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1139    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1140    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1141    0xFFFFFFFF, 0x0000003F, 0x00000000, 0x00000000,
1142    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1143    0x00001FFF, 0x00000000, 0x00000000, 0x00000000,
1144    0x00000000, 0x00000000, 0x00000000, 0x00000000,
1145    0x00000000, 0x00000000, 0x00000000, 0x00000000,
1146    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1147    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1148    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1149    0xFFFFFFFF, 0x0000000F, 0x00000000, 0x00000000,
1150    0x00000000, 0x00000000, 0x00000000, 0x00000000,
1151    0x00000000, 0x00000000, 0x00000000, 0x00000000,
1152    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1153    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1154    0xFFFFFFFF, 0xFFFF3FFF, 0xFFFFFFFF, 0x000007FF,
1155    0x00000000, 0x00000000, 0x00000000, 0x00000000,
1156    0xE0F8007F, 0x5F7FFDFF, 0xFFFFFFDB, 0xFFFFFFFF,
1157    0xFFFFFFFF, 0x0003FFFF, 0xFFF80000, 0xFFFFFFFF,
1158    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1159    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1160    0xFFFFFFFF, 0x3FFFFFFF, 0xFFFF0000, 0xFFFFFFFF,
1161    0xFFFCFFFF, 0xFFFFFFFF, 0x000000FF, 0x0FFF0000,
1162    0x0000FFFF, 0x0018000F, 0x0000E000, 0xFFDF0000,
1163    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x9FFFFFFF,
1164    0x03FF0000, 0x87FFFFFE, 0x07FFFFFE, 0xFFFFFFE0,
1165    0xFFFFFFFF, 0x7FFFFFFF, 0x1CFCFCFC, 0x0E000000,
1166    0xFFFFEFFF, 0xB7FFFF7F, 0x3FFF3FFF, 0x00000000,
1167    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x07FFFFFF,
1168    0x00000000, 0x00000000, 0x00000000, 0x00000000,
1169    0x00000000, 0x00000000, 0x00000000, 0x00000000,
1170    0x00000000, 0x00000000, 0x00000000, 0x00000000,
1171    0x00000000, 0x00000000, 0x00000000, 0x00000000,
1172    0x7FFFFFFF, 0xFFFF0000, 0x000007FF, 0x00000000,
1173    0x3FFFFFFF, 0x00000000, 0x00000000, 0x00000000,
1174    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1175    0x3FFFFFFF, 0x000003FF, 0x00000000, 0x00000000,
1176    0x00000000, 0x00000000, 0x00000000, 0x00000000,
1177    0x00000000, 0x00000000, 0x00000000, 0x00000000,
1178    0xFFFFFD3F, 0x91BFFFFF, 0x00000000, 0x00000000,
1179    0x00000000, 0x00000000, 0x00000000, 0x00000000,
1180    0x00000000, 0x00000000, 0x00000000, 0x00000000,
1181    0x00000000, 0x00000000, 0x00000000, 0x00000000,
1182    0x00000000, 0x00000000, 0x00000000, 0x00000000,
1183    0x00000000, 0x00000000, 0x00000000, 0x00000000,
1184    0x00000000, 0x00000000, 0x00000000, 0xFFFFE3E0,
1185    0x00000FE7, 0x00003C00, 0x00000000, 0x00000000,
1186    0xFFFFFFFF, 0xFFFFFFFF, 0xFFDFFFFF, 0xFFFFFFFF,
1187    0xDFFFFFFF, 0xEBFFDE64, 0xFFFFFFEF, 0xFFFFFFFF,
1188    0xDFDFE7BF, 0x7BFFFFFF, 0xFFFDFC5F, 0xFFFFFFFF,
1189    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1190    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1191    0xFFFFFFFF, 0xFFFFFF0F, 0xF7FFFFFD, 0xF7FFFFFF,
1192    0xFFDFFFFF, 0xFFDFFFFF, 0xFFFF7FFF, 0xFFFF7FFF,
1193    0xFFFFFDFF, 0xFFFFFDFF, 0xFFFFC3F7, 0xFFFFFFFF,
1194    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1195    0xFFFFFFFF, 0xFFFFFFFF, 0x007FFFFF, 0x00000000,
1196    0x00000000, 0x00000000, 0x00000000, 0x00000000,
1197    0x00000000, 0x00000000, 0x00000000, 0x00000000,
1198    0x3FFFFFFF, 0x00000000, 0x00000000, 0x00000000,
1199    0x00000000, 0x00000000, 0x00000000, 0x00000000,
1200    0x00000000, 0x00000000, 0x00000000, 0x00000000,
1201    0x00000000, 0x00000000, 0x00000000, 0x00000000,
1202    0x00000002, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1203    0x00000000, 0x00000000, 0x00000000, 0x00000000,
1204    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1205    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x0000FFFF
1206  }
1207};
1208
1209/* Return true if a given character can occur as first character of an
1210   identifier.  See ECMA-334 section 9.4.2.  */
1211static bool
1212is_identifier_start (int c)
1213{
1214  return bitmap_lookup (&table_identifier_start, c);
1215  /* In ASCII only this would be:
1216     return ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || c == '_');
1217   */
1218}
1219
1220/* Return true if a given character can occur as character of an identifier.
1221   See ECMA-334 section 9.4.2.  */
1222static bool
1223is_identifier_part (int c)
1224{
1225  return bitmap_lookup (&table_identifier_part, c);
1226  /* In ASCII only this would be:
1227     return ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z')
1228             || (c >= '0' && c <= '9') || c == '_');
1229   */
1230}
1231
1232static bool
1233is_any_character (int c)
1234{
1235  return true;
1236}
1237
1238
1239/* ======================= Preprocessor directives.  ======================= */
1240
1241
1242/* Phase 5: Remove preprocessor lines.  See ECMA-334 section 9.5.
1243   As a side effect, this also removes initial whitespace on every line;
1244   this whitespace doesn't matter.  */
1245
1246static int phase5_pushback[10];
1247static int phase5_pushback_length;
1248
1249static int
1250phase5_getc ()
1251{
1252  int c;
1253
1254  if (phase5_pushback_length)
1255    return phase5_pushback[--phase5_pushback_length];
1256
1257  c = phase4_getc ();
1258  if (c != UNL)
1259    return c;
1260
1261  do
1262    c = phase3_getc ();
1263  while (c != UEOF && is_whitespace (c));
1264
1265  if (c == '#')
1266    {
1267      /* Ignore the entire line containing the preprocessor directive
1268	 (including the // comment if it contains one).  */
1269      do
1270	c = phase3_getc ();
1271      while (c != UEOF && c != UNL);
1272      return c;
1273    }
1274  else
1275    {
1276      phase3_ungetc (c);
1277      return UNL;
1278    }
1279}
1280
1281#ifdef unused
1282static void
1283phase5_ungetc (int c)
1284{
1285  if (c != UEOF)
1286    {
1287      if (phase5_pushback_length == SIZEOF (phase5_pushback))
1288	abort ();
1289      phase5_pushback[phase5_pushback_length++] = c;
1290    }
1291}
1292#endif
1293
1294
1295/* ========================== Reading of tokens.  ========================== */
1296
1297enum token_type_ty
1298{
1299  token_type_eof,
1300  token_type_lparen,		/* ( */
1301  token_type_rparen,		/* ) */
1302  token_type_lbrace,		/* { */
1303  token_type_rbrace,		/* } */
1304  token_type_comma,		/* , */
1305  token_type_dot,		/* . */
1306  token_type_string_literal,	/* "abc", @"abc" */
1307  token_type_number,		/* 1.23 */
1308  token_type_symbol,		/* identifier, keyword, null */
1309  token_type_plus,		/* + */
1310  token_type_other		/* character literal, misc. operator */
1311};
1312typedef enum token_type_ty token_type_ty;
1313
1314typedef struct token_ty token_ty;
1315struct token_ty
1316{
1317  token_type_ty type;
1318  char *string;		/* for token_type_string_literal, token_type_symbol */
1319  refcounted_string_list_ty *comment;	/* for token_type_string_literal */
1320  int line_number;
1321  int logical_line_number;
1322};
1323
1324
1325/* Free the memory pointed to by a 'struct token_ty'.  */
1326static inline void
1327free_token (token_ty *tp)
1328{
1329  if (tp->type == token_type_string_literal || tp->type == token_type_symbol)
1330    free (tp->string);
1331  if (tp->type == token_type_string_literal)
1332    drop_reference (tp->comment);
1333}
1334
1335
1336/* Read a Unicode escape sequence outside string/character literals.
1337   Reject Unicode escapes that don't fulfill the given predicate.
1338   See ECMA-334 section 9.4.2.  */
1339static int
1340do_getc_unicode_escaped (bool (*predicate) (int))
1341{
1342  int c;
1343
1344  /* Use phase 3, because phase 4 elides comments.  */
1345  c = phase3_getc ();
1346  if (c == UEOF)
1347    return '\\';
1348  if (c == 'u' || c == 'U')
1349    {
1350      unsigned char buf[8];
1351      int expect;
1352      unsigned int n;
1353      int i;
1354
1355      expect = (c == 'U' ? 8 : 4);
1356      n = 0;
1357      for (i = 0; i < expect; i++)
1358	{
1359	  int c1 = phase3_getc ();
1360
1361	  if (c1 >= '0' && c1 <= '9')
1362	    n = (n << 4) + (c1 - '0');
1363	  else if (c1 >= 'A' && c1 <= 'F')
1364	    n = (n << 4) + (c1 - 'A' + 10);
1365	  else if (c1 >= 'a' && c1 <= 'f')
1366	    n = (n << 4) + (c1 - 'a' + 10);
1367	  else
1368	    {
1369	      phase3_ungetc (c1);
1370	      while (--i >= 0)
1371		phase3_ungetc (buf[i]);
1372	      phase3_ungetc (c);
1373	      return '\\';
1374	    }
1375
1376	  buf[i] = c1;
1377	}
1378
1379      if (n >= 0x110000)
1380	{
1381	  error_with_progname = false;
1382	  error (0, 0, _("%s:%d: warning: invalid Unicode character"),
1383		 logical_file_name, line_number);
1384	  error_with_progname = true;
1385	}
1386      else if (predicate (n))
1387	return n;
1388
1389      while (--i >= 0)
1390	phase3_ungetc (buf[i]);
1391    }
1392  phase3_ungetc (c);
1393  return '\\';
1394}
1395
1396
1397/* Read an escape sequence inside a string literal or character literal.
1398   See ECMA-334 sections 9.4.4.4., 9.4.4.5.  */
1399static int
1400do_getc_escaped ()
1401{
1402  int c;
1403  int n;
1404  int i;
1405
1406  /* Use phase 3, because phase 4 elides comments.  */
1407  c = phase3_getc ();
1408  if (c == UEOF)
1409    return '\\';
1410  switch (c)
1411    {
1412    case 'a':
1413      return 0x0007;
1414    case 'b':
1415      return 0x0008;
1416    case 't':
1417      return 0x0009;
1418    case 'n':
1419      return 0x000a;
1420    case 'v':
1421      return 0x000b;
1422    case 'f':
1423      return 0x000c;
1424    case 'r':
1425      return 0x000d;
1426    case '"':
1427      return '"';
1428    case '\'':
1429      return '\'';
1430    case '\\':
1431      return '\\';
1432    case '0':
1433      return 0x0000;
1434    case 'x':
1435      c = phase3_getc ();
1436      switch (c)
1437	{
1438	default:
1439	  phase3_ungetc (c);
1440	  phase3_ungetc ('x');
1441	  return '\\';
1442
1443	case '0': case '1': case '2': case '3': case '4':
1444	case '5': case '6': case '7': case '8': case '9':
1445	case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
1446	case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
1447	  break;
1448	}
1449      n = 0;
1450      for (i = 0;; i++)
1451	{
1452	  switch (c)
1453	    {
1454	    default:
1455	      phase3_ungetc (c);
1456	      return n;
1457	    case '0': case '1': case '2': case '3': case '4':
1458	    case '5': case '6': case '7': case '8': case '9':
1459	      n = n * 16 + c - '0';
1460	      break;
1461	    case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
1462	      n = n * 16 + 10 + c - 'A';
1463	      break;
1464	    case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
1465	      n = n * 16 + 10 + c - 'a';
1466	      break;
1467	    }
1468	  if (i == 3)
1469	    break;
1470	  c = phase3_getc ();
1471	}
1472      return n;
1473    case 'u': case 'U':
1474      phase3_ungetc (c);
1475      return do_getc_unicode_escaped (is_any_character);
1476    default:
1477      /* Invalid escape sequence.  */
1478      phase3_ungetc (c);
1479      return '\\';
1480    }
1481}
1482
1483/* Read a regular string literal or character literal.
1484   See ECMA-334 sections 9.4.4.4., 9.4.4.5.  */
1485static void
1486accumulate_escaped (struct string_buffer *literal, int delimiter)
1487{
1488  int c;
1489
1490  for (;;)
1491    {
1492      /* Use phase 3, because phase 4 elides comments.  */
1493      c = phase3_getc ();
1494      if (c == UEOF || c == delimiter)
1495	break;
1496      if (c == UNL)
1497	{
1498	  phase3_ungetc (c);
1499	  error_with_progname = false;
1500	  if (delimiter == '\'')
1501	    error (0, 0, _("%s:%d: warning: unterminated character constant"),
1502		   logical_file_name, line_number);
1503	  else
1504	    error (0, 0, _("%s:%d: warning: unterminated string constant"),
1505		   logical_file_name, line_number);
1506	  error_with_progname = true;
1507	  break;
1508	}
1509      if (c == '\\')
1510	c = do_getc_escaped ();
1511      string_buffer_append_unicode (literal, c);
1512    }
1513}
1514
1515
1516/* Combine characters into tokens.  Discard whitespace.  */
1517
1518/* Maximum used guaranteed to be < 4.  */
1519static token_ty phase6_pushback[4];
1520static int phase6_pushback_length;
1521
1522static void
1523phase6_get (token_ty *tp)
1524{
1525  int c;
1526
1527  if (phase6_pushback_length)
1528    {
1529      *tp = phase6_pushback[--phase6_pushback_length];
1530      return;
1531    }
1532  tp->string = NULL;
1533
1534  for (;;)
1535    {
1536      tp->line_number = line_number;
1537      tp->logical_line_number = logical_line_number;
1538      c = phase5_getc ();
1539
1540      if (c == UEOF)
1541	{
1542	  tp->type = token_type_eof;
1543	  return;
1544	}
1545
1546      switch (c)
1547	{
1548	case UNL:
1549	  if (last_non_comment_line > last_comment_line)
1550	    savable_comment_reset ();
1551	  /* FALLTHROUGH */
1552	case ' ':
1553	case '\t':
1554	case '\f':
1555	  /* Ignore whitespace and comments.  */
1556	  continue;
1557	}
1558
1559      last_non_comment_line = tp->logical_line_number;
1560
1561      switch (c)
1562	{
1563	case '(':
1564	  tp->type = token_type_lparen;
1565	  return;
1566
1567	case ')':
1568	  tp->type = token_type_rparen;
1569	  return;
1570
1571	case '{':
1572	  tp->type = token_type_lbrace;
1573	  return;
1574
1575	case '}':
1576	  tp->type = token_type_rbrace;
1577	  return;
1578
1579	case ',':
1580	  tp->type = token_type_comma;
1581	  return;
1582
1583	case '.':
1584	  c = phase4_getc ();
1585	  if (!(c >= '0' && c <= '9'))
1586	    {
1587	      phase4_ungetc (c);
1588	      tp->type = token_type_dot;
1589	      return;
1590	    }
1591	  /* FALLTHROUGH */
1592
1593	case '0': case '1': case '2': case '3': case '4':
1594	case '5': case '6': case '7': case '8': case '9':
1595	  {
1596	    /* Don't need to verify the complicated syntax of integers and
1597	       floating-point numbers.  We assume a valid C# input.
1598	       The simplified syntax that we recognize as number is: any
1599	       sequence of alphanumeric characters, additionally '+' and '-'
1600	       immediately after 'e' or 'E' except in hexadecimal numbers.  */
1601	    bool hexadecimal = false;
1602
1603	    for (;;)
1604	      {
1605		c = phase4_getc ();
1606		if (c >= '0' && c <= '9')
1607		  continue;
1608		if ((c >= 'A' && c <= 'Z') || (c >= 'a' &&c <= 'z'))
1609		  {
1610		    if (c == 'X' || c == 'x')
1611		      hexadecimal = true;
1612		    if ((c == 'E' || c == 'e') && !hexadecimal)
1613		      {
1614			c = phase4_getc ();
1615			if (!(c == '+' || c == '-'))
1616			  phase4_ungetc (c);
1617		      }
1618		    continue;
1619		  }
1620		if (c == '.')
1621		  continue;
1622		break;
1623	      }
1624	    phase4_ungetc (c);
1625	    tp->type = token_type_number;
1626	    return;
1627	  }
1628
1629	case '"':
1630	  /* Regular string literal.  */
1631	  {
1632	    struct string_buffer literal;
1633
1634	    init_string_buffer (&literal);
1635	    accumulate_escaped (&literal, '"');
1636	    tp->string = xstrdup (string_buffer_result (&literal));
1637	    free_string_buffer (&literal);
1638	    tp->comment = add_reference (savable_comment);
1639	    tp->type = token_type_string_literal;
1640	    return;
1641	  }
1642
1643	case '\'':
1644	  /* Character literal.  */
1645	  {
1646	    struct string_buffer literal;
1647
1648	    init_string_buffer (&literal);
1649	    accumulate_escaped (&literal, '\'');
1650	    free_string_buffer (&literal);
1651	    tp->type = token_type_other;
1652	    return;
1653	  }
1654
1655	case '+':
1656	  c = phase4_getc ();
1657	  if (c == '+')
1658	    /* Operator ++ */
1659	    tp->type = token_type_other;
1660	  else if (c == '=')
1661	    /* Operator += */
1662	    tp->type = token_type_other;
1663	  else
1664	    {
1665	      /* Operator + */
1666	      phase4_ungetc (c);
1667	      tp->type = token_type_plus;
1668	    }
1669	  return;
1670
1671	case '@':
1672	  c = phase4_getc ();
1673	  if (c == '"')
1674	    {
1675	      /* Verbatim string literal.  */
1676	      struct string_buffer literal;
1677
1678	      init_string_buffer (&literal);
1679	      for (;;)
1680		{
1681		  /* Use phase 2, because phase 4 elides comments and phase 3
1682		     mixes up the newline characters.  */
1683		  c = phase2_getc ();
1684		  if (c == UEOF)
1685		    break;
1686		  if (c == '"')
1687		    {
1688		      c = phase2_getc ();
1689		      if (c != '"')
1690			{
1691			  phase2_ungetc (c);
1692			  break;
1693			}
1694		    }
1695		  /* No special treatment of newline and backslash here.  */
1696		  string_buffer_append_unicode (&literal, c);
1697		}
1698	      tp->string = xstrdup (string_buffer_result (&literal));
1699	      free_string_buffer (&literal);
1700	      tp->comment = add_reference (savable_comment);
1701	      tp->type = token_type_string_literal;
1702	      return;
1703	    }
1704	  /* FALLTHROUGH, so that @identifier is recognized.  */
1705
1706	default:
1707	  if (c == '\\')
1708	    c = do_getc_unicode_escaped (is_identifier_start);
1709	  if (is_identifier_start (c))
1710	    {
1711	      static struct string_buffer buffer;
1712	      buffer.utf8_buflen = 0;
1713	      for (;;)
1714		{
1715		  string_buffer_append_unicode (&buffer, c);
1716		  c = phase4_getc ();
1717		  if (c == '\\')
1718		    c = do_getc_unicode_escaped (is_identifier_part);
1719		  if (!is_identifier_part (c))
1720		    break;
1721		}
1722	      phase4_ungetc (c);
1723	      tp->string = xstrdup (string_buffer_result (&buffer));
1724	      tp->type = token_type_symbol;
1725	      return;
1726	    }
1727	  else
1728	    {
1729	      /* Misc. operator.  */
1730	      tp->type = token_type_other;
1731	      return;
1732	    }
1733	}
1734    }
1735}
1736
1737/* Supports 3 tokens of pushback.  */
1738static void
1739phase6_unget (token_ty *tp)
1740{
1741  if (tp->type != token_type_eof)
1742    {
1743      if (phase6_pushback_length == SIZEOF (phase6_pushback))
1744	abort ();
1745      phase6_pushback[phase6_pushback_length++] = *tp;
1746    }
1747}
1748
1749
1750/* Compile-time optimization of string literal concatenation.
1751   Combine "string1" + ... + "stringN" to the concatenated string if
1752     - the token after this expression is not '.' (because then the last
1753       string could be part of a method call expression).  */
1754
1755static token_ty phase7_pushback[2];
1756static int phase7_pushback_length;
1757
1758static void
1759phase7_get (token_ty *tp)
1760{
1761  if (phase7_pushback_length)
1762    {
1763      *tp = phase7_pushback[--phase7_pushback_length];
1764      return;
1765    }
1766
1767  phase6_get (tp);
1768  if (tp->type == token_type_string_literal)
1769    {
1770      char *sum = tp->string;
1771      size_t sum_len = strlen (sum);
1772
1773      for (;;)
1774	{
1775	  token_ty token2;
1776
1777	  phase6_get (&token2);
1778	  if (token2.type == token_type_plus)
1779	    {
1780	      token_ty token3;
1781
1782	      phase6_get (&token3);
1783	      if (token3.type == token_type_string_literal)
1784		{
1785		  token_ty token_after;
1786
1787		  phase6_get (&token_after);
1788		  if (token_after.type != token_type_dot)
1789		    {
1790		      char *addend = token3.string;
1791		      size_t addend_len = strlen (addend);
1792
1793		      sum = (char *) xrealloc (sum, sum_len + addend_len + 1);
1794		      memcpy (sum + sum_len, addend, addend_len + 1);
1795		      sum_len += addend_len;
1796
1797		      phase6_unget (&token_after);
1798		      free_token (&token3);
1799		      free_token (&token2);
1800		      continue;
1801		    }
1802		  phase6_unget (&token_after);
1803		}
1804	      phase6_unget (&token3);
1805	    }
1806	  phase6_unget (&token2);
1807	  break;
1808	}
1809      tp->string = sum;
1810    }
1811}
1812
1813/* Supports 2 tokens of pushback.  */
1814static void
1815phase7_unget (token_ty *tp)
1816{
1817  if (tp->type != token_type_eof)
1818    {
1819      if (phase7_pushback_length == SIZEOF (phase7_pushback))
1820	abort ();
1821      phase7_pushback[phase7_pushback_length++] = *tp;
1822    }
1823}
1824
1825
1826static void
1827x_csharp_lex (token_ty *tp)
1828{
1829  phase7_get (tp);
1830}
1831
1832/* Supports 2 tokens of pushback.  */
1833static void
1834x_csharp_unlex (token_ty *tp)
1835{
1836  phase7_unget (tp);
1837}
1838
1839
1840/* ========================= Extracting strings.  ========================== */
1841
1842
1843/* Context lookup table.  */
1844static flag_context_list_table_ty *flag_context_list_table;
1845
1846
1847/* The file is broken into tokens.  Scan the token stream, looking for
1848   a keyword, followed by a left paren, followed by a string.  When we
1849   see this sequence, we have something to remember.  We assume we are
1850   looking at a valid C or C++ program, and leave the complaints about
1851   the grammar to the compiler.
1852
1853     Normal handling: Look for
1854       keyword ( ... msgid ... )
1855     Plural handling: Look for
1856       keyword ( ... msgid ... msgid_plural ... )
1857
1858   We use recursion because the arguments before msgid or between msgid
1859   and msgid_plural can contain subexpressions of the same form.  */
1860
1861
1862/* Extract messages until the next balanced closing parenthesis or brace,
1863   depending on TERMINATOR.
1864   Extracted messages are added to MLP.
1865   Return true upon eof, false upon closing parenthesis or brace.  */
1866static bool
1867extract_parenthesized (message_list_ty *mlp, token_type_ty terminator,
1868		       flag_context_ty outer_context,
1869		       flag_context_list_iterator_ty context_iter,
1870		       struct arglist_parser *argparser)
1871{
1872  /* Current argument number.  */
1873  int arg = 1;
1874  /* 0 when no keyword has been seen.  1 right after a keyword is seen.  */
1875  int state;
1876  /* Parameters of the keyword just seen.  Defined only in state 1.  */
1877  const struct callshapes *next_shapes = NULL;
1878  /* Context iterator that will be used if the next token is a '('.  */
1879  flag_context_list_iterator_ty next_context_iter =
1880    passthrough_context_list_iterator;
1881  /* Current context.  */
1882  flag_context_ty inner_context =
1883    inherited_context (outer_context,
1884		       flag_context_list_iterator_advance (&context_iter));
1885
1886  /* Start state is 0.  */
1887  state = 0;
1888
1889  for (;;)
1890    {
1891      token_ty token;
1892
1893      x_csharp_lex (&token);
1894      switch (token.type)
1895	{
1896	case token_type_symbol:
1897	  {
1898	    /* Combine symbol1 . ... . symbolN to a single strings, so that
1899	       we can recognize static function calls like
1900	       GettextResource.gettext.  The information present for
1901	       symbolI.....symbolN has precedence over the information for
1902	       symbolJ.....symbolN with J > I.  */
1903	    char *sum = token.string;
1904	    size_t sum_len = strlen (sum);
1905	    const char *dottedname;
1906	    flag_context_list_ty *context_list;
1907
1908	    for (;;)
1909	      {
1910		token_ty token2;
1911
1912		x_csharp_lex (&token2);
1913		if (token2.type == token_type_dot)
1914		  {
1915		    token_ty token3;
1916
1917		    x_csharp_lex (&token3);
1918		    if (token3.type == token_type_symbol)
1919		      {
1920			char *addend = token3.string;
1921			size_t addend_len = strlen (addend);
1922
1923			sum =
1924			  (char *) xrealloc (sum, sum_len + 1 + addend_len + 1);
1925			sum[sum_len] = '.';
1926			memcpy (sum + sum_len + 1, addend, addend_len + 1);
1927			sum_len += 1 + addend_len;
1928
1929			free_token (&token3);
1930			free_token (&token2);
1931			continue;
1932		      }
1933		    x_csharp_unlex (&token3);
1934		  }
1935		x_csharp_unlex (&token2);
1936		break;
1937	      }
1938
1939	    for (dottedname = sum;;)
1940	      {
1941		void *keyword_value;
1942
1943		if (hash_find_entry (&keywords, dottedname, strlen (dottedname),
1944				     &keyword_value)
1945		    == 0)
1946		  {
1947		    next_shapes = (const struct callshapes *) keyword_value;
1948		    state = 1;
1949		    break;
1950		  }
1951
1952		dottedname = strchr (dottedname, '.');
1953		if (dottedname == NULL)
1954		  {
1955		    state = 0;
1956		    break;
1957		  }
1958		dottedname++;
1959	      }
1960
1961	    for (dottedname = sum;;)
1962	      {
1963		context_list =
1964		  flag_context_list_table_lookup (
1965		    flag_context_list_table,
1966		    dottedname, strlen (dottedname));
1967		if (context_list != NULL)
1968		  break;
1969
1970		dottedname = strchr (dottedname, '.');
1971		if (dottedname == NULL)
1972		  break;
1973		dottedname++;
1974	      }
1975	    next_context_iter = flag_context_list_iterator (context_list);
1976
1977	    free (sum);
1978	    continue;
1979	  }
1980
1981	case token_type_lparen:
1982	  if (extract_parenthesized (mlp, token_type_rparen,
1983				     inner_context, next_context_iter,
1984				     arglist_parser_alloc (mlp,
1985							   state ? next_shapes : NULL)))
1986	    {
1987	      xgettext_current_source_encoding = po_charset_utf8;
1988	      arglist_parser_done (argparser, arg);
1989	      xgettext_current_source_encoding = xgettext_global_source_encoding;
1990	      return true;
1991	    }
1992	  next_context_iter = null_context_list_iterator;
1993	  state = 0;
1994	  continue;
1995
1996	case token_type_rparen:
1997	  if (terminator == token_type_rparen)
1998	    {
1999	      xgettext_current_source_encoding = po_charset_utf8;
2000	      arglist_parser_done (argparser, arg);
2001	      xgettext_current_source_encoding = xgettext_global_source_encoding;
2002	      return false;
2003	    }
2004	  if (terminator == token_type_rbrace)
2005	    {
2006	      error_with_progname = false;
2007	      error (0, 0,
2008		     _("%s:%d: warning: ')' found where '}' was expected"),
2009		     logical_file_name, token.line_number);
2010	      error_with_progname = true;
2011	    }
2012	  next_context_iter = null_context_list_iterator;
2013	  state = 0;
2014	  continue;
2015
2016	case token_type_lbrace:
2017	  if (extract_parenthesized (mlp, token_type_rbrace,
2018				     null_context, null_context_list_iterator,
2019				     arglist_parser_alloc (mlp, NULL)))
2020	    {
2021	      xgettext_current_source_encoding = po_charset_utf8;
2022	      arglist_parser_done (argparser, arg);
2023	      xgettext_current_source_encoding = xgettext_global_source_encoding;
2024	      return true;
2025	    }
2026	  next_context_iter = null_context_list_iterator;
2027	  state = 0;
2028	  continue;
2029
2030	case token_type_rbrace:
2031	  if (terminator == token_type_rbrace)
2032	    {
2033	      xgettext_current_source_encoding = po_charset_utf8;
2034	      arglist_parser_done (argparser, arg);
2035	      xgettext_current_source_encoding = xgettext_global_source_encoding;
2036	      return false;
2037	    }
2038	  if (terminator == token_type_rparen)
2039	    {
2040	      error_with_progname = false;
2041	      error (0, 0,
2042		     _("%s:%d: warning: '}' found where ')' was expected"),
2043		     logical_file_name, token.line_number);
2044	      error_with_progname = true;
2045	    }
2046	  next_context_iter = null_context_list_iterator;
2047	  state = 0;
2048	  continue;
2049
2050	case token_type_comma:
2051	  arg++;
2052	  inner_context =
2053	    inherited_context (outer_context,
2054			       flag_context_list_iterator_advance (
2055				 &context_iter));
2056	  next_context_iter = passthrough_context_list_iterator;
2057	  state = 0;
2058	  continue;
2059
2060	case token_type_string_literal:
2061	  {
2062	    lex_pos_ty pos;
2063	    pos.file_name = logical_file_name;
2064	    pos.line_number = token.line_number;
2065
2066	    xgettext_current_source_encoding = po_charset_utf8;
2067	    if (extract_all)
2068	      remember_a_message (mlp, NULL, token.string, inner_context,
2069				  &pos, token.comment);
2070	    else
2071	      arglist_parser_remember (argparser, arg, token.string,
2072				       inner_context,
2073				       pos.file_name, pos.line_number,
2074				       token.comment);
2075	    xgettext_current_source_encoding = xgettext_global_source_encoding;
2076	  }
2077	  drop_reference (token.comment);
2078	  next_context_iter = null_context_list_iterator;
2079	  state = 0;
2080	  continue;
2081
2082	case token_type_eof:
2083	  xgettext_current_source_encoding = po_charset_utf8;
2084	  arglist_parser_done (argparser, arg);
2085	  xgettext_current_source_encoding = xgettext_global_source_encoding;
2086	  return true;
2087
2088	case token_type_dot:
2089	case token_type_number:
2090	case token_type_plus:
2091	case token_type_other:
2092	  next_context_iter = null_context_list_iterator;
2093	  state = 0;
2094	  continue;
2095
2096	default:
2097	  abort ();
2098	}
2099    }
2100}
2101
2102
2103void
2104extract_csharp (FILE *f,
2105		const char *real_filename, const char *logical_filename,
2106		flag_context_list_table_ty *flag_table,
2107		msgdomain_list_ty *mdlp)
2108{
2109  message_list_ty *mlp = mdlp->item[0]->messages;
2110
2111  fp = f;
2112  real_file_name = real_filename;
2113  logical_file_name = xstrdup (logical_filename);
2114  line_number = 1;
2115
2116  logical_line_number = 1;
2117  last_comment_line = -1;
2118  last_non_comment_line = -1;
2119
2120  flag_context_list_table = flag_table;
2121
2122  init_keywords ();
2123
2124  /* Eat tokens until eof is seen.  When extract_parenthesized returns
2125     due to an unbalanced closing parenthesis, just restart it.  */
2126  while (!extract_parenthesized (mlp, token_type_eof,
2127				 null_context, null_context_list_iterator,
2128				 arglist_parser_alloc (mlp, NULL)))
2129    ;
2130
2131  fp = NULL;
2132  real_file_name = NULL;
2133  logical_file_name = NULL;
2134  line_number = 0;
2135}
2136