1/* xgettext C# backend. 2 Copyright (C) 2003, 2005-2006 Free Software Foundation, Inc. 3 Written by Bruno Haible <bruno@clisp.org>, 2003. 4 5 This program is free software; you can redistribute it and/or modify 6 it under the terms of the GNU General Public License as published by 7 the Free Software Foundation; either version 2, or (at your option) 8 any later version. 9 10 This program is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 GNU General Public License for more details. 14 15 You should have received a copy of the GNU General Public License 16 along with this program; if not, write to the Free Software Foundation, 17 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ 18 19#ifdef HAVE_CONFIG_H 20# include "config.h" 21#endif 22 23#include <errno.h> 24#include <stdbool.h> 25#include <stdio.h> 26#include <stdlib.h> 27#include <string.h> 28 29#include "message.h" 30#include "xgettext.h" 31#include "x-csharp.h" 32#include "c-ctype.h" 33#include "error.h" 34#include "error-progname.h" 35#include "xalloc.h" 36#include "xerror.h" 37#include "xvasprintf.h" 38#include "exit.h" 39#include "hash.h" 40#include "po-charset.h" 41#include "utf8-ucs4.h" 42#include "ucs4-utf8.h" 43#include "gettext.h" 44 45#define _(s) gettext(s) 46 47#define SIZEOF(a) (sizeof(a) / sizeof(a[0])) 48 49 50/* The C# syntax is defined in ECMA-334, second edition. */ 51 52 53/* ====================== Keyword set customization. ====================== */ 54 55/* If true extract all strings. */ 56static bool extract_all = false; 57 58static hash_table keywords; 59static bool default_keywords = true; 60 61 62void 63x_csharp_extract_all () 64{ 65 extract_all = true; 66} 67 68 69/* Processes a --keyword option. 70 Non-ASCII function names can be used if given in UTF-8 encoding. */ 71void 72x_csharp_keyword (const char *name) 73{ 74 if (name == NULL) 75 default_keywords = false; 76 else 77 { 78 const char *end; 79 struct callshape shape; 80 const char *colon; 81 82 if (keywords.table == NULL) 83 hash_init (&keywords, 100); 84 85 split_keywordspec (name, &end, &shape); 86 87 /* The characters between name and end should form a valid C# 88 identifier sequence with dots. 89 A colon means an invalid parse in split_keywordspec(). */ 90 colon = strchr (name, ':'); 91 if (colon == NULL || colon >= end) 92 insert_keyword_callshape (&keywords, name, end - name, &shape); 93 } 94} 95 96/* Finish initializing the keywords hash table. 97 Called after argument processing, before each file is processed. */ 98static void 99init_keywords () 100{ 101 if (default_keywords) 102 { 103 /* When adding new keywords here, also update the documentation in 104 xgettext.texi! */ 105 x_csharp_keyword ("GetString"); /* Resource{Manager,Set}.GetString */ 106 x_csharp_keyword ("GetPluralString:1,2"); /* GettextResource{Manager,Set}.GetPluralString */ 107 default_keywords = false; 108 } 109} 110 111void 112init_flag_table_csharp () 113{ 114 xgettext_record_flag ("GetString:1:pass-csharp-format"); 115 xgettext_record_flag ("GetPluralString:1:pass-csharp-format"); 116 xgettext_record_flag ("GetPluralString:2:pass-csharp-format"); 117 xgettext_record_flag ("String.Format:1:csharp-format"); 118} 119 120 121/* ======================== Reading of characters. ======================== */ 122 123/* Real filename, used in error messages about the input file. */ 124static const char *real_file_name; 125 126/* Logical filename and line number, used to label the extracted messages. */ 127static char *logical_file_name; 128static int line_number; 129 130/* The input file stream. */ 131static FILE *fp; 132 133 134/* Phase 1: line_number handling. */ 135 136/* Maximum used, roughly a safer MB_LEN_MAX. */ 137#define MAX_PHASE1_PUSHBACK 16 138static unsigned char phase1_pushback[MAX_PHASE1_PUSHBACK]; 139static int phase1_pushback_length; 140 141/* Read the next single byte from the input file. */ 142static int 143phase1_getc () 144{ 145 int c; 146 147 if (phase1_pushback_length) 148 { 149 c = phase1_pushback[--phase1_pushback_length]; 150 if (c == '\n') 151 ++line_number; 152 return c; 153 } 154 155 c = getc (fp); 156 if (c == EOF) 157 { 158 if (ferror (fp)) 159 error (EXIT_FAILURE, errno, _("error while reading \"%s\""), 160 real_file_name); 161 return EOF; 162 } 163 164 if (c == '\n') 165 ++line_number; 166 return c; 167} 168 169/* Supports MAX_PHASE1_PUSHBACK characters of pushback. */ 170static void 171phase1_ungetc (int c) 172{ 173 if (c != EOF) 174 { 175 if (c == '\n') 176 --line_number; 177 if (phase1_pushback_length == SIZEOF (phase1_pushback)) 178 abort (); 179 phase1_pushback[phase1_pushback_length++] = c; 180 } 181} 182 183 184/* Phase 2: Conversion to Unicode. 185 This is done early because ECMA-334 section 9.1. says that the source is 186 "an ordered sequence of Unicode characters", and because the recognition 187 of the line terminators (ECMA-334 section 9.3.1) is hardly possible without 188 prior conversion to Unicode. */ 189 190/* End-of-file indicator for functions returning an UCS-4 character. */ 191#define UEOF -1 192 193/* Newline Unicode character. */ 194#define UNL 0x000a 195 196static int phase2_pushback[1]; 197static int phase2_pushback_length; 198 199/* Read the next Unicode UCS-4 character from the input file. */ 200static int 201phase2_getc () 202{ 203 if (phase2_pushback_length) 204 return phase2_pushback[--phase2_pushback_length]; 205 206 if (xgettext_current_source_encoding == po_charset_ascii) 207 { 208 int c = phase1_getc (); 209 if (c == EOF) 210 return UEOF; 211 if (!c_isascii (c)) 212 { 213 char buffer[21]; 214 sprintf (buffer, ":%ld", (long) line_number); 215 multiline_error (xstrdup (""), 216 xasprintf (_("\ 217Non-ASCII string at %s%s.\n\ 218Please specify the source encoding through --from-code.\n"), 219 real_file_name, buffer)); 220 exit (EXIT_FAILURE); 221 } 222 return c; 223 } 224 else if (xgettext_current_source_encoding != po_charset_utf8) 225 { 226#if HAVE_ICONV 227 /* Use iconv on an increasing number of bytes. Read only as many bytes 228 through phase1_getc as needed. This is needed to give reasonable 229 interactive behaviour when fp is connected to an interactive tty. */ 230 unsigned char buf[MAX_PHASE1_PUSHBACK]; 231 size_t bufcount; 232 int c = phase1_getc (); 233 if (c == EOF) 234 return UEOF; 235 buf[0] = (unsigned char) c; 236 bufcount = 1; 237 238 for (;;) 239 { 240 unsigned char scratchbuf[6]; 241 const char *inptr = (const char *) &buf[0]; 242 size_t insize = bufcount; 243 char *outptr = (char *) &scratchbuf[0]; 244 size_t outsize = sizeof (scratchbuf); 245 246 size_t res = iconv (xgettext_current_source_iconv, 247 (ICONV_CONST char **) &inptr, &insize, 248 &outptr, &outsize); 249 /* We expect that a character has been produced if and only if 250 some input bytes have been consumed. */ 251 if ((insize < bufcount) != (outsize < sizeof (scratchbuf))) 252 abort (); 253 if (outsize == sizeof (scratchbuf)) 254 { 255 /* No character has been produced. Must be an error. */ 256 if (res != (size_t)(-1)) 257 abort (); 258 259 if (errno == EILSEQ) 260 { 261 /* An invalid multibyte sequence was encountered. */ 262 multiline_error (xstrdup (""), 263 xasprintf (_("\ 264%s:%d: Invalid multibyte sequence.\n\ 265Please specify the correct source encoding through --from-code.\n"), 266 real_file_name, line_number)); 267 exit (EXIT_FAILURE); 268 } 269 else if (errno == EINVAL) 270 { 271 /* An incomplete multibyte character. */ 272 int c; 273 274 if (bufcount == MAX_PHASE1_PUSHBACK) 275 { 276 /* An overlong incomplete multibyte sequence was 277 encountered. */ 278 multiline_error (xstrdup (""), 279 xasprintf (_("\ 280%s:%d: Long incomplete multibyte sequence.\n\ 281Please specify the correct source encoding through --from-code.\n"), 282 real_file_name, line_number)); 283 exit (EXIT_FAILURE); 284 } 285 286 /* Read one more byte and retry iconv. */ 287 c = phase1_getc (); 288 if (c == EOF) 289 { 290 multiline_error (xstrdup (""), 291 xasprintf (_("\ 292%s:%d: Incomplete multibyte sequence at end of file.\n\ 293Please specify the correct source encoding through --from-code.\n"), 294 real_file_name, line_number)); 295 exit (EXIT_FAILURE); 296 } 297 if (c == '\n') 298 { 299 multiline_error (xstrdup (""), 300 xasprintf (_("\ 301%s:%d: Incomplete multibyte sequence at end of line.\n\ 302Please specify the correct source encoding through --from-code.\n"), 303 real_file_name, line_number - 1)); 304 exit (EXIT_FAILURE); 305 } 306 buf[bufcount++] = (unsigned char) c; 307 } 308 else 309 error (EXIT_FAILURE, errno, _("%s:%d: iconv failure"), 310 real_file_name, line_number); 311 } 312 else 313 { 314 size_t outbytes = sizeof (scratchbuf) - outsize; 315 size_t bytes = bufcount - insize; 316 unsigned int uc; 317 318 /* We expect that one character has been produced. */ 319 if (bytes == 0) 320 abort (); 321 if (outbytes == 0) 322 abort (); 323 /* Push back the unused bytes. */ 324 while (insize > 0) 325 phase1_ungetc (buf[--insize]); 326 /* Convert the character from UTF-8 to UCS-4. */ 327 if (u8_mbtouc (&uc, scratchbuf, outbytes) < outbytes) 328 { 329 /* scratchbuf contains an out-of-range Unicode character 330 (> 0x10ffff). */ 331 multiline_error (xstrdup (""), 332 xasprintf (_("\ 333%s:%d: Invalid multibyte sequence.\n\ 334Please specify the source encoding through --from-code.\n"), 335 real_file_name, line_number)); 336 exit (EXIT_FAILURE); 337 } 338 return uc; 339 } 340 } 341#else 342 /* If we don't have iconv(), the only supported values for 343 xgettext_global_source_encoding and thus also for 344 xgettext_current_source_encoding are ASCII and UTF-8. */ 345 abort (); 346#endif 347 } 348 else 349 { 350 /* Read an UTF-8 encoded character. */ 351 unsigned char buf[6]; 352 unsigned int count; 353 int c; 354 unsigned int uc; 355 356 c = phase1_getc (); 357 if (c == EOF) 358 return UEOF; 359 buf[0] = c; 360 count = 1; 361 362 if (buf[0] >= 0xc0) 363 { 364 c = phase1_getc (); 365 if (c == EOF) 366 return UEOF; 367 buf[1] = c; 368 count = 2; 369 } 370 371 if (buf[0] >= 0xe0 372 && ((buf[1] ^ 0x80) < 0x40)) 373 { 374 c = phase1_getc (); 375 if (c == EOF) 376 return UEOF; 377 buf[2] = c; 378 count = 3; 379 } 380 381 if (buf[0] >= 0xf0 382 && ((buf[1] ^ 0x80) < 0x40) 383 && ((buf[2] ^ 0x80) < 0x40)) 384 { 385 c = phase1_getc (); 386 if (c == EOF) 387 return UEOF; 388 buf[3] = c; 389 count = 4; 390 } 391 392 if (buf[0] >= 0xf8 393 && ((buf[1] ^ 0x80) < 0x40) 394 && ((buf[2] ^ 0x80) < 0x40) 395 && ((buf[3] ^ 0x80) < 0x40)) 396 { 397 c = phase1_getc (); 398 if (c == EOF) 399 return UEOF; 400 buf[4] = c; 401 count = 5; 402 } 403 404 if (buf[0] >= 0xfc 405 && ((buf[1] ^ 0x80) < 0x40) 406 && ((buf[2] ^ 0x80) < 0x40) 407 && ((buf[3] ^ 0x80) < 0x40) 408 && ((buf[4] ^ 0x80) < 0x40)) 409 { 410 c = phase1_getc (); 411 if (c == EOF) 412 return UEOF; 413 buf[5] = c; 414 count = 6; 415 } 416 417 u8_mbtouc (&uc, buf, count); 418 return uc; 419 } 420} 421 422/* Supports only one pushback character. */ 423static void 424phase2_ungetc (int c) 425{ 426 if (c != UEOF) 427 { 428 if (phase2_pushback_length == SIZEOF (phase2_pushback)) 429 abort (); 430 phase2_pushback[phase2_pushback_length++] = c; 431 } 432} 433 434 435/* Phase 3: Convert all line terminators to LF. 436 See ECMA-334 section 9.3.1. */ 437 438/* Line number defined in terms of phase3. */ 439static int logical_line_number; 440 441static int phase3_pushback[9]; 442static int phase3_pushback_length; 443 444/* Read the next Unicode UCS-4 character from the input file, mapping 445 all line terminators to U+000A, and dropping U+001A at the end of file. */ 446static int 447phase3_getc () 448{ 449 int c; 450 451 if (phase3_pushback_length) 452 { 453 c = phase3_pushback[--phase3_pushback_length]; 454 if (c == UNL) 455 ++logical_line_number; 456 return c; 457 } 458 459 c = phase2_getc (); 460 461 if (c == 0x000d) 462 { 463 int c1 = phase2_getc (); 464 465 if (c1 != UEOF && c1 != 0x000a) 466 phase2_ungetc (c1); 467 468 /* Seen line terminator CR or CR/LF. */ 469 ++logical_line_number; 470 return UNL; 471 } 472 473 if (c == 0x0085 || c == 0x2028 || c == 0x2029) 474 { 475 /* Seen Unicode word processor newline. */ 476 ++logical_line_number; 477 return UNL; 478 } 479 480 if (c == 0x001a) 481 { 482 int c1 = phase2_getc (); 483 484 if (c1 == UEOF) 485 /* Seen U+001A right before the end of file. */ 486 return UEOF; 487 488 phase2_ungetc (c1); 489 } 490 491 if (c == UNL) 492 ++logical_line_number; 493 return c; 494} 495 496/* Supports 9 characters of pushback. */ 497static void 498phase3_ungetc (int c) 499{ 500 if (c != UEOF) 501 { 502 if (c == UNL) 503 --logical_line_number; 504 if (phase3_pushback_length == SIZEOF (phase3_pushback)) 505 abort (); 506 phase3_pushback[phase3_pushback_length++] = c; 507 } 508} 509 510 511/* ========================= Accumulating strings. ======================== */ 512 513/* A string buffer type that allows appending Unicode characters. 514 Returns the entire string in UTF-8 encoding. */ 515 516struct string_buffer 517{ 518 /* The part of the string that has already been converted to UTF-8. */ 519 char *utf8_buffer; 520 size_t utf8_buflen; 521 size_t utf8_allocated; 522}; 523 524/* Initialize a 'struct string_buffer' to empty. */ 525static inline void 526init_string_buffer (struct string_buffer *bp) 527{ 528 bp->utf8_buffer = NULL; 529 bp->utf8_buflen = 0; 530 bp->utf8_allocated = 0; 531} 532 533/* Auxiliary function: Ensure count more bytes are available in bp->utf8. */ 534static inline void 535string_buffer_append_unicode_grow (struct string_buffer *bp, size_t count) 536{ 537 if (bp->utf8_buflen + count > bp->utf8_allocated) 538 { 539 size_t new_allocated = 2 * bp->utf8_allocated + 10; 540 if (new_allocated < bp->utf8_buflen + count) 541 new_allocated = bp->utf8_buflen + count; 542 bp->utf8_allocated = new_allocated; 543 bp->utf8_buffer = xrealloc (bp->utf8_buffer, new_allocated); 544 } 545} 546 547/* Auxiliary function: Append a Unicode character to bp->utf8. 548 uc must be < 0x110000. */ 549static inline void 550string_buffer_append_unicode (struct string_buffer *bp, unsigned int uc) 551{ 552 unsigned char utf8buf[6]; 553 int count = u8_uctomb (utf8buf, uc, 6); 554 555 if (count < 0) 556 /* The caller should have ensured that uc is not out-of-range. */ 557 abort (); 558 559 string_buffer_append_unicode_grow (bp, count); 560 memcpy (bp->utf8_buffer + bp->utf8_buflen, utf8buf, count); 561 bp->utf8_buflen += count; 562} 563 564/* Return the string buffer's contents. */ 565static char * 566string_buffer_result (struct string_buffer *bp) 567{ 568 /* NUL-terminate it. */ 569 string_buffer_append_unicode_grow (bp, 1); 570 bp->utf8_buffer[bp->utf8_buflen] = '\0'; 571 /* Return it. */ 572 return bp->utf8_buffer; 573} 574 575/* Free the memory pointed to by a 'struct string_buffer'. */ 576static inline void 577free_string_buffer (struct string_buffer *bp) 578{ 579 free (bp->utf8_buffer); 580} 581 582 583/* ======================== Accumulating comments. ======================== */ 584 585 586/* Accumulating a single comment line. */ 587 588static struct string_buffer comment_buffer; 589 590static inline void 591comment_start () 592{ 593 comment_buffer.utf8_buflen = 0; 594} 595 596static inline bool 597comment_at_start () 598{ 599 return (comment_buffer.utf8_buflen == 0); 600} 601 602static inline void 603comment_add (int c) 604{ 605 string_buffer_append_unicode (&comment_buffer, c); 606} 607 608static inline void 609comment_line_end (size_t chars_to_remove) 610{ 611 char *buffer = string_buffer_result (&comment_buffer); 612 size_t buflen = strlen (buffer); 613 614 buflen -= chars_to_remove; 615 while (buflen >= 1 616 && (buffer[buflen - 1] == ' ' || buffer[buflen - 1] == '\t')) 617 --buflen; 618 buffer[buflen] = '\0'; 619 savable_comment_add (buffer); 620} 621 622 623/* These are for tracking whether comments count as immediately before 624 keyword. */ 625static int last_comment_line; 626static int last_non_comment_line; 627 628 629/* Phase 4: Replace each comment that is not inside a character constant or 630 string literal with a space or newline character. 631 See ECMA-334 section 9.3.2. */ 632 633static int 634phase4_getc () 635{ 636 int c0; 637 int c; 638 bool last_was_star; 639 640 c0 = phase3_getc (); 641 if (c0 != '/') 642 return c0; 643 c = phase3_getc (); 644 switch (c) 645 { 646 default: 647 phase3_ungetc (c); 648 return c0; 649 650 case '*': 651 /* C style comment. */ 652 comment_start (); 653 last_was_star = false; 654 for (;;) 655 { 656 c = phase3_getc (); 657 if (c == UEOF) 658 break; 659 /* We skip all leading white space, but not EOLs. */ 660 if (!(comment_at_start () && (c == ' ' || c == '\t'))) 661 comment_add (c); 662 switch (c) 663 { 664 case UNL: 665 comment_line_end (1); 666 comment_start (); 667 last_was_star = false; 668 continue; 669 670 case '*': 671 last_was_star = true; 672 continue; 673 674 case '/': 675 if (last_was_star) 676 { 677 comment_line_end (2); 678 break; 679 } 680 /* FALLTHROUGH */ 681 682 default: 683 last_was_star = false; 684 continue; 685 } 686 break; 687 } 688 last_comment_line = logical_line_number; 689 return ' '; 690 691 case '/': 692 /* C++ style comment. */ 693 last_comment_line = logical_line_number; 694 comment_start (); 695 for (;;) 696 { 697 c = phase3_getc (); 698 if (c == UNL || c == UEOF) 699 break; 700 /* We skip all leading white space, but not EOLs. */ 701 if (!(comment_at_start () && (c == ' ' || c == '\t'))) 702 comment_add (c); 703 } 704 phase3_ungetc (c); /* push back the newline, to decrement logical_line_number */ 705 comment_line_end (0); 706 phase3_getc (); /* read the newline again */ 707 return UNL; 708 } 709} 710 711/* Supports only one pushback character. */ 712static void 713phase4_ungetc (int c) 714{ 715 phase3_ungetc (c); 716} 717 718 719/* ======================= Character classification. ====================== */ 720 721 722/* Return true if a given character is white space. 723 See ECMA-334 section 9.3.3. */ 724static bool 725is_whitespace (int c) 726{ 727 /* Unicode character class Zs, as of Unicode 4.0. */ 728 /* grep '^[^;]*;[^;]*;Zs;' UnicodeData-4.0.0.txt */ 729 switch (c >> 8) 730 { 731 case 0x00: 732 return (c == 0x0020 || c == 0x00a0); 733 case 0x16: 734 return (c == 0x1680); 735 case 0x18: 736 return (c == 0x180e); 737 case 0x20: 738 return ((c >= 0x2000 && c <= 0x200b) || c == 0x202f || c == 0x205f); 739 case 0x30: 740 return (c == 0x3000); 741 default: 742 return false; 743 } 744} 745 746 747/* C# allows identifiers containing many Unicode characters. We recognize 748 them; to use an identifier with Unicode characters in a --keyword option, 749 it must be specified in UTF-8. */ 750 751static inline int 752bitmap_lookup (const void *table, unsigned int uc) 753{ 754 unsigned int index1 = uc >> 16; 755 if (index1 < ((const int *) table)[0]) 756 { 757 int lookup1 = ((const int *) table)[1 + index1]; 758 if (lookup1 >= 0) 759 { 760 unsigned int index2 = (uc >> 9) & 0x7f; 761 int lookup2 = ((const int *) table)[lookup1 + index2]; 762 if (lookup2 >= 0) 763 { 764 unsigned int index3 = (uc >> 5) & 0xf; 765 unsigned int lookup3 = ((const int *) table)[lookup2 + index3]; 766 767 return (lookup3 >> (uc & 0x1f)) & 1; 768 } 769 } 770 } 771 return 0; 772} 773 774/* Unicode character classes Lu, Ll, Lt, Lm, Lo, Nl, as of Unicode 4.0, 775 plus the underscore. */ 776static const 777struct 778 { 779 int header[1]; 780 int level1[3]; 781 int level2[3 << 7]; 782 /*unsigned*/ int level3[34 << 4]; 783 } 784table_identifier_start = 785{ 786 { 3 }, 787 { 4, 132, 260 }, 788 { 789 388, 404, 420, 436, 452, 468, 484, 500, 790 516, 532, 548, 564, 580, -1, 596, 612, 791 628, -1, -1, -1, -1, -1, -1, -1, 792 644, -1, 660, 660, 660, 660, 660, 660, 793 660, 660, 660, 660, 660, 660, 676, 660, 794 660, 660, 660, 660, 660, 660, 660, 660, 795 660, 660, 660, 660, 660, 660, 660, 660, 796 660, 660, 660, 660, 660, 660, 660, 660, 797 660, 660, 660, 660, 660, 660, 660, 660, 798 660, 660, 660, 660, 660, 660, 660, 692, 799 660, 660, 708, -1, -1, -1, 660, 660, 800 660, 660, 660, 660, 660, 660, 660, 660, 801 660, 660, 660, 660, 660, 660, 660, 660, 802 660, 660, 660, 724, -1, -1, -1, -1, 803 -1, -1, -1, -1, -1, -1, -1, -1, 804 -1, -1, -1, -1, 740, 756, 772, 788, 805 804, 820, 836, -1, 852, -1, -1, -1, 806 -1, -1, -1, -1, -1, -1, -1, -1, 807 -1, -1, -1, -1, -1, -1, -1, -1, 808 -1, -1, -1, -1, -1, -1, -1, -1, 809 -1, -1, -1, -1, -1, -1, -1, -1, 810 -1, -1, -1, -1, -1, -1, -1, -1, 811 -1, -1, -1, -1, -1, -1, -1, -1, 812 -1, -1, -1, -1, -1, -1, -1, -1, 813 -1, -1, -1, -1, -1, -1, -1, -1, 814 -1, -1, -1, -1, -1, -1, -1, -1, 815 -1, -1, -1, -1, -1, -1, -1, -1, 816 -1, -1, -1, -1, -1, -1, -1, -1, 817 -1, -1, -1, -1, -1, -1, -1, -1, 818 -1, -1, 868, 884, -1, -1, -1, -1, 819 -1, -1, -1, -1, -1, -1, -1, -1, 820 -1, -1, -1, -1, -1, -1, -1, -1, 821 660, 660, 660, 660, 660, 660, 660, 660, 822 660, 660, 660, 660, 660, 660, 660, 660, 823 660, 660, 660, 660, 660, 660, 660, 660, 824 660, 660, 660, 660, 660, 660, 660, 660, 825 660, 660, 660, 660, 660, 660, 660, 660, 826 660, 660, 660, 660, 660, 660, 660, 660, 827 660, 660, 660, 660, 660, 660, 660, 660, 828 660, 660, 660, 660, 660, 660, 660, 660, 829 660, 660, 660, 660, 660, 660, 660, 660, 830 660, 660, 660, 660, 660, 660, 660, 660, 831 660, 660, 660, 900, -1, -1, -1, -1, 832 -1, -1, -1, -1, -1, -1, -1, -1, 833 -1, -1, -1, -1, -1, -1, -1, -1, 834 -1, -1, -1, -1, -1, -1, -1, -1, 835 -1, -1, -1, -1, -1, -1, -1, -1, 836 -1, -1, -1, -1, 660, 916, -1, -1 837 }, 838 { 839 0x00000000, 0x00000000, 0x87FFFFFE, 0x07FFFFFE, 840 0x00000000, 0x04200400, 0xFF7FFFFF, 0xFF7FFFFF, 841 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 842 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 843 0xFFFFFFFF, 0x007FFFFF, 0xFFFF0000, 0xFFFFFFFF, 844 0xFFFFFFFF, 0xFFFFFFFF, 0x0003FFC3, 0x0000401F, 845 0x00000000, 0x00000000, 0x00000000, 0x04000000, 846 0xFFFFD740, 0xFFFFFFFB, 0xFFFF7FFF, 0x0FBFFFFF, 847 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 848 0xFFFFFC03, 0xFFFFFFFF, 0xFFFF7FFF, 0x033FFFFF, 849 0x0000FFFF, 0xFFFE0000, 0x027FFFFF, 0xFFFFFFFE, 850 0x000000FF, 0x00000000, 0xFFFF0000, 0x000707FF, 851 0x00000000, 0x07FFFFFE, 0x000007FF, 0xFFFEC000, 852 0xFFFFFFFF, 0xFFFFFFFF, 0x002FFFFF, 0x9C00C060, 853 0xFFFD0000, 0x0000FFFF, 0x0000E000, 0x00000000, 854 0xFFFFFFFF, 0x0002003F, 0x00000000, 0x00000000, 855 0x00000000, 0x00000000, 0x00000000, 0x00000000, 856 0x00000000, 0x00000000, 0x00000000, 0x00000000, 857 0xFFFFFFF0, 0x23FFFFFF, 0xFF010000, 0x00000003, 858 0xFFF99FE0, 0x23C5FDFF, 0xB0000000, 0x00030003, 859 0xFFF987E0, 0x036DFDFF, 0x5E000000, 0x001C0000, 860 0xFFFBBFE0, 0x23EDFDFF, 0x00010000, 0x00000003, 861 0xFFF99FE0, 0x23EDFDFF, 0xB0000000, 0x00020003, 862 0xD63DC7E8, 0x03BFC718, 0x00000000, 0x00000000, 863 0xFFFDDFE0, 0x03EFFDFF, 0x00000000, 0x00000003, 864 0xFFFDDFE0, 0x23EFFDFF, 0x40000000, 0x00000003, 865 0xFFFDDFE0, 0x03FFFDFF, 0x00000000, 0x00000003, 866 0xFC7FFFE0, 0x2FFBFFFF, 0x0000007F, 0x00000000, 867 0xFFFFFFFE, 0x000DFFFF, 0x0000007F, 0x00000000, 868 0xFEF02596, 0x200DECAE, 0x3000005F, 0x00000000, 869 0x00000001, 0x00000000, 0xFFFFFEFF, 0x000007FF, 870 0x00000F00, 0x00000000, 0x00000000, 0x00000000, 871 0xFFFFFFFF, 0x000006FB, 0x003F0000, 0x00000000, 872 0x00000000, 0xFFFFFFFF, 0xFFFF003F, 0x01FFFFFF, 873 0xFFFFFFFF, 0xFFFFFFFF, 0x83FFFFFF, 0xFFFFFFFF, 874 0xFFFFFFFF, 0xFFFFFF07, 0xFFFFFFFF, 0x03FFFFFF, 875 0xFFFFFF7F, 0xFFFFFFFF, 0x3D7F3D7F, 0xFFFFFFFF, 876 0xFFFF3D7F, 0x7F3D7FFF, 0xFF7F7F3D, 0xFFFF7FFF, 877 0x7F3D7FFF, 0xFFFFFFFF, 0x07FFFF7F, 0x00000000, 878 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0x001FFFFF, 879 0xFFFFFFFE, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 880 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 881 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 882 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 883 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x007F9FFF, 884 0x07FFFFFE, 0xFFFFFFFF, 0xFFFFFFFF, 0x0001C7FF, 885 0x0003DFFF, 0x0003FFFF, 0x0003FFFF, 0x0001DFFF, 886 0xFFFFFFFF, 0x000FFFFF, 0x10800000, 0x00000000, 887 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0x00FFFFFF, 888 0xFFFFFFFF, 0x000001FF, 0x00000000, 0x00000000, 889 0x1FFFFFFF, 0x00000000, 0xFFFF0000, 0x001F3FFF, 890 0x00000000, 0x00000000, 0x00000000, 0x00000000, 891 0x00000000, 0x00000000, 0x00000000, 0x00000000, 892 0x00000000, 0x00000000, 0x00000000, 0x00000000, 893 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000FFF, 894 0x00000000, 0x00000000, 0x00000000, 0x00000000, 895 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 896 0x0FFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x03FFFFFF, 897 0x3F3FFFFF, 0xFFFFFFFF, 0xAAFF3F3F, 0x3FFFFFFF, 898 0xFFFFFFFF, 0x5FDFFFFF, 0x0FCF1FDC, 0x1FDC1FFF, 899 0x00000000, 0x00000000, 0x00000000, 0x80020000, 900 0x00000000, 0x00000000, 0x00000000, 0x00000000, 901 0x3E2FFC84, 0xE3FBBD50, 0x000003E0, 0xFFFFFFFF, 902 0x0000000F, 0x00000000, 0x00000000, 0x00000000, 903 0x000000E0, 0x1F3E03FE, 0xFFFFFFFE, 0xFFFFFFFF, 904 0xE07FFFFF, 0xFFFFFFFE, 0xFFFFFFFF, 0xF7FFFFFF, 905 0xFFFFFFE0, 0xFFFE1FFF, 0xFFFFFFFF, 0xFFFFFFFF, 906 0x00007FFF, 0x00FFFFFF, 0x00000000, 0xFFFF0000, 907 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 908 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 909 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 910 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 911 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 912 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 913 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 914 0xFFFFFFFF, 0x003FFFFF, 0x00000000, 0x00000000, 915 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 916 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 917 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 918 0xFFFFFFFF, 0x0000003F, 0x00000000, 0x00000000, 919 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 920 0x00001FFF, 0x00000000, 0x00000000, 0x00000000, 921 0x00000000, 0x00000000, 0x00000000, 0x00000000, 922 0x00000000, 0x00000000, 0x00000000, 0x00000000, 923 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 924 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 925 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 926 0xFFFFFFFF, 0x0000000F, 0x00000000, 0x00000000, 927 0x00000000, 0x00000000, 0x00000000, 0x00000000, 928 0x00000000, 0x00000000, 0x00000000, 0x00000000, 929 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 930 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 931 0xFFFFFFFF, 0xFFFF3FFF, 0xFFFFFFFF, 0x000007FF, 932 0x00000000, 0x00000000, 0x00000000, 0x00000000, 933 0xA0F8007F, 0x5F7FFDFF, 0xFFFFFFDB, 0xFFFFFFFF, 934 0xFFFFFFFF, 0x0003FFFF, 0xFFF80000, 0xFFFFFFFF, 935 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 936 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 937 0xFFFFFFFF, 0x3FFFFFFF, 0xFFFF0000, 0xFFFFFFFF, 938 0xFFFCFFFF, 0xFFFFFFFF, 0x000000FF, 0x0FFF0000, 939 0x00000000, 0x00000000, 0x00000000, 0xFFDF0000, 940 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x1FFFFFFF, 941 0x00000000, 0x07FFFFFE, 0x07FFFFFE, 0xFFFFFFC0, 942 0xFFFFFFFF, 0x7FFFFFFF, 0x1CFCFCFC, 0x00000000, 943 0xFFFFEFFF, 0xB7FFFF7F, 0x3FFF3FFF, 0x00000000, 944 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x07FFFFFF, 945 0x00000000, 0x00000000, 0x00000000, 0x00000000, 946 0x00000000, 0x00000000, 0x00000000, 0x00000000, 947 0x00000000, 0x00000000, 0x00000000, 0x00000000, 948 0x00000000, 0x00000000, 0x00000000, 0x00000000, 949 0x7FFFFFFF, 0xFFFF0000, 0x000007FF, 0x00000000, 950 0x3FFFFFFF, 0x00000000, 0x00000000, 0x00000000, 951 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 952 0x3FFFFFFF, 0x00000000, 0x00000000, 0x00000000, 953 0x00000000, 0x00000000, 0x00000000, 0x00000000, 954 0x00000000, 0x00000000, 0x00000000, 0x00000000, 955 0xFFFFFD3F, 0x91BFFFFF, 0x00000000, 0x00000000, 956 0x00000000, 0x00000000, 0x00000000, 0x00000000, 957 0x00000000, 0x00000000, 0x00000000, 0x00000000, 958 0x00000000, 0x00000000, 0x00000000, 0x00000000, 959 0xFFFFFFFF, 0xFFFFFFFF, 0xFFDFFFFF, 0xFFFFFFFF, 960 0xDFFFFFFF, 0xEBFFDE64, 0xFFFFFFEF, 0xFFFFFFFF, 961 0xDFDFE7BF, 0x7BFFFFFF, 0xFFFDFC5F, 0xFFFFFFFF, 962 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 963 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 964 0xFFFFFFFF, 0xFFFFFF0F, 0xF7FFFFFD, 0xF7FFFFFF, 965 0xFFDFFFFF, 0xFFDFFFFF, 0xFFFF7FFF, 0xFFFF7FFF, 966 0xFFFFFDFF, 0xFFFFFDFF, 0x000003F7, 0x00000000, 967 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 968 0xFFFFFFFF, 0xFFFFFFFF, 0x007FFFFF, 0x00000000, 969 0x00000000, 0x00000000, 0x00000000, 0x00000000, 970 0x00000000, 0x00000000, 0x00000000, 0x00000000, 971 0x3FFFFFFF, 0x00000000, 0x00000000, 0x00000000, 972 0x00000000, 0x00000000, 0x00000000, 0x00000000, 973 0x00000000, 0x00000000, 0x00000000, 0x00000000, 974 0x00000000, 0x00000000, 0x00000000, 0x00000000 975 } 976}; 977 978/* Unicode character classes Lu, Ll, Lt, Lm, Lo, Nl, Nd, Pc, Mn, Mc, Cf, 979 as of Unicode 4.0. */ 980static const 981struct 982 { 983 int header[1]; 984 int level1[15]; 985 int level2[4 << 7]; 986 /*unsigned*/ int level3[36 << 4]; 987 } 988table_identifier_part = 989{ 990 { 15 }, 991 { 992 16, 144, 272, -1, -1, -1, -1, -1, 993 -1, -1, -1, -1, -1, -1, 400 994 }, 995 { 996 528, 544, 560, 576, 592, 608, 624, 640, 997 656, 672, 688, 704, 720, -1, 736, 752, 998 768, -1, -1, -1, -1, -1, -1, -1, 999 784, -1, 800, 800, 800, 800, 800, 800, 1000 800, 800, 800, 800, 800, 800, 816, 800, 1001 800, 800, 800, 800, 800, 800, 800, 800, 1002 800, 800, 800, 800, 800, 800, 800, 800, 1003 800, 800, 800, 800, 800, 800, 800, 800, 1004 800, 800, 800, 800, 800, 800, 800, 800, 1005 800, 800, 800, 800, 800, 800, 800, 832, 1006 800, 800, 848, -1, -1, -1, 800, 800, 1007 800, 800, 800, 800, 800, 800, 800, 800, 1008 800, 800, 800, 800, 800, 800, 800, 800, 1009 800, 800, 800, 864, -1, -1, -1, -1, 1010 -1, -1, -1, -1, -1, -1, -1, -1, 1011 -1, -1, -1, -1, 880, 896, 912, 928, 1012 944, 960, 976, -1, 992, -1, -1, -1, 1013 -1, -1, -1, -1, -1, -1, -1, -1, 1014 -1, -1, -1, -1, -1, -1, -1, -1, 1015 -1, -1, -1, -1, -1, -1, -1, -1, 1016 -1, -1, -1, -1, -1, -1, -1, -1, 1017 -1, -1, -1, -1, -1, -1, -1, -1, 1018 -1, -1, -1, -1, -1, -1, -1, -1, 1019 -1, -1, -1, -1, -1, -1, -1, -1, 1020 -1, -1, -1, -1, -1, -1, -1, -1, 1021 -1, -1, -1, -1, -1, -1, -1, -1, 1022 -1, -1, -1, -1, -1, -1, -1, -1, 1023 -1, -1, -1, -1, -1, -1, -1, -1, 1024 -1, -1, -1, -1, -1, -1, -1, -1, 1025 1008, -1, 1024, 1040, -1, -1, -1, -1, 1026 -1, -1, -1, -1, -1, -1, -1, -1, 1027 -1, -1, -1, -1, -1, -1, -1, -1, 1028 800, 800, 800, 800, 800, 800, 800, 800, 1029 800, 800, 800, 800, 800, 800, 800, 800, 1030 800, 800, 800, 800, 800, 800, 800, 800, 1031 800, 800, 800, 800, 800, 800, 800, 800, 1032 800, 800, 800, 800, 800, 800, 800, 800, 1033 800, 800, 800, 800, 800, 800, 800, 800, 1034 800, 800, 800, 800, 800, 800, 800, 800, 1035 800, 800, 800, 800, 800, 800, 800, 800, 1036 800, 800, 800, 800, 800, 800, 800, 800, 1037 800, 800, 800, 800, 800, 800, 800, 800, 1038 800, 800, 800, 1056, -1, -1, -1, -1, 1039 -1, -1, -1, -1, -1, -1, -1, -1, 1040 -1, -1, -1, -1, -1, -1, -1, -1, 1041 -1, -1, -1, -1, -1, -1, -1, -1, 1042 -1, -1, -1, -1, -1, -1, -1, -1, 1043 -1, -1, -1, -1, 800, 1072, -1, -1, 1044 1088, -1, -1, -1, -1, -1, -1, -1, 1045 -1, -1, -1, -1, -1, -1, -1, -1, 1046 -1, -1, -1, -1, -1, -1, -1, -1, 1047 -1, -1, -1, -1, -1, -1, -1, -1, 1048 -1, -1, -1, -1, -1, -1, -1, -1, 1049 -1, -1, -1, -1, -1, -1, -1, -1, 1050 -1, -1, -1, -1, -1, -1, -1, -1, 1051 -1, -1, -1, -1, -1, -1, -1, -1, 1052 -1, -1, -1, -1, -1, -1, -1, -1, 1053 -1, -1, -1, -1, -1, -1, -1, -1, 1054 -1, -1, -1, -1, -1, -1, -1, -1, 1055 -1, -1, -1, -1, -1, -1, -1, -1, 1056 -1, -1, -1, -1, -1, -1, -1, -1, 1057 -1, -1, -1, -1, -1, -1, -1, -1, 1058 -1, -1, -1, -1, -1, -1, -1, -1, 1059 -1, -1, -1, -1, -1, -1, -1, -1 1060 }, 1061 { 1062 0x00000000, 0x03FF0000, 0x87FFFFFE, 0x07FFFFFE, 1063 0x00000000, 0x04202400, 0xFF7FFFFF, 0xFF7FFFFF, 1064 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 1065 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 1066 0xFFFFFFFF, 0x007FFFFF, 0xFFFF0000, 0xFFFFFFFF, 1067 0xFFFFFFFF, 0xFFFFFFFF, 0x0003FFC3, 0x0000401F, 1068 0xFFFFFFFF, 0xFFFFFFFF, 0xE0FFFFFF, 0x0400FFFF, 1069 0xFFFFD740, 0xFFFFFFFB, 0xFFFF7FFF, 0x0FBFFFFF, 1070 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 1071 0xFFFFFC7B, 0xFFFFFFFF, 0xFFFF7FFF, 0x033FFFFF, 1072 0x0000FFFF, 0xFFFE0000, 0x027FFFFF, 0xFFFFFFFE, 1073 0xFFFE00FF, 0xBBFFFFFB, 0xFFFF0016, 0x000707FF, 1074 0x003F000F, 0x07FFFFFE, 0x01FFFFFF, 0xFFFFC3FF, 1075 0xFFFFFFFF, 0xFFFFFFFF, 0xBFEFFFFF, 0x9FFFFDFF, 1076 0xFFFF8000, 0xFFFFFFFF, 0x0000E7FF, 0x00000000, 1077 0xFFFFFFFF, 0x0003FFFF, 0x00000000, 0x00000000, 1078 0x00000000, 0x00000000, 0x00000000, 0x00000000, 1079 0x00000000, 0x00000000, 0x00000000, 0x00000000, 1080 0xFFFFFFFE, 0xF3FFFFFF, 0xFF1F3FFF, 0x0000FFCF, 1081 0xFFF99FEE, 0xF3C5FDFF, 0xB080399F, 0x0003FFCF, 1082 0xFFF987EE, 0xD36DFDFF, 0x5E003987, 0x001FFFC0, 1083 0xFFFBBFEE, 0xF3EDFDFF, 0x00013BBF, 0x0000FFCF, 1084 0xFFF99FEE, 0xF3EDFDFF, 0xB0C0398F, 0x0002FFC3, 1085 0xD63DC7EC, 0xC3BFC718, 0x00803DC7, 0x0000FF80, 1086 0xFFFDDFEE, 0xC3EFFDFF, 0x00603DDF, 0x0000FFC3, 1087 0xFFFDDFEC, 0xF3EFFDFF, 0x40603DDF, 0x0000FFC3, 1088 0xFFFDDFEC, 0xC3FFFDFF, 0x00803DCF, 0x0000FFC3, 1089 0xFC7FFFEC, 0x2FFBFFFF, 0xFF5F847F, 0x000C0000, 1090 0xFFFFFFFE, 0x07FFFFFF, 0x03FF7FFF, 0x00000000, 1091 0xFEF02596, 0x3BFFECAE, 0x33FF3F5F, 0x00000000, 1092 0x03000001, 0xC2A003FF, 0xFFFFFEFF, 0xFFFE07FF, 1093 0xFEFF0FDF, 0x1FFFFFFF, 0x00000040, 0x00000000, 1094 0xFFFFFFFF, 0x03C7F6FB, 0x03FF03FF, 0x00000000, 1095 0x00000000, 0xFFFFFFFF, 0xFFFF003F, 0x01FFFFFF, 1096 0xFFFFFFFF, 0xFFFFFFFF, 0x83FFFFFF, 0xFFFFFFFF, 1097 0xFFFFFFFF, 0xFFFFFF07, 0xFFFFFFFF, 0x03FFFFFF, 1098 0xFFFFFF7F, 0xFFFFFFFF, 0x3D7F3D7F, 0xFFFFFFFF, 1099 0xFFFF3D7F, 0x7F3D7FFF, 0xFF7F7F3D, 0xFFFF7FFF, 1100 0x7F3D7FFF, 0xFFFFFFFF, 0x07FFFF7F, 0x0003FE00, 1101 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0x001FFFFF, 1102 0xFFFFFFFE, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 1103 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 1104 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 1105 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 1106 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x007F9FFF, 1107 0x07FFFFFE, 0xFFFFFFFF, 0xFFFFFFFF, 0x0001C7FF, 1108 0x001FDFFF, 0x001FFFFF, 0x000FFFFF, 0x000DDFFF, 1109 0xFFFFFFFF, 0xFFFFFFFF, 0x308FFFFF, 0x000003FF, 1110 0x03FF3800, 0xFFFFFFFF, 0xFFFFFFFF, 0x00FFFFFF, 1111 0xFFFFFFFF, 0x000003FF, 0x00000000, 0x00000000, 1112 0x1FFFFFFF, 0x0FFF0FFF, 0xFFFFFFC0, 0x001F3FFF, 1113 0x00000000, 0x00000000, 0x00000000, 0x00000000, 1114 0x00000000, 0x00000000, 0x00000000, 0x00000000, 1115 0x00000000, 0x00000000, 0x00000000, 0x00000000, 1116 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000FFF, 1117 0x00000000, 0x00000000, 0x00000000, 0x00000000, 1118 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 1119 0x0FFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x03FFFFFF, 1120 0x3F3FFFFF, 0xFFFFFFFF, 0xAAFF3F3F, 0x3FFFFFFF, 1121 0xFFFFFFFF, 0x5FDFFFFF, 0x0FCF1FDC, 0x1FDC1FFF, 1122 0x0000F000, 0x80007C00, 0x00100001, 0x8002FC0F, 1123 0x00000000, 0x00000000, 0x1FFF0000, 0x000007E2, 1124 0x3E2FFC84, 0xE3FBBD50, 0x000003E0, 0xFFFFFFFF, 1125 0x0000000F, 0x00000000, 0x00000000, 0x00000000, 1126 0x000000E0, 0x1F3EFFFE, 0xFFFFFFFE, 0xFFFFFFFF, 1127 0xE67FFFFF, 0xFFFFFFFE, 0xFFFFFFFF, 0xFFFFFFFF, 1128 0xFFFFFFE0, 0xFFFE1FFF, 0xFFFFFFFF, 0xFFFFFFFF, 1129 0x00007FFF, 0x00FFFFFF, 0x00000000, 0xFFFF0000, 1130 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 1131 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 1132 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 1133 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 1134 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 1135 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 1136 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 1137 0xFFFFFFFF, 0x003FFFFF, 0x00000000, 0x00000000, 1138 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 1139 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 1140 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 1141 0xFFFFFFFF, 0x0000003F, 0x00000000, 0x00000000, 1142 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 1143 0x00001FFF, 0x00000000, 0x00000000, 0x00000000, 1144 0x00000000, 0x00000000, 0x00000000, 0x00000000, 1145 0x00000000, 0x00000000, 0x00000000, 0x00000000, 1146 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 1147 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 1148 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 1149 0xFFFFFFFF, 0x0000000F, 0x00000000, 0x00000000, 1150 0x00000000, 0x00000000, 0x00000000, 0x00000000, 1151 0x00000000, 0x00000000, 0x00000000, 0x00000000, 1152 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 1153 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 1154 0xFFFFFFFF, 0xFFFF3FFF, 0xFFFFFFFF, 0x000007FF, 1155 0x00000000, 0x00000000, 0x00000000, 0x00000000, 1156 0xE0F8007F, 0x5F7FFDFF, 0xFFFFFFDB, 0xFFFFFFFF, 1157 0xFFFFFFFF, 0x0003FFFF, 0xFFF80000, 0xFFFFFFFF, 1158 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 1159 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 1160 0xFFFFFFFF, 0x3FFFFFFF, 0xFFFF0000, 0xFFFFFFFF, 1161 0xFFFCFFFF, 0xFFFFFFFF, 0x000000FF, 0x0FFF0000, 1162 0x0000FFFF, 0x0018000F, 0x0000E000, 0xFFDF0000, 1163 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x9FFFFFFF, 1164 0x03FF0000, 0x87FFFFFE, 0x07FFFFFE, 0xFFFFFFE0, 1165 0xFFFFFFFF, 0x7FFFFFFF, 0x1CFCFCFC, 0x0E000000, 1166 0xFFFFEFFF, 0xB7FFFF7F, 0x3FFF3FFF, 0x00000000, 1167 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x07FFFFFF, 1168 0x00000000, 0x00000000, 0x00000000, 0x00000000, 1169 0x00000000, 0x00000000, 0x00000000, 0x00000000, 1170 0x00000000, 0x00000000, 0x00000000, 0x00000000, 1171 0x00000000, 0x00000000, 0x00000000, 0x00000000, 1172 0x7FFFFFFF, 0xFFFF0000, 0x000007FF, 0x00000000, 1173 0x3FFFFFFF, 0x00000000, 0x00000000, 0x00000000, 1174 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 1175 0x3FFFFFFF, 0x000003FF, 0x00000000, 0x00000000, 1176 0x00000000, 0x00000000, 0x00000000, 0x00000000, 1177 0x00000000, 0x00000000, 0x00000000, 0x00000000, 1178 0xFFFFFD3F, 0x91BFFFFF, 0x00000000, 0x00000000, 1179 0x00000000, 0x00000000, 0x00000000, 0x00000000, 1180 0x00000000, 0x00000000, 0x00000000, 0x00000000, 1181 0x00000000, 0x00000000, 0x00000000, 0x00000000, 1182 0x00000000, 0x00000000, 0x00000000, 0x00000000, 1183 0x00000000, 0x00000000, 0x00000000, 0x00000000, 1184 0x00000000, 0x00000000, 0x00000000, 0xFFFFE3E0, 1185 0x00000FE7, 0x00003C00, 0x00000000, 0x00000000, 1186 0xFFFFFFFF, 0xFFFFFFFF, 0xFFDFFFFF, 0xFFFFFFFF, 1187 0xDFFFFFFF, 0xEBFFDE64, 0xFFFFFFEF, 0xFFFFFFFF, 1188 0xDFDFE7BF, 0x7BFFFFFF, 0xFFFDFC5F, 0xFFFFFFFF, 1189 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 1190 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 1191 0xFFFFFFFF, 0xFFFFFF0F, 0xF7FFFFFD, 0xF7FFFFFF, 1192 0xFFDFFFFF, 0xFFDFFFFF, 0xFFFF7FFF, 0xFFFF7FFF, 1193 0xFFFFFDFF, 0xFFFFFDFF, 0xFFFFC3F7, 0xFFFFFFFF, 1194 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 1195 0xFFFFFFFF, 0xFFFFFFFF, 0x007FFFFF, 0x00000000, 1196 0x00000000, 0x00000000, 0x00000000, 0x00000000, 1197 0x00000000, 0x00000000, 0x00000000, 0x00000000, 1198 0x3FFFFFFF, 0x00000000, 0x00000000, 0x00000000, 1199 0x00000000, 0x00000000, 0x00000000, 0x00000000, 1200 0x00000000, 0x00000000, 0x00000000, 0x00000000, 1201 0x00000000, 0x00000000, 0x00000000, 0x00000000, 1202 0x00000002, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 1203 0x00000000, 0x00000000, 0x00000000, 0x00000000, 1204 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 1205 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x0000FFFF 1206 } 1207}; 1208 1209/* Return true if a given character can occur as first character of an 1210 identifier. See ECMA-334 section 9.4.2. */ 1211static bool 1212is_identifier_start (int c) 1213{ 1214 return bitmap_lookup (&table_identifier_start, c); 1215 /* In ASCII only this would be: 1216 return ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || c == '_'); 1217 */ 1218} 1219 1220/* Return true if a given character can occur as character of an identifier. 1221 See ECMA-334 section 9.4.2. */ 1222static bool 1223is_identifier_part (int c) 1224{ 1225 return bitmap_lookup (&table_identifier_part, c); 1226 /* In ASCII only this would be: 1227 return ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') 1228 || (c >= '0' && c <= '9') || c == '_'); 1229 */ 1230} 1231 1232static bool 1233is_any_character (int c) 1234{ 1235 return true; 1236} 1237 1238 1239/* ======================= Preprocessor directives. ======================= */ 1240 1241 1242/* Phase 5: Remove preprocessor lines. See ECMA-334 section 9.5. 1243 As a side effect, this also removes initial whitespace on every line; 1244 this whitespace doesn't matter. */ 1245 1246static int phase5_pushback[10]; 1247static int phase5_pushback_length; 1248 1249static int 1250phase5_getc () 1251{ 1252 int c; 1253 1254 if (phase5_pushback_length) 1255 return phase5_pushback[--phase5_pushback_length]; 1256 1257 c = phase4_getc (); 1258 if (c != UNL) 1259 return c; 1260 1261 do 1262 c = phase3_getc (); 1263 while (c != UEOF && is_whitespace (c)); 1264 1265 if (c == '#') 1266 { 1267 /* Ignore the entire line containing the preprocessor directive 1268 (including the // comment if it contains one). */ 1269 do 1270 c = phase3_getc (); 1271 while (c != UEOF && c != UNL); 1272 return c; 1273 } 1274 else 1275 { 1276 phase3_ungetc (c); 1277 return UNL; 1278 } 1279} 1280 1281#ifdef unused 1282static void 1283phase5_ungetc (int c) 1284{ 1285 if (c != UEOF) 1286 { 1287 if (phase5_pushback_length == SIZEOF (phase5_pushback)) 1288 abort (); 1289 phase5_pushback[phase5_pushback_length++] = c; 1290 } 1291} 1292#endif 1293 1294 1295/* ========================== Reading of tokens. ========================== */ 1296 1297enum token_type_ty 1298{ 1299 token_type_eof, 1300 token_type_lparen, /* ( */ 1301 token_type_rparen, /* ) */ 1302 token_type_lbrace, /* { */ 1303 token_type_rbrace, /* } */ 1304 token_type_comma, /* , */ 1305 token_type_dot, /* . */ 1306 token_type_string_literal, /* "abc", @"abc" */ 1307 token_type_number, /* 1.23 */ 1308 token_type_symbol, /* identifier, keyword, null */ 1309 token_type_plus, /* + */ 1310 token_type_other /* character literal, misc. operator */ 1311}; 1312typedef enum token_type_ty token_type_ty; 1313 1314typedef struct token_ty token_ty; 1315struct token_ty 1316{ 1317 token_type_ty type; 1318 char *string; /* for token_type_string_literal, token_type_symbol */ 1319 refcounted_string_list_ty *comment; /* for token_type_string_literal */ 1320 int line_number; 1321 int logical_line_number; 1322}; 1323 1324 1325/* Free the memory pointed to by a 'struct token_ty'. */ 1326static inline void 1327free_token (token_ty *tp) 1328{ 1329 if (tp->type == token_type_string_literal || tp->type == token_type_symbol) 1330 free (tp->string); 1331 if (tp->type == token_type_string_literal) 1332 drop_reference (tp->comment); 1333} 1334 1335 1336/* Read a Unicode escape sequence outside string/character literals. 1337 Reject Unicode escapes that don't fulfill the given predicate. 1338 See ECMA-334 section 9.4.2. */ 1339static int 1340do_getc_unicode_escaped (bool (*predicate) (int)) 1341{ 1342 int c; 1343 1344 /* Use phase 3, because phase 4 elides comments. */ 1345 c = phase3_getc (); 1346 if (c == UEOF) 1347 return '\\'; 1348 if (c == 'u' || c == 'U') 1349 { 1350 unsigned char buf[8]; 1351 int expect; 1352 unsigned int n; 1353 int i; 1354 1355 expect = (c == 'U' ? 8 : 4); 1356 n = 0; 1357 for (i = 0; i < expect; i++) 1358 { 1359 int c1 = phase3_getc (); 1360 1361 if (c1 >= '0' && c1 <= '9') 1362 n = (n << 4) + (c1 - '0'); 1363 else if (c1 >= 'A' && c1 <= 'F') 1364 n = (n << 4) + (c1 - 'A' + 10); 1365 else if (c1 >= 'a' && c1 <= 'f') 1366 n = (n << 4) + (c1 - 'a' + 10); 1367 else 1368 { 1369 phase3_ungetc (c1); 1370 while (--i >= 0) 1371 phase3_ungetc (buf[i]); 1372 phase3_ungetc (c); 1373 return '\\'; 1374 } 1375 1376 buf[i] = c1; 1377 } 1378 1379 if (n >= 0x110000) 1380 { 1381 error_with_progname = false; 1382 error (0, 0, _("%s:%d: warning: invalid Unicode character"), 1383 logical_file_name, line_number); 1384 error_with_progname = true; 1385 } 1386 else if (predicate (n)) 1387 return n; 1388 1389 while (--i >= 0) 1390 phase3_ungetc (buf[i]); 1391 } 1392 phase3_ungetc (c); 1393 return '\\'; 1394} 1395 1396 1397/* Read an escape sequence inside a string literal or character literal. 1398 See ECMA-334 sections 9.4.4.4., 9.4.4.5. */ 1399static int 1400do_getc_escaped () 1401{ 1402 int c; 1403 int n; 1404 int i; 1405 1406 /* Use phase 3, because phase 4 elides comments. */ 1407 c = phase3_getc (); 1408 if (c == UEOF) 1409 return '\\'; 1410 switch (c) 1411 { 1412 case 'a': 1413 return 0x0007; 1414 case 'b': 1415 return 0x0008; 1416 case 't': 1417 return 0x0009; 1418 case 'n': 1419 return 0x000a; 1420 case 'v': 1421 return 0x000b; 1422 case 'f': 1423 return 0x000c; 1424 case 'r': 1425 return 0x000d; 1426 case '"': 1427 return '"'; 1428 case '\'': 1429 return '\''; 1430 case '\\': 1431 return '\\'; 1432 case '0': 1433 return 0x0000; 1434 case 'x': 1435 c = phase3_getc (); 1436 switch (c) 1437 { 1438 default: 1439 phase3_ungetc (c); 1440 phase3_ungetc ('x'); 1441 return '\\'; 1442 1443 case '0': case '1': case '2': case '3': case '4': 1444 case '5': case '6': case '7': case '8': case '9': 1445 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': 1446 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': 1447 break; 1448 } 1449 n = 0; 1450 for (i = 0;; i++) 1451 { 1452 switch (c) 1453 { 1454 default: 1455 phase3_ungetc (c); 1456 return n; 1457 case '0': case '1': case '2': case '3': case '4': 1458 case '5': case '6': case '7': case '8': case '9': 1459 n = n * 16 + c - '0'; 1460 break; 1461 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': 1462 n = n * 16 + 10 + c - 'A'; 1463 break; 1464 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': 1465 n = n * 16 + 10 + c - 'a'; 1466 break; 1467 } 1468 if (i == 3) 1469 break; 1470 c = phase3_getc (); 1471 } 1472 return n; 1473 case 'u': case 'U': 1474 phase3_ungetc (c); 1475 return do_getc_unicode_escaped (is_any_character); 1476 default: 1477 /* Invalid escape sequence. */ 1478 phase3_ungetc (c); 1479 return '\\'; 1480 } 1481} 1482 1483/* Read a regular string literal or character literal. 1484 See ECMA-334 sections 9.4.4.4., 9.4.4.5. */ 1485static void 1486accumulate_escaped (struct string_buffer *literal, int delimiter) 1487{ 1488 int c; 1489 1490 for (;;) 1491 { 1492 /* Use phase 3, because phase 4 elides comments. */ 1493 c = phase3_getc (); 1494 if (c == UEOF || c == delimiter) 1495 break; 1496 if (c == UNL) 1497 { 1498 phase3_ungetc (c); 1499 error_with_progname = false; 1500 if (delimiter == '\'') 1501 error (0, 0, _("%s:%d: warning: unterminated character constant"), 1502 logical_file_name, line_number); 1503 else 1504 error (0, 0, _("%s:%d: warning: unterminated string constant"), 1505 logical_file_name, line_number); 1506 error_with_progname = true; 1507 break; 1508 } 1509 if (c == '\\') 1510 c = do_getc_escaped (); 1511 string_buffer_append_unicode (literal, c); 1512 } 1513} 1514 1515 1516/* Combine characters into tokens. Discard whitespace. */ 1517 1518/* Maximum used guaranteed to be < 4. */ 1519static token_ty phase6_pushback[4]; 1520static int phase6_pushback_length; 1521 1522static void 1523phase6_get (token_ty *tp) 1524{ 1525 int c; 1526 1527 if (phase6_pushback_length) 1528 { 1529 *tp = phase6_pushback[--phase6_pushback_length]; 1530 return; 1531 } 1532 tp->string = NULL; 1533 1534 for (;;) 1535 { 1536 tp->line_number = line_number; 1537 tp->logical_line_number = logical_line_number; 1538 c = phase5_getc (); 1539 1540 if (c == UEOF) 1541 { 1542 tp->type = token_type_eof; 1543 return; 1544 } 1545 1546 switch (c) 1547 { 1548 case UNL: 1549 if (last_non_comment_line > last_comment_line) 1550 savable_comment_reset (); 1551 /* FALLTHROUGH */ 1552 case ' ': 1553 case '\t': 1554 case '\f': 1555 /* Ignore whitespace and comments. */ 1556 continue; 1557 } 1558 1559 last_non_comment_line = tp->logical_line_number; 1560 1561 switch (c) 1562 { 1563 case '(': 1564 tp->type = token_type_lparen; 1565 return; 1566 1567 case ')': 1568 tp->type = token_type_rparen; 1569 return; 1570 1571 case '{': 1572 tp->type = token_type_lbrace; 1573 return; 1574 1575 case '}': 1576 tp->type = token_type_rbrace; 1577 return; 1578 1579 case ',': 1580 tp->type = token_type_comma; 1581 return; 1582 1583 case '.': 1584 c = phase4_getc (); 1585 if (!(c >= '0' && c <= '9')) 1586 { 1587 phase4_ungetc (c); 1588 tp->type = token_type_dot; 1589 return; 1590 } 1591 /* FALLTHROUGH */ 1592 1593 case '0': case '1': case '2': case '3': case '4': 1594 case '5': case '6': case '7': case '8': case '9': 1595 { 1596 /* Don't need to verify the complicated syntax of integers and 1597 floating-point numbers. We assume a valid C# input. 1598 The simplified syntax that we recognize as number is: any 1599 sequence of alphanumeric characters, additionally '+' and '-' 1600 immediately after 'e' or 'E' except in hexadecimal numbers. */ 1601 bool hexadecimal = false; 1602 1603 for (;;) 1604 { 1605 c = phase4_getc (); 1606 if (c >= '0' && c <= '9') 1607 continue; 1608 if ((c >= 'A' && c <= 'Z') || (c >= 'a' &&c <= 'z')) 1609 { 1610 if (c == 'X' || c == 'x') 1611 hexadecimal = true; 1612 if ((c == 'E' || c == 'e') && !hexadecimal) 1613 { 1614 c = phase4_getc (); 1615 if (!(c == '+' || c == '-')) 1616 phase4_ungetc (c); 1617 } 1618 continue; 1619 } 1620 if (c == '.') 1621 continue; 1622 break; 1623 } 1624 phase4_ungetc (c); 1625 tp->type = token_type_number; 1626 return; 1627 } 1628 1629 case '"': 1630 /* Regular string literal. */ 1631 { 1632 struct string_buffer literal; 1633 1634 init_string_buffer (&literal); 1635 accumulate_escaped (&literal, '"'); 1636 tp->string = xstrdup (string_buffer_result (&literal)); 1637 free_string_buffer (&literal); 1638 tp->comment = add_reference (savable_comment); 1639 tp->type = token_type_string_literal; 1640 return; 1641 } 1642 1643 case '\'': 1644 /* Character literal. */ 1645 { 1646 struct string_buffer literal; 1647 1648 init_string_buffer (&literal); 1649 accumulate_escaped (&literal, '\''); 1650 free_string_buffer (&literal); 1651 tp->type = token_type_other; 1652 return; 1653 } 1654 1655 case '+': 1656 c = phase4_getc (); 1657 if (c == '+') 1658 /* Operator ++ */ 1659 tp->type = token_type_other; 1660 else if (c == '=') 1661 /* Operator += */ 1662 tp->type = token_type_other; 1663 else 1664 { 1665 /* Operator + */ 1666 phase4_ungetc (c); 1667 tp->type = token_type_plus; 1668 } 1669 return; 1670 1671 case '@': 1672 c = phase4_getc (); 1673 if (c == '"') 1674 { 1675 /* Verbatim string literal. */ 1676 struct string_buffer literal; 1677 1678 init_string_buffer (&literal); 1679 for (;;) 1680 { 1681 /* Use phase 2, because phase 4 elides comments and phase 3 1682 mixes up the newline characters. */ 1683 c = phase2_getc (); 1684 if (c == UEOF) 1685 break; 1686 if (c == '"') 1687 { 1688 c = phase2_getc (); 1689 if (c != '"') 1690 { 1691 phase2_ungetc (c); 1692 break; 1693 } 1694 } 1695 /* No special treatment of newline and backslash here. */ 1696 string_buffer_append_unicode (&literal, c); 1697 } 1698 tp->string = xstrdup (string_buffer_result (&literal)); 1699 free_string_buffer (&literal); 1700 tp->comment = add_reference (savable_comment); 1701 tp->type = token_type_string_literal; 1702 return; 1703 } 1704 /* FALLTHROUGH, so that @identifier is recognized. */ 1705 1706 default: 1707 if (c == '\\') 1708 c = do_getc_unicode_escaped (is_identifier_start); 1709 if (is_identifier_start (c)) 1710 { 1711 static struct string_buffer buffer; 1712 buffer.utf8_buflen = 0; 1713 for (;;) 1714 { 1715 string_buffer_append_unicode (&buffer, c); 1716 c = phase4_getc (); 1717 if (c == '\\') 1718 c = do_getc_unicode_escaped (is_identifier_part); 1719 if (!is_identifier_part (c)) 1720 break; 1721 } 1722 phase4_ungetc (c); 1723 tp->string = xstrdup (string_buffer_result (&buffer)); 1724 tp->type = token_type_symbol; 1725 return; 1726 } 1727 else 1728 { 1729 /* Misc. operator. */ 1730 tp->type = token_type_other; 1731 return; 1732 } 1733 } 1734 } 1735} 1736 1737/* Supports 3 tokens of pushback. */ 1738static void 1739phase6_unget (token_ty *tp) 1740{ 1741 if (tp->type != token_type_eof) 1742 { 1743 if (phase6_pushback_length == SIZEOF (phase6_pushback)) 1744 abort (); 1745 phase6_pushback[phase6_pushback_length++] = *tp; 1746 } 1747} 1748 1749 1750/* Compile-time optimization of string literal concatenation. 1751 Combine "string1" + ... + "stringN" to the concatenated string if 1752 - the token after this expression is not '.' (because then the last 1753 string could be part of a method call expression). */ 1754 1755static token_ty phase7_pushback[2]; 1756static int phase7_pushback_length; 1757 1758static void 1759phase7_get (token_ty *tp) 1760{ 1761 if (phase7_pushback_length) 1762 { 1763 *tp = phase7_pushback[--phase7_pushback_length]; 1764 return; 1765 } 1766 1767 phase6_get (tp); 1768 if (tp->type == token_type_string_literal) 1769 { 1770 char *sum = tp->string; 1771 size_t sum_len = strlen (sum); 1772 1773 for (;;) 1774 { 1775 token_ty token2; 1776 1777 phase6_get (&token2); 1778 if (token2.type == token_type_plus) 1779 { 1780 token_ty token3; 1781 1782 phase6_get (&token3); 1783 if (token3.type == token_type_string_literal) 1784 { 1785 token_ty token_after; 1786 1787 phase6_get (&token_after); 1788 if (token_after.type != token_type_dot) 1789 { 1790 char *addend = token3.string; 1791 size_t addend_len = strlen (addend); 1792 1793 sum = (char *) xrealloc (sum, sum_len + addend_len + 1); 1794 memcpy (sum + sum_len, addend, addend_len + 1); 1795 sum_len += addend_len; 1796 1797 phase6_unget (&token_after); 1798 free_token (&token3); 1799 free_token (&token2); 1800 continue; 1801 } 1802 phase6_unget (&token_after); 1803 } 1804 phase6_unget (&token3); 1805 } 1806 phase6_unget (&token2); 1807 break; 1808 } 1809 tp->string = sum; 1810 } 1811} 1812 1813/* Supports 2 tokens of pushback. */ 1814static void 1815phase7_unget (token_ty *tp) 1816{ 1817 if (tp->type != token_type_eof) 1818 { 1819 if (phase7_pushback_length == SIZEOF (phase7_pushback)) 1820 abort (); 1821 phase7_pushback[phase7_pushback_length++] = *tp; 1822 } 1823} 1824 1825 1826static void 1827x_csharp_lex (token_ty *tp) 1828{ 1829 phase7_get (tp); 1830} 1831 1832/* Supports 2 tokens of pushback. */ 1833static void 1834x_csharp_unlex (token_ty *tp) 1835{ 1836 phase7_unget (tp); 1837} 1838 1839 1840/* ========================= Extracting strings. ========================== */ 1841 1842 1843/* Context lookup table. */ 1844static flag_context_list_table_ty *flag_context_list_table; 1845 1846 1847/* The file is broken into tokens. Scan the token stream, looking for 1848 a keyword, followed by a left paren, followed by a string. When we 1849 see this sequence, we have something to remember. We assume we are 1850 looking at a valid C or C++ program, and leave the complaints about 1851 the grammar to the compiler. 1852 1853 Normal handling: Look for 1854 keyword ( ... msgid ... ) 1855 Plural handling: Look for 1856 keyword ( ... msgid ... msgid_plural ... ) 1857 1858 We use recursion because the arguments before msgid or between msgid 1859 and msgid_plural can contain subexpressions of the same form. */ 1860 1861 1862/* Extract messages until the next balanced closing parenthesis or brace, 1863 depending on TERMINATOR. 1864 Extracted messages are added to MLP. 1865 Return true upon eof, false upon closing parenthesis or brace. */ 1866static bool 1867extract_parenthesized (message_list_ty *mlp, token_type_ty terminator, 1868 flag_context_ty outer_context, 1869 flag_context_list_iterator_ty context_iter, 1870 struct arglist_parser *argparser) 1871{ 1872 /* Current argument number. */ 1873 int arg = 1; 1874 /* 0 when no keyword has been seen. 1 right after a keyword is seen. */ 1875 int state; 1876 /* Parameters of the keyword just seen. Defined only in state 1. */ 1877 const struct callshapes *next_shapes = NULL; 1878 /* Context iterator that will be used if the next token is a '('. */ 1879 flag_context_list_iterator_ty next_context_iter = 1880 passthrough_context_list_iterator; 1881 /* Current context. */ 1882 flag_context_ty inner_context = 1883 inherited_context (outer_context, 1884 flag_context_list_iterator_advance (&context_iter)); 1885 1886 /* Start state is 0. */ 1887 state = 0; 1888 1889 for (;;) 1890 { 1891 token_ty token; 1892 1893 x_csharp_lex (&token); 1894 switch (token.type) 1895 { 1896 case token_type_symbol: 1897 { 1898 /* Combine symbol1 . ... . symbolN to a single strings, so that 1899 we can recognize static function calls like 1900 GettextResource.gettext. The information present for 1901 symbolI.....symbolN has precedence over the information for 1902 symbolJ.....symbolN with J > I. */ 1903 char *sum = token.string; 1904 size_t sum_len = strlen (sum); 1905 const char *dottedname; 1906 flag_context_list_ty *context_list; 1907 1908 for (;;) 1909 { 1910 token_ty token2; 1911 1912 x_csharp_lex (&token2); 1913 if (token2.type == token_type_dot) 1914 { 1915 token_ty token3; 1916 1917 x_csharp_lex (&token3); 1918 if (token3.type == token_type_symbol) 1919 { 1920 char *addend = token3.string; 1921 size_t addend_len = strlen (addend); 1922 1923 sum = 1924 (char *) xrealloc (sum, sum_len + 1 + addend_len + 1); 1925 sum[sum_len] = '.'; 1926 memcpy (sum + sum_len + 1, addend, addend_len + 1); 1927 sum_len += 1 + addend_len; 1928 1929 free_token (&token3); 1930 free_token (&token2); 1931 continue; 1932 } 1933 x_csharp_unlex (&token3); 1934 } 1935 x_csharp_unlex (&token2); 1936 break; 1937 } 1938 1939 for (dottedname = sum;;) 1940 { 1941 void *keyword_value; 1942 1943 if (hash_find_entry (&keywords, dottedname, strlen (dottedname), 1944 &keyword_value) 1945 == 0) 1946 { 1947 next_shapes = (const struct callshapes *) keyword_value; 1948 state = 1; 1949 break; 1950 } 1951 1952 dottedname = strchr (dottedname, '.'); 1953 if (dottedname == NULL) 1954 { 1955 state = 0; 1956 break; 1957 } 1958 dottedname++; 1959 } 1960 1961 for (dottedname = sum;;) 1962 { 1963 context_list = 1964 flag_context_list_table_lookup ( 1965 flag_context_list_table, 1966 dottedname, strlen (dottedname)); 1967 if (context_list != NULL) 1968 break; 1969 1970 dottedname = strchr (dottedname, '.'); 1971 if (dottedname == NULL) 1972 break; 1973 dottedname++; 1974 } 1975 next_context_iter = flag_context_list_iterator (context_list); 1976 1977 free (sum); 1978 continue; 1979 } 1980 1981 case token_type_lparen: 1982 if (extract_parenthesized (mlp, token_type_rparen, 1983 inner_context, next_context_iter, 1984 arglist_parser_alloc (mlp, 1985 state ? next_shapes : NULL))) 1986 { 1987 xgettext_current_source_encoding = po_charset_utf8; 1988 arglist_parser_done (argparser, arg); 1989 xgettext_current_source_encoding = xgettext_global_source_encoding; 1990 return true; 1991 } 1992 next_context_iter = null_context_list_iterator; 1993 state = 0; 1994 continue; 1995 1996 case token_type_rparen: 1997 if (terminator == token_type_rparen) 1998 { 1999 xgettext_current_source_encoding = po_charset_utf8; 2000 arglist_parser_done (argparser, arg); 2001 xgettext_current_source_encoding = xgettext_global_source_encoding; 2002 return false; 2003 } 2004 if (terminator == token_type_rbrace) 2005 { 2006 error_with_progname = false; 2007 error (0, 0, 2008 _("%s:%d: warning: ')' found where '}' was expected"), 2009 logical_file_name, token.line_number); 2010 error_with_progname = true; 2011 } 2012 next_context_iter = null_context_list_iterator; 2013 state = 0; 2014 continue; 2015 2016 case token_type_lbrace: 2017 if (extract_parenthesized (mlp, token_type_rbrace, 2018 null_context, null_context_list_iterator, 2019 arglist_parser_alloc (mlp, NULL))) 2020 { 2021 xgettext_current_source_encoding = po_charset_utf8; 2022 arglist_parser_done (argparser, arg); 2023 xgettext_current_source_encoding = xgettext_global_source_encoding; 2024 return true; 2025 } 2026 next_context_iter = null_context_list_iterator; 2027 state = 0; 2028 continue; 2029 2030 case token_type_rbrace: 2031 if (terminator == token_type_rbrace) 2032 { 2033 xgettext_current_source_encoding = po_charset_utf8; 2034 arglist_parser_done (argparser, arg); 2035 xgettext_current_source_encoding = xgettext_global_source_encoding; 2036 return false; 2037 } 2038 if (terminator == token_type_rparen) 2039 { 2040 error_with_progname = false; 2041 error (0, 0, 2042 _("%s:%d: warning: '}' found where ')' was expected"), 2043 logical_file_name, token.line_number); 2044 error_with_progname = true; 2045 } 2046 next_context_iter = null_context_list_iterator; 2047 state = 0; 2048 continue; 2049 2050 case token_type_comma: 2051 arg++; 2052 inner_context = 2053 inherited_context (outer_context, 2054 flag_context_list_iterator_advance ( 2055 &context_iter)); 2056 next_context_iter = passthrough_context_list_iterator; 2057 state = 0; 2058 continue; 2059 2060 case token_type_string_literal: 2061 { 2062 lex_pos_ty pos; 2063 pos.file_name = logical_file_name; 2064 pos.line_number = token.line_number; 2065 2066 xgettext_current_source_encoding = po_charset_utf8; 2067 if (extract_all) 2068 remember_a_message (mlp, NULL, token.string, inner_context, 2069 &pos, token.comment); 2070 else 2071 arglist_parser_remember (argparser, arg, token.string, 2072 inner_context, 2073 pos.file_name, pos.line_number, 2074 token.comment); 2075 xgettext_current_source_encoding = xgettext_global_source_encoding; 2076 } 2077 drop_reference (token.comment); 2078 next_context_iter = null_context_list_iterator; 2079 state = 0; 2080 continue; 2081 2082 case token_type_eof: 2083 xgettext_current_source_encoding = po_charset_utf8; 2084 arglist_parser_done (argparser, arg); 2085 xgettext_current_source_encoding = xgettext_global_source_encoding; 2086 return true; 2087 2088 case token_type_dot: 2089 case token_type_number: 2090 case token_type_plus: 2091 case token_type_other: 2092 next_context_iter = null_context_list_iterator; 2093 state = 0; 2094 continue; 2095 2096 default: 2097 abort (); 2098 } 2099 } 2100} 2101 2102 2103void 2104extract_csharp (FILE *f, 2105 const char *real_filename, const char *logical_filename, 2106 flag_context_list_table_ty *flag_table, 2107 msgdomain_list_ty *mdlp) 2108{ 2109 message_list_ty *mlp = mdlp->item[0]->messages; 2110 2111 fp = f; 2112 real_file_name = real_filename; 2113 logical_file_name = xstrdup (logical_filename); 2114 line_number = 1; 2115 2116 logical_line_number = 1; 2117 last_comment_line = -1; 2118 last_non_comment_line = -1; 2119 2120 flag_context_list_table = flag_table; 2121 2122 init_keywords (); 2123 2124 /* Eat tokens until eof is seen. When extract_parenthesized returns 2125 due to an unbalanced closing parenthesis, just restart it. */ 2126 while (!extract_parenthesized (mlp, token_type_eof, 2127 null_context, null_context_list_iterator, 2128 arglist_parser_alloc (mlp, NULL))) 2129 ; 2130 2131 fp = NULL; 2132 real_file_name = NULL; 2133 logical_file_name = NULL; 2134 line_number = 0; 2135} 2136