1/* xgettext C# backend. 2 Copyright (C) 2003, 2005-2007 Free Software Foundation, Inc. 3 Written by Bruno Haible <bruno@clisp.org>, 2003. 4 5 This program is free software: you can redistribute it and/or modify 6 it under the terms of the GNU General Public License as published by 7 the Free Software Foundation; either version 3 of the License, or 8 (at your option) any later version. 9 10 This program is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 GNU General Public License for more details. 14 15 You should have received a copy of the GNU General Public License 16 along with this program. If not, see <http://www.gnu.org/licenses/>. */ 17 18#ifdef HAVE_CONFIG_H 19# include "config.h" 20#endif 21 22/* Specification. */ 23#include "x-csharp.h" 24 25#include <errno.h> 26#include <stdbool.h> 27#include <stdio.h> 28#include <stdlib.h> 29#include <string.h> 30 31#include "message.h" 32#include "xgettext.h" 33#include "x-csharp.h" 34#include "c-ctype.h" 35#include "error.h" 36#include "error-progname.h" 37#include "xalloc.h" 38#include "xerror.h" 39#include "xvasprintf.h" 40#include "hash.h" 41#include "po-charset.h" 42#include "unistr.h" 43#include "gettext.h" 44 45#define _(s) gettext(s) 46 47#define SIZEOF(a) (sizeof(a) / sizeof(a[0])) 48 49 50/* The C# syntax is defined in ECMA-334, second edition. */ 51 52 53/* ====================== Keyword set customization. ====================== */ 54 55/* If true extract all strings. */ 56static bool extract_all = false; 57 58static hash_table keywords; 59static bool default_keywords = true; 60 61 62void 63x_csharp_extract_all () 64{ 65 extract_all = true; 66} 67 68 69/* Processes a --keyword option. 70 Non-ASCII function names can be used if given in UTF-8 encoding. */ 71void 72x_csharp_keyword (const char *name) 73{ 74 if (name == NULL) 75 default_keywords = false; 76 else 77 { 78 const char *end; 79 struct callshape shape; 80 const char *colon; 81 82 if (keywords.table == NULL) 83 hash_init (&keywords, 100); 84 85 split_keywordspec (name, &end, &shape); 86 87 /* The characters between name and end should form a valid C# 88 identifier sequence with dots. 89 A colon means an invalid parse in split_keywordspec(). */ 90 colon = strchr (name, ':'); 91 if (colon == NULL || colon >= end) 92 insert_keyword_callshape (&keywords, name, end - name, &shape); 93 } 94} 95 96/* Finish initializing the keywords hash table. 97 Called after argument processing, before each file is processed. */ 98static void 99init_keywords () 100{ 101 if (default_keywords) 102 { 103 /* When adding new keywords here, also update the documentation in 104 xgettext.texi! */ 105 x_csharp_keyword ("GetString"); /* Resource{Manager,Set}.GetString */ 106 x_csharp_keyword ("GetPluralString:1,2"); /* GettextResource{Manager,Set}.GetPluralString */ 107 x_csharp_keyword ("GetParticularString:1c,2"); /* Resource{Manager,Set}.GetParticularString */ 108 x_csharp_keyword ("GetParticularPluralString:1c,2,3"); /* Resource{Manager,Set}.GetParticularPluralString */ 109 default_keywords = false; 110 } 111} 112 113void 114init_flag_table_csharp () 115{ 116 xgettext_record_flag ("GetString:1:pass-csharp-format"); 117 xgettext_record_flag ("GetPluralString:1:pass-csharp-format"); 118 xgettext_record_flag ("GetPluralString:2:pass-csharp-format"); 119 xgettext_record_flag ("GetParticularString:2:pass-csharp-format"); 120 xgettext_record_flag ("GetParticularPluralString:2:pass-csharp-format"); 121 xgettext_record_flag ("GetParticularPluralString:3:pass-csharp-format"); 122 xgettext_record_flag ("String.Format:1:csharp-format"); 123} 124 125 126/* ======================== Reading of characters. ======================== */ 127 128/* Real filename, used in error messages about the input file. */ 129static const char *real_file_name; 130 131/* Logical filename and line number, used to label the extracted messages. */ 132static char *logical_file_name; 133static int line_number; 134 135/* The input file stream. */ 136static FILE *fp; 137 138 139/* Phase 1: line_number handling. */ 140 141/* Maximum used, roughly a safer MB_LEN_MAX. */ 142#define MAX_PHASE1_PUSHBACK 16 143static unsigned char phase1_pushback[MAX_PHASE1_PUSHBACK]; 144static int phase1_pushback_length; 145 146/* Read the next single byte from the input file. */ 147static int 148phase1_getc () 149{ 150 int c; 151 152 if (phase1_pushback_length) 153 { 154 c = phase1_pushback[--phase1_pushback_length]; 155 if (c == '\n') 156 ++line_number; 157 return c; 158 } 159 160 c = getc (fp); 161 if (c == EOF) 162 { 163 if (ferror (fp)) 164 error (EXIT_FAILURE, errno, _("error while reading \"%s\""), 165 real_file_name); 166 return EOF; 167 } 168 169 if (c == '\n') 170 ++line_number; 171 return c; 172} 173 174/* Supports MAX_PHASE1_PUSHBACK characters of pushback. */ 175static void 176phase1_ungetc (int c) 177{ 178 if (c != EOF) 179 { 180 if (c == '\n') 181 --line_number; 182 if (phase1_pushback_length == SIZEOF (phase1_pushback)) 183 abort (); 184 phase1_pushback[phase1_pushback_length++] = c; 185 } 186} 187 188 189/* Phase 2: Conversion to Unicode. 190 This is done early because ECMA-334 section 9.1. says that the source is 191 "an ordered sequence of Unicode characters", and because the recognition 192 of the line terminators (ECMA-334 section 9.3.1) is hardly possible without 193 prior conversion to Unicode. */ 194 195/* End-of-file indicator for functions returning an UCS-4 character. */ 196#define UEOF -1 197 198/* Newline Unicode character. */ 199#define UNL 0x000a 200 201static int phase2_pushback[1]; 202static int phase2_pushback_length; 203 204/* Read the next Unicode UCS-4 character from the input file. */ 205static int 206phase2_getc () 207{ 208 if (phase2_pushback_length) 209 return phase2_pushback[--phase2_pushback_length]; 210 211 if (xgettext_current_source_encoding == po_charset_ascii) 212 { 213 int c = phase1_getc (); 214 if (c == EOF) 215 return UEOF; 216 if (!c_isascii (c)) 217 { 218 char buffer[21]; 219 sprintf (buffer, ":%ld", (long) line_number); 220 multiline_error (xstrdup (""), 221 xasprintf (_("\ 222Non-ASCII string at %s%s.\n\ 223Please specify the source encoding through --from-code.\n"), 224 real_file_name, buffer)); 225 exit (EXIT_FAILURE); 226 } 227 return c; 228 } 229 else if (xgettext_current_source_encoding != po_charset_utf8) 230 { 231#if HAVE_ICONV 232 /* Use iconv on an increasing number of bytes. Read only as many bytes 233 through phase1_getc as needed. This is needed to give reasonable 234 interactive behaviour when fp is connected to an interactive tty. */ 235 unsigned char buf[MAX_PHASE1_PUSHBACK]; 236 size_t bufcount; 237 int c = phase1_getc (); 238 if (c == EOF) 239 return UEOF; 240 buf[0] = (unsigned char) c; 241 bufcount = 1; 242 243 for (;;) 244 { 245 unsigned char scratchbuf[6]; 246 const char *inptr = (const char *) &buf[0]; 247 size_t insize = bufcount; 248 char *outptr = (char *) &scratchbuf[0]; 249 size_t outsize = sizeof (scratchbuf); 250 251 size_t res = iconv (xgettext_current_source_iconv, 252 (ICONV_CONST char **) &inptr, &insize, 253 &outptr, &outsize); 254 /* We expect that a character has been produced if and only if 255 some input bytes have been consumed. */ 256 if ((insize < bufcount) != (outsize < sizeof (scratchbuf))) 257 abort (); 258 if (outsize == sizeof (scratchbuf)) 259 { 260 /* No character has been produced. Must be an error. */ 261 if (res != (size_t)(-1)) 262 abort (); 263 264 if (errno == EILSEQ) 265 { 266 /* An invalid multibyte sequence was encountered. */ 267 multiline_error (xstrdup (""), 268 xasprintf (_("\ 269%s:%d: Invalid multibyte sequence.\n\ 270Please specify the correct source encoding through --from-code.\n"), 271 real_file_name, line_number)); 272 exit (EXIT_FAILURE); 273 } 274 else if (errno == EINVAL) 275 { 276 /* An incomplete multibyte character. */ 277 int c; 278 279 if (bufcount == MAX_PHASE1_PUSHBACK) 280 { 281 /* An overlong incomplete multibyte sequence was 282 encountered. */ 283 multiline_error (xstrdup (""), 284 xasprintf (_("\ 285%s:%d: Long incomplete multibyte sequence.\n\ 286Please specify the correct source encoding through --from-code.\n"), 287 real_file_name, line_number)); 288 exit (EXIT_FAILURE); 289 } 290 291 /* Read one more byte and retry iconv. */ 292 c = phase1_getc (); 293 if (c == EOF) 294 { 295 multiline_error (xstrdup (""), 296 xasprintf (_("\ 297%s:%d: Incomplete multibyte sequence at end of file.\n\ 298Please specify the correct source encoding through --from-code.\n"), 299 real_file_name, line_number)); 300 exit (EXIT_FAILURE); 301 } 302 if (c == '\n') 303 { 304 multiline_error (xstrdup (""), 305 xasprintf (_("\ 306%s:%d: Incomplete multibyte sequence at end of line.\n\ 307Please specify the correct source encoding through --from-code.\n"), 308 real_file_name, line_number - 1)); 309 exit (EXIT_FAILURE); 310 } 311 buf[bufcount++] = (unsigned char) c; 312 } 313 else 314 error (EXIT_FAILURE, errno, _("%s:%d: iconv failure"), 315 real_file_name, line_number); 316 } 317 else 318 { 319 size_t outbytes = sizeof (scratchbuf) - outsize; 320 size_t bytes = bufcount - insize; 321 unsigned int uc; 322 323 /* We expect that one character has been produced. */ 324 if (bytes == 0) 325 abort (); 326 if (outbytes == 0) 327 abort (); 328 /* Push back the unused bytes. */ 329 while (insize > 0) 330 phase1_ungetc (buf[--insize]); 331 /* Convert the character from UTF-8 to UCS-4. */ 332 if (u8_mbtouc (&uc, scratchbuf, outbytes) < outbytes) 333 { 334 /* scratchbuf contains an out-of-range Unicode character 335 (> 0x10ffff). */ 336 multiline_error (xstrdup (""), 337 xasprintf (_("\ 338%s:%d: Invalid multibyte sequence.\n\ 339Please specify the source encoding through --from-code.\n"), 340 real_file_name, line_number)); 341 exit (EXIT_FAILURE); 342 } 343 return uc; 344 } 345 } 346#else 347 /* If we don't have iconv(), the only supported values for 348 xgettext_global_source_encoding and thus also for 349 xgettext_current_source_encoding are ASCII and UTF-8. */ 350 abort (); 351#endif 352 } 353 else 354 { 355 /* Read an UTF-8 encoded character. */ 356 unsigned char buf[6]; 357 unsigned int count; 358 int c; 359 unsigned int uc; 360 361 c = phase1_getc (); 362 if (c == EOF) 363 return UEOF; 364 buf[0] = c; 365 count = 1; 366 367 if (buf[0] >= 0xc0) 368 { 369 c = phase1_getc (); 370 if (c == EOF) 371 return UEOF; 372 buf[1] = c; 373 count = 2; 374 } 375 376 if (buf[0] >= 0xe0 377 && ((buf[1] ^ 0x80) < 0x40)) 378 { 379 c = phase1_getc (); 380 if (c == EOF) 381 return UEOF; 382 buf[2] = c; 383 count = 3; 384 } 385 386 if (buf[0] >= 0xf0 387 && ((buf[1] ^ 0x80) < 0x40) 388 && ((buf[2] ^ 0x80) < 0x40)) 389 { 390 c = phase1_getc (); 391 if (c == EOF) 392 return UEOF; 393 buf[3] = c; 394 count = 4; 395 } 396 397 if (buf[0] >= 0xf8 398 && ((buf[1] ^ 0x80) < 0x40) 399 && ((buf[2] ^ 0x80) < 0x40) 400 && ((buf[3] ^ 0x80) < 0x40)) 401 { 402 c = phase1_getc (); 403 if (c == EOF) 404 return UEOF; 405 buf[4] = c; 406 count = 5; 407 } 408 409 if (buf[0] >= 0xfc 410 && ((buf[1] ^ 0x80) < 0x40) 411 && ((buf[2] ^ 0x80) < 0x40) 412 && ((buf[3] ^ 0x80) < 0x40) 413 && ((buf[4] ^ 0x80) < 0x40)) 414 { 415 c = phase1_getc (); 416 if (c == EOF) 417 return UEOF; 418 buf[5] = c; 419 count = 6; 420 } 421 422 u8_mbtouc (&uc, buf, count); 423 return uc; 424 } 425} 426 427/* Supports only one pushback character. */ 428static void 429phase2_ungetc (int c) 430{ 431 if (c != UEOF) 432 { 433 if (phase2_pushback_length == SIZEOF (phase2_pushback)) 434 abort (); 435 phase2_pushback[phase2_pushback_length++] = c; 436 } 437} 438 439 440/* Phase 3: Convert all line terminators to LF. 441 See ECMA-334 section 9.3.1. */ 442 443/* Line number defined in terms of phase3. */ 444static int logical_line_number; 445 446static int phase3_pushback[9]; 447static int phase3_pushback_length; 448 449/* Read the next Unicode UCS-4 character from the input file, mapping 450 all line terminators to U+000A, and dropping U+001A at the end of file. */ 451static int 452phase3_getc () 453{ 454 int c; 455 456 if (phase3_pushback_length) 457 { 458 c = phase3_pushback[--phase3_pushback_length]; 459 if (c == UNL) 460 ++logical_line_number; 461 return c; 462 } 463 464 c = phase2_getc (); 465 466 if (c == 0x000d) 467 { 468 int c1 = phase2_getc (); 469 470 if (c1 != UEOF && c1 != 0x000a) 471 phase2_ungetc (c1); 472 473 /* Seen line terminator CR or CR/LF. */ 474 ++logical_line_number; 475 return UNL; 476 } 477 478 if (c == 0x0085 || c == 0x2028 || c == 0x2029) 479 { 480 /* Seen Unicode word processor newline. */ 481 ++logical_line_number; 482 return UNL; 483 } 484 485 if (c == 0x001a) 486 { 487 int c1 = phase2_getc (); 488 489 if (c1 == UEOF) 490 /* Seen U+001A right before the end of file. */ 491 return UEOF; 492 493 phase2_ungetc (c1); 494 } 495 496 if (c == UNL) 497 ++logical_line_number; 498 return c; 499} 500 501/* Supports 9 characters of pushback. */ 502static void 503phase3_ungetc (int c) 504{ 505 if (c != UEOF) 506 { 507 if (c == UNL) 508 --logical_line_number; 509 if (phase3_pushback_length == SIZEOF (phase3_pushback)) 510 abort (); 511 phase3_pushback[phase3_pushback_length++] = c; 512 } 513} 514 515 516/* ========================= Accumulating strings. ======================== */ 517 518/* A string buffer type that allows appending Unicode characters. 519 Returns the entire string in UTF-8 encoding. */ 520 521struct string_buffer 522{ 523 /* The part of the string that has already been converted to UTF-8. */ 524 char *utf8_buffer; 525 size_t utf8_buflen; 526 size_t utf8_allocated; 527}; 528 529/* Initialize a 'struct string_buffer' to empty. */ 530static inline void 531init_string_buffer (struct string_buffer *bp) 532{ 533 bp->utf8_buffer = NULL; 534 bp->utf8_buflen = 0; 535 bp->utf8_allocated = 0; 536} 537 538/* Auxiliary function: Ensure count more bytes are available in bp->utf8. */ 539static inline void 540string_buffer_append_unicode_grow (struct string_buffer *bp, size_t count) 541{ 542 if (bp->utf8_buflen + count > bp->utf8_allocated) 543 { 544 size_t new_allocated = 2 * bp->utf8_allocated + 10; 545 if (new_allocated < bp->utf8_buflen + count) 546 new_allocated = bp->utf8_buflen + count; 547 bp->utf8_allocated = new_allocated; 548 bp->utf8_buffer = xrealloc (bp->utf8_buffer, new_allocated); 549 } 550} 551 552/* Auxiliary function: Append a Unicode character to bp->utf8. 553 uc must be < 0x110000. */ 554static inline void 555string_buffer_append_unicode (struct string_buffer *bp, unsigned int uc) 556{ 557 unsigned char utf8buf[6]; 558 int count = u8_uctomb (utf8buf, uc, 6); 559 560 if (count < 0) 561 /* The caller should have ensured that uc is not out-of-range. */ 562 abort (); 563 564 string_buffer_append_unicode_grow (bp, count); 565 memcpy (bp->utf8_buffer + bp->utf8_buflen, utf8buf, count); 566 bp->utf8_buflen += count; 567} 568 569/* Return the string buffer's contents. */ 570static char * 571string_buffer_result (struct string_buffer *bp) 572{ 573 /* NUL-terminate it. */ 574 string_buffer_append_unicode_grow (bp, 1); 575 bp->utf8_buffer[bp->utf8_buflen] = '\0'; 576 /* Return it. */ 577 return bp->utf8_buffer; 578} 579 580/* Free the memory pointed to by a 'struct string_buffer'. */ 581static inline void 582free_string_buffer (struct string_buffer *bp) 583{ 584 free (bp->utf8_buffer); 585} 586 587 588/* ======================== Accumulating comments. ======================== */ 589 590 591/* Accumulating a single comment line. */ 592 593static struct string_buffer comment_buffer; 594 595static inline void 596comment_start () 597{ 598 comment_buffer.utf8_buflen = 0; 599} 600 601static inline bool 602comment_at_start () 603{ 604 return (comment_buffer.utf8_buflen == 0); 605} 606 607static inline void 608comment_add (int c) 609{ 610 string_buffer_append_unicode (&comment_buffer, c); 611} 612 613static inline void 614comment_line_end (size_t chars_to_remove) 615{ 616 char *buffer = string_buffer_result (&comment_buffer); 617 size_t buflen = strlen (buffer); 618 619 buflen -= chars_to_remove; 620 while (buflen >= 1 621 && (buffer[buflen - 1] == ' ' || buffer[buflen - 1] == '\t')) 622 --buflen; 623 buffer[buflen] = '\0'; 624 savable_comment_add (buffer); 625} 626 627 628/* These are for tracking whether comments count as immediately before 629 keyword. */ 630static int last_comment_line; 631static int last_non_comment_line; 632 633 634/* Phase 4: Replace each comment that is not inside a character constant or 635 string literal with a space or newline character. 636 See ECMA-334 section 9.3.2. */ 637 638static int 639phase4_getc () 640{ 641 int c0; 642 int c; 643 bool last_was_star; 644 645 c0 = phase3_getc (); 646 if (c0 != '/') 647 return c0; 648 c = phase3_getc (); 649 switch (c) 650 { 651 default: 652 phase3_ungetc (c); 653 return c0; 654 655 case '*': 656 /* C style comment. */ 657 comment_start (); 658 last_was_star = false; 659 for (;;) 660 { 661 c = phase3_getc (); 662 if (c == UEOF) 663 break; 664 /* We skip all leading white space, but not EOLs. */ 665 if (!(comment_at_start () && (c == ' ' || c == '\t'))) 666 comment_add (c); 667 switch (c) 668 { 669 case UNL: 670 comment_line_end (1); 671 comment_start (); 672 last_was_star = false; 673 continue; 674 675 case '*': 676 last_was_star = true; 677 continue; 678 679 case '/': 680 if (last_was_star) 681 { 682 comment_line_end (2); 683 break; 684 } 685 /* FALLTHROUGH */ 686 687 default: 688 last_was_star = false; 689 continue; 690 } 691 break; 692 } 693 last_comment_line = logical_line_number; 694 return ' '; 695 696 case '/': 697 /* C++ style comment. */ 698 last_comment_line = logical_line_number; 699 comment_start (); 700 for (;;) 701 { 702 c = phase3_getc (); 703 if (c == UNL || c == UEOF) 704 break; 705 /* We skip all leading white space, but not EOLs. */ 706 if (!(comment_at_start () && (c == ' ' || c == '\t'))) 707 comment_add (c); 708 } 709 phase3_ungetc (c); /* push back the newline, to decrement logical_line_number */ 710 comment_line_end (0); 711 phase3_getc (); /* read the newline again */ 712 return UNL; 713 } 714} 715 716/* Supports only one pushback character. */ 717static void 718phase4_ungetc (int c) 719{ 720 phase3_ungetc (c); 721} 722 723 724/* ======================= Character classification. ====================== */ 725 726 727/* Return true if a given character is white space. 728 See ECMA-334 section 9.3.3. */ 729static bool 730is_whitespace (int c) 731{ 732 /* Unicode character class Zs, as of Unicode 4.0. */ 733 /* grep '^[^;]*;[^;]*;Zs;' UnicodeData-4.0.0.txt */ 734 switch (c >> 8) 735 { 736 case 0x00: 737 return (c == 0x0020 || c == 0x00a0); 738 case 0x16: 739 return (c == 0x1680); 740 case 0x18: 741 return (c == 0x180e); 742 case 0x20: 743 return ((c >= 0x2000 && c <= 0x200b) || c == 0x202f || c == 0x205f); 744 case 0x30: 745 return (c == 0x3000); 746 default: 747 return false; 748 } 749} 750 751 752/* C# allows identifiers containing many Unicode characters. We recognize 753 them; to use an identifier with Unicode characters in a --keyword option, 754 it must be specified in UTF-8. */ 755 756static inline int 757bitmap_lookup (const void *table, unsigned int uc) 758{ 759 unsigned int index1 = uc >> 16; 760 if (index1 < ((const int *) table)[0]) 761 { 762 int lookup1 = ((const int *) table)[1 + index1]; 763 if (lookup1 >= 0) 764 { 765 unsigned int index2 = (uc >> 9) & 0x7f; 766 int lookup2 = ((const int *) table)[lookup1 + index2]; 767 if (lookup2 >= 0) 768 { 769 unsigned int index3 = (uc >> 5) & 0xf; 770 unsigned int lookup3 = ((const int *) table)[lookup2 + index3]; 771 772 return (lookup3 >> (uc & 0x1f)) & 1; 773 } 774 } 775 } 776 return 0; 777} 778 779/* Unicode character classes Lu, Ll, Lt, Lm, Lo, Nl, as of Unicode 4.0, 780 plus the underscore. */ 781static const 782struct 783 { 784 int header[1]; 785 int level1[3]; 786 int level2[3 << 7]; 787 /*unsigned*/ int level3[34 << 4]; 788 } 789table_identifier_start = 790{ 791 { 3 }, 792 { 4, 132, 260 }, 793 { 794 388, 404, 420, 436, 452, 468, 484, 500, 795 516, 532, 548, 564, 580, -1, 596, 612, 796 628, -1, -1, -1, -1, -1, -1, -1, 797 644, -1, 660, 660, 660, 660, 660, 660, 798 660, 660, 660, 660, 660, 660, 676, 660, 799 660, 660, 660, 660, 660, 660, 660, 660, 800 660, 660, 660, 660, 660, 660, 660, 660, 801 660, 660, 660, 660, 660, 660, 660, 660, 802 660, 660, 660, 660, 660, 660, 660, 660, 803 660, 660, 660, 660, 660, 660, 660, 692, 804 660, 660, 708, -1, -1, -1, 660, 660, 805 660, 660, 660, 660, 660, 660, 660, 660, 806 660, 660, 660, 660, 660, 660, 660, 660, 807 660, 660, 660, 724, -1, -1, -1, -1, 808 -1, -1, -1, -1, -1, -1, -1, -1, 809 -1, -1, -1, -1, 740, 756, 772, 788, 810 804, 820, 836, -1, 852, -1, -1, -1, 811 -1, -1, -1, -1, -1, -1, -1, -1, 812 -1, -1, -1, -1, -1, -1, -1, -1, 813 -1, -1, -1, -1, -1, -1, -1, -1, 814 -1, -1, -1, -1, -1, -1, -1, -1, 815 -1, -1, -1, -1, -1, -1, -1, -1, 816 -1, -1, -1, -1, -1, -1, -1, -1, 817 -1, -1, -1, -1, -1, -1, -1, -1, 818 -1, -1, -1, -1, -1, -1, -1, -1, 819 -1, -1, -1, -1, -1, -1, -1, -1, 820 -1, -1, -1, -1, -1, -1, -1, -1, 821 -1, -1, -1, -1, -1, -1, -1, -1, 822 -1, -1, -1, -1, -1, -1, -1, -1, 823 -1, -1, 868, 884, -1, -1, -1, -1, 824 -1, -1, -1, -1, -1, -1, -1, -1, 825 -1, -1, -1, -1, -1, -1, -1, -1, 826 660, 660, 660, 660, 660, 660, 660, 660, 827 660, 660, 660, 660, 660, 660, 660, 660, 828 660, 660, 660, 660, 660, 660, 660, 660, 829 660, 660, 660, 660, 660, 660, 660, 660, 830 660, 660, 660, 660, 660, 660, 660, 660, 831 660, 660, 660, 660, 660, 660, 660, 660, 832 660, 660, 660, 660, 660, 660, 660, 660, 833 660, 660, 660, 660, 660, 660, 660, 660, 834 660, 660, 660, 660, 660, 660, 660, 660, 835 660, 660, 660, 660, 660, 660, 660, 660, 836 660, 660, 660, 900, -1, -1, -1, -1, 837 -1, -1, -1, -1, -1, -1, -1, -1, 838 -1, -1, -1, -1, -1, -1, -1, -1, 839 -1, -1, -1, -1, -1, -1, -1, -1, 840 -1, -1, -1, -1, -1, -1, -1, -1, 841 -1, -1, -1, -1, 660, 916, -1, -1 842 }, 843 { 844 0x00000000, 0x00000000, 0x87FFFFFE, 0x07FFFFFE, 845 0x00000000, 0x04200400, 0xFF7FFFFF, 0xFF7FFFFF, 846 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 847 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 848 0xFFFFFFFF, 0x007FFFFF, 0xFFFF0000, 0xFFFFFFFF, 849 0xFFFFFFFF, 0xFFFFFFFF, 0x0003FFC3, 0x0000401F, 850 0x00000000, 0x00000000, 0x00000000, 0x04000000, 851 0xFFFFD740, 0xFFFFFFFB, 0xFFFF7FFF, 0x0FBFFFFF, 852 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 853 0xFFFFFC03, 0xFFFFFFFF, 0xFFFF7FFF, 0x033FFFFF, 854 0x0000FFFF, 0xFFFE0000, 0x027FFFFF, 0xFFFFFFFE, 855 0x000000FF, 0x00000000, 0xFFFF0000, 0x000707FF, 856 0x00000000, 0x07FFFFFE, 0x000007FF, 0xFFFEC000, 857 0xFFFFFFFF, 0xFFFFFFFF, 0x002FFFFF, 0x9C00C060, 858 0xFFFD0000, 0x0000FFFF, 0x0000E000, 0x00000000, 859 0xFFFFFFFF, 0x0002003F, 0x00000000, 0x00000000, 860 0x00000000, 0x00000000, 0x00000000, 0x00000000, 861 0x00000000, 0x00000000, 0x00000000, 0x00000000, 862 0xFFFFFFF0, 0x23FFFFFF, 0xFF010000, 0x00000003, 863 0xFFF99FE0, 0x23C5FDFF, 0xB0000000, 0x00030003, 864 0xFFF987E0, 0x036DFDFF, 0x5E000000, 0x001C0000, 865 0xFFFBBFE0, 0x23EDFDFF, 0x00010000, 0x00000003, 866 0xFFF99FE0, 0x23EDFDFF, 0xB0000000, 0x00020003, 867 0xD63DC7E8, 0x03BFC718, 0x00000000, 0x00000000, 868 0xFFFDDFE0, 0x03EFFDFF, 0x00000000, 0x00000003, 869 0xFFFDDFE0, 0x23EFFDFF, 0x40000000, 0x00000003, 870 0xFFFDDFE0, 0x03FFFDFF, 0x00000000, 0x00000003, 871 0xFC7FFFE0, 0x2FFBFFFF, 0x0000007F, 0x00000000, 872 0xFFFFFFFE, 0x000DFFFF, 0x0000007F, 0x00000000, 873 0xFEF02596, 0x200DECAE, 0x3000005F, 0x00000000, 874 0x00000001, 0x00000000, 0xFFFFFEFF, 0x000007FF, 875 0x00000F00, 0x00000000, 0x00000000, 0x00000000, 876 0xFFFFFFFF, 0x000006FB, 0x003F0000, 0x00000000, 877 0x00000000, 0xFFFFFFFF, 0xFFFF003F, 0x01FFFFFF, 878 0xFFFFFFFF, 0xFFFFFFFF, 0x83FFFFFF, 0xFFFFFFFF, 879 0xFFFFFFFF, 0xFFFFFF07, 0xFFFFFFFF, 0x03FFFFFF, 880 0xFFFFFF7F, 0xFFFFFFFF, 0x3D7F3D7F, 0xFFFFFFFF, 881 0xFFFF3D7F, 0x7F3D7FFF, 0xFF7F7F3D, 0xFFFF7FFF, 882 0x7F3D7FFF, 0xFFFFFFFF, 0x07FFFF7F, 0x00000000, 883 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0x001FFFFF, 884 0xFFFFFFFE, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 885 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 886 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 887 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 888 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x007F9FFF, 889 0x07FFFFFE, 0xFFFFFFFF, 0xFFFFFFFF, 0x0001C7FF, 890 0x0003DFFF, 0x0003FFFF, 0x0003FFFF, 0x0001DFFF, 891 0xFFFFFFFF, 0x000FFFFF, 0x10800000, 0x00000000, 892 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0x00FFFFFF, 893 0xFFFFFFFF, 0x000001FF, 0x00000000, 0x00000000, 894 0x1FFFFFFF, 0x00000000, 0xFFFF0000, 0x001F3FFF, 895 0x00000000, 0x00000000, 0x00000000, 0x00000000, 896 0x00000000, 0x00000000, 0x00000000, 0x00000000, 897 0x00000000, 0x00000000, 0x00000000, 0x00000000, 898 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000FFF, 899 0x00000000, 0x00000000, 0x00000000, 0x00000000, 900 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 901 0x0FFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x03FFFFFF, 902 0x3F3FFFFF, 0xFFFFFFFF, 0xAAFF3F3F, 0x3FFFFFFF, 903 0xFFFFFFFF, 0x5FDFFFFF, 0x0FCF1FDC, 0x1FDC1FFF, 904 0x00000000, 0x00000000, 0x00000000, 0x80020000, 905 0x00000000, 0x00000000, 0x00000000, 0x00000000, 906 0x3E2FFC84, 0xE3FBBD50, 0x000003E0, 0xFFFFFFFF, 907 0x0000000F, 0x00000000, 0x00000000, 0x00000000, 908 0x000000E0, 0x1F3E03FE, 0xFFFFFFFE, 0xFFFFFFFF, 909 0xE07FFFFF, 0xFFFFFFFE, 0xFFFFFFFF, 0xF7FFFFFF, 910 0xFFFFFFE0, 0xFFFE1FFF, 0xFFFFFFFF, 0xFFFFFFFF, 911 0x00007FFF, 0x00FFFFFF, 0x00000000, 0xFFFF0000, 912 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 913 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 914 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 915 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 916 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 917 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 918 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 919 0xFFFFFFFF, 0x003FFFFF, 0x00000000, 0x00000000, 920 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 921 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 922 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 923 0xFFFFFFFF, 0x0000003F, 0x00000000, 0x00000000, 924 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 925 0x00001FFF, 0x00000000, 0x00000000, 0x00000000, 926 0x00000000, 0x00000000, 0x00000000, 0x00000000, 927 0x00000000, 0x00000000, 0x00000000, 0x00000000, 928 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 929 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 930 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 931 0xFFFFFFFF, 0x0000000F, 0x00000000, 0x00000000, 932 0x00000000, 0x00000000, 0x00000000, 0x00000000, 933 0x00000000, 0x00000000, 0x00000000, 0x00000000, 934 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 935 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 936 0xFFFFFFFF, 0xFFFF3FFF, 0xFFFFFFFF, 0x000007FF, 937 0x00000000, 0x00000000, 0x00000000, 0x00000000, 938 0xA0F8007F, 0x5F7FFDFF, 0xFFFFFFDB, 0xFFFFFFFF, 939 0xFFFFFFFF, 0x0003FFFF, 0xFFF80000, 0xFFFFFFFF, 940 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 941 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 942 0xFFFFFFFF, 0x3FFFFFFF, 0xFFFF0000, 0xFFFFFFFF, 943 0xFFFCFFFF, 0xFFFFFFFF, 0x000000FF, 0x0FFF0000, 944 0x00000000, 0x00000000, 0x00000000, 0xFFDF0000, 945 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x1FFFFFFF, 946 0x00000000, 0x07FFFFFE, 0x07FFFFFE, 0xFFFFFFC0, 947 0xFFFFFFFF, 0x7FFFFFFF, 0x1CFCFCFC, 0x00000000, 948 0xFFFFEFFF, 0xB7FFFF7F, 0x3FFF3FFF, 0x00000000, 949 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x07FFFFFF, 950 0x00000000, 0x00000000, 0x00000000, 0x00000000, 951 0x00000000, 0x00000000, 0x00000000, 0x00000000, 952 0x00000000, 0x00000000, 0x00000000, 0x00000000, 953 0x00000000, 0x00000000, 0x00000000, 0x00000000, 954 0x7FFFFFFF, 0xFFFF0000, 0x000007FF, 0x00000000, 955 0x3FFFFFFF, 0x00000000, 0x00000000, 0x00000000, 956 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 957 0x3FFFFFFF, 0x00000000, 0x00000000, 0x00000000, 958 0x00000000, 0x00000000, 0x00000000, 0x00000000, 959 0x00000000, 0x00000000, 0x00000000, 0x00000000, 960 0xFFFFFD3F, 0x91BFFFFF, 0x00000000, 0x00000000, 961 0x00000000, 0x00000000, 0x00000000, 0x00000000, 962 0x00000000, 0x00000000, 0x00000000, 0x00000000, 963 0x00000000, 0x00000000, 0x00000000, 0x00000000, 964 0xFFFFFFFF, 0xFFFFFFFF, 0xFFDFFFFF, 0xFFFFFFFF, 965 0xDFFFFFFF, 0xEBFFDE64, 0xFFFFFFEF, 0xFFFFFFFF, 966 0xDFDFE7BF, 0x7BFFFFFF, 0xFFFDFC5F, 0xFFFFFFFF, 967 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 968 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 969 0xFFFFFFFF, 0xFFFFFF0F, 0xF7FFFFFD, 0xF7FFFFFF, 970 0xFFDFFFFF, 0xFFDFFFFF, 0xFFFF7FFF, 0xFFFF7FFF, 971 0xFFFFFDFF, 0xFFFFFDFF, 0x000003F7, 0x00000000, 972 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 973 0xFFFFFFFF, 0xFFFFFFFF, 0x007FFFFF, 0x00000000, 974 0x00000000, 0x00000000, 0x00000000, 0x00000000, 975 0x00000000, 0x00000000, 0x00000000, 0x00000000, 976 0x3FFFFFFF, 0x00000000, 0x00000000, 0x00000000, 977 0x00000000, 0x00000000, 0x00000000, 0x00000000, 978 0x00000000, 0x00000000, 0x00000000, 0x00000000, 979 0x00000000, 0x00000000, 0x00000000, 0x00000000 980 } 981}; 982 983/* Unicode character classes Lu, Ll, Lt, Lm, Lo, Nl, Nd, Pc, Mn, Mc, Cf, 984 as of Unicode 4.0. */ 985static const 986struct 987 { 988 int header[1]; 989 int level1[15]; 990 int level2[4 << 7]; 991 /*unsigned*/ int level3[36 << 4]; 992 } 993table_identifier_part = 994{ 995 { 15 }, 996 { 997 16, 144, 272, -1, -1, -1, -1, -1, 998 -1, -1, -1, -1, -1, -1, 400 999 }, 1000 { 1001 528, 544, 560, 576, 592, 608, 624, 640, 1002 656, 672, 688, 704, 720, -1, 736, 752, 1003 768, -1, -1, -1, -1, -1, -1, -1, 1004 784, -1, 800, 800, 800, 800, 800, 800, 1005 800, 800, 800, 800, 800, 800, 816, 800, 1006 800, 800, 800, 800, 800, 800, 800, 800, 1007 800, 800, 800, 800, 800, 800, 800, 800, 1008 800, 800, 800, 800, 800, 800, 800, 800, 1009 800, 800, 800, 800, 800, 800, 800, 800, 1010 800, 800, 800, 800, 800, 800, 800, 832, 1011 800, 800, 848, -1, -1, -1, 800, 800, 1012 800, 800, 800, 800, 800, 800, 800, 800, 1013 800, 800, 800, 800, 800, 800, 800, 800, 1014 800, 800, 800, 864, -1, -1, -1, -1, 1015 -1, -1, -1, -1, -1, -1, -1, -1, 1016 -1, -1, -1, -1, 880, 896, 912, 928, 1017 944, 960, 976, -1, 992, -1, -1, -1, 1018 -1, -1, -1, -1, -1, -1, -1, -1, 1019 -1, -1, -1, -1, -1, -1, -1, -1, 1020 -1, -1, -1, -1, -1, -1, -1, -1, 1021 -1, -1, -1, -1, -1, -1, -1, -1, 1022 -1, -1, -1, -1, -1, -1, -1, -1, 1023 -1, -1, -1, -1, -1, -1, -1, -1, 1024 -1, -1, -1, -1, -1, -1, -1, -1, 1025 -1, -1, -1, -1, -1, -1, -1, -1, 1026 -1, -1, -1, -1, -1, -1, -1, -1, 1027 -1, -1, -1, -1, -1, -1, -1, -1, 1028 -1, -1, -1, -1, -1, -1, -1, -1, 1029 -1, -1, -1, -1, -1, -1, -1, -1, 1030 1008, -1, 1024, 1040, -1, -1, -1, -1, 1031 -1, -1, -1, -1, -1, -1, -1, -1, 1032 -1, -1, -1, -1, -1, -1, -1, -1, 1033 800, 800, 800, 800, 800, 800, 800, 800, 1034 800, 800, 800, 800, 800, 800, 800, 800, 1035 800, 800, 800, 800, 800, 800, 800, 800, 1036 800, 800, 800, 800, 800, 800, 800, 800, 1037 800, 800, 800, 800, 800, 800, 800, 800, 1038 800, 800, 800, 800, 800, 800, 800, 800, 1039 800, 800, 800, 800, 800, 800, 800, 800, 1040 800, 800, 800, 800, 800, 800, 800, 800, 1041 800, 800, 800, 800, 800, 800, 800, 800, 1042 800, 800, 800, 800, 800, 800, 800, 800, 1043 800, 800, 800, 1056, -1, -1, -1, -1, 1044 -1, -1, -1, -1, -1, -1, -1, -1, 1045 -1, -1, -1, -1, -1, -1, -1, -1, 1046 -1, -1, -1, -1, -1, -1, -1, -1, 1047 -1, -1, -1, -1, -1, -1, -1, -1, 1048 -1, -1, -1, -1, 800, 1072, -1, -1, 1049 1088, -1, -1, -1, -1, -1, -1, -1, 1050 -1, -1, -1, -1, -1, -1, -1, -1, 1051 -1, -1, -1, -1, -1, -1, -1, -1, 1052 -1, -1, -1, -1, -1, -1, -1, -1, 1053 -1, -1, -1, -1, -1, -1, -1, -1, 1054 -1, -1, -1, -1, -1, -1, -1, -1, 1055 -1, -1, -1, -1, -1, -1, -1, -1, 1056 -1, -1, -1, -1, -1, -1, -1, -1, 1057 -1, -1, -1, -1, -1, -1, -1, -1, 1058 -1, -1, -1, -1, -1, -1, -1, -1, 1059 -1, -1, -1, -1, -1, -1, -1, -1, 1060 -1, -1, -1, -1, -1, -1, -1, -1, 1061 -1, -1, -1, -1, -1, -1, -1, -1, 1062 -1, -1, -1, -1, -1, -1, -1, -1, 1063 -1, -1, -1, -1, -1, -1, -1, -1, 1064 -1, -1, -1, -1, -1, -1, -1, -1 1065 }, 1066 { 1067 0x00000000, 0x03FF0000, 0x87FFFFFE, 0x07FFFFFE, 1068 0x00000000, 0x04202400, 0xFF7FFFFF, 0xFF7FFFFF, 1069 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 1070 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 1071 0xFFFFFFFF, 0x007FFFFF, 0xFFFF0000, 0xFFFFFFFF, 1072 0xFFFFFFFF, 0xFFFFFFFF, 0x0003FFC3, 0x0000401F, 1073 0xFFFFFFFF, 0xFFFFFFFF, 0xE0FFFFFF, 0x0400FFFF, 1074 0xFFFFD740, 0xFFFFFFFB, 0xFFFF7FFF, 0x0FBFFFFF, 1075 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 1076 0xFFFFFC7B, 0xFFFFFFFF, 0xFFFF7FFF, 0x033FFFFF, 1077 0x0000FFFF, 0xFFFE0000, 0x027FFFFF, 0xFFFFFFFE, 1078 0xFFFE00FF, 0xBBFFFFFB, 0xFFFF0016, 0x000707FF, 1079 0x003F000F, 0x07FFFFFE, 0x01FFFFFF, 0xFFFFC3FF, 1080 0xFFFFFFFF, 0xFFFFFFFF, 0xBFEFFFFF, 0x9FFFFDFF, 1081 0xFFFF8000, 0xFFFFFFFF, 0x0000E7FF, 0x00000000, 1082 0xFFFFFFFF, 0x0003FFFF, 0x00000000, 0x00000000, 1083 0x00000000, 0x00000000, 0x00000000, 0x00000000, 1084 0x00000000, 0x00000000, 0x00000000, 0x00000000, 1085 0xFFFFFFFE, 0xF3FFFFFF, 0xFF1F3FFF, 0x0000FFCF, 1086 0xFFF99FEE, 0xF3C5FDFF, 0xB080399F, 0x0003FFCF, 1087 0xFFF987EE, 0xD36DFDFF, 0x5E003987, 0x001FFFC0, 1088 0xFFFBBFEE, 0xF3EDFDFF, 0x00013BBF, 0x0000FFCF, 1089 0xFFF99FEE, 0xF3EDFDFF, 0xB0C0398F, 0x0002FFC3, 1090 0xD63DC7EC, 0xC3BFC718, 0x00803DC7, 0x0000FF80, 1091 0xFFFDDFEE, 0xC3EFFDFF, 0x00603DDF, 0x0000FFC3, 1092 0xFFFDDFEC, 0xF3EFFDFF, 0x40603DDF, 0x0000FFC3, 1093 0xFFFDDFEC, 0xC3FFFDFF, 0x00803DCF, 0x0000FFC3, 1094 0xFC7FFFEC, 0x2FFBFFFF, 0xFF5F847F, 0x000C0000, 1095 0xFFFFFFFE, 0x07FFFFFF, 0x03FF7FFF, 0x00000000, 1096 0xFEF02596, 0x3BFFECAE, 0x33FF3F5F, 0x00000000, 1097 0x03000001, 0xC2A003FF, 0xFFFFFEFF, 0xFFFE07FF, 1098 0xFEFF0FDF, 0x1FFFFFFF, 0x00000040, 0x00000000, 1099 0xFFFFFFFF, 0x03C7F6FB, 0x03FF03FF, 0x00000000, 1100 0x00000000, 0xFFFFFFFF, 0xFFFF003F, 0x01FFFFFF, 1101 0xFFFFFFFF, 0xFFFFFFFF, 0x83FFFFFF, 0xFFFFFFFF, 1102 0xFFFFFFFF, 0xFFFFFF07, 0xFFFFFFFF, 0x03FFFFFF, 1103 0xFFFFFF7F, 0xFFFFFFFF, 0x3D7F3D7F, 0xFFFFFFFF, 1104 0xFFFF3D7F, 0x7F3D7FFF, 0xFF7F7F3D, 0xFFFF7FFF, 1105 0x7F3D7FFF, 0xFFFFFFFF, 0x07FFFF7F, 0x0003FE00, 1106 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0x001FFFFF, 1107 0xFFFFFFFE, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 1108 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 1109 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 1110 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 1111 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x007F9FFF, 1112 0x07FFFFFE, 0xFFFFFFFF, 0xFFFFFFFF, 0x0001C7FF, 1113 0x001FDFFF, 0x001FFFFF, 0x000FFFFF, 0x000DDFFF, 1114 0xFFFFFFFF, 0xFFFFFFFF, 0x308FFFFF, 0x000003FF, 1115 0x03FF3800, 0xFFFFFFFF, 0xFFFFFFFF, 0x00FFFFFF, 1116 0xFFFFFFFF, 0x000003FF, 0x00000000, 0x00000000, 1117 0x1FFFFFFF, 0x0FFF0FFF, 0xFFFFFFC0, 0x001F3FFF, 1118 0x00000000, 0x00000000, 0x00000000, 0x00000000, 1119 0x00000000, 0x00000000, 0x00000000, 0x00000000, 1120 0x00000000, 0x00000000, 0x00000000, 0x00000000, 1121 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000FFF, 1122 0x00000000, 0x00000000, 0x00000000, 0x00000000, 1123 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 1124 0x0FFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x03FFFFFF, 1125 0x3F3FFFFF, 0xFFFFFFFF, 0xAAFF3F3F, 0x3FFFFFFF, 1126 0xFFFFFFFF, 0x5FDFFFFF, 0x0FCF1FDC, 0x1FDC1FFF, 1127 0x0000F000, 0x80007C00, 0x00100001, 0x8002FC0F, 1128 0x00000000, 0x00000000, 0x1FFF0000, 0x000007E2, 1129 0x3E2FFC84, 0xE3FBBD50, 0x000003E0, 0xFFFFFFFF, 1130 0x0000000F, 0x00000000, 0x00000000, 0x00000000, 1131 0x000000E0, 0x1F3EFFFE, 0xFFFFFFFE, 0xFFFFFFFF, 1132 0xE67FFFFF, 0xFFFFFFFE, 0xFFFFFFFF, 0xFFFFFFFF, 1133 0xFFFFFFE0, 0xFFFE1FFF, 0xFFFFFFFF, 0xFFFFFFFF, 1134 0x00007FFF, 0x00FFFFFF, 0x00000000, 0xFFFF0000, 1135 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 1136 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 1137 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 1138 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 1139 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 1140 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 1141 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 1142 0xFFFFFFFF, 0x003FFFFF, 0x00000000, 0x00000000, 1143 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 1144 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 1145 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 1146 0xFFFFFFFF, 0x0000003F, 0x00000000, 0x00000000, 1147 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 1148 0x00001FFF, 0x00000000, 0x00000000, 0x00000000, 1149 0x00000000, 0x00000000, 0x00000000, 0x00000000, 1150 0x00000000, 0x00000000, 0x00000000, 0x00000000, 1151 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 1152 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 1153 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 1154 0xFFFFFFFF, 0x0000000F, 0x00000000, 0x00000000, 1155 0x00000000, 0x00000000, 0x00000000, 0x00000000, 1156 0x00000000, 0x00000000, 0x00000000, 0x00000000, 1157 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 1158 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 1159 0xFFFFFFFF, 0xFFFF3FFF, 0xFFFFFFFF, 0x000007FF, 1160 0x00000000, 0x00000000, 0x00000000, 0x00000000, 1161 0xE0F8007F, 0x5F7FFDFF, 0xFFFFFFDB, 0xFFFFFFFF, 1162 0xFFFFFFFF, 0x0003FFFF, 0xFFF80000, 0xFFFFFFFF, 1163 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 1164 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 1165 0xFFFFFFFF, 0x3FFFFFFF, 0xFFFF0000, 0xFFFFFFFF, 1166 0xFFFCFFFF, 0xFFFFFFFF, 0x000000FF, 0x0FFF0000, 1167 0x0000FFFF, 0x0018000F, 0x0000E000, 0xFFDF0000, 1168 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x9FFFFFFF, 1169 0x03FF0000, 0x87FFFFFE, 0x07FFFFFE, 0xFFFFFFE0, 1170 0xFFFFFFFF, 0x7FFFFFFF, 0x1CFCFCFC, 0x0E000000, 1171 0xFFFFEFFF, 0xB7FFFF7F, 0x3FFF3FFF, 0x00000000, 1172 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x07FFFFFF, 1173 0x00000000, 0x00000000, 0x00000000, 0x00000000, 1174 0x00000000, 0x00000000, 0x00000000, 0x00000000, 1175 0x00000000, 0x00000000, 0x00000000, 0x00000000, 1176 0x00000000, 0x00000000, 0x00000000, 0x00000000, 1177 0x7FFFFFFF, 0xFFFF0000, 0x000007FF, 0x00000000, 1178 0x3FFFFFFF, 0x00000000, 0x00000000, 0x00000000, 1179 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 1180 0x3FFFFFFF, 0x000003FF, 0x00000000, 0x00000000, 1181 0x00000000, 0x00000000, 0x00000000, 0x00000000, 1182 0x00000000, 0x00000000, 0x00000000, 0x00000000, 1183 0xFFFFFD3F, 0x91BFFFFF, 0x00000000, 0x00000000, 1184 0x00000000, 0x00000000, 0x00000000, 0x00000000, 1185 0x00000000, 0x00000000, 0x00000000, 0x00000000, 1186 0x00000000, 0x00000000, 0x00000000, 0x00000000, 1187 0x00000000, 0x00000000, 0x00000000, 0x00000000, 1188 0x00000000, 0x00000000, 0x00000000, 0x00000000, 1189 0x00000000, 0x00000000, 0x00000000, 0xFFFFE3E0, 1190 0x00000FE7, 0x00003C00, 0x00000000, 0x00000000, 1191 0xFFFFFFFF, 0xFFFFFFFF, 0xFFDFFFFF, 0xFFFFFFFF, 1192 0xDFFFFFFF, 0xEBFFDE64, 0xFFFFFFEF, 0xFFFFFFFF, 1193 0xDFDFE7BF, 0x7BFFFFFF, 0xFFFDFC5F, 0xFFFFFFFF, 1194 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 1195 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 1196 0xFFFFFFFF, 0xFFFFFF0F, 0xF7FFFFFD, 0xF7FFFFFF, 1197 0xFFDFFFFF, 0xFFDFFFFF, 0xFFFF7FFF, 0xFFFF7FFF, 1198 0xFFFFFDFF, 0xFFFFFDFF, 0xFFFFC3F7, 0xFFFFFFFF, 1199 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 1200 0xFFFFFFFF, 0xFFFFFFFF, 0x007FFFFF, 0x00000000, 1201 0x00000000, 0x00000000, 0x00000000, 0x00000000, 1202 0x00000000, 0x00000000, 0x00000000, 0x00000000, 1203 0x3FFFFFFF, 0x00000000, 0x00000000, 0x00000000, 1204 0x00000000, 0x00000000, 0x00000000, 0x00000000, 1205 0x00000000, 0x00000000, 0x00000000, 0x00000000, 1206 0x00000000, 0x00000000, 0x00000000, 0x00000000, 1207 0x00000002, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 1208 0x00000000, 0x00000000, 0x00000000, 0x00000000, 1209 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 1210 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x0000FFFF 1211 } 1212}; 1213 1214/* Return true if a given character can occur as first character of an 1215 identifier. See ECMA-334 section 9.4.2. */ 1216static bool 1217is_identifier_start (int c) 1218{ 1219 return bitmap_lookup (&table_identifier_start, c); 1220 /* In ASCII only this would be: 1221 return ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || c == '_'); 1222 */ 1223} 1224 1225/* Return true if a given character can occur as character of an identifier. 1226 See ECMA-334 section 9.4.2. */ 1227static bool 1228is_identifier_part (int c) 1229{ 1230 return bitmap_lookup (&table_identifier_part, c); 1231 /* In ASCII only this would be: 1232 return ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') 1233 || (c >= '0' && c <= '9') || c == '_'); 1234 */ 1235} 1236 1237static bool 1238is_any_character (int c) 1239{ 1240 return true; 1241} 1242 1243 1244/* ======================= Preprocessor directives. ======================= */ 1245 1246 1247/* Phase 5: Remove preprocessor lines. See ECMA-334 section 9.5. 1248 As a side effect, this also removes initial whitespace on every line; 1249 this whitespace doesn't matter. */ 1250 1251static int phase5_pushback[10]; 1252static int phase5_pushback_length; 1253 1254static int 1255phase5_getc () 1256{ 1257 int c; 1258 1259 if (phase5_pushback_length) 1260 return phase5_pushback[--phase5_pushback_length]; 1261 1262 c = phase4_getc (); 1263 if (c != UNL) 1264 return c; 1265 1266 do 1267 c = phase3_getc (); 1268 while (c != UEOF && is_whitespace (c)); 1269 1270 if (c == '#') 1271 { 1272 /* Ignore the entire line containing the preprocessor directive 1273 (including the // comment if it contains one). */ 1274 do 1275 c = phase3_getc (); 1276 while (c != UEOF && c != UNL); 1277 return c; 1278 } 1279 else 1280 { 1281 phase3_ungetc (c); 1282 return UNL; 1283 } 1284} 1285 1286#ifdef unused 1287static void 1288phase5_ungetc (int c) 1289{ 1290 if (c != UEOF) 1291 { 1292 if (phase5_pushback_length == SIZEOF (phase5_pushback)) 1293 abort (); 1294 phase5_pushback[phase5_pushback_length++] = c; 1295 } 1296} 1297#endif 1298 1299 1300/* ========================== Reading of tokens. ========================== */ 1301 1302enum token_type_ty 1303{ 1304 token_type_eof, 1305 token_type_lparen, /* ( */ 1306 token_type_rparen, /* ) */ 1307 token_type_lbrace, /* { */ 1308 token_type_rbrace, /* } */ 1309 token_type_comma, /* , */ 1310 token_type_dot, /* . */ 1311 token_type_string_literal, /* "abc", @"abc" */ 1312 token_type_number, /* 1.23 */ 1313 token_type_symbol, /* identifier, keyword, null */ 1314 token_type_plus, /* + */ 1315 token_type_other /* character literal, misc. operator */ 1316}; 1317typedef enum token_type_ty token_type_ty; 1318 1319typedef struct token_ty token_ty; 1320struct token_ty 1321{ 1322 token_type_ty type; 1323 char *string; /* for token_type_string_literal, token_type_symbol */ 1324 refcounted_string_list_ty *comment; /* for token_type_string_literal */ 1325 int line_number; 1326 int logical_line_number; 1327}; 1328 1329 1330/* Free the memory pointed to by a 'struct token_ty'. */ 1331static inline void 1332free_token (token_ty *tp) 1333{ 1334 if (tp->type == token_type_string_literal || tp->type == token_type_symbol) 1335 free (tp->string); 1336 if (tp->type == token_type_string_literal) 1337 drop_reference (tp->comment); 1338} 1339 1340 1341/* Read a Unicode escape sequence outside string/character literals. 1342 Reject Unicode escapes that don't fulfill the given predicate. 1343 See ECMA-334 section 9.4.2. */ 1344static int 1345do_getc_unicode_escaped (bool (*predicate) (int)) 1346{ 1347 int c; 1348 1349 /* Use phase 3, because phase 4 elides comments. */ 1350 c = phase3_getc (); 1351 if (c == UEOF) 1352 return '\\'; 1353 if (c == 'u' || c == 'U') 1354 { 1355 unsigned char buf[8]; 1356 int expect; 1357 unsigned int n; 1358 int i; 1359 1360 expect = (c == 'U' ? 8 : 4); 1361 n = 0; 1362 for (i = 0; i < expect; i++) 1363 { 1364 int c1 = phase3_getc (); 1365 1366 if (c1 >= '0' && c1 <= '9') 1367 n = (n << 4) + (c1 - '0'); 1368 else if (c1 >= 'A' && c1 <= 'F') 1369 n = (n << 4) + (c1 - 'A' + 10); 1370 else if (c1 >= 'a' && c1 <= 'f') 1371 n = (n << 4) + (c1 - 'a' + 10); 1372 else 1373 { 1374 phase3_ungetc (c1); 1375 while (--i >= 0) 1376 phase3_ungetc (buf[i]); 1377 phase3_ungetc (c); 1378 return '\\'; 1379 } 1380 1381 buf[i] = c1; 1382 } 1383 1384 if (n >= 0x110000) 1385 { 1386 error_with_progname = false; 1387 error (0, 0, _("%s:%d: warning: invalid Unicode character"), 1388 logical_file_name, line_number); 1389 error_with_progname = true; 1390 } 1391 else if (predicate (n)) 1392 return n; 1393 1394 while (--i >= 0) 1395 phase3_ungetc (buf[i]); 1396 } 1397 phase3_ungetc (c); 1398 return '\\'; 1399} 1400 1401 1402/* Read an escape sequence inside a string literal or character literal. 1403 See ECMA-334 sections 9.4.4.4., 9.4.4.5. */ 1404static int 1405do_getc_escaped () 1406{ 1407 int c; 1408 int n; 1409 int i; 1410 1411 /* Use phase 3, because phase 4 elides comments. */ 1412 c = phase3_getc (); 1413 if (c == UEOF) 1414 return '\\'; 1415 switch (c) 1416 { 1417 case 'a': 1418 return 0x0007; 1419 case 'b': 1420 return 0x0008; 1421 case 't': 1422 return 0x0009; 1423 case 'n': 1424 return 0x000a; 1425 case 'v': 1426 return 0x000b; 1427 case 'f': 1428 return 0x000c; 1429 case 'r': 1430 return 0x000d; 1431 case '"': 1432 return '"'; 1433 case '\'': 1434 return '\''; 1435 case '\\': 1436 return '\\'; 1437 case '0': 1438 return 0x0000; 1439 case 'x': 1440 c = phase3_getc (); 1441 switch (c) 1442 { 1443 default: 1444 phase3_ungetc (c); 1445 phase3_ungetc ('x'); 1446 return '\\'; 1447 1448 case '0': case '1': case '2': case '3': case '4': 1449 case '5': case '6': case '7': case '8': case '9': 1450 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': 1451 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': 1452 break; 1453 } 1454 n = 0; 1455 for (i = 0;; i++) 1456 { 1457 switch (c) 1458 { 1459 default: 1460 phase3_ungetc (c); 1461 return n; 1462 case '0': case '1': case '2': case '3': case '4': 1463 case '5': case '6': case '7': case '8': case '9': 1464 n = n * 16 + c - '0'; 1465 break; 1466 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': 1467 n = n * 16 + 10 + c - 'A'; 1468 break; 1469 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': 1470 n = n * 16 + 10 + c - 'a'; 1471 break; 1472 } 1473 if (i == 3) 1474 break; 1475 c = phase3_getc (); 1476 } 1477 return n; 1478 case 'u': case 'U': 1479 phase3_ungetc (c); 1480 return do_getc_unicode_escaped (is_any_character); 1481 default: 1482 /* Invalid escape sequence. */ 1483 phase3_ungetc (c); 1484 return '\\'; 1485 } 1486} 1487 1488/* Read a regular string literal or character literal. 1489 See ECMA-334 sections 9.4.4.4., 9.4.4.5. */ 1490static void 1491accumulate_escaped (struct string_buffer *literal, int delimiter) 1492{ 1493 int c; 1494 1495 for (;;) 1496 { 1497 /* Use phase 3, because phase 4 elides comments. */ 1498 c = phase3_getc (); 1499 if (c == UEOF || c == delimiter) 1500 break; 1501 if (c == UNL) 1502 { 1503 phase3_ungetc (c); 1504 error_with_progname = false; 1505 if (delimiter == '\'') 1506 error (0, 0, _("%s:%d: warning: unterminated character constant"), 1507 logical_file_name, line_number); 1508 else 1509 error (0, 0, _("%s:%d: warning: unterminated string constant"), 1510 logical_file_name, line_number); 1511 error_with_progname = true; 1512 break; 1513 } 1514 if (c == '\\') 1515 c = do_getc_escaped (); 1516 string_buffer_append_unicode (literal, c); 1517 } 1518} 1519 1520 1521/* Combine characters into tokens. Discard whitespace. */ 1522 1523/* Maximum used guaranteed to be < 4. */ 1524static token_ty phase6_pushback[4]; 1525static int phase6_pushback_length; 1526 1527static void 1528phase6_get (token_ty *tp) 1529{ 1530 int c; 1531 1532 if (phase6_pushback_length) 1533 { 1534 *tp = phase6_pushback[--phase6_pushback_length]; 1535 return; 1536 } 1537 tp->string = NULL; 1538 1539 for (;;) 1540 { 1541 tp->line_number = line_number; 1542 tp->logical_line_number = logical_line_number; 1543 c = phase5_getc (); 1544 1545 if (c == UEOF) 1546 { 1547 tp->type = token_type_eof; 1548 return; 1549 } 1550 1551 switch (c) 1552 { 1553 case UNL: 1554 if (last_non_comment_line > last_comment_line) 1555 savable_comment_reset (); 1556 /* FALLTHROUGH */ 1557 case ' ': 1558 case '\t': 1559 case '\f': 1560 /* Ignore whitespace and comments. */ 1561 continue; 1562 } 1563 1564 last_non_comment_line = tp->logical_line_number; 1565 1566 switch (c) 1567 { 1568 case '(': 1569 tp->type = token_type_lparen; 1570 return; 1571 1572 case ')': 1573 tp->type = token_type_rparen; 1574 return; 1575 1576 case '{': 1577 tp->type = token_type_lbrace; 1578 return; 1579 1580 case '}': 1581 tp->type = token_type_rbrace; 1582 return; 1583 1584 case ',': 1585 tp->type = token_type_comma; 1586 return; 1587 1588 case '.': 1589 c = phase4_getc (); 1590 if (!(c >= '0' && c <= '9')) 1591 { 1592 phase4_ungetc (c); 1593 tp->type = token_type_dot; 1594 return; 1595 } 1596 /* FALLTHROUGH */ 1597 1598 case '0': case '1': case '2': case '3': case '4': 1599 case '5': case '6': case '7': case '8': case '9': 1600 { 1601 /* Don't need to verify the complicated syntax of integers and 1602 floating-point numbers. We assume a valid C# input. 1603 The simplified syntax that we recognize as number is: any 1604 sequence of alphanumeric characters, additionally '+' and '-' 1605 immediately after 'e' or 'E' except in hexadecimal numbers. */ 1606 bool hexadecimal = false; 1607 1608 for (;;) 1609 { 1610 c = phase4_getc (); 1611 if (c >= '0' && c <= '9') 1612 continue; 1613 if ((c >= 'A' && c <= 'Z') || (c >= 'a' &&c <= 'z')) 1614 { 1615 if (c == 'X' || c == 'x') 1616 hexadecimal = true; 1617 if ((c == 'E' || c == 'e') && !hexadecimal) 1618 { 1619 c = phase4_getc (); 1620 if (!(c == '+' || c == '-')) 1621 phase4_ungetc (c); 1622 } 1623 continue; 1624 } 1625 if (c == '.') 1626 continue; 1627 break; 1628 } 1629 phase4_ungetc (c); 1630 tp->type = token_type_number; 1631 return; 1632 } 1633 1634 case '"': 1635 /* Regular string literal. */ 1636 { 1637 struct string_buffer literal; 1638 1639 init_string_buffer (&literal); 1640 accumulate_escaped (&literal, '"'); 1641 tp->string = xstrdup (string_buffer_result (&literal)); 1642 free_string_buffer (&literal); 1643 tp->comment = add_reference (savable_comment); 1644 tp->type = token_type_string_literal; 1645 return; 1646 } 1647 1648 case '\'': 1649 /* Character literal. */ 1650 { 1651 struct string_buffer literal; 1652 1653 init_string_buffer (&literal); 1654 accumulate_escaped (&literal, '\''); 1655 free_string_buffer (&literal); 1656 tp->type = token_type_other; 1657 return; 1658 } 1659 1660 case '+': 1661 c = phase4_getc (); 1662 if (c == '+') 1663 /* Operator ++ */ 1664 tp->type = token_type_other; 1665 else if (c == '=') 1666 /* Operator += */ 1667 tp->type = token_type_other; 1668 else 1669 { 1670 /* Operator + */ 1671 phase4_ungetc (c); 1672 tp->type = token_type_plus; 1673 } 1674 return; 1675 1676 case '@': 1677 c = phase4_getc (); 1678 if (c == '"') 1679 { 1680 /* Verbatim string literal. */ 1681 struct string_buffer literal; 1682 1683 init_string_buffer (&literal); 1684 for (;;) 1685 { 1686 /* Use phase 2, because phase 4 elides comments and phase 3 1687 mixes up the newline characters. */ 1688 c = phase2_getc (); 1689 if (c == UEOF) 1690 break; 1691 if (c == '"') 1692 { 1693 c = phase2_getc (); 1694 if (c != '"') 1695 { 1696 phase2_ungetc (c); 1697 break; 1698 } 1699 } 1700 /* No special treatment of newline and backslash here. */ 1701 string_buffer_append_unicode (&literal, c); 1702 } 1703 tp->string = xstrdup (string_buffer_result (&literal)); 1704 free_string_buffer (&literal); 1705 tp->comment = add_reference (savable_comment); 1706 tp->type = token_type_string_literal; 1707 return; 1708 } 1709 /* FALLTHROUGH, so that @identifier is recognized. */ 1710 1711 default: 1712 if (c == '\\') 1713 c = do_getc_unicode_escaped (is_identifier_start); 1714 if (is_identifier_start (c)) 1715 { 1716 static struct string_buffer buffer; 1717 buffer.utf8_buflen = 0; 1718 for (;;) 1719 { 1720 string_buffer_append_unicode (&buffer, c); 1721 c = phase4_getc (); 1722 if (c == '\\') 1723 c = do_getc_unicode_escaped (is_identifier_part); 1724 if (!is_identifier_part (c)) 1725 break; 1726 } 1727 phase4_ungetc (c); 1728 tp->string = xstrdup (string_buffer_result (&buffer)); 1729 tp->type = token_type_symbol; 1730 return; 1731 } 1732 else 1733 { 1734 /* Misc. operator. */ 1735 tp->type = token_type_other; 1736 return; 1737 } 1738 } 1739 } 1740} 1741 1742/* Supports 3 tokens of pushback. */ 1743static void 1744phase6_unget (token_ty *tp) 1745{ 1746 if (tp->type != token_type_eof) 1747 { 1748 if (phase6_pushback_length == SIZEOF (phase6_pushback)) 1749 abort (); 1750 phase6_pushback[phase6_pushback_length++] = *tp; 1751 } 1752} 1753 1754 1755/* Compile-time optimization of string literal concatenation. 1756 Combine "string1" + ... + "stringN" to the concatenated string if 1757 - the token after this expression is not '.' (because then the last 1758 string could be part of a method call expression). */ 1759 1760static token_ty phase7_pushback[2]; 1761static int phase7_pushback_length; 1762 1763static void 1764phase7_get (token_ty *tp) 1765{ 1766 if (phase7_pushback_length) 1767 { 1768 *tp = phase7_pushback[--phase7_pushback_length]; 1769 return; 1770 } 1771 1772 phase6_get (tp); 1773 if (tp->type == token_type_string_literal) 1774 { 1775 char *sum = tp->string; 1776 size_t sum_len = strlen (sum); 1777 1778 for (;;) 1779 { 1780 token_ty token2; 1781 1782 phase6_get (&token2); 1783 if (token2.type == token_type_plus) 1784 { 1785 token_ty token3; 1786 1787 phase6_get (&token3); 1788 if (token3.type == token_type_string_literal) 1789 { 1790 token_ty token_after; 1791 1792 phase6_get (&token_after); 1793 if (token_after.type != token_type_dot) 1794 { 1795 char *addend = token3.string; 1796 size_t addend_len = strlen (addend); 1797 1798 sum = (char *) xrealloc (sum, sum_len + addend_len + 1); 1799 memcpy (sum + sum_len, addend, addend_len + 1); 1800 sum_len += addend_len; 1801 1802 phase6_unget (&token_after); 1803 free_token (&token3); 1804 free_token (&token2); 1805 continue; 1806 } 1807 phase6_unget (&token_after); 1808 } 1809 phase6_unget (&token3); 1810 } 1811 phase6_unget (&token2); 1812 break; 1813 } 1814 tp->string = sum; 1815 } 1816} 1817 1818/* Supports 2 tokens of pushback. */ 1819static void 1820phase7_unget (token_ty *tp) 1821{ 1822 if (tp->type != token_type_eof) 1823 { 1824 if (phase7_pushback_length == SIZEOF (phase7_pushback)) 1825 abort (); 1826 phase7_pushback[phase7_pushback_length++] = *tp; 1827 } 1828} 1829 1830 1831static void 1832x_csharp_lex (token_ty *tp) 1833{ 1834 phase7_get (tp); 1835} 1836 1837/* Supports 2 tokens of pushback. */ 1838static void 1839x_csharp_unlex (token_ty *tp) 1840{ 1841 phase7_unget (tp); 1842} 1843 1844 1845/* ========================= Extracting strings. ========================== */ 1846 1847 1848/* Context lookup table. */ 1849static flag_context_list_table_ty *flag_context_list_table; 1850 1851 1852/* The file is broken into tokens. Scan the token stream, looking for 1853 a keyword, followed by a left paren, followed by a string. When we 1854 see this sequence, we have something to remember. We assume we are 1855 looking at a valid C or C++ program, and leave the complaints about 1856 the grammar to the compiler. 1857 1858 Normal handling: Look for 1859 keyword ( ... msgid ... ) 1860 Plural handling: Look for 1861 keyword ( ... msgid ... msgid_plural ... ) 1862 1863 We use recursion because the arguments before msgid or between msgid 1864 and msgid_plural can contain subexpressions of the same form. */ 1865 1866 1867/* Extract messages until the next balanced closing parenthesis or brace, 1868 depending on TERMINATOR. 1869 Extracted messages are added to MLP. 1870 Return true upon eof, false upon closing parenthesis or brace. */ 1871static bool 1872extract_parenthesized (message_list_ty *mlp, token_type_ty terminator, 1873 flag_context_ty outer_context, 1874 flag_context_list_iterator_ty context_iter, 1875 struct arglist_parser *argparser) 1876{ 1877 /* Current argument number. */ 1878 int arg = 1; 1879 /* 0 when no keyword has been seen. 1 right after a keyword is seen. */ 1880 int state; 1881 /* Parameters of the keyword just seen. Defined only in state 1. */ 1882 const struct callshapes *next_shapes = NULL; 1883 /* Context iterator that will be used if the next token is a '('. */ 1884 flag_context_list_iterator_ty next_context_iter = 1885 passthrough_context_list_iterator; 1886 /* Current context. */ 1887 flag_context_ty inner_context = 1888 inherited_context (outer_context, 1889 flag_context_list_iterator_advance (&context_iter)); 1890 1891 /* Start state is 0. */ 1892 state = 0; 1893 1894 for (;;) 1895 { 1896 token_ty token; 1897 1898 x_csharp_lex (&token); 1899 switch (token.type) 1900 { 1901 case token_type_symbol: 1902 { 1903 /* Combine symbol1 . ... . symbolN to a single strings, so that 1904 we can recognize static function calls like 1905 GettextResource.gettext. The information present for 1906 symbolI.....symbolN has precedence over the information for 1907 symbolJ.....symbolN with J > I. */ 1908 char *sum = token.string; 1909 size_t sum_len = strlen (sum); 1910 const char *dottedname; 1911 flag_context_list_ty *context_list; 1912 1913 for (;;) 1914 { 1915 token_ty token2; 1916 1917 x_csharp_lex (&token2); 1918 if (token2.type == token_type_dot) 1919 { 1920 token_ty token3; 1921 1922 x_csharp_lex (&token3); 1923 if (token3.type == token_type_symbol) 1924 { 1925 char *addend = token3.string; 1926 size_t addend_len = strlen (addend); 1927 1928 sum = 1929 (char *) xrealloc (sum, sum_len + 1 + addend_len + 1); 1930 sum[sum_len] = '.'; 1931 memcpy (sum + sum_len + 1, addend, addend_len + 1); 1932 sum_len += 1 + addend_len; 1933 1934 free_token (&token3); 1935 free_token (&token2); 1936 continue; 1937 } 1938 x_csharp_unlex (&token3); 1939 } 1940 x_csharp_unlex (&token2); 1941 break; 1942 } 1943 1944 for (dottedname = sum;;) 1945 { 1946 void *keyword_value; 1947 1948 if (hash_find_entry (&keywords, dottedname, strlen (dottedname), 1949 &keyword_value) 1950 == 0) 1951 { 1952 next_shapes = (const struct callshapes *) keyword_value; 1953 state = 1; 1954 break; 1955 } 1956 1957 dottedname = strchr (dottedname, '.'); 1958 if (dottedname == NULL) 1959 { 1960 state = 0; 1961 break; 1962 } 1963 dottedname++; 1964 } 1965 1966 for (dottedname = sum;;) 1967 { 1968 context_list = 1969 flag_context_list_table_lookup ( 1970 flag_context_list_table, 1971 dottedname, strlen (dottedname)); 1972 if (context_list != NULL) 1973 break; 1974 1975 dottedname = strchr (dottedname, '.'); 1976 if (dottedname == NULL) 1977 break; 1978 dottedname++; 1979 } 1980 next_context_iter = flag_context_list_iterator (context_list); 1981 1982 free (sum); 1983 continue; 1984 } 1985 1986 case token_type_lparen: 1987 if (extract_parenthesized (mlp, token_type_rparen, 1988 inner_context, next_context_iter, 1989 arglist_parser_alloc (mlp, 1990 state ? next_shapes : NULL))) 1991 { 1992 xgettext_current_source_encoding = po_charset_utf8; 1993 arglist_parser_done (argparser, arg); 1994 xgettext_current_source_encoding = xgettext_global_source_encoding; 1995 return true; 1996 } 1997 next_context_iter = null_context_list_iterator; 1998 state = 0; 1999 continue; 2000 2001 case token_type_rparen: 2002 if (terminator == token_type_rparen) 2003 { 2004 xgettext_current_source_encoding = po_charset_utf8; 2005 arglist_parser_done (argparser, arg); 2006 xgettext_current_source_encoding = xgettext_global_source_encoding; 2007 return false; 2008 } 2009 if (terminator == token_type_rbrace) 2010 { 2011 error_with_progname = false; 2012 error (0, 0, 2013 _("%s:%d: warning: ')' found where '}' was expected"), 2014 logical_file_name, token.line_number); 2015 error_with_progname = true; 2016 } 2017 next_context_iter = null_context_list_iterator; 2018 state = 0; 2019 continue; 2020 2021 case token_type_lbrace: 2022 if (extract_parenthesized (mlp, token_type_rbrace, 2023 null_context, null_context_list_iterator, 2024 arglist_parser_alloc (mlp, NULL))) 2025 { 2026 xgettext_current_source_encoding = po_charset_utf8; 2027 arglist_parser_done (argparser, arg); 2028 xgettext_current_source_encoding = xgettext_global_source_encoding; 2029 return true; 2030 } 2031 next_context_iter = null_context_list_iterator; 2032 state = 0; 2033 continue; 2034 2035 case token_type_rbrace: 2036 if (terminator == token_type_rbrace) 2037 { 2038 xgettext_current_source_encoding = po_charset_utf8; 2039 arglist_parser_done (argparser, arg); 2040 xgettext_current_source_encoding = xgettext_global_source_encoding; 2041 return false; 2042 } 2043 if (terminator == token_type_rparen) 2044 { 2045 error_with_progname = false; 2046 error (0, 0, 2047 _("%s:%d: warning: '}' found where ')' was expected"), 2048 logical_file_name, token.line_number); 2049 error_with_progname = true; 2050 } 2051 next_context_iter = null_context_list_iterator; 2052 state = 0; 2053 continue; 2054 2055 case token_type_comma: 2056 arg++; 2057 inner_context = 2058 inherited_context (outer_context, 2059 flag_context_list_iterator_advance ( 2060 &context_iter)); 2061 next_context_iter = passthrough_context_list_iterator; 2062 state = 0; 2063 continue; 2064 2065 case token_type_string_literal: 2066 { 2067 lex_pos_ty pos; 2068 pos.file_name = logical_file_name; 2069 pos.line_number = token.line_number; 2070 2071 xgettext_current_source_encoding = po_charset_utf8; 2072 if (extract_all) 2073 remember_a_message (mlp, NULL, token.string, inner_context, 2074 &pos, token.comment); 2075 else 2076 arglist_parser_remember (argparser, arg, token.string, 2077 inner_context, 2078 pos.file_name, pos.line_number, 2079 token.comment); 2080 xgettext_current_source_encoding = xgettext_global_source_encoding; 2081 } 2082 drop_reference (token.comment); 2083 next_context_iter = null_context_list_iterator; 2084 state = 0; 2085 continue; 2086 2087 case token_type_eof: 2088 xgettext_current_source_encoding = po_charset_utf8; 2089 arglist_parser_done (argparser, arg); 2090 xgettext_current_source_encoding = xgettext_global_source_encoding; 2091 return true; 2092 2093 case token_type_dot: 2094 case token_type_number: 2095 case token_type_plus: 2096 case token_type_other: 2097 next_context_iter = null_context_list_iterator; 2098 state = 0; 2099 continue; 2100 2101 default: 2102 abort (); 2103 } 2104 } 2105} 2106 2107 2108void 2109extract_csharp (FILE *f, 2110 const char *real_filename, const char *logical_filename, 2111 flag_context_list_table_ty *flag_table, 2112 msgdomain_list_ty *mdlp) 2113{ 2114 message_list_ty *mlp = mdlp->item[0]->messages; 2115 2116 fp = f; 2117 real_file_name = real_filename; 2118 logical_file_name = xstrdup (logical_filename); 2119 line_number = 1; 2120 2121 logical_line_number = 1; 2122 last_comment_line = -1; 2123 last_non_comment_line = -1; 2124 2125 flag_context_list_table = flag_table; 2126 2127 init_keywords (); 2128 2129 /* Eat tokens until eof is seen. When extract_parenthesized returns 2130 due to an unbalanced closing parenthesis, just restart it. */ 2131 while (!extract_parenthesized (mlp, token_type_eof, 2132 null_context, null_context_list_iterator, 2133 arglist_parser_alloc (mlp, NULL))) 2134 ; 2135 2136 fp = NULL; 2137 real_file_name = NULL; 2138 logical_file_name = NULL; 2139 line_number = 0; 2140} 2141