1/* xgettext Lisp backend. 2 Copyright (C) 2001-2003, 2005-2006 Free Software Foundation, Inc. 3 4 This file was written by Bruno Haible <haible@clisp.cons.org>, 2001. 5 6 This program is free software; you can redistribute it and/or modify 7 it under the terms of the GNU General Public License as published by 8 the Free Software Foundation; either version 2, or (at your option) 9 any later version. 10 11 This program is distributed in the hope that it will be useful, 12 but WITHOUT ANY WARRANTY; without even the implied warranty of 13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 GNU General Public License for more details. 15 16 You should have received a copy of the GNU General Public License 17 along with this program; if not, write to the Free Software Foundation, 18 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ 19 20#ifdef HAVE_CONFIG_H 21# include "config.h" 22#endif 23 24#include <errno.h> 25#include <stdbool.h> 26#include <stdio.h> 27#include <stdlib.h> 28#include <string.h> 29 30#include "message.h" 31#include "xgettext.h" 32#include "x-lisp.h" 33#include "error.h" 34#include "xalloc.h" 35#include "exit.h" 36#include "hash.h" 37#include "gettext.h" 38 39#define _(s) gettext(s) 40 41 42/* The Common Lisp syntax is described in the Common Lisp HyperSpec, chapter 2. 43 Since we are interested only in strings and in forms similar to 44 (gettext msgid ...) 45 or (ngettext msgid msgid_plural ...) 46 we make the following simplifications: 47 48 - Assume the keywords and strings are in an ASCII compatible encoding. 49 This means we can read the input file one byte at a time, instead of 50 one character at a time. No need to worry about multibyte characters: 51 If they occur as part of identifiers, they most probably act as 52 constituent characters, and the byte based approach will do the same. 53 54 - Assume the read table is the standard Common Lisp read table. 55 Non-standard read tables are mostly used to read data, not programs. 56 57 - Assume the read table case is :UPCASE, and *READ-BASE* is 10. 58 59 - Don't interpret #n= and #n#, they usually don't appear in programs. 60 61 - Don't interpret #+, #-, they are unlikely to appear in a gettext form. 62 63 The remaining syntax rules are: 64 65 - The syntax code assigned to each character, and how tokens are built 66 up from characters (single escape, multiple escape etc.). 67 68 - Comment syntax: ';' and '#| ... |#'. 69 70 - String syntax: "..." with single escapes. 71 72 - Read macros and dispatch macro character '#'. Needed to be able to 73 tell which is the n-th argument of a function call. 74 75 */ 76 77 78/* ========================= Lexer customization. ========================= */ 79 80/* 'readtable_case' is the case conversion that is applied to non-escaped 81 parts of symbol tokens. In Common Lisp: (readtable-case *readtable*). */ 82 83enum rtcase 84{ 85 case_upcase, 86 case_downcase, 87 case_preserve, 88 case_invert 89}; 90 91static enum rtcase readtable_case = case_upcase; 92 93/* 'read_base' is the assumed radix of integers and rational numbers. 94 In Common Lisp: *read-base*. */ 95static int read_base = 10; 96 97/* 'read_preserve_whitespace' specifies whether a whitespace character 98 that terminates a token must be pushed back on the input stream. 99 We set it to true, because the special newline side effect in read_object() 100 requires that read_object() sees every newline not inside a token. */ 101static bool read_preserve_whitespace = true; 102 103 104/* ====================== Keyword set customization. ====================== */ 105 106/* If true extract all strings. */ 107static bool extract_all = false; 108 109static hash_table keywords; 110static bool default_keywords = true; 111 112 113void 114x_lisp_extract_all () 115{ 116 extract_all = true; 117} 118 119 120void 121x_lisp_keyword (const char *name) 122{ 123 if (name == NULL) 124 default_keywords = false; 125 else 126 { 127 const char *end; 128 struct callshape shape; 129 const char *colon; 130 size_t len; 131 char *symname; 132 size_t i; 133 134 if (keywords.table == NULL) 135 hash_init (&keywords, 100); 136 137 split_keywordspec (name, &end, &shape); 138 139 /* The characters between name and end should form a valid Lisp symbol. 140 Extract the symbol name part. */ 141 colon = strchr (name, ':'); 142 if (colon != NULL && colon < end) 143 { 144 name = colon + 1; 145 if (name < end && *name == ':') 146 name++; 147 colon = strchr (name, ':'); 148 if (colon != NULL && colon < end) 149 return; 150 } 151 152 /* Uppercase it. */ 153 len = end - name; 154 symname = (char *) xmalloc (len); 155 for (i = 0; i < len; i++) 156 symname[i] = 157 (name[i] >= 'a' && name[i] <= 'z' ? name[i] - 'a' + 'A' : name[i]); 158 159 insert_keyword_callshape (&keywords, symname, len, &shape); 160 } 161} 162 163/* Finish initializing the keywords hash table. 164 Called after argument processing, before each file is processed. */ 165static void 166init_keywords () 167{ 168 if (default_keywords) 169 { 170 /* When adding new keywords here, also update the documentation in 171 xgettext.texi! */ 172 x_lisp_keyword ("gettext"); /* I18N:GETTEXT */ 173 x_lisp_keyword ("ngettext:1,2"); /* I18N:NGETTEXT */ 174 x_lisp_keyword ("gettext-noop"); 175 default_keywords = false; 176 } 177} 178 179void 180init_flag_table_lisp () 181{ 182 xgettext_record_flag ("gettext:1:pass-lisp-format"); 183 xgettext_record_flag ("ngettext:1:pass-lisp-format"); 184 xgettext_record_flag ("ngettext:2:pass-lisp-format"); 185 xgettext_record_flag ("gettext-noop:1:pass-lisp-format"); 186 xgettext_record_flag ("format:2:lisp-format"); 187} 188 189 190/* ======================== Reading of characters. ======================== */ 191 192/* Real filename, used in error messages about the input file. */ 193static const char *real_file_name; 194 195/* Logical filename and line number, used to label the extracted messages. */ 196static char *logical_file_name; 197static int line_number; 198 199/* The input file stream. */ 200static FILE *fp; 201 202 203/* Fetch the next character from the input file. */ 204static int 205do_getc () 206{ 207 int c = getc (fp); 208 209 if (c == EOF) 210 { 211 if (ferror (fp)) 212 error (EXIT_FAILURE, errno, _("\ 213error while reading \"%s\""), real_file_name); 214 } 215 else if (c == '\n') 216 line_number++; 217 218 return c; 219} 220 221/* Put back the last fetched character, not EOF. */ 222static void 223do_ungetc (int c) 224{ 225 if (c == '\n') 226 line_number--; 227 ungetc (c, fp); 228} 229 230 231/* ========= Reading of tokens. See CLHS 2.2 "Reader Algorithm". ========= */ 232 233 234/* Syntax code. See CLHS 2.1.4 "Character Syntax Types". */ 235 236enum syntax_code 237{ 238 syntax_illegal, /* non-printable, except whitespace */ 239 syntax_single_esc, /* '\' (single escape) */ 240 syntax_multi_esc, /* '|' (multiple escape) */ 241 syntax_constituent, /* everything else (constituent) */ 242 syntax_whitespace, /* TAB,LF,FF,CR,' ' (whitespace) */ 243 syntax_eof, /* EOF */ 244 syntax_t_macro, /* '()'"' (terminating macro) */ 245 syntax_nt_macro /* '#' (non-terminating macro) */ 246}; 247 248/* Returns the syntax code of a character. */ 249static enum syntax_code 250syntax_code_of (unsigned char c) 251{ 252 switch (c) 253 { 254 case '\\': 255 return syntax_single_esc; 256 case '|': 257 return syntax_multi_esc; 258 case '\t': case '\n': case '\f': case '\r': case ' ': 259 return syntax_whitespace; 260 case '(': case ')': case '\'': case '"': case ',': case ';': case '`': 261 return syntax_t_macro; 262 case '#': 263 return syntax_nt_macro; 264 default: 265 if (c < ' ' && c != '\b') 266 return syntax_illegal; 267 else 268 return syntax_constituent; 269 } 270} 271 272struct char_syntax 273{ 274 int ch; /* character */ 275 enum syntax_code scode; /* syntax code */ 276}; 277 278/* Returns the next character and its syntax code. */ 279static void 280read_char_syntax (struct char_syntax *p) 281{ 282 int c = do_getc (); 283 284 p->ch = c; 285 p->scode = (c == EOF ? syntax_eof : syntax_code_of (c)); 286} 287 288/* Every character in a token has an attribute assigned. The attributes 289 help during interpretation of the token. See 290 CLHS 2.3 "Interpretation of Tokens" for the possible interpretations, 291 and CLHS 2.1.4.2 "Constituent Traits". */ 292 293enum attribute 294{ 295 a_illg, /* invalid constituent */ 296 a_pack_m, /* ':' package marker */ 297 a_alpha, /* normal alphabetic */ 298 a_escaped, /* alphabetic but not subject to case conversion */ 299 a_ratio, /* '/' */ 300 a_dot, /* '.' */ 301 a_sign, /* '+-' */ 302 a_extens, /* '_^' extension characters */ 303 a_digit, /* '0123456789' */ 304 a_letterdigit,/* 'A'-'Z','a'-'z' below base, except 'esfdlESFDL' */ 305 a_expodigit, /* 'esfdlESFDL' below base */ 306 a_letter, /* 'A'-'Z','a'-'z', except 'esfdlESFDL' */ 307 a_expo /* 'esfdlESFDL' */ 308}; 309 310#define is_letter_attribute(a) ((a) >= a_letter) 311#define is_number_attribute(a) ((a) >= a_ratio) 312 313/* Returns the attribute of a character, assuming base 10. */ 314static enum attribute 315attribute_of (unsigned char c) 316{ 317 switch (c) 318 { 319 case ':': 320 return a_pack_m; 321 case '/': 322 return a_ratio; 323 case '.': 324 return a_dot; 325 case '+': case '-': 326 return a_sign; 327 case '_': case '^': 328 return a_extens; 329 case '0': case '1': case '2': case '3': case '4': 330 case '5': case '6': case '7': case '8': case '9': 331 return a_digit; 332 case 'a': case 'b': case 'c': case 'g': case 'h': case 'i': case 'j': 333 case 'k': case 'm': case 'n': case 'o': case 'p': case 'q': case 'r': 334 case 't': case 'u': case 'v': case 'w': case 'x': case 'y': case 'z': 335 case 'A': case 'B': case 'C': case 'G': case 'H': case 'I': case 'J': 336 case 'K': case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R': 337 case 'T': case 'U': case 'V': case 'W': case 'X': case 'Y': case 'Z': 338 return a_letter; 339 case 'e': case 's': case 'd': case 'f': case 'l': 340 case 'E': case 'S': case 'D': case 'F': case 'L': 341 return a_expo; 342 default: 343 /* Treat everything as valid. Never return a_illg. */ 344 return a_alpha; 345 } 346} 347 348struct token_char 349{ 350 unsigned char ch; /* character */ 351 unsigned char attribute; /* attribute */ 352}; 353 354/* A token consists of a sequence of characters with associated attribute. */ 355struct token 356{ 357 int allocated; /* number of allocated 'token_char's */ 358 int charcount; /* number of used 'token_char's */ 359 struct token_char *chars; /* the token's constituents */ 360 bool with_escape; /* whether single-escape or multiple escape occurs */ 361}; 362 363/* Initialize a 'struct token'. */ 364static inline void 365init_token (struct token *tp) 366{ 367 tp->allocated = 10; 368 tp->chars = 369 (struct token_char *) xmalloc (tp->allocated * sizeof (struct token_char)); 370 tp->charcount = 0; 371} 372 373/* Free the memory pointed to by a 'struct token'. */ 374static inline void 375free_token (struct token *tp) 376{ 377 free (tp->chars); 378} 379 380/* Ensure there is enough room in the token for one more character. */ 381static inline void 382grow_token (struct token *tp) 383{ 384 if (tp->charcount == tp->allocated) 385 { 386 tp->allocated *= 2; 387 tp->chars = (struct token_char *) xrealloc (tp->chars, tp->allocated * sizeof (struct token_char)); 388 } 389} 390 391/* Read the next token. If 'first' is given, it points to the first 392 character, which has already been read. 393 The algorithm follows CLHS 2.2 "Reader Algorithm". */ 394static void 395read_token (struct token *tp, const struct char_syntax *first) 396{ 397 bool multiple_escape_flag; 398 struct char_syntax curr; 399 400 init_token (tp); 401 tp->with_escape = false; 402 403 multiple_escape_flag = false; 404 if (first) 405 curr = *first; 406 else 407 read_char_syntax (&curr); 408 409 for (;; read_char_syntax (&curr)) 410 { 411 switch (curr.scode) 412 { 413 case syntax_illegal: 414 /* Invalid input. Be tolerant, no error message. */ 415 do_ungetc (curr.ch); 416 return; 417 418 case syntax_single_esc: 419 tp->with_escape = true; 420 read_char_syntax (&curr); 421 if (curr.scode == syntax_eof) 422 /* Invalid input. Be tolerant, no error message. */ 423 return; 424 grow_token (tp); 425 tp->chars[tp->charcount].ch = curr.ch; 426 tp->chars[tp->charcount].attribute = a_escaped; 427 tp->charcount++; 428 break; 429 430 case syntax_multi_esc: 431 multiple_escape_flag = !multiple_escape_flag; 432 tp->with_escape = true; 433 break; 434 435 case syntax_constituent: 436 case syntax_nt_macro: 437 grow_token (tp); 438 if (multiple_escape_flag) 439 { 440 tp->chars[tp->charcount].ch = curr.ch; 441 tp->chars[tp->charcount].attribute = a_escaped; 442 tp->charcount++; 443 } 444 else 445 { 446 tp->chars[tp->charcount].ch = curr.ch; 447 tp->chars[tp->charcount].attribute = attribute_of (curr.ch); 448 tp->charcount++; 449 } 450 break; 451 452 case syntax_whitespace: 453 case syntax_t_macro: 454 if (multiple_escape_flag) 455 { 456 grow_token (tp); 457 tp->chars[tp->charcount].ch = curr.ch; 458 tp->chars[tp->charcount].attribute = a_escaped; 459 tp->charcount++; 460 } 461 else 462 { 463 if (curr.scode != syntax_whitespace || read_preserve_whitespace) 464 do_ungetc (curr.ch); 465 return; 466 } 467 break; 468 469 case syntax_eof: 470 if (multiple_escape_flag) 471 /* Invalid input. Be tolerant, no error message. */ 472 ; 473 return; 474 } 475 } 476} 477 478/* A potential number is a token which 479 1. consists only of digits, '+','-','/','^','_','.' and number markers. 480 The base for digits is context dependent, but always 10 if a dot '.' 481 occurs. A number marker is a non-digit letter which is not adjacent 482 to a non-digit letter. 483 2. has at least one digit. 484 3. starts with a digit, '+','-','.','^' or '_'. 485 4. does not end with '+' or '-'. 486 See CLHS 2.3.1.1 "Potential Numbers as Tokens". 487 */ 488 489static inline bool 490has_a_dot (const struct token *tp) 491{ 492 int n = tp->charcount; 493 int i; 494 495 for (i = 0; i < n; i++) 496 if (tp->chars[i].attribute == a_dot) 497 return true; 498 return false; 499} 500 501static inline bool 502all_a_number (const struct token *tp) 503{ 504 int n = tp->charcount; 505 int i; 506 507 for (i = 0; i < n; i++) 508 if (!is_number_attribute (tp->chars[i].attribute)) 509 return false; 510 return true; 511} 512 513static inline void 514a_letter_to_digit (const struct token *tp, int base) 515{ 516 int n = tp->charcount; 517 int i; 518 519 for (i = 0; i < n; i++) 520 if (is_letter_attribute (tp->chars[i].attribute)) 521 { 522 int c = tp->chars[i].ch; 523 524 if (c >= 'a') 525 c -= 'a' - 'A'; 526 if (c - 'A' + 10 < base) 527 tp->chars[i].attribute -= 2; /* a_letter -> a_letterdigit, 528 a_expo -> a_expodigit */ 529 } 530} 531 532static inline bool 533has_a_digit (const struct token *tp) 534{ 535 int n = tp->charcount; 536 int i; 537 538 for (i = 0; i < n; i++) 539 if (tp->chars[i].attribute == a_digit 540 || tp->chars[i].attribute == a_letterdigit 541 || tp->chars[i].attribute == a_expodigit) 542 return true; 543 return false; 544} 545 546static inline bool 547has_adjacent_letters (const struct token *tp) 548{ 549 int n = tp->charcount; 550 int i; 551 552 for (i = 1; i < n; i++) 553 if (is_letter_attribute (tp->chars[i-1].attribute) 554 && is_letter_attribute (tp->chars[i].attribute)) 555 return true; 556 return false; 557} 558 559static bool 560is_potential_number (const struct token *tp, int *basep) 561{ 562 /* CLHS 2.3.1.1.1: 563 "A potential number cannot contain any escape characters." */ 564 if (tp->with_escape) 565 return false; 566 567 if (has_a_dot (tp)) 568 *basep = 10; 569 570 if (!all_a_number (tp)) 571 return false; 572 573 a_letter_to_digit (tp, *basep); 574 575 if (!has_a_digit (tp)) 576 return false; 577 578 if (has_adjacent_letters (tp)) 579 return false; 580 581 if (!(tp->chars[0].attribute >= a_dot 582 && tp->chars[0].attribute <= a_expodigit)) 583 return false; 584 585 if (tp->chars[tp->charcount - 1].attribute == a_sign) 586 return false; 587 588 return true; 589} 590 591/* A number is one of integer, ratio, float. Each has a particular syntax. 592 See CLHS 2.3.1 "Numbers as Tokens". 593 But note a mistake: The exponent rule should read: 594 exponent ::= exponent-marker [sign] {decimal-digit}+ 595 (see 22.1.3.1.3 "Printing Floats"). */ 596 597enum number_type 598{ 599 n_none, 600 n_integer, 601 n_ratio, 602 n_float 603}; 604 605static enum number_type 606is_number (const struct token *tp, int *basep) 607{ 608 struct token_char *ptr_limit; 609 struct token_char *ptr1; 610 611 if (!is_potential_number (tp, basep)) 612 return n_none; 613 614 /* is_potential_number guarantees 615 - all attributes are >= a_ratio, 616 - there is at least one a_digit or a_letterdigit or a_expodigit, and 617 - if there is an a_dot, then *basep = 10. */ 618 619 ptr1 = &tp->chars[0]; 620 ptr_limit = &tp->chars[tp->charcount]; 621 622 if (ptr1->attribute == a_sign) 623 ptr1++; 624 625 /* Test for syntax 626 * { a_sign | } 627 * { a_digit < base }+ { a_ratio { a_digit < base }+ | } 628 */ 629 { 630 bool seen_a_ratio = false; 631 bool seen_a_digit = false; /* seen a digit in last digit block? */ 632 struct token_char *ptr; 633 634 for (ptr = ptr1;; ptr++) 635 { 636 if (ptr >= ptr_limit) 637 { 638 if (!seen_a_digit) 639 break; 640 if (seen_a_ratio) 641 return n_ratio; 642 else 643 return n_integer; 644 } 645 if (ptr->attribute == a_digit 646 || ptr->attribute == a_letterdigit 647 || ptr->attribute == a_expodigit) 648 { 649 int c = ptr->ch; 650 651 c = (c < 'A' ? c - '0' : c < 'a' ? c - 'A' + 10 : c - 'a' + 10); 652 if (c >= *basep) 653 break; 654 seen_a_digit = true; 655 } 656 else if (ptr->attribute == a_ratio) 657 { 658 if (seen_a_ratio || !seen_a_digit) 659 break; 660 seen_a_ratio = true; 661 seen_a_digit = false; 662 } 663 else 664 break; 665 } 666 } 667 668 /* Test for syntax 669 * { a_sign | } 670 * { a_digit }* { a_dot { a_digit }* | } 671 * { a_expo { a_sign | } { a_digit }+ | } 672 * 673 * If there is an exponent part, there must be digits before the dot or 674 * after the dot. The result is a float. 675 * If there is no exponen: 676 * If there is no dot, it would an integer in base 10, but is has already 677 * been verified to not be an integer in the current base. 678 * If there is a dot: 679 * If there are digits after the dot, it's a float. 680 * Otherwise, if there are digits before the dot, it's an integer. 681 */ 682 *basep = 10; 683 { 684 bool seen_a_dot = false; 685 bool seen_a_dot_with_leading_digits = false; 686 bool seen_a_digit = false; /* seen a digit in last digit block? */ 687 struct token_char *ptr; 688 689 for (ptr = ptr1;; ptr++) 690 { 691 if (ptr >= ptr_limit) 692 { 693 /* no exponent */ 694 if (!seen_a_dot) 695 return n_none; 696 if (seen_a_digit) 697 return n_float; 698 if (seen_a_dot_with_leading_digits) 699 return n_integer; 700 else 701 return n_none; 702 } 703 if (ptr->attribute == a_digit) 704 { 705 seen_a_digit = true; 706 } 707 else if (ptr->attribute == a_dot) 708 { 709 if (seen_a_dot) 710 return n_none; 711 seen_a_dot = true; 712 if (seen_a_digit) 713 seen_a_dot_with_leading_digits = true; 714 seen_a_digit = false; 715 } 716 else if (ptr->attribute == a_expo || ptr->attribute == a_expodigit) 717 break; 718 else 719 return n_none; 720 } 721 ptr++; 722 if (!seen_a_dot_with_leading_digits || !seen_a_digit) 723 return n_none; 724 if (ptr >= ptr_limit) 725 return n_none; 726 if (ptr->attribute == a_sign) 727 ptr++; 728 seen_a_digit = false; 729 for (;; ptr++) 730 { 731 if (ptr >= ptr_limit) 732 break; 733 if (ptr->attribute != a_digit) 734 return n_none; 735 seen_a_digit = true; 736 } 737 if (!seen_a_digit) 738 return n_none; 739 return n_float; 740 } 741} 742 743/* A token representing a symbol must be case converted. 744 For portability, we convert only ASCII characters here. */ 745 746static void 747upcase_token (struct token *tp) 748{ 749 int n = tp->charcount; 750 int i; 751 752 for (i = 0; i < n; i++) 753 if (tp->chars[i].attribute != a_escaped) 754 { 755 unsigned char c = tp->chars[i].ch; 756 if (c >= 'a' && c <= 'z') 757 tp->chars[i].ch = c - 'a' + 'A'; 758 } 759} 760 761static void 762downcase_token (struct token *tp) 763{ 764 int n = tp->charcount; 765 int i; 766 767 for (i = 0; i < n; i++) 768 if (tp->chars[i].attribute != a_escaped) 769 { 770 unsigned char c = tp->chars[i].ch; 771 if (c >= 'A' && c <= 'Z') 772 tp->chars[i].ch = c - 'A' + 'a'; 773 } 774} 775 776static void 777case_convert_token (struct token *tp) 778{ 779 int n = tp->charcount; 780 int i; 781 782 switch (readtable_case) 783 { 784 case case_upcase: 785 upcase_token (tp); 786 break; 787 788 case case_downcase: 789 downcase_token (tp); 790 break; 791 792 case case_preserve: 793 break; 794 795 case case_invert: 796 { 797 bool seen_uppercase = false; 798 bool seen_lowercase = false; 799 for (i = 0; i < n; i++) 800 if (tp->chars[i].attribute != a_escaped) 801 { 802 unsigned char c = tp->chars[i].ch; 803 if (c >= 'a' && c <= 'z') 804 seen_lowercase = true; 805 if (c >= 'A' && c <= 'Z') 806 seen_uppercase = true; 807 } 808 if (seen_uppercase) 809 { 810 if (!seen_lowercase) 811 downcase_token (tp); 812 } 813 else 814 { 815 if (seen_lowercase) 816 upcase_token (tp); 817 } 818 } 819 break; 820 } 821} 822 823 824/* ========================= Accumulating comments ========================= */ 825 826 827static char *buffer; 828static size_t bufmax; 829static size_t buflen; 830 831static inline void 832comment_start () 833{ 834 buflen = 0; 835} 836 837static inline void 838comment_add (int c) 839{ 840 if (buflen >= bufmax) 841 { 842 bufmax = 2 * bufmax + 10; 843 buffer = xrealloc (buffer, bufmax); 844 } 845 buffer[buflen++] = c; 846} 847 848static inline void 849comment_line_end (size_t chars_to_remove) 850{ 851 buflen -= chars_to_remove; 852 while (buflen >= 1 853 && (buffer[buflen - 1] == ' ' || buffer[buflen - 1] == '\t')) 854 --buflen; 855 if (chars_to_remove == 0 && buflen >= bufmax) 856 { 857 bufmax = 2 * bufmax + 10; 858 buffer = xrealloc (buffer, bufmax); 859 } 860 buffer[buflen] = '\0'; 861 savable_comment_add (buffer); 862} 863 864 865/* These are for tracking whether comments count as immediately before 866 keyword. */ 867static int last_comment_line; 868static int last_non_comment_line; 869 870 871/* ========================= Accumulating messages ========================= */ 872 873 874static message_list_ty *mlp; 875 876 877/* ============== Reading of objects. See CLHS 2 "Syntax". ============== */ 878 879 880/* We are only interested in symbols (e.g. GETTEXT or NGETTEXT) and strings. 881 Other objects need not to be represented precisely. */ 882enum object_type 883{ 884 t_symbol, /* symbol */ 885 t_string, /* string */ 886 t_other, /* other kind of real object */ 887 t_dot, /* '.' pseudo object */ 888 t_close, /* ')' pseudo object */ 889 t_eof /* EOF marker */ 890}; 891 892struct object 893{ 894 enum object_type type; 895 struct token *token; /* for t_symbol and t_string */ 896 int line_number_at_start; /* for t_string */ 897}; 898 899/* Free the memory pointed to by a 'struct object'. */ 900static inline void 901free_object (struct object *op) 902{ 903 if (op->type == t_symbol || op->type == t_string) 904 { 905 free_token (op->token); 906 free (op->token); 907 } 908} 909 910/* Convert a t_symbol/t_string token to a char*. */ 911static char * 912string_of_object (const struct object *op) 913{ 914 char *str; 915 const struct token_char *p; 916 char *q; 917 int n; 918 919 if (!(op->type == t_symbol || op->type == t_string)) 920 abort (); 921 n = op->token->charcount; 922 str = (char *) xmalloc (n + 1); 923 q = str; 924 for (p = op->token->chars; n > 0; p++, n--) 925 *q++ = p->ch; 926 *q = '\0'; 927 return str; 928} 929 930/* Context lookup table. */ 931static flag_context_list_table_ty *flag_context_list_table; 932 933/* Read the next object. */ 934static void 935read_object (struct object *op, flag_context_ty outer_context) 936{ 937 for (;;) 938 { 939 struct char_syntax curr; 940 941 read_char_syntax (&curr); 942 943 switch (curr.scode) 944 { 945 case syntax_eof: 946 op->type = t_eof; 947 return; 948 949 case syntax_whitespace: 950 if (curr.ch == '\n') 951 /* Comments assumed to be grouped with a message must immediately 952 precede it, with no non-whitespace token on a line between 953 both. */ 954 if (last_non_comment_line > last_comment_line) 955 savable_comment_reset (); 956 continue; 957 958 case syntax_illegal: 959 op->type = t_other; 960 return; 961 962 case syntax_single_esc: 963 case syntax_multi_esc: 964 case syntax_constituent: 965 /* Start reading a token. */ 966 op->token = (struct token *) xmalloc (sizeof (struct token)); 967 read_token (op->token, &curr); 968 last_non_comment_line = line_number; 969 970 /* Interpret the token. */ 971 972 /* Dots. */ 973 if (!op->token->with_escape 974 && op->token->charcount == 1 975 && op->token->chars[0].attribute == a_dot) 976 { 977 free_token (op->token); 978 free (op->token); 979 op->type = t_dot; 980 return; 981 } 982 /* Tokens consisting entirely of dots are illegal, but be tolerant 983 here. */ 984 985 /* Number. */ 986 { 987 int base = read_base; 988 989 if (is_number (op->token, &base) != n_none) 990 { 991 free_token (op->token); 992 free (op->token); 993 op->type = t_other; 994 return; 995 } 996 } 997 998 /* We interpret all other tokens as symbols (including 'reserved 999 tokens', i.e. potential numbers which are not numbers). */ 1000 case_convert_token (op->token); 1001 op->type = t_symbol; 1002 return; 1003 1004 case syntax_t_macro: 1005 case syntax_nt_macro: 1006 /* Read a macro. */ 1007 switch (curr.ch) 1008 { 1009 case '(': 1010 { 1011 int arg = 0; /* Current argument number. */ 1012 flag_context_list_iterator_ty context_iter; 1013 const struct callshapes *shapes = NULL; 1014 struct arglist_parser *argparser = NULL; 1015 1016 for (;; arg++) 1017 { 1018 struct object inner; 1019 flag_context_ty inner_context; 1020 1021 if (arg == 0) 1022 inner_context = null_context; 1023 else 1024 inner_context = 1025 inherited_context (outer_context, 1026 flag_context_list_iterator_advance ( 1027 &context_iter)); 1028 1029 read_object (&inner, inner_context); 1030 1031 /* Recognize end of list. */ 1032 if (inner.type == t_close) 1033 { 1034 op->type = t_other; 1035 /* Don't bother converting "()" to "NIL". */ 1036 last_non_comment_line = line_number; 1037 if (argparser != NULL) 1038 arglist_parser_done (argparser, arg); 1039 return; 1040 } 1041 1042 /* Dots are not allowed in every position. 1043 But be tolerant. */ 1044 1045 /* EOF inside list is illegal. 1046 But be tolerant. */ 1047 if (inner.type == t_eof) 1048 break; 1049 1050 if (arg == 0) 1051 { 1052 /* This is the function position. */ 1053 if (inner.type == t_symbol) 1054 { 1055 char *symbol_name = string_of_object (&inner); 1056 int i; 1057 int prefix_len; 1058 void *keyword_value; 1059 1060 /* Omit any package name. */ 1061 i = inner.token->charcount; 1062 while (i > 0 1063 && inner.token->chars[i-1].attribute != a_pack_m) 1064 i--; 1065 prefix_len = i; 1066 1067 if (hash_find_entry (&keywords, 1068 symbol_name + prefix_len, 1069 strlen (symbol_name + prefix_len), 1070 &keyword_value) 1071 == 0) 1072 shapes = (const struct callshapes *) keyword_value; 1073 1074 argparser = arglist_parser_alloc (mlp, shapes); 1075 1076 context_iter = 1077 flag_context_list_iterator ( 1078 flag_context_list_table_lookup ( 1079 flag_context_list_table, 1080 symbol_name, strlen (symbol_name))); 1081 1082 free (symbol_name); 1083 } 1084 else 1085 context_iter = null_context_list_iterator; 1086 } 1087 else 1088 { 1089 /* These are the argument positions. */ 1090 if (argparser != NULL && inner.type == t_string) 1091 arglist_parser_remember (argparser, arg, 1092 string_of_object (&inner), 1093 inner_context, 1094 logical_file_name, 1095 inner.line_number_at_start, 1096 savable_comment); 1097 } 1098 1099 free_object (&inner); 1100 } 1101 1102 if (argparser != NULL) 1103 arglist_parser_done (argparser, arg); 1104 } 1105 op->type = t_other; 1106 last_non_comment_line = line_number; 1107 return; 1108 1109 case ')': 1110 /* Tell the caller about the end of list. 1111 Unmatched closing parenthesis is illegal. 1112 But be tolerant. */ 1113 op->type = t_close; 1114 last_non_comment_line = line_number; 1115 return; 1116 1117 case ',': 1118 { 1119 int c = do_getc (); 1120 /* The ,@ handling inside lists is wrong anyway, because 1121 ,@form expands to an unknown number of elements. */ 1122 if (c != EOF && c != '@' && c != '.') 1123 do_ungetc (c); 1124 } 1125 /*FALLTHROUGH*/ 1126 case '\'': 1127 case '`': 1128 { 1129 struct object inner; 1130 1131 read_object (&inner, null_context); 1132 1133 /* Dots and EOF are not allowed here. But be tolerant. */ 1134 1135 free_object (&inner); 1136 1137 op->type = t_other; 1138 last_non_comment_line = line_number; 1139 return; 1140 } 1141 1142 case ';': 1143 { 1144 bool all_semicolons = true; 1145 1146 last_comment_line = line_number; 1147 comment_start (); 1148 for (;;) 1149 { 1150 int c = do_getc (); 1151 if (c == EOF || c == '\n') 1152 break; 1153 if (c != ';') 1154 all_semicolons = false; 1155 if (!all_semicolons) 1156 { 1157 /* We skip all leading white space, but not EOLs. */ 1158 if (!(buflen == 0 && (c == ' ' || c == '\t'))) 1159 comment_add (c); 1160 } 1161 } 1162 comment_line_end (0); 1163 continue; 1164 } 1165 1166 case '"': 1167 { 1168 op->token = (struct token *) xmalloc (sizeof (struct token)); 1169 init_token (op->token); 1170 op->line_number_at_start = line_number; 1171 for (;;) 1172 { 1173 int c = do_getc (); 1174 if (c == EOF) 1175 /* Invalid input. Be tolerant, no error message. */ 1176 break; 1177 if (c == '"') 1178 break; 1179 if (c == '\\') /* syntax_single_esc */ 1180 { 1181 c = do_getc (); 1182 if (c == EOF) 1183 /* Invalid input. Be tolerant, no error message. */ 1184 break; 1185 } 1186 grow_token (op->token); 1187 op->token->chars[op->token->charcount++].ch = c; 1188 } 1189 op->type = t_string; 1190 1191 if (extract_all) 1192 { 1193 lex_pos_ty pos; 1194 1195 pos.file_name = logical_file_name; 1196 pos.line_number = op->line_number_at_start; 1197 remember_a_message (mlp, NULL, string_of_object (op), 1198 null_context, &pos, savable_comment); 1199 } 1200 last_non_comment_line = line_number; 1201 return; 1202 } 1203 1204 case '#': 1205 /* Dispatch macro handling. */ 1206 { 1207 int c; 1208 1209 for (;;) 1210 { 1211 c = do_getc (); 1212 if (c == EOF) 1213 /* Invalid input. Be tolerant, no error message. */ 1214 { 1215 op->type = t_other; 1216 return; 1217 } 1218 if (!(c >= '0' && c <= '9')) 1219 break; 1220 } 1221 1222 switch (c) 1223 { 1224 case '(': 1225 case '"': 1226 do_ungetc (c); 1227 /*FALLTHROUGH*/ 1228 case '\'': 1229 case ':': 1230 case '.': 1231 case ',': 1232 case 'A': case 'a': 1233 case 'C': case 'c': 1234 case 'P': case 'p': 1235 case 'S': case 's': 1236 { 1237 struct object inner; 1238 read_object (&inner, null_context); 1239 /* Dots and EOF are not allowed here. 1240 But be tolerant. */ 1241 free_object (&inner); 1242 op->type = t_other; 1243 last_non_comment_line = line_number; 1244 return; 1245 } 1246 1247 case '|': 1248 { 1249 int depth = 0; 1250 int c; 1251 1252 comment_start (); 1253 c = do_getc (); 1254 for (;;) 1255 { 1256 if (c == EOF) 1257 break; 1258 if (c == '|') 1259 { 1260 c = do_getc (); 1261 if (c == EOF) 1262 break; 1263 if (c == '#') 1264 { 1265 if (depth == 0) 1266 { 1267 comment_line_end (0); 1268 break; 1269 } 1270 depth--; 1271 comment_add ('|'); 1272 comment_add ('#'); 1273 c = do_getc (); 1274 } 1275 else 1276 comment_add ('|'); 1277 } 1278 else if (c == '#') 1279 { 1280 c = do_getc (); 1281 if (c == EOF) 1282 break; 1283 comment_add ('#'); 1284 if (c == '|') 1285 { 1286 depth++; 1287 comment_add ('|'); 1288 c = do_getc (); 1289 } 1290 } 1291 else 1292 { 1293 /* We skip all leading white space. */ 1294 if (!(buflen == 0 && (c == ' ' || c == '\t'))) 1295 comment_add (c); 1296 if (c == '\n') 1297 { 1298 comment_line_end (1); 1299 comment_start (); 1300 } 1301 c = do_getc (); 1302 } 1303 } 1304 if (c == EOF) 1305 { 1306 /* EOF not allowed here. But be tolerant. */ 1307 op->type = t_eof; 1308 return; 1309 } 1310 last_comment_line = line_number; 1311 continue; 1312 } 1313 1314 case '\\': 1315 { 1316 struct token token; 1317 struct char_syntax first; 1318 first.ch = '\\'; 1319 first.scode = syntax_single_esc; 1320 read_token (&token, &first); 1321 free_token (&token); 1322 op->type = t_other; 1323 last_non_comment_line = line_number; 1324 return; 1325 } 1326 1327 case 'B': case 'b': 1328 case 'O': case 'o': 1329 case 'X': case 'x': 1330 case 'R': case 'r': 1331 case '*': 1332 { 1333 struct token token; 1334 read_token (&token, NULL); 1335 free_token (&token); 1336 op->type = t_other; 1337 last_non_comment_line = line_number; 1338 return; 1339 } 1340 1341 case '=': 1342 /* Ignore read labels. */ 1343 continue; 1344 1345 case '#': 1346 /* Don't bother looking up the corresponding object. */ 1347 op->type = t_other; 1348 last_non_comment_line = line_number; 1349 return; 1350 1351 case '+': 1352 case '-': 1353 /* Simply assume every feature expression is true. */ 1354 { 1355 struct object inner; 1356 read_object (&inner, null_context); 1357 /* Dots and EOF are not allowed here. 1358 But be tolerant. */ 1359 free_object (&inner); 1360 continue; 1361 } 1362 1363 default: 1364 op->type = t_other; 1365 last_non_comment_line = line_number; 1366 return; 1367 } 1368 /*NOTREACHED*/ 1369 abort (); 1370 } 1371 1372 default: 1373 /*NOTREACHED*/ 1374 abort (); 1375 } 1376 1377 default: 1378 /*NOTREACHED*/ 1379 abort (); 1380 } 1381 } 1382} 1383 1384 1385void 1386extract_lisp (FILE *f, 1387 const char *real_filename, const char *logical_filename, 1388 flag_context_list_table_ty *flag_table, 1389 msgdomain_list_ty *mdlp) 1390{ 1391 mlp = mdlp->item[0]->messages; 1392 1393 fp = f; 1394 real_file_name = real_filename; 1395 logical_file_name = xstrdup (logical_filename); 1396 line_number = 1; 1397 1398 last_comment_line = -1; 1399 last_non_comment_line = -1; 1400 1401 flag_context_list_table = flag_table; 1402 1403 init_keywords (); 1404 1405 /* Eat tokens until eof is seen. When read_object returns 1406 due to an unbalanced closing parenthesis, just restart it. */ 1407 do 1408 { 1409 struct object toplevel_object; 1410 1411 read_object (&toplevel_object, null_context); 1412 1413 if (toplevel_object.type == t_eof) 1414 break; 1415 1416 free_object (&toplevel_object); 1417 } 1418 while (!feof (fp)); 1419 1420 /* Close scanner. */ 1421 fp = NULL; 1422 real_file_name = NULL; 1423 logical_file_name = NULL; 1424 line_number = 0; 1425} 1426