1/* xgettext sh backend. 2 Copyright (C) 2003, 2005-2007 Free Software Foundation, Inc. 3 Written by Bruno Haible <bruno@clisp.org>, 2003. 4 5 This program is free software: you can redistribute it and/or modify 6 it under the terms of the GNU General Public License as published by 7 the Free Software Foundation; either version 3 of the License, or 8 (at your option) any later version. 9 10 This program is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 GNU General Public License for more details. 14 15 You should have received a copy of the GNU General Public License 16 along with this program. If not, see <http://www.gnu.org/licenses/>. */ 17 18#ifdef HAVE_CONFIG_H 19# include "config.h" 20#endif 21 22/* Specification. */ 23#include "x-sh.h" 24 25#include <errno.h> 26#include <limits.h> 27#include <stdbool.h> 28#include <stdio.h> 29#include <stdlib.h> 30#include <string.h> 31 32#include "message.h" 33#include "xgettext.h" 34#include "x-sh.h" 35#include "error.h" 36#include "xalloc.h" 37#include "hash.h" 38#include "gettext.h" 39 40#define _(s) gettext(s) 41 42#define SIZEOF(a) (sizeof(a) / sizeof(a[0])) 43 44 45/* The sh syntax is defined in POSIX:2001, see 46 http://www.opengroup.org/onlinepubs/007904975/utilities/xcu_chap02.html 47 Summary of sh syntax: 48 - Input is broken into words, which are then subject to 49 - tilde expansion ~... 50 - command substitution `...` 51 - variable substitution $var 52 - arithmetic substitution $((...)) 53 - field splitting at whitespace (IFS) 54 - wildcard pattern expansion *? 55 - quote removal 56 - Strings are enclosed in "..."; command substitution, variable 57 substitution and arithmetic substitution are performed here as well. 58 - '...' is a string without substitutions. 59 - The list of resulting words is split into commands by semicolon and 60 newline. 61 - '#' at the beginning of a word introduces a comment until end of line. 62 The parser is implemented in bash-2.05b/parse.y. */ 63 64 65/* ====================== Keyword set customization. ====================== */ 66 67/* If true extract all strings. */ 68static bool extract_all = false; 69 70static hash_table keywords; 71static bool default_keywords = true; 72 73 74void 75x_sh_extract_all () 76{ 77 extract_all = true; 78} 79 80 81void 82x_sh_keyword (const char *name) 83{ 84 if (name == NULL) 85 default_keywords = false; 86 else 87 { 88 const char *end; 89 struct callshape shape; 90 const char *colon; 91 92 if (keywords.table == NULL) 93 hash_init (&keywords, 100); 94 95 split_keywordspec (name, &end, &shape); 96 97 /* The characters between name and end should form a valid C identifier. 98 A colon means an invalid parse in split_keywordspec(). */ 99 colon = strchr (name, ':'); 100 if (colon == NULL || colon >= end) 101 insert_keyword_callshape (&keywords, name, end - name, &shape); 102 } 103} 104 105/* Finish initializing the keywords hash table. 106 Called after argument processing, before each file is processed. */ 107static void 108init_keywords () 109{ 110 if (default_keywords) 111 { 112 /* When adding new keywords here, also update the documentation in 113 xgettext.texi! */ 114 x_sh_keyword ("gettext"); 115 x_sh_keyword ("ngettext:1,2"); 116 x_sh_keyword ("eval_gettext"); 117 x_sh_keyword ("eval_ngettext:1,2"); 118 default_keywords = false; 119 } 120} 121 122void 123init_flag_table_sh () 124{ 125 xgettext_record_flag ("gettext:1:pass-sh-format"); 126 xgettext_record_flag ("ngettext:1:pass-sh-format"); 127 xgettext_record_flag ("ngettext:2:pass-sh-format"); 128 xgettext_record_flag ("eval_gettext:1:sh-format"); 129 xgettext_record_flag ("eval_ngettext:1:sh-format"); 130 xgettext_record_flag ("eval_ngettext:2:sh-format"); 131} 132 133 134/* ======================== Reading of characters. ======================== */ 135 136/* Real filename, used in error messages about the input file. */ 137static const char *real_file_name; 138 139/* Logical filename and line number, used to label the extracted messages. */ 140static char *logical_file_name; 141static int line_number; 142 143/* The input file stream. */ 144static FILE *fp; 145 146 147/* Fetch the next character from the input file. */ 148static int 149do_getc () 150{ 151 int c = getc (fp); 152 153 if (c == EOF) 154 { 155 if (ferror (fp)) 156 error (EXIT_FAILURE, errno, _("\ 157error while reading \"%s\""), real_file_name); 158 } 159 else if (c == '\n') 160 line_number++; 161 162 return c; 163} 164 165/* Put back the last fetched character, not EOF. */ 166static void 167do_ungetc (int c) 168{ 169 if (c == '\n') 170 line_number--; 171 ungetc (c, fp); 172} 173 174 175/* Remove backslash followed by newline from the input stream. */ 176 177static int phase1_pushback[1]; 178static int phase1_pushback_length; 179 180static int 181phase1_getc () 182{ 183 int c; 184 185 if (phase1_pushback_length) 186 { 187 c = phase1_pushback[--phase1_pushback_length]; 188 if (c == '\n') 189 ++line_number; 190 return c; 191 } 192 for (;;) 193 { 194 c = do_getc (); 195 if (c != '\\') 196 return c; 197 c = do_getc (); 198 if (c != '\n') 199 { 200 if (c != EOF) 201 do_ungetc (c); 202 return '\\'; 203 } 204 } 205} 206 207/* Supports only one pushback character. */ 208static void 209phase1_ungetc (int c) 210{ 211 switch (c) 212 { 213 case EOF: 214 break; 215 216 case '\n': 217 --line_number; 218 /* FALLTHROUGH */ 219 220 default: 221 if (phase1_pushback_length == SIZEOF (phase1_pushback)) 222 abort (); 223 phase1_pushback[phase1_pushback_length++] = c; 224 break; 225 } 226} 227 228 229/* ========================== Reading of tokens. ========================== */ 230 231 232/* A token consists of a sequence of characters. */ 233struct token 234{ 235 int allocated; /* number of allocated 'token_char's */ 236 int charcount; /* number of used 'token_char's */ 237 char *chars; /* the token's constituents */ 238}; 239 240/* Initialize a 'struct token'. */ 241static inline void 242init_token (struct token *tp) 243{ 244 tp->allocated = 10; 245 tp->chars = XNMALLOC (tp->allocated, char); 246 tp->charcount = 0; 247} 248 249/* Free the memory pointed to by a 'struct token'. */ 250static inline void 251free_token (struct token *tp) 252{ 253 free (tp->chars); 254} 255 256/* Ensure there is enough room in the token for one more character. */ 257static inline void 258grow_token (struct token *tp) 259{ 260 if (tp->charcount == tp->allocated) 261 { 262 tp->allocated *= 2; 263 tp->chars = (char *) xrealloc (tp->chars, tp->allocated * sizeof (char)); 264 } 265} 266 267/* Convert a struct token * to a char*. */ 268static char * 269string_of_token (const struct token *tp) 270{ 271 char *str; 272 int n; 273 274 n = tp->charcount; 275 str = XNMALLOC (n + 1, char); 276 memcpy (str, tp->chars, n); 277 str[n] = '\0'; 278 return str; 279} 280 281 282/* ========================= Accumulating messages ========================= */ 283 284 285static message_list_ty *mlp; 286 287 288/* ========================= Accumulating comments ========================= */ 289 290 291static char *buffer; 292static size_t bufmax; 293static size_t buflen; 294 295static inline void 296comment_start () 297{ 298 buflen = 0; 299} 300 301static inline void 302comment_add (int c) 303{ 304 if (buflen >= bufmax) 305 { 306 bufmax = 2 * bufmax + 10; 307 buffer = xrealloc (buffer, bufmax); 308 } 309 buffer[buflen++] = c; 310} 311 312static inline void 313comment_line_end () 314{ 315 while (buflen >= 1 316 && (buffer[buflen - 1] == ' ' || buffer[buflen - 1] == '\t')) 317 --buflen; 318 if (buflen >= bufmax) 319 { 320 bufmax = 2 * bufmax + 10; 321 buffer = xrealloc (buffer, bufmax); 322 } 323 buffer[buflen] = '\0'; 324 savable_comment_add (buffer); 325} 326 327 328/* These are for tracking whether comments count as immediately before 329 keyword. */ 330static int last_comment_line; 331static int last_non_comment_line; 332 333 334/* ========================= Debackslashification ========================== */ 335 336/* This state tracks the effect of backquotes, double-quotes and single-quotes 337 on the parsing of backslashes. We make a single pass through the input 338 file, keeping the state up to date. This is much faster than accumulating 339 strings and processing them with explicit debackslashification, like the 340 shell does it. */ 341 342/* The number of nested `...` or "`...`" constructs. Assumed to be <= 32. */ 343static unsigned int nested_backquotes; 344 345/* A bit mask indicating which of the currently open `...` or "`...`" 346 constructs is with double-quotes: "`...`". 347 A bit value of 1 stands for "`...`", a bit value of 0 stands for `...`. 348 Bit position 0 designates the outermost backquotes nesting, 349 bit position 1 the second-outermost backquotes nesting, 350 ... 351 bit position (nested_backquotes-1) the innermost backquotes nesting. */ 352static unsigned int open_doublequotes_mask; 353 354/* A bit indicating whether a double-quote is currently open inside the 355 innermost backquotes nesting. */ 356static bool open_doublequote; 357 358/* A bit indicating whether a single-quote is currently open inside the 359 innermost backquotes nesting. */ 360static bool open_singlequote; 361 362/* The expected terminator of the currently open single-quote. 363 Usually '\'', but can be '"' for i18n-quotes. */ 364static char open_singlequote_terminator; 365 366 367/* Functions to update the state. */ 368 369static inline void 370saw_opening_backquote () 371{ 372 if (open_singlequote) 373 abort (); 374 if (open_doublequote) 375 open_doublequotes_mask |= (unsigned int) 1 << nested_backquotes; 376 nested_backquotes++; 377 open_doublequote = false; 378} 379 380static inline void 381saw_closing_backquote () 382{ 383 nested_backquotes--; 384 open_doublequote = (open_doublequotes_mask >> nested_backquotes) & 1; 385 open_doublequotes_mask &= ((unsigned int) 1 << nested_backquotes) - 1; 386 open_singlequote = false; /* just for safety */ 387} 388 389static inline void 390saw_opening_doublequote () 391{ 392 if (open_singlequote || open_doublequote) 393 abort (); 394 open_doublequote = true; 395} 396 397static inline void 398saw_closing_doublequote () 399{ 400 if (open_singlequote || !open_doublequote) 401 abort (); 402 open_doublequote = false; 403} 404 405static inline void 406saw_opening_singlequote () 407{ 408 if (open_doublequote || open_singlequote) 409 abort (); 410 open_singlequote = true; 411 open_singlequote_terminator = '\''; 412} 413 414static inline void 415saw_closing_singlequote () 416{ 417 if (open_doublequote || !open_singlequote) 418 abort (); 419 open_singlequote = false; 420} 421 422 423/* ========================== Reading of commands ========================== */ 424 425/* We are only interested in constant strings. Other words need not to be 426 represented precisely. */ 427enum word_type 428{ 429 t_string, /* constant string */ 430 t_other, /* other string */ 431 t_separator, /* command separator: semicolon or newline */ 432 t_redirect, /* redirection: one of < > >| << <<- >> <> <& >& */ 433 t_backquote, /* closing '`' pseudo word */ 434 t_paren, /* closing ')' pseudo word */ 435 t_eof /* EOF marker */ 436}; 437 438struct word 439{ 440 enum word_type type; 441 struct token *token; /* for t_string */ 442 int line_number_at_start; /* for t_string */ 443}; 444 445/* Free the memory pointed to by a 'struct word'. */ 446static inline void 447free_word (struct word *wp) 448{ 449 if (wp->type == t_string) 450 { 451 free_token (wp->token); 452 free (wp->token); 453 } 454} 455 456/* Convert a t_string token to a char*. */ 457static char * 458string_of_word (const struct word *wp) 459{ 460 char *str; 461 int n; 462 463 if (!(wp->type == t_string)) 464 abort (); 465 n = wp->token->charcount; 466 str = XNMALLOC (n + 1, char); 467 memcpy (str, wp->token->chars, n); 468 str[n] = '\0'; 469 return str; 470} 471 472 473/* Whitespace recognition. */ 474 475static inline bool 476is_whitespace (int c) 477{ 478 return (c == ' ' || c == '\t' || c == '\n'); 479} 480 481/* Operator character recognition. */ 482 483static inline bool 484is_operator_start (int c) 485{ 486 return (c == '|' || c == '&' || c == ';' || c == '<' || c == '>' 487 || c == '(' || c == ')'); 488} 489 490 491/* Denotation of a quoted character. 492 The distinction between quoted and unquoted character is important only for 493 the special, whitespace and operator characters; it is irrelevant for 494 alphanumeric characters, '\\' and many others. */ 495#define QUOTED(c) (UCHAR_MAX + 1 + (c)) 496/* Values in the 'unsigned char' range are implicitly unquoted. Among these, 497 the following are important: 498 '"' opening or closing double quote 499 '\'' opening or closing single quote 500 '$' the unknown result of a dollar expansion 501 '`' does not occur - replaced with OPENING_BACKQUOTE or 502 CLOSING_BACKQUOTE 503 */ 504#define OPENING_BACKQUOTE (2 * (UCHAR_MAX + 1) + '`') 505#define CLOSING_BACKQUOTE (3 * (UCHAR_MAX + 1) + '`') 506 507/* 2 characters of pushback are supported. 508 2 characters of pushback occur only when the first is an 'x'; in all 509 other cases only one character of pushback is needed. */ 510static int phase2_pushback[2]; 511static int phase2_pushback_length; 512 513/* Return the next character, with backslashes removed. 514 The result is QUOTED(c) for some unsigned char c, if the next character 515 is escaped sufficiently often to make it a regular constituent character, 516 or simply an 'unsigned char' if it has its special meaning (of special, 517 whitespace or operator charcter), or OPENING_BACKQUOTE, CLOSING_BACKQUOTE, 518 EOF. 519 It's the caller's responsibility to update the state. */ 520static int 521phase2_getc () 522{ 523 int c; 524 525 if (phase2_pushback_length) 526 { 527 c = phase2_pushback[--phase2_pushback_length]; 528 if (c == '\n') 529 ++line_number; 530 return c; 531 } 532 533 c = phase1_getc (); 534 if (c == EOF) 535 return c; 536 if (c == '\'') 537 return ((open_doublequote 538 || (open_singlequote && open_singlequote_terminator != c)) 539 ? QUOTED (c) 540 : c); 541 if (open_singlequote) 542 { 543 if (c == open_singlequote_terminator) 544 return c; 545 } 546 else 547 { 548 if (c == '"' || c == '$') 549 return c; 550 if (c == '`') 551 return (nested_backquotes > 0 ? CLOSING_BACKQUOTE : OPENING_BACKQUOTE); 552 } 553 if (c == '\\') 554 { 555 /* Number of debackslahificication passes that are active at the 556 current point. */ 557 unsigned int debackslahify = 558 nested_backquotes + (open_singlequote ? 0 : 1); 559 /* Normal number of backslashes that yield a single backslash in the 560 final output. */ 561 unsigned int expected_count = 562 (unsigned int) 1 << debackslahify; 563 /* Number of backslashes found. */ 564 unsigned int count; 565 566 for (count = 1; count < expected_count; count++) 567 { 568 c = phase1_getc (); 569 if (c != '\\') 570 break; 571 } 572 if (count == expected_count) 573 return '\\'; 574 575 /* The count of backslashes is > 0 and < expected_count, therefore the 576 result depends on c, the first character after the backslashes. 577 Note: The formulas below don't necessarily have a logic; they were 578 empirically determined such that 1. the xgettext-30 test succeeds, 579 2. the behaviour for count == 0 would correspond to the one without 580 any baskslash. */ 581 if (c == '\'') 582 { 583 if (!open_singlequote && count > (expected_count >> 1)) 584 { 585 phase1_ungetc (c); 586 return '\\'; 587 } 588 else 589 return ((open_doublequote 590 || (open_singlequote && open_singlequote_terminator != c)) 591 ? QUOTED (c) 592 : c); 593 } 594 else if (c == '"') 595 { 596 /* Each debackslahificication pass converts \\ to \ and \" to "; 597 passes corresponding to `...` drop a lone " whereas passes 598 corresponding to "`...`" leave it alone. Therefore, the 599 minimum number of backslashes needed to get one double-quote 600 in the end is open_doublequotes_mask + 1. */ 601 if (open_singlequote) 602 { 603 if (count > open_doublequotes_mask) 604 { 605 phase1_ungetc (c); 606 return '\\'; 607 } 608 else 609 return (open_singlequote_terminator != c ? QUOTED (c) : c); 610 } 611 else 612 { 613 if (count > open_doublequotes_mask) 614 return QUOTED (c); 615 else 616 /* Some of the count values <= open_doublequotes_mask are 617 actually invalid here, but we assume a syntactically 618 correct input file anyway. */ 619 return c; 620 } 621 } 622 else if (c == '`') 623 { 624 /* FIXME: This code looks fishy. */ 625 if (count == expected_count - 1) 626 return c; 627 else 628 /* Some of the count values < expected_count - 1 are 629 actually invalid here, but we assume a syntactically 630 correct input file anyway. */ 631 if (nested_backquotes > 0 && !open_singlequote 632 && count >= (expected_count >> 2)) 633 return OPENING_BACKQUOTE; 634 else 635 return CLOSING_BACKQUOTE; 636 } 637 else if (c == '$') 638 { 639 if (open_singlequote) 640 return QUOTED (c); 641 if (count >= (expected_count >> 1)) 642 return QUOTED (c); 643 else 644 return c; 645 } 646 else 647 { 648 /* When not followed by a quoting character or backslash or dollar, 649 a backslash survives a debackslahificication pass unmodified. 650 Therefore each debackslahificication pass performs a 651 count := (count + 1) >> 1 652 operation. Therefore the minimum number of backslashes needed 653 to get one backslash in the end is (expected_count >> 1) + 1. */ 654 if (open_doublequote || open_singlequote) 655 { 656 if (count > 0) 657 { 658 phase1_ungetc (c); 659 return '\\'; 660 } 661 else 662 return QUOTED (c); 663 } 664 else 665 { 666 if (count > (expected_count >> 1)) 667 { 668 phase1_ungetc (c); 669 return '\\'; 670 } 671 else if (count > 0) 672 return QUOTED (c); 673 else 674 return c; 675 } 676 } 677 } 678 679 return (open_singlequote || open_doublequote ? QUOTED (c) : c); 680} 681 682/* Supports 2 characters of pushback. */ 683static void 684phase2_ungetc (int c) 685{ 686 switch (c) 687 { 688 case EOF: 689 break; 690 691 case '\n': 692 --line_number; 693 /* FALLTHROUGH */ 694 695 default: 696 if (phase2_pushback_length == SIZEOF (phase2_pushback)) 697 abort (); 698 phase2_pushback[phase2_pushback_length++] = c; 699 break; 700 } 701} 702 703 704/* Context lookup table. */ 705static flag_context_list_table_ty *flag_context_list_table; 706 707 708/* Forward declaration of local functions. */ 709static enum word_type read_command_list (int looking_for, 710 flag_context_ty outer_context); 711 712 713 714/* Read the next word. 715 'looking_for' denotes a parse terminator, either CLOSING_BACKQUOTE, ')' 716 or '\0'. */ 717static void 718read_word (struct word *wp, int looking_for, flag_context_ty context) 719{ 720 int c; 721 bool all_unquoted_digits; 722 723 do 724 { 725 c = phase2_getc (); 726 if (c == '#') 727 { 728 /* Skip a comment up to end of line. */ 729 last_comment_line = line_number; 730 comment_start (); 731 for (;;) 732 { 733 c = phase1_getc (); 734 if (c == EOF || c == '\n') 735 break; 736 /* We skip all leading white space, but not EOLs. */ 737 if (!(buflen == 0 && (c == ' ' || c == '\t'))) 738 comment_add (c); 739 } 740 comment_line_end (); 741 } 742 if (c == '\n') 743 { 744 /* Comments assumed to be grouped with a message must immediately 745 precede it, with no non-whitespace token on a line between 746 both. */ 747 if (last_non_comment_line > last_comment_line) 748 savable_comment_reset (); 749 wp->type = t_separator; 750 return; 751 } 752 } 753 while (is_whitespace (c)); 754 755 if (c == EOF) 756 { 757 wp->type = t_eof; 758 return; 759 } 760 761 if (c == '<' || c == '>') 762 { 763 /* Recognize the redirection operators < > >| << <<- >> <> <& >& 764 But <( and >) are handled below, not here. */ 765 int c2 = phase2_getc (); 766 if (c2 != '(') 767 { 768 if ((c == '<' ? c2 == '<' : c2 == '|') || c2 == '>' || c2 == '&') 769 { 770 if (c == '<' && c2 == '<') 771 { 772 int c3 = phase2_getc (); 773 if (c3 != '-') 774 phase2_ungetc (c3); 775 } 776 } 777 else 778 phase2_ungetc (c2); 779 wp->type = t_redirect; 780 return; 781 } 782 else 783 phase2_ungetc (c2); 784 } 785 786 if (looking_for == CLOSING_BACKQUOTE && c == CLOSING_BACKQUOTE) 787 { 788 saw_closing_backquote (); 789 wp->type = t_backquote; 790 last_non_comment_line = line_number; 791 return; 792 } 793 794 if (looking_for == ')' && c == ')') 795 { 796 wp->type = t_paren; 797 last_non_comment_line = line_number; 798 return; 799 } 800 801 if (is_operator_start (c)) 802 { 803 wp->type = (c == ';' ? t_separator : t_other); 804 return; 805 } 806 807 wp->type = t_string; 808 wp->token = XMALLOC (struct token); 809 init_token (wp->token); 810 wp->line_number_at_start = line_number; 811 all_unquoted_digits = true; 812 813 for (;; c = phase2_getc ()) 814 { 815 if (c == EOF) 816 break; 817 818 if (all_unquoted_digits && (c == '<' || c == '>')) 819 { 820 /* Recognize the redirection operators < > >| << <<- >> <> <& >& 821 prefixed with a nonempty sequence of unquoted digits. */ 822 int c2 = phase2_getc (); 823 if ((c == '<' ? c2 == '<' : c2 == '|') || c2 == '>' || c2 == '&') 824 { 825 if (c == '<' && c2 == '<') 826 { 827 int c3 = phase2_getc (); 828 if (c3 != '-') 829 phase2_ungetc (c3); 830 } 831 } 832 else 833 phase2_ungetc (c2); 834 835 wp->type = t_redirect; 836 free_token (wp->token); 837 free (wp->token); 838 839 last_non_comment_line = line_number; 840 841 return; 842 } 843 844 all_unquoted_digits = all_unquoted_digits && (c >= '0' && c <= '9'); 845 846 if (c == '$') 847 { 848 int c2; 849 850 /* An unquoted dollar indicates we are not inside '...'. */ 851 if (open_singlequote) 852 abort (); 853 /* After reading a dollar, we know that there is no pushed back 854 character from an earlier lookahead. */ 855 if (phase2_pushback_length > 0) 856 abort (); 857 /* Therefore we can use phase1 without interfering with phase2. 858 We need to recognize $( outside and inside double-quotes. 859 It would be incorrect to do 860 c2 = phase2_getc (); 861 if (c2 == '(' || c2 == QUOTED ('(')) 862 because that would also trigger for $\(. */ 863 c2 = phase1_getc (); 864 if (c2 == '(') 865 { 866 bool saved_open_doublequote; 867 int c3; 868 869 phase1_ungetc (c2); 870 871 /* The entire inner command or arithmetic expression is read 872 ignoring possible surrounding double-quotes. */ 873 saved_open_doublequote = open_doublequote; 874 open_doublequote = false; 875 876 c2 = phase2_getc (); 877 if (c2 != '(') 878 abort (); 879 880 c3 = phase2_getc (); 881 if (c3 == '(') 882 { 883 /* Arithmetic expression (Bash syntax). Skip until the 884 matching closing parenthesis. */ 885 unsigned int depth = 2; 886 887 do 888 { 889 c = phase2_getc (); 890 if (c == '(') 891 depth++; 892 else if (c == ')') 893 if (--depth == 0) 894 break; 895 } 896 while (c != EOF); 897 } 898 else 899 { 900 /* Command substitution (Bash syntax). */ 901 phase2_ungetc (c3); 902 read_command_list (')', context); 903 } 904 905 open_doublequote = saved_open_doublequote; 906 } 907 else 908 { 909 phase1_ungetc (c2); 910 c2 = phase2_getc (); 911 912 if (c2 == '\'' && !open_singlequote) 913 { 914 /* Bash builtin for string with ANSI-C escape sequences. */ 915 saw_opening_singlequote (); 916 for (;;) 917 { 918 c = phase2_getc (); 919 if (c == EOF) 920 break; 921 if (c == '\'') 922 { 923 saw_closing_singlequote (); 924 break; 925 } 926 if (c == '\\') 927 { 928 c = phase2_getc (); 929 switch (c) 930 { 931 default: 932 phase2_ungetc (c); 933 c = '\\'; 934 break; 935 936 case '\\': 937 break; 938 case '\'': 939 /* Don't call saw_closing_singlequote () 940 here. */ 941 break; 942 943 case 'a': 944 c = '\a'; 945 break; 946 case 'b': 947 c = '\b'; 948 break; 949 case 'e': 950 c = 0x1b; /* ESC */ 951 break; 952 case 'f': 953 c = '\f'; 954 break; 955 case 'n': 956 c = '\n'; 957 break; 958 case 'r': 959 c = '\r'; 960 break; 961 case 't': 962 c = '\t'; 963 break; 964 case 'v': 965 c = '\v'; 966 break; 967 968 case 'x': 969 c = phase2_getc (); 970 if ((c >= '0' && c <= '9') 971 || (c >= 'A' && c <= 'F') 972 || (c >= 'a' && c <= 'f')) 973 { 974 int n; 975 976 if (c >= '0' && c <= '9') 977 n = c - '0'; 978 else if (c >= 'A' && c <= 'F') 979 n = 10 + c - 'A'; 980 else if (c >= 'a' && c <= 'f') 981 n = 10 + c - 'a'; 982 else 983 abort (); 984 985 c = phase2_getc (); 986 if ((c >= '0' && c <= '9') 987 || (c >= 'A' && c <= 'F') 988 || (c >= 'a' && c <= 'f')) 989 { 990 if (c >= '0' && c <= '9') 991 n = n * 16 + c - '0'; 992 else if (c >= 'A' && c <= 'F') 993 n = n * 16 + 10 + c - 'A'; 994 else if (c >= 'a' && c <= 'f') 995 n = n * 16 + 10 + c - 'a'; 996 else 997 abort (); 998 } 999 else 1000 phase2_ungetc (c); 1001 1002 c = n; 1003 } 1004 else 1005 { 1006 phase2_ungetc (c); 1007 phase2_ungetc ('x'); 1008 c = '\\'; 1009 } 1010 break; 1011 1012 case '0': case '1': case '2': case '3': 1013 case '4': case '5': case '6': case '7': 1014 { 1015 int n = c - '0'; 1016 1017 c = phase2_getc (); 1018 if (c >= '0' && c <= '7') 1019 { 1020 n = n * 8 + c - '0'; 1021 1022 c = phase2_getc (); 1023 if (c >= '0' && c <= '7') 1024 n = n * 8 + c - '0'; 1025 else 1026 phase2_ungetc (c); 1027 } 1028 else 1029 phase2_ungetc (c); 1030 1031 c = n; 1032 } 1033 break; 1034 } 1035 } 1036 if (wp->type == t_string) 1037 { 1038 grow_token (wp->token); 1039 wp->token->chars[wp->token->charcount++] = 1040 (unsigned char) c; 1041 } 1042 } 1043 /* The result is a literal string. Don't change wp->type. */ 1044 continue; 1045 } 1046 else if (c2 == '"' && !open_doublequote) 1047 { 1048 /* Bash builtin for internationalized string. */ 1049 lex_pos_ty pos; 1050 struct token string; 1051 1052 saw_opening_singlequote (); 1053 open_singlequote_terminator = '"'; 1054 pos.file_name = logical_file_name; 1055 pos.line_number = line_number; 1056 init_token (&string); 1057 for (;;) 1058 { 1059 c = phase2_getc (); 1060 if (c == EOF) 1061 break; 1062 if (c == '"') 1063 { 1064 saw_closing_singlequote (); 1065 break; 1066 } 1067 grow_token (&string); 1068 string.chars[string.charcount++] = (unsigned char) c; 1069 } 1070 remember_a_message (mlp, NULL, string_of_token (&string), 1071 context, &pos, savable_comment); 1072 free_token (&string); 1073 1074 error_with_progname = false; 1075 error (0, 0, _("%s:%lu: warning: the syntax $\"...\" is deprecated due to security reasons; use eval_gettext instead"), 1076 pos.file_name, (unsigned long) pos.line_number); 1077 error_with_progname = true; 1078 1079 /* The result at runtime is not constant. Therefore we 1080 change wp->type. */ 1081 } 1082 else 1083 phase2_ungetc (c2); 1084 } 1085 wp->type = t_other; 1086 continue; 1087 } 1088 1089 if (c == '\'') 1090 { 1091 if (!open_singlequote) 1092 { 1093 /* Handle an opening single quote. */ 1094 saw_opening_singlequote (); 1095 } 1096 else 1097 { 1098 /* Handle a closing single quote. */ 1099 saw_closing_singlequote (); 1100 } 1101 continue; 1102 } 1103 1104 if (c == '"') 1105 { 1106 if (open_singlequote && open_singlequote_terminator == '"') 1107 { 1108 /* Handle a closing i18n quote. */ 1109 saw_closing_singlequote (); 1110 } 1111 else if (!open_doublequote) 1112 { 1113 /* Handle an opening double quote. */ 1114 saw_opening_doublequote (); 1115 } 1116 else 1117 { 1118 /* Handle a closing double quote. */ 1119 saw_closing_doublequote (); 1120 } 1121 continue; 1122 } 1123 1124 if (c == OPENING_BACKQUOTE) 1125 { 1126 /* Handle an opening backquote. */ 1127 saw_opening_backquote (); 1128 1129 read_command_list (CLOSING_BACKQUOTE, context); 1130 1131 wp->type = t_other; 1132 continue; 1133 } 1134 if (c == CLOSING_BACKQUOTE) 1135 break; 1136 1137 if (c == '<' || c == '>') 1138 { 1139 int c2; 1140 1141 /* An unquoted c indicates we are not inside '...' nor "...". */ 1142 if (open_singlequote || open_doublequote) 1143 abort (); 1144 1145 c2 = phase2_getc (); 1146 if (c2 == '(') 1147 { 1148 /* Process substitution (Bash syntax). */ 1149 read_command_list (')', context); 1150 1151 wp->type = t_other; 1152 continue; 1153 } 1154 else 1155 phase2_ungetc (c2); 1156 } 1157 1158 if (!open_singlequote && !open_doublequote 1159 && (is_whitespace (c) || is_operator_start (c))) 1160 break; 1161 1162 if (wp->type == t_string) 1163 { 1164 grow_token (wp->token); 1165 wp->token->chars[wp->token->charcount++] = (unsigned char) c; 1166 } 1167 } 1168 1169 phase2_ungetc (c); 1170 1171 if (wp->type != t_string) 1172 { 1173 free_token (wp->token); 1174 free (wp->token); 1175 } 1176 last_non_comment_line = line_number; 1177} 1178 1179 1180/* Read the next command. 1181 'looking_for' denotes a parse terminator, either CLOSING_BACKQUOTE, ')' 1182 or '\0'. 1183 Returns the type of the word that terminated the command. */ 1184static enum word_type 1185read_command (int looking_for, flag_context_ty outer_context) 1186{ 1187 /* Read the words that make up the command. 1188 Here we completely ignore field splitting at whitespace and wildcard 1189 expansions; i.e. we assume that the source is written in such a way that 1190 every word in the program determines exactly one word in the resulting 1191 command. 1192 But we do not require that the 'gettext'/'ngettext' command is the 1193 first in the command; this is because 1. we want to allow for prefixes 1194 like "$verbose" that may expand to nothing, and 2. it's a big effort 1195 to know where a command starts in a $(for ...) or $(case ...) compound 1196 command. */ 1197 int arg = 0; /* Current argument number. */ 1198 bool arg_of_redirect = false; /* True right after a redirection operator. */ 1199 flag_context_list_iterator_ty context_iter; 1200 const struct callshapes *shapes = NULL; 1201 struct arglist_parser *argparser = NULL; 1202 1203 for (;;) 1204 { 1205 struct word inner; 1206 flag_context_ty inner_context; 1207 1208 if (arg == 0) 1209 inner_context = null_context; 1210 else 1211 inner_context = 1212 inherited_context (outer_context, 1213 flag_context_list_iterator_advance ( 1214 &context_iter)); 1215 1216 read_word (&inner, looking_for, inner_context); 1217 1218 /* Recognize end of command. */ 1219 if (inner.type == t_separator 1220 || inner.type == t_backquote || inner.type == t_paren 1221 || inner.type == t_eof) 1222 { 1223 if (argparser != NULL) 1224 arglist_parser_done (argparser, arg); 1225 return inner.type; 1226 } 1227 1228 if (extract_all) 1229 { 1230 if (inner.type == t_string) 1231 { 1232 lex_pos_ty pos; 1233 1234 pos.file_name = logical_file_name; 1235 pos.line_number = inner.line_number_at_start; 1236 remember_a_message (mlp, NULL, string_of_word (&inner), 1237 inner_context, &pos, savable_comment); 1238 } 1239 } 1240 1241 if (arg_of_redirect) 1242 { 1243 /* Ignore arguments of redirection operators. */ 1244 arg_of_redirect = false; 1245 } 1246 else if (inner.type == t_redirect) 1247 { 1248 /* Ignore this word and the following one. */ 1249 arg_of_redirect = true; 1250 } 1251 else 1252 { 1253 if (argparser == NULL) 1254 { 1255 /* This is the function position. */ 1256 arg = 0; 1257 if (inner.type == t_string) 1258 { 1259 char *function_name = string_of_word (&inner); 1260 void *keyword_value; 1261 1262 if (hash_find_entry (&keywords, 1263 function_name, strlen (function_name), 1264 &keyword_value) 1265 == 0) 1266 shapes = (const struct callshapes *) keyword_value; 1267 1268 argparser = arglist_parser_alloc (mlp, shapes); 1269 1270 context_iter = 1271 flag_context_list_iterator ( 1272 flag_context_list_table_lookup ( 1273 flag_context_list_table, 1274 function_name, strlen (function_name))); 1275 1276 free (function_name); 1277 } 1278 else 1279 context_iter = null_context_list_iterator; 1280 } 1281 else 1282 { 1283 /* These are the argument positions. */ 1284 if (inner.type == t_string) 1285 arglist_parser_remember (argparser, arg, 1286 string_of_word (&inner), 1287 inner_context, 1288 logical_file_name, 1289 inner.line_number_at_start, 1290 savable_comment); 1291 1292 if (arglist_parser_decidedp (argparser, arg)) 1293 { 1294 /* Stop looking for arguments of the last function_name. */ 1295 /* FIXME: What about context_iter? */ 1296 arglist_parser_done (argparser, arg); 1297 shapes = NULL; 1298 argparser = NULL; 1299 } 1300 } 1301 1302 arg++; 1303 } 1304 1305 free_word (&inner); 1306 } 1307} 1308 1309 1310/* Read a list of commands. 1311 'looking_for' denotes a parse terminator, either CLOSING_BACKQUOTE, ')' 1312 or '\0'. 1313 Returns the type of the word that terminated the command list. */ 1314static enum word_type 1315read_command_list (int looking_for, flag_context_ty outer_context) 1316{ 1317 for (;;) 1318 { 1319 enum word_type terminator; 1320 1321 terminator = read_command (looking_for, outer_context); 1322 if (terminator != t_separator) 1323 return terminator; 1324 } 1325} 1326 1327 1328void 1329extract_sh (FILE *f, 1330 const char *real_filename, const char *logical_filename, 1331 flag_context_list_table_ty *flag_table, 1332 msgdomain_list_ty *mdlp) 1333{ 1334 mlp = mdlp->item[0]->messages; 1335 1336 fp = f; 1337 real_file_name = real_filename; 1338 logical_file_name = xstrdup (logical_filename); 1339 line_number = 1; 1340 1341 last_comment_line = -1; 1342 last_non_comment_line = -1; 1343 1344 nested_backquotes = 0; 1345 open_doublequotes_mask = 0; 1346 open_doublequote = false; 1347 open_singlequote = false; 1348 1349 flag_context_list_table = flag_table; 1350 1351 init_keywords (); 1352 1353 /* Eat tokens until eof is seen. */ 1354 read_command_list ('\0', null_context); 1355 1356 fp = NULL; 1357 real_file_name = NULL; 1358 logical_file_name = NULL; 1359 line_number = 0; 1360} 1361