1/* xgettext Perl backend. 2 Copyright (C) 2002-2007 Free Software Foundation, Inc. 3 4 This file was written by Guido Flohr <guido@imperia.net>, 2002-2003. 5 6 This program is free software: you can redistribute it and/or modify 7 it under the terms of the GNU General Public License as published by 8 the Free Software Foundation; either version 3 of the License, or 9 (at your option) any later version. 10 11 This program is distributed in the hope that it will be useful, 12 but WITHOUT ANY WARRANTY; without even the implied warranty of 13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 GNU General Public License for more details. 15 16 You should have received a copy of the GNU General Public License 17 along with this program. If not, see <http://www.gnu.org/licenses/>. */ 18 19#ifdef HAVE_CONFIG_H 20# include "config.h" 21#endif 22 23/* Specification. */ 24#include "x-perl.h" 25 26#include <errno.h> 27#include <stdbool.h> 28#include <stdio.h> 29#include <stdlib.h> 30#include <string.h> 31 32#include "message.h" 33#include "xgettext.h" 34#include "x-perl.h" 35#include "error.h" 36#include "error-progname.h" 37#include "xalloc.h" 38#include "po-charset.h" 39#include "unistr.h" 40#include "uniname.h" 41#include "gettext.h" 42 43#define _(s) gettext(s) 44 45/* The Perl syntax is defined in perlsyn.pod. Try the command 46 "man perlsyn" or "perldoc perlsyn". 47 Also, the syntax after the 'sub' keyword is specified in perlsub.pod. 48 Try the command "man perlsub" or "perldoc perlsub". */ 49 50#define DEBUG_PERL 0 51 52 53/* ====================== Keyword set customization. ====================== */ 54 55/* If true extract all strings. */ 56static bool extract_all = false; 57 58static hash_table keywords; 59static bool default_keywords = true; 60 61 62void 63x_perl_extract_all () 64{ 65 extract_all = true; 66} 67 68 69void 70x_perl_keyword (const char *name) 71{ 72 if (name == NULL) 73 default_keywords = false; 74 else 75 { 76 const char *end; 77 struct callshape shape; 78 const char *colon; 79 80 if (keywords.table == NULL) 81 hash_init (&keywords, 100); 82 83 split_keywordspec (name, &end, &shape); 84 85 /* The characters between name and end should form a valid C identifier. 86 A colon means an invalid parse in split_keywordspec(). */ 87 colon = strchr (name, ':'); 88 if (colon == NULL || colon >= end) 89 insert_keyword_callshape (&keywords, name, end - name, &shape); 90 } 91} 92 93/* Finish initializing the keywords hash table. 94 Called after argument processing, before each file is processed. */ 95static void 96init_keywords () 97{ 98 if (default_keywords) 99 { 100 /* When adding new keywords here, also update the documentation in 101 xgettext.texi! */ 102 x_perl_keyword ("gettext"); 103 x_perl_keyword ("%gettext"); 104 x_perl_keyword ("$gettext"); 105 x_perl_keyword ("dgettext:2"); 106 x_perl_keyword ("dcgettext:2"); 107 x_perl_keyword ("ngettext:1,2"); 108 x_perl_keyword ("dngettext:2,3"); 109 x_perl_keyword ("dcngettext:2,3"); 110 x_perl_keyword ("gettext_noop"); 111#if 0 112 x_perl_keyword ("__"); 113 x_perl_keyword ("$__"); 114 x_perl_keyword ("%__"); 115 x_perl_keyword ("__x"); 116 x_perl_keyword ("__n:1,2"); 117 x_perl_keyword ("__nx:1,2"); 118 x_perl_keyword ("__xn:1,2"); 119 x_perl_keyword ("N__"); 120#endif 121 default_keywords = false; 122 } 123} 124 125void 126init_flag_table_perl () 127{ 128 xgettext_record_flag ("gettext:1:pass-perl-format"); 129 xgettext_record_flag ("gettext:1:pass-perl-brace-format"); 130 xgettext_record_flag ("%gettext:1:pass-perl-format"); 131 xgettext_record_flag ("%gettext:1:pass-perl-brace-format"); 132 xgettext_record_flag ("$gettext:1:pass-perl-format"); 133 xgettext_record_flag ("$gettext:1:pass-perl-brace-format"); 134 xgettext_record_flag ("dgettext:2:pass-perl-format"); 135 xgettext_record_flag ("dgettext:2:pass-perl-brace-format"); 136 xgettext_record_flag ("dcgettext:2:pass-perl-format"); 137 xgettext_record_flag ("dcgettext:2:pass-perl-brace-format"); 138 xgettext_record_flag ("ngettext:1:pass-perl-format"); 139 xgettext_record_flag ("ngettext:2:pass-perl-format"); 140 xgettext_record_flag ("ngettext:1:pass-perl-brace-format"); 141 xgettext_record_flag ("ngettext:2:pass-perl-brace-format"); 142 xgettext_record_flag ("dngettext:2:pass-perl-format"); 143 xgettext_record_flag ("dngettext:3:pass-perl-format"); 144 xgettext_record_flag ("dngettext:2:pass-perl-brace-format"); 145 xgettext_record_flag ("dngettext:3:pass-perl-brace-format"); 146 xgettext_record_flag ("dcngettext:2:pass-perl-format"); 147 xgettext_record_flag ("dcngettext:3:pass-perl-format"); 148 xgettext_record_flag ("dcngettext:2:pass-perl-brace-format"); 149 xgettext_record_flag ("dcngettext:3:pass-perl-brace-format"); 150 xgettext_record_flag ("gettext_noop:1:pass-perl-format"); 151 xgettext_record_flag ("gettext_noop:1:pass-perl-brace-format"); 152 xgettext_record_flag ("printf:1:perl-format"); /* argument 1 or 2 ?? */ 153 xgettext_record_flag ("sprintf:1:perl-format"); 154#if 0 155 xgettext_record_flag ("__:1:pass-perl-format"); 156 xgettext_record_flag ("__:1:pass-perl-brace-format"); 157 xgettext_record_flag ("%__:1:pass-perl-format"); 158 xgettext_record_flag ("%__:1:pass-perl-brace-format"); 159 xgettext_record_flag ("$__:1:pass-perl-format"); 160 xgettext_record_flag ("$__:1:pass-perl-brace-format"); 161 xgettext_record_flag ("__x:1:perl-brace-format"); 162 xgettext_record_flag ("__n:1:pass-perl-format"); 163 xgettext_record_flag ("__n:2:pass-perl-format"); 164 xgettext_record_flag ("__n:1:pass-perl-brace-format"); 165 xgettext_record_flag ("__n:2:pass-perl-brace-format"); 166 xgettext_record_flag ("__nx:1:perl-brace-format"); 167 xgettext_record_flag ("__nx:2:perl-brace-format"); 168 xgettext_record_flag ("__xn:1:perl-brace-format"); 169 xgettext_record_flag ("__xn:2:perl-brace-format"); 170 xgettext_record_flag ("N__:1:pass-perl-format"); 171 xgettext_record_flag ("N__:1:pass-perl-brace-format"); 172#endif 173} 174 175 176/* ======================== Reading of characters. ======================== */ 177 178/* Real filename, used in error messages about the input file. */ 179static const char *real_file_name; 180 181/* Logical filename and line number, used to label the extracted messages. */ 182static char *logical_file_name; 183static int line_number; 184 185/* The input file stream. */ 186static FILE *fp; 187 188/* The current line buffer. */ 189static char *linebuf; 190 191/* The size of the current line. */ 192static int linesize; 193 194/* The position in the current line. */ 195static int linepos; 196 197/* The size of the input buffer. */ 198static size_t linebuf_size; 199 200/* Number of lines eaten for here documents. */ 201static int here_eaten; 202 203/* Paranoia: EOF marker for __END__ or __DATA__. */ 204static bool end_of_file; 205 206 207/* 1. line_number handling. */ 208 209/* Returns the next character from the input stream or EOF. */ 210static int 211phase1_getc () 212{ 213 line_number += here_eaten; 214 here_eaten = 0; 215 216 if (end_of_file) 217 return EOF; 218 219 if (linepos >= linesize) 220 { 221 linesize = getline (&linebuf, &linebuf_size, fp); 222 223 if (linesize < 0) 224 { 225 if (ferror (fp)) 226 error (EXIT_FAILURE, errno, _("error while reading \"%s\""), 227 real_file_name); 228 end_of_file = true; 229 return EOF; 230 } 231 232 linepos = 0; 233 ++line_number; 234 235 /* Undosify. This is important for catching the end of <<EOF and 236 <<'EOF'. We could rely on stdio doing this for us but you 237 it is not uncommon to to come across Perl scripts with CRLF 238 newline conventions on systems that do not follow this 239 convention. */ 240 if (linesize >= 2 && linebuf[linesize - 1] == '\n' 241 && linebuf[linesize - 2] == '\r') 242 { 243 linebuf[linesize - 2] = '\n'; 244 linebuf[linesize - 1] = '\0'; 245 --linesize; 246 } 247 } 248 249 return linebuf[linepos++]; 250} 251 252/* Supports only one pushback character. */ 253static void 254phase1_ungetc (int c) 255{ 256 if (c != EOF) 257 { 258 if (linepos == 0) 259 /* Attempt to ungetc across line boundary. Shouldn't happen. 260 No two phase1_ungetc calls are permitted in a row. */ 261 abort (); 262 263 --linepos; 264 } 265} 266 267/* Read a here document and return its contents. 268 The delimiter is an UTF-8 encoded string; the resulting string is UTF-8 269 encoded as well. */ 270 271static char * 272get_here_document (const char *delimiter) 273{ 274 /* Accumulator for the entire here document, including a NUL byte 275 at the end. */ 276 static char *buffer; 277 static size_t bufmax = 0; 278 size_t bufpos = 0; 279 /* Current line being appended. */ 280 static char *my_linebuf = NULL; 281 static size_t my_linebuf_size = 0; 282 283 /* Allocate the initial buffer. Later on, bufmax > 0. */ 284 if (bufmax == 0) 285 { 286 buffer = XNMALLOC (1, char); 287 buffer[0] = '\0'; 288 bufmax = 1; 289 } 290 291 for (;;) 292 { 293 int read_bytes = getline (&my_linebuf, &my_linebuf_size, fp); 294 char *my_line_utf8; 295 bool chomp; 296 297 if (read_bytes < 0) 298 { 299 if (ferror (fp)) 300 { 301 error (EXIT_FAILURE, errno, _("error while reading \"%s\""), 302 real_file_name); 303 } 304 else 305 { 306 error_with_progname = false; 307 error (EXIT_SUCCESS, 0, _("\ 308%s:%d: can't find string terminator \"%s\" anywhere before EOF"), 309 real_file_name, line_number, delimiter); 310 error_with_progname = true; 311 312 break; 313 } 314 } 315 316 ++here_eaten; 317 318 /* Convert to UTF-8. */ 319 my_line_utf8 = 320 from_current_source_encoding (my_linebuf, logical_file_name, 321 line_number + here_eaten); 322 if (my_line_utf8 != my_linebuf) 323 { 324 if (strlen (my_line_utf8) >= my_linebuf_size) 325 { 326 my_linebuf_size = strlen (my_line_utf8) + 1; 327 my_linebuf = xrealloc (my_linebuf, my_linebuf_size); 328 } 329 strcpy (my_linebuf, my_line_utf8); 330 free (my_line_utf8); 331 } 332 333 /* Undosify. This is important for catching the end of <<EOF and 334 <<'EOF'. We could rely on stdio doing this for us but you 335 it is not uncommon to to come across Perl scripts with CRLF 336 newline conventions on systems that do not follow this 337 convention. */ 338 if (read_bytes >= 2 && my_linebuf[read_bytes - 1] == '\n' 339 && my_linebuf[read_bytes - 2] == '\r') 340 { 341 my_linebuf[read_bytes - 2] = '\n'; 342 my_linebuf[read_bytes - 1] = '\0'; 343 --read_bytes; 344 } 345 346 /* Temporarily remove the trailing newline from my_linebuf. */ 347 chomp = false; 348 if (read_bytes >= 1 && my_linebuf[read_bytes - 1] == '\n') 349 { 350 chomp = true; 351 my_linebuf[read_bytes - 1] = '\0'; 352 } 353 354 /* See whether this line terminates the here document. */ 355 if (strcmp (my_linebuf, delimiter) == 0) 356 break; 357 358 /* Add back the trailing newline to my_linebuf. */ 359 if (chomp) 360 my_linebuf[read_bytes - 1] = '\n'; 361 362 /* Ensure room for read_bytes + 1 bytes. */ 363 if (bufpos + read_bytes >= bufmax) 364 { 365 do 366 bufmax = 2 * bufmax + 10; 367 while (bufpos + read_bytes >= bufmax); 368 buffer = xrealloc (buffer, bufmax); 369 } 370 /* Append this line to the accumulator. */ 371 strcpy (buffer + bufpos, my_linebuf); 372 bufpos += read_bytes; 373 } 374 375 /* Done accumulating the here document. */ 376 return xstrdup (buffer); 377} 378 379/* Skips pod sections. */ 380static void 381skip_pod () 382{ 383 line_number += here_eaten; 384 here_eaten = 0; 385 linepos = 0; 386 387 for (;;) 388 { 389 linesize = getline (&linebuf, &linebuf_size, fp); 390 391 if (linesize < 0) 392 { 393 if (ferror (fp)) 394 error (EXIT_FAILURE, errno, _("error while reading \"%s\""), 395 real_file_name); 396 return; 397 } 398 399 ++line_number; 400 401 if (strncmp ("=cut", linebuf, 4) == 0) 402 { 403 /* Force reading of a new line on next call to phase1_getc(). */ 404 linepos = linesize; 405 return; 406 } 407 } 408} 409 410 411/* These are for tracking whether comments count as immediately before 412 keyword. */ 413static int last_comment_line; 414static int last_non_comment_line; 415 416 417/* 2. Replace each comment that is not inside a string literal or regular 418 expression with a newline character. We need to remember the comment 419 for later, because it may be attached to a keyword string. */ 420 421static int 422phase2_getc () 423{ 424 static char *buffer; 425 static size_t bufmax; 426 size_t buflen; 427 int lineno; 428 int c; 429 char *utf8_string; 430 431 c = phase1_getc (); 432 if (c == '#') 433 { 434 buflen = 0; 435 lineno = line_number; 436 /* Skip leading whitespace. */ 437 for (;;) 438 { 439 c = phase1_getc (); 440 if (c == EOF) 441 break; 442 if (c != ' ' && c != '\t' && c != '\r' && c != '\f') 443 { 444 phase1_ungetc (c); 445 break; 446 } 447 } 448 /* Accumulate the comment. */ 449 for (;;) 450 { 451 c = phase1_getc (); 452 if (c == '\n' || c == EOF) 453 break; 454 if (buflen >= bufmax) 455 { 456 bufmax = 2 * bufmax + 10; 457 buffer = xrealloc (buffer, bufmax); 458 } 459 buffer[buflen++] = c; 460 } 461 if (buflen >= bufmax) 462 { 463 bufmax = 2 * bufmax + 10; 464 buffer = xrealloc (buffer, bufmax); 465 } 466 buffer[buflen] = '\0'; 467 /* Convert it to UTF-8. */ 468 utf8_string = 469 from_current_source_encoding (buffer, logical_file_name, lineno); 470 /* Save it until we encounter the corresponding string. */ 471 savable_comment_add (utf8_string); 472 last_comment_line = lineno; 473 } 474 return c; 475} 476 477/* Supports only one pushback character. */ 478static void 479phase2_ungetc (int c) 480{ 481 if (c != EOF) 482 phase1_ungetc (c); 483} 484 485/* Whitespace recognition. */ 486 487#define case_whitespace \ 488 case ' ': case '\t': case '\r': case '\n': case '\f' 489 490static inline bool 491is_whitespace (int c) 492{ 493 return (c == ' ' || c == '\t' || c == '\r' || c == '\n' || c == '\f'); 494} 495 496 497/* ========================== Reading of tokens. ========================== */ 498 499 500enum token_type_ty 501{ 502 token_type_eof, 503 token_type_lparen, /* ( */ 504 token_type_rparen, /* ) */ 505 token_type_comma, /* , */ 506 token_type_fat_comma, /* => */ 507 token_type_dereference, /* , */ 508 token_type_semicolon, /* ; */ 509 token_type_lbrace, /* { */ 510 token_type_rbrace, /* } */ 511 token_type_lbracket, /* [ */ 512 token_type_rbracket, /* ] */ 513 token_type_string, /* quote-like */ 514 token_type_named_op, /* if, unless, while, ... */ 515 token_type_variable, /* $... */ 516 token_type_symbol, /* symbol, number */ 517 token_type_regex_op, /* s, tr, y, m. */ 518 token_type_dot, /* . */ 519 token_type_other, /* regexp, misc. operator */ 520 /* The following are not really token types, but variants used by 521 the parser. */ 522 token_type_keyword_symbol /* keyword symbol */ 523}; 524typedef enum token_type_ty token_type_ty; 525 526/* Subtypes for strings, important for interpolation. */ 527enum string_type_ty 528{ 529 string_type_verbatim, /* "<<'EOF'", "m'...'", "s'...''...'", 530 "tr/.../.../", "y/.../.../". */ 531 string_type_q, /* "'..'", "q/.../". */ 532 string_type_qq, /* '"..."', "`...`", "qq/.../", "qx/.../", 533 "<file*glob>". */ 534 string_type_qr /* Not supported. */ 535}; 536 537/* Subtypes for symbols, important for dollar interpretation. */ 538enum symbol_type_ty 539{ 540 symbol_type_none, /* Nothing special. */ 541 symbol_type_sub, /* 'sub'. */ 542 symbol_type_function /* Function name after 'sub'. */ 543}; 544 545typedef struct token_ty token_ty; 546struct token_ty 547{ 548 token_type_ty type; 549 int sub_type; /* for token_type_string, token_type_symbol */ 550 char *string; /* for: in encoding: 551 token_type_named_op ASCII 552 token_type_string UTF-8 553 token_type_symbol ASCII 554 token_type_variable global_source_encoding 555 */ 556 refcounted_string_list_ty *comment; /* for token_type_string */ 557 int line_number; 558}; 559 560#if DEBUG_PERL 561static const char * 562token2string (const token_ty *token) 563{ 564 switch (token->type) 565 { 566 case token_type_eof: 567 return "token_type_eof"; 568 case token_type_lparen: 569 return "token_type_lparen"; 570 case token_type_rparen: 571 return "token_type_rparen"; 572 case token_type_comma: 573 return "token_type_comma"; 574 case token_type_fat_comma: 575 return "token_type_fat_comma"; 576 case token_type_dereference: 577 return "token_type_dereference"; 578 case token_type_semicolon: 579 return "token_type_semicolon"; 580 case token_type_lbrace: 581 return "token_type_lbrace"; 582 case token_type_rbrace: 583 return "token_type_rbrace"; 584 case token_type_lbracket: 585 return "token_type_lbracket"; 586 case token_type_rbracket: 587 return "token_type_rbracket"; 588 case token_type_string: 589 return "token_type_string"; 590 case token_type_named_op: 591 return "token_type_named_op"; 592 case token_type_variable: 593 return "token_type_variable"; 594 case token_type_symbol: 595 return "token_type_symbol"; 596 case token_type_regex_op: 597 return "token_type_regex_op"; 598 case token_type_dot: 599 return "token_type_dot"; 600 case token_type_other: 601 return "token_type_other"; 602 default: 603 return "unknown"; 604 } 605} 606#endif 607 608/* Free the memory pointed to by a 'struct token_ty'. */ 609static inline void 610free_token (token_ty *tp) 611{ 612 switch (tp->type) 613 { 614 case token_type_named_op: 615 case token_type_string: 616 case token_type_symbol: 617 case token_type_variable: 618 free (tp->string); 619 break; 620 default: 621 break; 622 } 623 if (tp->type == token_type_string) 624 drop_reference (tp->comment); 625 free (tp); 626} 627 628/* Pass 1 of extracting quotes: Find the end of the string, regardless 629 of the semantics of the construct. Return the complete string, 630 including the starting and the trailing delimiter, with backslashes 631 removed where appropriate. */ 632static char * 633extract_quotelike_pass1 (int delim) 634{ 635 /* This function is called recursively. No way to allocate stuff 636 statically. Also alloca() is inappropriate due to limited stack 637 size on some platforms. So we use malloc(). */ 638 int bufmax = 10; 639 char *buffer = XNMALLOC (bufmax, char); 640 int bufpos = 0; 641 bool nested = true; 642 int counter_delim; 643 644 buffer[bufpos++] = delim; 645 646 /* Find the closing delimiter. */ 647 switch (delim) 648 { 649 case '(': 650 counter_delim = ')'; 651 break; 652 case '{': 653 counter_delim = '}'; 654 break; 655 case '[': 656 counter_delim = ']'; 657 break; 658 case '<': 659 counter_delim = '>'; 660 break; 661 default: /* "..." or '...' or |...| etc. */ 662 nested = false; 663 counter_delim = delim; 664 break; 665 } 666 667 for (;;) 668 { 669 int c = phase1_getc (); 670 671 /* This round can produce 1 or 2 bytes. Ensure room for 2 bytes. */ 672 if (bufpos + 2 > bufmax) 673 { 674 bufmax = 2 * bufmax + 10; 675 buffer = xrealloc (buffer, bufmax); 676 } 677 678 if (c == counter_delim || c == EOF) 679 { 680 buffer[bufpos++] = counter_delim; /* will be stripped off later */ 681 buffer[bufpos++] = '\0'; 682#if DEBUG_PERL 683 fprintf (stderr, "PASS1: %s\n", buffer); 684#endif 685 return buffer; 686 } 687 688 if (nested && c == delim) 689 { 690 char *inner = extract_quotelike_pass1 (delim); 691 size_t len = strlen (inner); 692 693 /* Ensure room for len + 1 bytes. */ 694 if (bufpos + len >= bufmax) 695 { 696 do 697 bufmax = 2 * bufmax + 10; 698 while (bufpos + len >= bufmax); 699 buffer = xrealloc (buffer, bufmax); 700 } 701 strcpy (buffer + bufpos, inner); 702 free (inner); 703 bufpos += len; 704 } 705 else if (c == '\\') 706 { 707 c = phase1_getc (); 708 if (c == '\\') 709 { 710 buffer[bufpos++] = '\\'; 711 buffer[bufpos++] = '\\'; 712 } 713 else if (c == delim || c == counter_delim) 714 { 715 /* This is pass2 in Perl. */ 716 buffer[bufpos++] = c; 717 } 718 else 719 { 720 buffer[bufpos++] = '\\'; 721 phase1_ungetc (c); 722 } 723 } 724 else 725 { 726 buffer[bufpos++] = c; 727 } 728 } 729} 730 731/* Like extract_quotelike_pass1, but return the complete string in UTF-8 732 encoding. */ 733static char * 734extract_quotelike_pass1_utf8 (int delim) 735{ 736 char *string = extract_quotelike_pass1 (delim); 737 char *utf8_string = 738 from_current_source_encoding (string, logical_file_name, line_number); 739 if (utf8_string != string) 740 free (string); 741 return utf8_string; 742} 743 744 745/* ========= Reading of tokens and commands. Extracting strings. ========= */ 746 747 748/* There is an ambiguity about '/': It can start a division operator ('/' or 749 '/=') or it can start a regular expression. The distinction is important 750 because inside regular expressions, '#' loses its special meaning. 751 The distinction is possible depending on the parsing state: After a 752 variable or simple expression, it's a division operator; at the beginning 753 of an expression, it's a regexp. */ 754static bool prefer_division_over_regexp; 755 756/* Context lookup table. */ 757static flag_context_list_table_ty *flag_context_list_table; 758 759 760/* Forward declaration of local functions. */ 761static void interpolate_keywords (message_list_ty *mlp, const char *string, 762 int lineno); 763static token_ty *x_perl_lex (message_list_ty *mlp); 764static void x_perl_unlex (token_ty *tp); 765static bool extract_balanced (message_list_ty *mlp, 766 token_type_ty delim, bool eat_delim, 767 bool comma_delim, 768 flag_context_ty outer_context, 769 flag_context_list_iterator_ty context_iter, 770 int arg, struct arglist_parser *argparser); 771 772 773/* Extract an unsigned hexadecimal number from STRING, considering at 774 most LEN bytes and place the result in *RESULT. Returns a pointer 775 to the first character past the hexadecimal number. */ 776static const char * 777extract_hex (const char *string, size_t len, unsigned int *result) 778{ 779 size_t i; 780 781 *result = 0; 782 783 for (i = 0; i < len; i++) 784 { 785 char c = string[i]; 786 int number; 787 788 if (c >= 'A' && c <= 'F') 789 number = c - 'A' + 10; 790 else if (c >= 'a' && c <= 'f') 791 number = c - 'a' + 10; 792 else if (c >= '0' && c <= '9') 793 number = c - '0'; 794 else 795 break; 796 797 *result <<= 4; 798 *result |= number; 799 } 800 801 return string + i; 802} 803 804/* Extract an unsigned octal number from STRING, considering at 805 most LEN bytes and place the result in *RESULT. Returns a pointer 806 to the first character past the octal number. */ 807static const char * 808extract_oct (const char *string, size_t len, unsigned int *result) 809{ 810 size_t i; 811 812 *result = 0; 813 814 for (i = 0; i < len; i++) 815 { 816 char c = string[i]; 817 int number; 818 819 if (c >= '0' && c <= '7') 820 number = c - '0'; 821 else 822 break; 823 824 *result <<= 3; 825 *result |= number; 826 } 827 828 return string + i; 829} 830 831/* Extract the various quotelike constructs except for <<EOF. See the 832 section "Gory details of parsing quoted constructs" in perlop.pod. 833 Return the resulting token in *tp; tp->type == token_type_string. */ 834static void 835extract_quotelike (token_ty *tp, int delim) 836{ 837 char *string = extract_quotelike_pass1_utf8 (delim); 838 size_t len = strlen (string); 839 840 tp->type = token_type_string; 841 /* Take the string without the delimiters at the start and at the end. */ 842 if (!(len >= 2)) 843 abort (); 844 string[len - 1] = '\0'; 845 tp->string = xstrdup (string + 1); 846 free (string); 847 tp->comment = add_reference (savable_comment); 848} 849 850/* Extract the quotelike constructs with double delimiters, like 851 s/[SEARCH]/[REPLACE]/. This function does not eat up trailing 852 modifiers (left to the caller). 853 Return the resulting token in *tp; tp->type == token_type_regex_op. */ 854static void 855extract_triple_quotelike (message_list_ty *mlp, token_ty *tp, int delim, 856 bool interpolate) 857{ 858 char *string; 859 860 tp->type = token_type_regex_op; 861 862 string = extract_quotelike_pass1_utf8 (delim); 863 if (interpolate) 864 interpolate_keywords (mlp, string, line_number); 865 free (string); 866 867 if (delim == '(' || delim == '<' || delim == '{' || delim == '[') 868 { 869 /* The delimiter for the second string can be different, e.g. 870 s{SEARCH}{REPLACE} or s{SEARCH}/REPLACE/. See "man perlrequick". */ 871 delim = phase1_getc (); 872 while (is_whitespace (delim)) 873 { 874 /* The hash-sign is not a valid delimiter after whitespace, ergo 875 use phase2_getc() and not phase1_getc() now. */ 876 delim = phase2_getc (); 877 } 878 } 879 string = extract_quotelike_pass1_utf8 (delim); 880 if (interpolate) 881 interpolate_keywords (mlp, string, line_number); 882 free (string); 883} 884 885/* Perform pass 3 of quotelike extraction (interpolation). 886 *tp is a token of type token_type_string. 887 This function replaces tp->string. 888 This function does not access tp->comment. */ 889/* FIXME: Currently may writes null-bytes into the string. */ 890static void 891extract_quotelike_pass3 (token_ty *tp, int error_level) 892{ 893 static char *buffer; 894 static int bufmax = 0; 895 int bufpos = 0; 896 const char *crs; 897 bool uppercase; 898 bool lowercase; 899 bool quotemeta; 900 901#if DEBUG_PERL 902 switch (tp->sub_type) 903 { 904 case string_type_verbatim: 905 fprintf (stderr, "Interpolating string_type_verbatim:\n"); 906 break; 907 case string_type_q: 908 fprintf (stderr, "Interpolating string_type_q:\n"); 909 break; 910 case string_type_qq: 911 fprintf (stderr, "Interpolating string_type_qq:\n"); 912 break; 913 case string_type_qr: 914 fprintf (stderr, "Interpolating string_type_qr:\n"); 915 break; 916 } 917 fprintf (stderr, "%s\n", tp->string); 918 if (tp->sub_type == string_type_verbatim) 919 fprintf (stderr, "---> %s\n", tp->string); 920#endif 921 922 if (tp->sub_type == string_type_verbatim) 923 return; 924 925 /* Loop over tp->string, accumulating the expansion in buffer. */ 926 crs = tp->string; 927 uppercase = false; 928 lowercase = false; 929 quotemeta = false; 930 while (*crs) 931 { 932 bool backslashed; 933 934 /* Ensure room for 7 bytes, 6 (multi-)bytes plus a leading backslash 935 if \Q modifier is present. */ 936 if (bufpos + 7 > bufmax) 937 { 938 bufmax = 2 * bufmax + 10; 939 buffer = xrealloc (buffer, bufmax); 940 } 941 942 if (tp->sub_type == string_type_q) 943 { 944 switch (*crs) 945 { 946 case '\\': 947 if (crs[1] == '\\') 948 { 949 crs += 2; 950 buffer[bufpos++] = '\\'; 951 break; 952 } 953 /* FALLTHROUGH */ 954 default: 955 buffer[bufpos++] = *crs++; 956 break; 957 } 958 continue; 959 } 960 961 /* We only get here for double-quoted strings or regular expressions. 962 Unescape escape sequences. */ 963 if (*crs == '\\') 964 { 965 switch (crs[1]) 966 { 967 case 't': 968 crs += 2; 969 buffer[bufpos++] = '\t'; 970 continue; 971 case 'n': 972 crs += 2; 973 buffer[bufpos++] = '\n'; 974 continue; 975 case 'r': 976 crs += 2; 977 buffer[bufpos++] = '\r'; 978 continue; 979 case 'f': 980 crs += 2; 981 buffer[bufpos++] = '\f'; 982 continue; 983 case 'b': 984 crs += 2; 985 buffer[bufpos++] = '\b'; 986 continue; 987 case 'a': 988 crs += 2; 989 buffer[bufpos++] = '\a'; 990 continue; 991 case 'e': 992 crs += 2; 993 buffer[bufpos++] = 0x1b; 994 continue; 995 case '0': case '1': case '2': case '3': 996 case '4': case '5': case '6': case '7': 997 { 998 unsigned int oct_number; 999 int length; 1000 1001 crs = extract_oct (crs + 1, 3, &oct_number); 1002 1003 /* FIXME: If one of the variables UPPERCASE or LOWERCASE is 1004 true, the character should be converted to its uppercase 1005 resp. lowercase equivalent. I don't know if the necessary 1006 facilities are already included in gettext. For US-Ascii 1007 the conversion can be already be done, however. */ 1008 if (uppercase && oct_number >= 'a' && oct_number <= 'z') 1009 { 1010 oct_number = oct_number - 'a' + 'A'; 1011 } 1012 else if (lowercase && oct_number >= 'A' && oct_number <= 'Z') 1013 { 1014 oct_number = oct_number - 'A' + 'a'; 1015 } 1016 1017 1018 /* Yes, octal escape sequences in the range 0x100..0x1ff are 1019 valid. */ 1020 length = u8_uctomb ((unsigned char *) (buffer + bufpos), 1021 oct_number, 2); 1022 if (length > 0) 1023 bufpos += length; 1024 } 1025 continue; 1026 case 'x': 1027 { 1028 unsigned int hex_number = 0; 1029 int length; 1030 1031 crs += 2; 1032 if (*crs == '{') 1033 { 1034 const char *end = strchr (crs, '}'); 1035 if (end == NULL) 1036 { 1037 error_with_progname = false; 1038 error (error_level, 0, _("\ 1039%s:%d: missing right brace on \\x{HEXNUMBER}"), real_file_name, line_number); 1040 error_with_progname = true; 1041 ++crs; 1042 continue; 1043 } 1044 else 1045 { 1046 ++crs; 1047 (void) extract_hex (crs, end - crs, &hex_number); 1048 crs = end + 1; 1049 } 1050 } 1051 else 1052 { 1053 crs = extract_hex (crs, 2, &hex_number); 1054 } 1055 1056 /* FIXME: If one of the variables UPPERCASE or LOWERCASE is 1057 true, the character should be converted to its uppercase 1058 resp. lowercase equivalent. I don't know if the necessary 1059 facilities are already included in gettext. For US-Ascii 1060 the conversion can be already be done, however. */ 1061 if (uppercase && hex_number >= 'a' && hex_number <= 'z') 1062 { 1063 hex_number = hex_number - 'a' + 'A'; 1064 } 1065 else if (lowercase && hex_number >= 'A' && hex_number <= 'Z') 1066 { 1067 hex_number = hex_number - 'A' + 'a'; 1068 } 1069 1070 length = u8_uctomb ((unsigned char *) (buffer + bufpos), 1071 hex_number, 6); 1072 1073 if (length > 0) 1074 bufpos += length; 1075 } 1076 continue; 1077 case 'c': 1078 /* Perl's notion of control characters. */ 1079 crs += 2; 1080 if (*crs) 1081 { 1082 int the_char = (unsigned char) *crs; 1083 if (the_char >= 'a' || the_char <= 'z') 1084 the_char = the_char - 'a' + 'A'; 1085 buffer[bufpos++] = the_char ^ 0x40; 1086 } 1087 continue; 1088 case 'N': 1089 crs += 2; 1090 if (*crs == '{') 1091 { 1092 const char *end = strchr (crs + 1, '}'); 1093 if (end != NULL) 1094 { 1095 char *name; 1096 unsigned int unicode; 1097 1098 name = XNMALLOC (end - (crs + 1) + 1, char); 1099 memcpy (name, crs + 1, end - (crs + 1)); 1100 name[end - (crs + 1)] = '\0'; 1101 1102 unicode = unicode_name_character (name); 1103 if (unicode != UNINAME_INVALID) 1104 { 1105 /* FIXME: Convert to upper/lowercase if the 1106 corresponding flag is set to true. */ 1107 int length = 1108 u8_uctomb ((unsigned char *) (buffer + bufpos), 1109 unicode, 6); 1110 if (length > 0) 1111 bufpos += length; 1112 } 1113 1114 free (name); 1115 1116 crs = end + 1; 1117 } 1118 } 1119 continue; 1120 } 1121 } 1122 1123 /* No escape sequence, go on. */ 1124 if (*crs == '\\') 1125 { 1126 ++crs; 1127 switch (*crs) 1128 { 1129 case 'E': 1130 uppercase = false; 1131 lowercase = false; 1132 quotemeta = false; 1133 ++crs; 1134 continue; 1135 case 'L': 1136 uppercase = false; 1137 lowercase = true; 1138 ++crs; 1139 continue; 1140 case 'U': 1141 uppercase = true; 1142 lowercase = false; 1143 ++crs; 1144 continue; 1145 case 'Q': 1146 quotemeta = true; 1147 ++crs; 1148 continue; 1149 case 'l': 1150 ++crs; 1151 if (*crs >= 'A' && *crs <= 'Z') 1152 { 1153 buffer[bufpos++] = *crs - 'A' + 'a'; 1154 } 1155 else if ((unsigned char) *crs >= 0x80) 1156 { 1157 error_with_progname = false; 1158 error (error_level, 0, _("\ 1159%s:%d: invalid interpolation (\"\\l\") of 8bit character \"%c\""), 1160 real_file_name, line_number, *crs); 1161 error_with_progname = true; 1162 } 1163 else 1164 { 1165 buffer[bufpos++] = *crs; 1166 } 1167 ++crs; 1168 continue; 1169 case 'u': 1170 ++crs; 1171 if (*crs >= 'a' && *crs <= 'z') 1172 { 1173 buffer[bufpos++] = *crs - 'a' + 'A'; 1174 } 1175 else if ((unsigned char) *crs >= 0x80) 1176 { 1177 error_with_progname = false; 1178 error (error_level, 0, _("\ 1179%s:%d: invalid interpolation (\"\\u\") of 8bit character \"%c\""), 1180 real_file_name, line_number, *crs); 1181 error_with_progname = true; 1182 } 1183 else 1184 { 1185 buffer[bufpos++] = *crs; 1186 } 1187 ++crs; 1188 continue; 1189 case '\\': 1190 buffer[bufpos++] = *crs; 1191 ++crs; 1192 continue; 1193 default: 1194 backslashed = true; 1195 break; 1196 } 1197 } 1198 else 1199 backslashed = false; 1200 1201 if (quotemeta 1202 && !((*crs >= 'A' && *crs <= 'Z') || (*crs >= 'A' && *crs <= 'z') 1203 || (*crs >= '0' && *crs <= '9') || *crs == '_')) 1204 { 1205 buffer[bufpos++] = '\\'; 1206 backslashed = true; 1207 } 1208 1209 if (!backslashed && !extract_all && (*crs == '$' || *crs == '@')) 1210 { 1211 error_with_progname = false; 1212 error (error_level, 0, _("\ 1213%s:%d: invalid variable interpolation at \"%c\""), 1214 real_file_name, line_number, *crs); 1215 error_with_progname = true; 1216 ++crs; 1217 } 1218 else if (lowercase) 1219 { 1220 if (*crs >= 'A' && *crs <= 'Z') 1221 buffer[bufpos++] = *crs - 'A' + 'a'; 1222 else if ((unsigned char) *crs >= 0x80) 1223 { 1224 error_with_progname = false; 1225 error (error_level, 0, _("\ 1226%s:%d: invalid interpolation (\"\\L\") of 8bit character \"%c\""), 1227 real_file_name, line_number, *crs); 1228 error_with_progname = true; 1229 buffer[bufpos++] = *crs; 1230 } 1231 else 1232 buffer[bufpos++] = *crs; 1233 ++crs; 1234 } 1235 else if (uppercase) 1236 { 1237 if (*crs >= 'a' && *crs <= 'z') 1238 buffer[bufpos++] = *crs - 'a' + 'A'; 1239 else if ((unsigned char) *crs >= 0x80) 1240 { 1241 error_with_progname = false; 1242 error (error_level, 0, _("\ 1243%s:%d: invalid interpolation (\"\\U\") of 8bit character \"%c\""), 1244 real_file_name, line_number, *crs); 1245 error_with_progname = true; 1246 buffer[bufpos++] = *crs; 1247 } 1248 else 1249 buffer[bufpos++] = *crs; 1250 ++crs; 1251 } 1252 else 1253 { 1254 buffer[bufpos++] = *crs++; 1255 } 1256 } 1257 1258 /* Ensure room for 1 more byte. */ 1259 if (bufpos >= bufmax) 1260 { 1261 bufmax = 2 * bufmax + 10; 1262 buffer = xrealloc (buffer, bufmax); 1263 } 1264 1265 buffer[bufpos++] = '\0'; 1266 1267#if DEBUG_PERL 1268 fprintf (stderr, "---> %s\n", buffer); 1269#endif 1270 1271 /* Replace tp->string. */ 1272 free (tp->string); 1273 tp->string = xstrdup (buffer); 1274} 1275 1276/* Parse a variable. This is done in several steps: 1277 1) Consume all leading occurencies of '$', '@', '%', and '*'. 1278 2) Determine the name of the variable from the following input. 1279 3) Parse possible following hash keys or array indexes. 1280 */ 1281static void 1282extract_variable (message_list_ty *mlp, token_ty *tp, int first) 1283{ 1284 static char *buffer; 1285 static int bufmax = 0; 1286 int bufpos = 0; 1287 int c = first; 1288 size_t varbody_length = 0; 1289 bool maybe_hash_deref = false; 1290 bool maybe_hash_value = false; 1291 1292 tp->type = token_type_variable; 1293 1294#if DEBUG_PERL 1295 fprintf (stderr, "%s:%d: extracting variable type '%c'\n", 1296 real_file_name, line_number, first); 1297#endif 1298 1299 /* 1300 * 1) Consume dollars and so on (not euros ...). Unconditionally 1301 * accepting the hash sign (#) will maybe lead to inaccurate 1302 * results. FIXME! 1303 */ 1304 while (c == '$' || c == '*' || c == '#' || c == '@' || c == '%') 1305 { 1306 if (bufpos >= bufmax) 1307 { 1308 bufmax = 2 * bufmax + 10; 1309 buffer = xrealloc (buffer, bufmax); 1310 } 1311 buffer[bufpos++] = c; 1312 c = phase1_getc (); 1313 } 1314 1315 if (c == EOF) 1316 { 1317 tp->type = token_type_eof; 1318 return; 1319 } 1320 1321 /* Hash references are treated in a special way, when looking for 1322 our keywords. */ 1323 if (buffer[0] == '$') 1324 { 1325 if (bufpos == 1) 1326 maybe_hash_value = true; 1327 else if (bufpos == 2 && buffer[1] == '$') 1328 { 1329 if (!(c == '{' 1330 || (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') 1331 || (c >= '0' && c <= '9') 1332 || c == '_' || c == ':' || c == '\'' || c >= 0x80)) 1333 { 1334 /* Special variable $$ for pid. */ 1335 if (bufpos >= bufmax) 1336 { 1337 bufmax = 2 * bufmax + 10; 1338 buffer = xrealloc (buffer, bufmax); 1339 } 1340 buffer[bufpos++] = '\0'; 1341 tp->string = xstrdup (buffer); 1342#if DEBUG_PERL 1343 fprintf (stderr, "%s:%d: is PID ($$)\n", 1344 real_file_name, line_number); 1345#endif 1346 1347 phase1_ungetc (c); 1348 return; 1349 } 1350 1351 maybe_hash_deref = true; 1352 bufpos = 1; 1353 } 1354 } 1355 1356 /* 1357 * 2) Get the name of the variable. The first character is practically 1358 * arbitrary. Punctuation and numbers automagically put a variable 1359 * in the global namespace but that subtle difference is not interesting 1360 * for us. 1361 */ 1362 if (bufpos >= bufmax) 1363 { 1364 bufmax = 2 * bufmax + 10; 1365 buffer = xrealloc (buffer, bufmax); 1366 } 1367 if (c == '{') 1368 { 1369 /* Yuck, we cannot accept ${gettext} as a keyword... Except for 1370 * debugging purposes it is also harmless, that we suppress the 1371 * real name of the variable. 1372 */ 1373#if DEBUG_PERL 1374 fprintf (stderr, "%s:%d: braced {variable_name}\n", 1375 real_file_name, line_number); 1376#endif 1377 1378 if (extract_balanced (mlp, token_type_rbrace, true, false, 1379 null_context, null_context_list_iterator, 1380 1, arglist_parser_alloc (mlp, NULL))) 1381 return; 1382 buffer[bufpos++] = c; 1383 } 1384 else 1385 { 1386 while ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') 1387 || (c >= '0' && c <= '9') 1388 || c == '_' || c == ':' || c == '\'' || c >= 0x80) 1389 { 1390 ++varbody_length; 1391 if (bufpos >= bufmax) 1392 { 1393 bufmax = 2 * bufmax + 10; 1394 buffer = xrealloc (buffer, bufmax); 1395 } 1396 buffer[bufpos++] = c; 1397 c = phase1_getc (); 1398 } 1399 phase1_ungetc (c); 1400 } 1401 1402 /* Probably some strange Perl variable like $`. */ 1403 if (varbody_length == 0) 1404 { 1405 c = phase1_getc (); 1406 if (c == EOF || is_whitespace (c)) 1407 phase1_ungetc (c); /* Loser. */ 1408 else 1409 { 1410 if (bufpos >= bufmax) 1411 { 1412 bufmax = 2 * bufmax + 10; 1413 buffer = xrealloc (buffer, bufmax); 1414 } 1415 buffer[bufpos++] = c; 1416 } 1417 } 1418 1419 if (bufpos >= bufmax) 1420 { 1421 bufmax = 2 * bufmax + 10; 1422 buffer = xrealloc (buffer, bufmax); 1423 } 1424 buffer[bufpos++] = '\0'; 1425 1426 tp->string = xstrdup (buffer); 1427 1428#if DEBUG_PERL 1429 fprintf (stderr, "%s:%d: complete variable name: %s\n", 1430 real_file_name, line_number, tp->string); 1431#endif 1432 1433 prefer_division_over_regexp = true; 1434 1435 /* 1436 * 3) If the following looks strange to you, this is valid Perl syntax: 1437 * 1438 * $var = $$hashref # We can place a 1439 * # comment here and then ... 1440 * {key_into_hashref}; 1441 * 1442 * POD sections are not allowed but we leave complaints about 1443 * that to the compiler/interpreter. 1444 */ 1445 /* We only extract strings from the first hash key (if present). */ 1446 1447 if (maybe_hash_deref || maybe_hash_value) 1448 { 1449 bool is_dereference = false; 1450 int c; 1451 1452 do 1453 c = phase2_getc (); 1454 while (is_whitespace (c)); 1455 1456 if (c == '-') 1457 { 1458 int c2 = phase1_getc (); 1459 1460 if (c2 == '>') 1461 { 1462 is_dereference = true; 1463 1464 do 1465 c = phase2_getc (); 1466 while (is_whitespace (c)); 1467 } 1468 else if (c2 != '\n') 1469 { 1470 /* Discarding the newline is harmless here. The only 1471 special character recognized after a minus is greater-than 1472 for dereference. However, the sequence "-\n>" that we 1473 treat incorrectly here, is a syntax error. */ 1474 phase1_ungetc (c2); 1475 } 1476 } 1477 1478 if (maybe_hash_value && is_dereference) 1479 { 1480#if DEBUG_PERL 1481 fprintf (stderr, "%s:%d: first keys preceded by \"->\"\n", 1482 real_file_name, line_number); 1483#endif 1484 } 1485 else if (maybe_hash_value) 1486 { 1487 /* Fake it into a hash. */ 1488 tp->string[0] = '%'; 1489 } 1490 1491 /* Do NOT change that into else if (see above). */ 1492 if ((maybe_hash_value || maybe_hash_deref) && c == '{') 1493 { 1494 void *keyword_value; 1495 1496#if DEBUG_PERL 1497 fprintf (stderr, "%s:%d: first keys preceded by '{'\n", 1498 real_file_name, line_number); 1499#endif 1500 1501 if (hash_find_entry (&keywords, tp->string, strlen (tp->string), 1502 &keyword_value) == 0) 1503 { 1504 /* TODO: Shouldn't we use the shapes of the keyword, instead 1505 of hardwiring argnum1 = 1 ? 1506 const struct callshapes *shapes = 1507 (const struct callshapes *) keyword_value; 1508 */ 1509 struct callshapes shapes; 1510 shapes.keyword = tp->string; /* XXX storage duration? */ 1511 shapes.keyword_len = strlen (tp->string); 1512 shapes.nshapes = 1; 1513 shapes.shapes[0].argnum1 = 1; 1514 shapes.shapes[0].argnum2 = 0; 1515 shapes.shapes[0].argnumc = 0; 1516 shapes.shapes[0].argnum1_glib_context = false; 1517 shapes.shapes[0].argnum2_glib_context = false; 1518 shapes.shapes[0].argtotal = 0; 1519 string_list_init (&shapes.shapes[0].xcomments); 1520 1521 { 1522 /* Extract a possible string from the key. Before proceeding 1523 we check whether the open curly is followed by a symbol and 1524 then by a right curly. */ 1525 flag_context_list_iterator_ty context_iter = 1526 flag_context_list_iterator ( 1527 flag_context_list_table_lookup ( 1528 flag_context_list_table, 1529 tp->string, strlen (tp->string))); 1530 token_ty *t1 = x_perl_lex (mlp); 1531 1532#if DEBUG_PERL 1533 fprintf (stderr, "%s:%d: extracting string key\n", 1534 real_file_name, line_number); 1535#endif 1536 1537 if (t1->type == token_type_symbol 1538 || t1->type == token_type_named_op) 1539 { 1540 token_ty *t2 = x_perl_lex (mlp); 1541 if (t2->type == token_type_rbrace) 1542 { 1543 flag_context_ty context; 1544 lex_pos_ty pos; 1545 1546 context = 1547 inherited_context (null_context, 1548 flag_context_list_iterator_advance ( 1549 &context_iter)); 1550 1551 pos.line_number = line_number; 1552 pos.file_name = logical_file_name; 1553 1554 xgettext_current_source_encoding = po_charset_utf8; 1555 remember_a_message (mlp, NULL, xstrdup (t1->string), 1556 context, &pos, savable_comment); 1557 xgettext_current_source_encoding = xgettext_global_source_encoding; 1558 free_token (t2); 1559 free_token (t1); 1560 } 1561 else 1562 { 1563 x_perl_unlex (t2); 1564 } 1565 } 1566 else 1567 { 1568 x_perl_unlex (t1); 1569 if (extract_balanced (mlp, token_type_rbrace, true, false, 1570 null_context, context_iter, 1571 1, arglist_parser_alloc (mlp, &shapes))) 1572 return; 1573 } 1574 } 1575 } 1576 else 1577 { 1578 phase2_ungetc (c); 1579 } 1580 } 1581 else 1582 { 1583 phase2_ungetc (c); 1584 } 1585 } 1586 1587 /* Now consume "->", "[...]", and "{...}". */ 1588 for (;;) 1589 { 1590 int c = phase2_getc (); 1591 int c2; 1592 1593 switch (c) 1594 { 1595 case '{': 1596#if DEBUG_PERL 1597 fprintf (stderr, "%s:%d: extracting balanced '{' after varname\n", 1598 real_file_name, line_number); 1599#endif 1600 extract_balanced (mlp, token_type_rbrace, true, false, 1601 null_context, null_context_list_iterator, 1602 1, arglist_parser_alloc (mlp, NULL)); 1603 break; 1604 1605 case '[': 1606#if DEBUG_PERL 1607 fprintf (stderr, "%s:%d: extracting balanced '[' after varname\n", 1608 real_file_name, line_number); 1609#endif 1610 extract_balanced (mlp, token_type_rbracket, true, false, 1611 null_context, null_context_list_iterator, 1612 1, arglist_parser_alloc (mlp, NULL)); 1613 break; 1614 1615 case '-': 1616 c2 = phase1_getc (); 1617 if (c2 == '>') 1618 { 1619#if DEBUG_PERL 1620 fprintf (stderr, "%s:%d: another \"->\" after varname\n", 1621 real_file_name, line_number); 1622#endif 1623 break; 1624 } 1625 else if (c2 != '\n') 1626 { 1627 /* Discarding the newline is harmless here. The only 1628 special character recognized after a minus is greater-than 1629 for dereference. However, the sequence "-\n>" that we 1630 treat incorrectly here, is a syntax error. */ 1631 phase1_ungetc (c2); 1632 } 1633 /* FALLTHROUGH */ 1634 1635 default: 1636#if DEBUG_PERL 1637 fprintf (stderr, "%s:%d: variable finished\n", 1638 real_file_name, line_number); 1639#endif 1640 phase2_ungetc (c); 1641 return; 1642 } 1643 } 1644} 1645 1646/* Actually a simplified version of extract_variable(). It searches for 1647 variables inside a double-quoted string that may interpolate to 1648 some keyword hash (reference). The string is UTF-8 encoded. */ 1649static void 1650interpolate_keywords (message_list_ty *mlp, const char *string, int lineno) 1651{ 1652 static char *buffer; 1653 static int bufmax = 0; 1654 int bufpos = 0; 1655 flag_context_ty context; 1656 int c; 1657 bool maybe_hash_deref = false; 1658 enum parser_state 1659 { 1660 initial, 1661 one_dollar, 1662 two_dollars, 1663 identifier, 1664 minus, 1665 wait_lbrace, 1666 wait_quote, 1667 dquote, 1668 squote, 1669 barekey, 1670 wait_rbrace 1671 } state; 1672 token_ty token; 1673 1674 lex_pos_ty pos; 1675 1676 /* States are: 1677 * 1678 * initial: initial 1679 * one_dollar: dollar sign seen in state INITIAL 1680 * two_dollars: another dollar-sign has been seen in state ONE_DOLLAR 1681 * identifier: a valid identifier character has been seen in state 1682 * ONE_DOLLAR or TWO_DOLLARS 1683 * minus: a minus-sign has been seen in state IDENTIFIER 1684 * wait_lbrace: a greater-than has been seen in state MINUS 1685 * wait_quote: a left brace has been seen in state IDENTIFIER or in 1686 * state WAIT_LBRACE 1687 * dquote: a double-quote has been seen in state WAIT_QUOTE 1688 * squote: a single-quote has been seen in state WAIT_QUOTE 1689 * barekey: an bareword character has been seen in state WAIT_QUOTE 1690 * wait_rbrace: closing quote has been seen in state DQUOTE or SQUOTE 1691 * 1692 * In the states initial...identifier the context is null_context; in the 1693 * states minus...wait_rbrace the context is the one suitable for the first 1694 * argument of the last seen identifier. 1695 */ 1696 state = initial; 1697 context = null_context; 1698 1699 token.type = token_type_string; 1700 token.sub_type = string_type_qq; 1701 token.line_number = line_number; 1702 /* No need for token.comment = add_reference (savable_comment); here. 1703 We can let token.comment uninitialized here, and use savable_comment 1704 directly, because this function only parses the given string and does 1705 not call phase2_getc. */ 1706 pos.file_name = logical_file_name; 1707 pos.line_number = lineno; 1708 1709 while ((c = (unsigned char) *string++) != '\0') 1710 { 1711 void *keyword_value; 1712 1713 if (state == initial) 1714 bufpos = 0; 1715 1716 if (c == '\n') 1717 lineno++; 1718 1719 if (bufpos + 1 >= bufmax) 1720 { 1721 bufmax = 2 * bufmax + 10; 1722 buffer = xrealloc (buffer, bufmax); 1723 } 1724 1725 switch (state) 1726 { 1727 case initial: 1728 switch (c) 1729 { 1730 case '\\': 1731 c = (unsigned char) *string++; 1732 if (c == '\0') 1733 return; 1734 break; 1735 case '$': 1736 buffer[bufpos++] = '$'; 1737 maybe_hash_deref = false; 1738 state = one_dollar; 1739 break; 1740 default: 1741 break; 1742 } 1743 break; 1744 case one_dollar: 1745 switch (c) 1746 { 1747 case '$': 1748 /* 1749 * This is enough to make us believe later that we dereference 1750 * a hash reference. 1751 */ 1752 maybe_hash_deref = true; 1753 state = two_dollars; 1754 break; 1755 default: 1756 if (c == '_' || c == ':' || c == '\'' || c >= 0x80 1757 || (c >= 'A' && c <= 'Z') 1758 || (c >= 'a' && c <= 'z') 1759 || (c >= '0' && c <= '9')) 1760 { 1761 buffer[bufpos++] = c; 1762 state = identifier; 1763 } 1764 else 1765 state = initial; 1766 break; 1767 } 1768 break; 1769 case two_dollars: 1770 if (c == '_' || c == ':' || c == '\'' || c >= 0x80 1771 || (c >= 'A' && c <= 'Z') 1772 || (c >= 'a' && c <= 'z') 1773 || (c >= '0' && c <= '9')) 1774 { 1775 buffer[bufpos++] = c; 1776 state = identifier; 1777 } 1778 else 1779 state = initial; 1780 break; 1781 case identifier: 1782 switch (c) 1783 { 1784 case '-': 1785 if (hash_find_entry (&keywords, buffer, bufpos, &keyword_value) 1786 == 0) 1787 { 1788 flag_context_list_iterator_ty context_iter = 1789 flag_context_list_iterator ( 1790 flag_context_list_table_lookup ( 1791 flag_context_list_table, 1792 buffer, bufpos)); 1793 context = 1794 inherited_context (null_context, 1795 flag_context_list_iterator_advance ( 1796 &context_iter)); 1797 state = minus; 1798 } 1799 else 1800 state = initial; 1801 break; 1802 case '{': 1803 if (!maybe_hash_deref) 1804 buffer[0] = '%'; 1805 if (hash_find_entry (&keywords, buffer, bufpos, &keyword_value) 1806 == 0) 1807 { 1808 flag_context_list_iterator_ty context_iter = 1809 flag_context_list_iterator ( 1810 flag_context_list_table_lookup ( 1811 flag_context_list_table, 1812 buffer, bufpos)); 1813 context = 1814 inherited_context (null_context, 1815 flag_context_list_iterator_advance ( 1816 &context_iter)); 1817 state = wait_quote; 1818 } 1819 else 1820 state = initial; 1821 break; 1822 default: 1823 if (c == '_' || c == ':' || c == '\'' || c >= 0x80 1824 || (c >= 'A' && c <= 'Z') 1825 || (c >= 'a' && c <= 'z') 1826 || (c >= '0' && c <= '9')) 1827 { 1828 buffer[bufpos++] = c; 1829 } 1830 else 1831 state = initial; 1832 break; 1833 } 1834 break; 1835 case minus: 1836 switch (c) 1837 { 1838 case '>': 1839 state = wait_lbrace; 1840 break; 1841 default: 1842 context = null_context; 1843 state = initial; 1844 break; 1845 } 1846 break; 1847 case wait_lbrace: 1848 switch (c) 1849 { 1850 case '{': 1851 state = wait_quote; 1852 break; 1853 default: 1854 context = null_context; 1855 state = initial; 1856 break; 1857 } 1858 break; 1859 case wait_quote: 1860 switch (c) 1861 { 1862 case_whitespace: 1863 break; 1864 case '\'': 1865 pos.line_number = lineno; 1866 bufpos = 0; 1867 state = squote; 1868 break; 1869 case '"': 1870 pos.line_number = lineno; 1871 bufpos = 0; 1872 state = dquote; 1873 break; 1874 default: 1875 if (c == '_' || (c >= '0' && c <= '9') || c >= 0x80 1876 || (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z')) 1877 { 1878 pos.line_number = lineno; 1879 bufpos = 0; 1880 buffer[bufpos++] = c; 1881 state = barekey; 1882 } 1883 else 1884 { 1885 context = null_context; 1886 state = initial; 1887 } 1888 break; 1889 } 1890 break; 1891 case dquote: 1892 switch (c) 1893 { 1894 case '"': 1895 /* The resulting string has to be interpolated twice. */ 1896 buffer[bufpos] = '\0'; 1897 token.string = xstrdup (buffer); 1898 extract_quotelike_pass3 (&token, EXIT_FAILURE); 1899 /* The string can only shrink with interpolation (because 1900 we ignore \Q). */ 1901 if (!(strlen (token.string) <= bufpos)) 1902 abort (); 1903 strcpy (buffer, token.string); 1904 free (token.string); 1905 state = wait_rbrace; 1906 break; 1907 case '\\': 1908 if (string[0] == '\"') 1909 { 1910 buffer[bufpos++] = string++[0]; 1911 } 1912 else if (string[0]) 1913 { 1914 buffer[bufpos++] = '\\'; 1915 buffer[bufpos++] = string++[0]; 1916 } 1917 else 1918 { 1919 context = null_context; 1920 state = initial; 1921 } 1922 break; 1923 default: 1924 buffer[bufpos++] = c; 1925 break; 1926 } 1927 break; 1928 case squote: 1929 switch (c) 1930 { 1931 case '\'': 1932 state = wait_rbrace; 1933 break; 1934 case '\\': 1935 if (string[0] == '\'') 1936 { 1937 buffer[bufpos++] = string++[0]; 1938 } 1939 else if (string[0]) 1940 { 1941 buffer[bufpos++] = '\\'; 1942 buffer[bufpos++] = string++[0]; 1943 } 1944 else 1945 { 1946 context = null_context; 1947 state = initial; 1948 } 1949 break; 1950 default: 1951 buffer[bufpos++] = c; 1952 break; 1953 } 1954 break; 1955 case barekey: 1956 if (c == '_' || (c >= '0' && c <= '9') || c >= 0x80 1957 || (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z')) 1958 { 1959 buffer[bufpos++] = c; 1960 break; 1961 } 1962 else if (is_whitespace (c)) 1963 { 1964 state = wait_rbrace; 1965 break; 1966 } 1967 else if (c != '}') 1968 { 1969 context = null_context; 1970 state = initial; 1971 break; 1972 } 1973 /* Must be right brace. */ 1974 /* FALLTHROUGH */ 1975 case wait_rbrace: 1976 switch (c) 1977 { 1978 case_whitespace: 1979 break; 1980 case '}': 1981 buffer[bufpos] = '\0'; 1982 token.string = xstrdup (buffer); 1983 extract_quotelike_pass3 (&token, EXIT_FAILURE); 1984 xgettext_current_source_encoding = po_charset_utf8; 1985 remember_a_message (mlp, NULL, token.string, context, &pos, 1986 savable_comment); 1987 xgettext_current_source_encoding = xgettext_global_source_encoding; 1988 /* FALLTHROUGH */ 1989 default: 1990 context = null_context; 1991 state = initial; 1992 break; 1993 } 1994 break; 1995 } 1996 } 1997} 1998 1999/* The last token seen in the token stream. This is important for the 2000 interpretation of '?' and '/'. */ 2001static token_type_ty last_token; 2002 2003/* Combine characters into tokens. Discard whitespace. */ 2004 2005static void 2006x_perl_prelex (message_list_ty *mlp, token_ty *tp) 2007{ 2008 static char *buffer; 2009 static int bufmax; 2010 int bufpos; 2011 int c; 2012 2013 for (;;) 2014 { 2015 c = phase2_getc (); 2016 tp->line_number = line_number; 2017 2018 switch (c) 2019 { 2020 case EOF: 2021 tp->type = token_type_eof; 2022 return; 2023 2024 case '\n': 2025 if (last_non_comment_line > last_comment_line) 2026 savable_comment_reset (); 2027 /* FALLTHROUGH */ 2028 case '\t': 2029 case ' ': 2030 /* Ignore whitespace. */ 2031 continue; 2032 2033 case '%': 2034 case '@': 2035 case '*': 2036 case '$': 2037 if (!extract_all) 2038 { 2039 extract_variable (mlp, tp, c); 2040 prefer_division_over_regexp = true; 2041 return; 2042 } 2043 break; 2044 } 2045 2046 last_non_comment_line = tp->line_number; 2047 2048 switch (c) 2049 { 2050 case '.': 2051 { 2052 int c2 = phase1_getc (); 2053 phase1_ungetc (c2); 2054 if (c2 == '.') 2055 { 2056 tp->type = token_type_other; 2057 prefer_division_over_regexp = false; 2058 return; 2059 } 2060 else if (c2 >= '0' && c2 <= '9') 2061 { 2062 prefer_division_over_regexp = false; 2063 } 2064 else 2065 { 2066 tp->type = token_type_dot; 2067 prefer_division_over_regexp = true; 2068 return; 2069 } 2070 } 2071 /* FALLTHROUGH */ 2072 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': 2073 case 'G': case 'H': case 'I': case 'J': case 'K': case 'L': 2074 case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R': 2075 case 'S': case 'T': case 'U': case 'V': case 'W': case 'X': 2076 case 'Y': case 'Z': 2077 case '_': 2078 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': 2079 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l': 2080 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r': 2081 case 's': case 't': case 'u': case 'v': case 'w': case 'x': 2082 case 'y': case 'z': 2083 case '0': case '1': case '2': case '3': case '4': 2084 case '5': case '6': case '7': case '8': case '9': 2085 /* Symbol, or part of a number. */ 2086 prefer_division_over_regexp = true; 2087 bufpos = 0; 2088 for (;;) 2089 { 2090 if (bufpos >= bufmax) 2091 { 2092 bufmax = 2 * bufmax + 10; 2093 buffer = xrealloc (buffer, bufmax); 2094 } 2095 buffer[bufpos++] = c; 2096 c = phase1_getc (); 2097 switch (c) 2098 { 2099 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': 2100 case 'G': case 'H': case 'I': case 'J': case 'K': case 'L': 2101 case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R': 2102 case 'S': case 'T': case 'U': case 'V': case 'W': case 'X': 2103 case 'Y': case 'Z': 2104 case '_': 2105 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': 2106 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l': 2107 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r': 2108 case 's': case 't': case 'u': case 'v': case 'w': case 'x': 2109 case 'y': case 'z': 2110 case '0': case '1': case '2': case '3': case '4': 2111 case '5': case '6': case '7': case '8': case '9': 2112 continue; 2113 2114 default: 2115 phase1_ungetc (c); 2116 break; 2117 } 2118 break; 2119 } 2120 if (bufpos >= bufmax) 2121 { 2122 bufmax = 2 * bufmax + 10; 2123 buffer = xrealloc (buffer, bufmax); 2124 } 2125 buffer[bufpos] = '\0'; 2126 2127 if (strcmp (buffer, "__END__") == 0 2128 || strcmp (buffer, "__DATA__") == 0) 2129 { 2130 end_of_file = true; 2131 tp->type = token_type_eof; 2132 return; 2133 } 2134 else if (strcmp (buffer, "and") == 0 2135 || strcmp (buffer, "cmp") == 0 2136 || strcmp (buffer, "eq") == 0 2137 || strcmp (buffer, "if") == 0 2138 || strcmp (buffer, "ge") == 0 2139 || strcmp (buffer, "gt") == 0 2140 || strcmp (buffer, "le") == 0 2141 || strcmp (buffer, "lt") == 0 2142 || strcmp (buffer, "ne") == 0 2143 || strcmp (buffer, "not") == 0 2144 || strcmp (buffer, "or") == 0 2145 || strcmp (buffer, "unless") == 0 2146 || strcmp (buffer, "while") == 0 2147 || strcmp (buffer, "xor") == 0) 2148 { 2149 tp->type = token_type_named_op; 2150 tp->string = xstrdup (buffer); 2151 prefer_division_over_regexp = false; 2152 return; 2153 } 2154 else if (strcmp (buffer, "s") == 0 2155 || strcmp (buffer, "y") == 0 2156 || strcmp (buffer, "tr") == 0) 2157 { 2158 int delim = phase1_getc (); 2159 2160 while (is_whitespace (delim)) 2161 delim = phase2_getc (); 2162 2163 if (delim == EOF) 2164 { 2165 tp->type = token_type_eof; 2166 return; 2167 } 2168 if ((delim >= '0' && delim <= '9') 2169 || (delim >= 'A' && delim <= 'Z') 2170 || (delim >= 'a' && delim <= 'z')) 2171 { 2172 /* False positive. */ 2173 phase2_ungetc (delim); 2174 tp->type = token_type_symbol; 2175 tp->sub_type = symbol_type_none; 2176 tp->string = xstrdup (buffer); 2177 prefer_division_over_regexp = true; 2178 return; 2179 } 2180 extract_triple_quotelike (mlp, tp, delim, 2181 buffer[0] == 's' && delim != '\''); 2182 2183 /* Eat the following modifiers. */ 2184 do 2185 c = phase1_getc (); 2186 while (c >= 'a' && c <= 'z'); 2187 phase1_ungetc (c); 2188 return; 2189 } 2190 else if (strcmp (buffer, "m") == 0) 2191 { 2192 int delim = phase1_getc (); 2193 2194 while (is_whitespace (delim)) 2195 delim = phase2_getc (); 2196 2197 if (delim == EOF) 2198 { 2199 tp->type = token_type_eof; 2200 return; 2201 } 2202 if ((delim >= '0' && delim <= '9') 2203 || (delim >= 'A' && delim <= 'Z') 2204 || (delim >= 'a' && delim <= 'z')) 2205 { 2206 /* False positive. */ 2207 phase2_ungetc (delim); 2208 tp->type = token_type_symbol; 2209 tp->sub_type = symbol_type_none; 2210 tp->string = xstrdup (buffer); 2211 prefer_division_over_regexp = true; 2212 return; 2213 } 2214 extract_quotelike (tp, delim); 2215 if (delim != '\'') 2216 interpolate_keywords (mlp, tp->string, line_number); 2217 free (tp->string); 2218 drop_reference (tp->comment); 2219 tp->type = token_type_regex_op; 2220 prefer_division_over_regexp = true; 2221 2222 /* Eat the following modifiers. */ 2223 do 2224 c = phase1_getc (); 2225 while (c >= 'a' && c <= 'z'); 2226 phase1_ungetc (c); 2227 return; 2228 } 2229 else if (strcmp (buffer, "qq") == 0 2230 || strcmp (buffer, "q") == 0 2231 || strcmp (buffer, "qx") == 0 2232 || strcmp (buffer, "qw") == 0 2233 || strcmp (buffer, "qr") == 0) 2234 { 2235 /* The qw (...) construct is not really a string but we 2236 can treat in the same manner and then pretend it is 2237 a symbol. Rationale: Saying "qw (foo bar)" is the 2238 same as "my @list = ('foo', 'bar'); @list;". */ 2239 2240 int delim = phase1_getc (); 2241 2242 while (is_whitespace (delim)) 2243 delim = phase2_getc (); 2244 2245 if (delim == EOF) 2246 { 2247 tp->type = token_type_eof; 2248 return; 2249 } 2250 prefer_division_over_regexp = true; 2251 2252 if ((delim >= '0' && delim <= '9') 2253 || (delim >= 'A' && delim <= 'Z') 2254 || (delim >= 'a' && delim <= 'z')) 2255 { 2256 /* False positive. */ 2257 phase2_ungetc (delim); 2258 tp->type = token_type_symbol; 2259 tp->sub_type = symbol_type_none; 2260 tp->string = xstrdup (buffer); 2261 prefer_division_over_regexp = true; 2262 return; 2263 } 2264 2265 extract_quotelike (tp, delim); 2266 2267 switch (buffer[1]) 2268 { 2269 case 'q': 2270 case 'x': 2271 tp->type = token_type_string; 2272 tp->sub_type = string_type_qq; 2273 interpolate_keywords (mlp, tp->string, line_number); 2274 break; 2275 case 'r': 2276 drop_reference (tp->comment); 2277 tp->type = token_type_regex_op; 2278 break; 2279 case 'w': 2280 drop_reference (tp->comment); 2281 tp->type = token_type_symbol; 2282 tp->sub_type = symbol_type_none; 2283 break; 2284 case '\0': 2285 tp->type = token_type_string; 2286 tp->sub_type = string_type_q; 2287 break; 2288 default: 2289 abort (); 2290 } 2291 return; 2292 } 2293 else if (strcmp (buffer, "grep") == 0 2294 || strcmp (buffer, "split") == 0) 2295 { 2296 prefer_division_over_regexp = false; 2297 } 2298 tp->type = token_type_symbol; 2299 tp->sub_type = (strcmp (buffer, "sub") == 0 2300 ? symbol_type_sub 2301 : symbol_type_none); 2302 tp->string = xstrdup (buffer); 2303 return; 2304 2305 case '"': 2306 prefer_division_over_regexp = true; 2307 extract_quotelike (tp, c); 2308 tp->sub_type = string_type_qq; 2309 interpolate_keywords (mlp, tp->string, line_number); 2310 return; 2311 2312 case '`': 2313 prefer_division_over_regexp = true; 2314 extract_quotelike (tp, c); 2315 tp->sub_type = string_type_qq; 2316 interpolate_keywords (mlp, tp->string, line_number); 2317 return; 2318 2319 case '\'': 2320 prefer_division_over_regexp = true; 2321 extract_quotelike (tp, c); 2322 tp->sub_type = string_type_q; 2323 return; 2324 2325 case '(': 2326 c = phase2_getc (); 2327 if (c == ')') 2328 /* Ignore empty list. */ 2329 continue; 2330 else 2331 phase2_ungetc (c); 2332 tp->type = token_type_lparen; 2333 prefer_division_over_regexp = false; 2334 return; 2335 2336 case ')': 2337 tp->type = token_type_rparen; 2338 prefer_division_over_regexp = true; 2339 return; 2340 2341 case '{': 2342 tp->type = token_type_lbrace; 2343 prefer_division_over_regexp = false; 2344 return; 2345 2346 case '}': 2347 tp->type = token_type_rbrace; 2348 prefer_division_over_regexp = false; 2349 return; 2350 2351 case '[': 2352 tp->type = token_type_lbracket; 2353 prefer_division_over_regexp = false; 2354 return; 2355 2356 case ']': 2357 tp->type = token_type_rbracket; 2358 prefer_division_over_regexp = false; 2359 return; 2360 2361 case ';': 2362 tp->type = token_type_semicolon; 2363 prefer_division_over_regexp = false; 2364 return; 2365 2366 case ',': 2367 tp->type = token_type_comma; 2368 prefer_division_over_regexp = false; 2369 return; 2370 2371 case '=': 2372 /* Check for fat comma. */ 2373 c = phase1_getc (); 2374 if (c == '>') 2375 { 2376 tp->type = token_type_fat_comma; 2377 return; 2378 } 2379 else if (linepos == 2 2380 && (last_token == token_type_semicolon 2381 || last_token == token_type_rbrace) 2382 && ((c >= 'A' && c <='Z') 2383 || (c >= 'a' && c <= 'z'))) 2384 { 2385#if DEBUG_PERL 2386 fprintf (stderr, "%s:%d: start pod section\n", 2387 real_file_name, line_number); 2388#endif 2389 skip_pod (); 2390#if DEBUG_PERL 2391 fprintf (stderr, "%s:%d: end pod section\n", 2392 real_file_name, line_number); 2393#endif 2394 continue; 2395 } 2396 phase1_ungetc (c); 2397 tp->type = token_type_other; 2398 prefer_division_over_regexp = false; 2399 return; 2400 2401 case '<': 2402 /* Check for <<EOF and friends. */ 2403 prefer_division_over_regexp = false; 2404 c = phase1_getc (); 2405 if (c == '<') 2406 { 2407 c = phase1_getc (); 2408 if (c == '\'') 2409 { 2410 char *string; 2411 extract_quotelike (tp, c); 2412 string = get_here_document (tp->string); 2413 free (tp->string); 2414 tp->string = string; 2415 tp->type = token_type_string; 2416 tp->sub_type = string_type_verbatim; 2417 tp->line_number = line_number + 1; 2418 return; 2419 } 2420 else if (c == '"') 2421 { 2422 char *string; 2423 extract_quotelike (tp, c); 2424 string = get_here_document (tp->string); 2425 free (tp->string); 2426 tp->string = string; 2427 tp->type = token_type_string; 2428 tp->sub_type = string_type_qq; 2429 tp->line_number = line_number + 1; 2430 interpolate_keywords (mlp, tp->string, line_number + 1); 2431 return; 2432 } 2433 else if ((c >= 'A' && c <= 'Z') 2434 || (c >= 'a' && c <= 'z') 2435 || c == '_') 2436 { 2437 bufpos = 0; 2438 while ((c >= 'A' && c <= 'Z') 2439 || (c >= 'a' && c <= 'z') 2440 || (c >= '0' && c <= '9') 2441 || c == '_' || c >= 0x80) 2442 { 2443 if (bufpos >= bufmax) 2444 { 2445 bufmax = 2 * bufmax + 10; 2446 buffer = xrealloc (buffer, bufmax); 2447 } 2448 buffer[bufpos++] = c; 2449 c = phase1_getc (); 2450 } 2451 if (c == EOF) 2452 { 2453 tp->type = token_type_eof; 2454 return; 2455 } 2456 else 2457 { 2458 char *string; 2459 phase1_ungetc (c); 2460 if (bufpos >= bufmax) 2461 { 2462 bufmax = 2 * bufmax + 10; 2463 buffer = xrealloc (buffer, bufmax); 2464 } 2465 buffer[bufpos++] = '\0'; 2466 string = get_here_document (buffer); 2467 tp->string = string; 2468 tp->type = token_type_string; 2469 tp->sub_type = string_type_qq; 2470 tp->comment = add_reference (savable_comment); 2471 tp->line_number = line_number + 1; 2472 interpolate_keywords (mlp, tp->string, line_number + 1); 2473 return; 2474 } 2475 } 2476 else 2477 { 2478 tp->type = token_type_other; 2479 return; 2480 } 2481 } 2482 else 2483 { 2484 phase1_ungetc (c); 2485 tp->type = token_type_other; 2486 } 2487 return; /* End of case '>'. */ 2488 2489 case '-': 2490 /* Check for dereferencing operator. */ 2491 c = phase1_getc (); 2492 if (c == '>') 2493 { 2494 tp->type = token_type_dereference; 2495 return; 2496 } 2497 else if ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z')) 2498 { 2499 /* One of the -X (filetest) functions. We play safe 2500 and accept all alphabetical characters here. */ 2501 tp->type = token_type_other; 2502 return; 2503 } 2504 phase1_ungetc (c); 2505 tp->type = token_type_other; 2506 prefer_division_over_regexp = false; 2507 return; 2508 2509 case '/': 2510 case '?': 2511 if (!prefer_division_over_regexp) 2512 { 2513 extract_quotelike (tp, c); 2514 interpolate_keywords (mlp, tp->string, line_number); 2515 free (tp->string); 2516 drop_reference (tp->comment); 2517 tp->type = token_type_other; 2518 prefer_division_over_regexp = true; 2519 /* Eat the following modifiers. */ 2520 do 2521 c = phase1_getc (); 2522 while (c >= 'a' && c <= 'z'); 2523 phase1_ungetc (c); 2524 return; 2525 } 2526 /* FALLTHROUGH */ 2527 2528 default: 2529 /* We could carefully recognize each of the 2 and 3 character 2530 operators, but it is not necessary, as we only need to recognize 2531 gettext invocations. Don't bother. */ 2532 tp->type = token_type_other; 2533 prefer_division_over_regexp = false; 2534 return; 2535 } 2536 } 2537} 2538 2539 2540/* A token stack used as a lookahead buffer. */ 2541 2542typedef struct token_stack_ty token_stack_ty; 2543struct token_stack_ty 2544{ 2545 token_ty **items; 2546 size_t nitems; 2547 size_t nitems_max; 2548}; 2549 2550static struct token_stack_ty token_stack; 2551 2552#if DEBUG_PERL 2553/* Dumps all resources allocated by stack STACK. */ 2554static int 2555token_stack_dump (token_stack_ty *stack) 2556{ 2557 size_t i; 2558 2559 fprintf (stderr, "BEGIN STACK DUMP\n"); 2560 for (i = 0; i < stack->nitems; i++) 2561 { 2562 token_ty *token = stack->items[i]; 2563 fprintf (stderr, " [%s]\n", token2string (token)); 2564 switch (token->type) 2565 { 2566 case token_type_named_op: 2567 case token_type_string: 2568 case token_type_symbol: 2569 case token_type_variable: 2570 fprintf (stderr, " string: %s\n", token->string); 2571 break; 2572 } 2573 } 2574 fprintf (stderr, "END STACK DUMP\n"); 2575 return 0; 2576} 2577#endif 2578 2579/* Pushes the token TOKEN onto the stack STACK. */ 2580static inline void 2581token_stack_push (token_stack_ty *stack, token_ty *token) 2582{ 2583 if (stack->nitems >= stack->nitems_max) 2584 { 2585 size_t nbytes; 2586 2587 stack->nitems_max = 2 * stack->nitems_max + 4; 2588 nbytes = stack->nitems_max * sizeof (token_ty *); 2589 stack->items = xrealloc (stack->items, nbytes); 2590 } 2591 stack->items[stack->nitems++] = token; 2592} 2593 2594/* Pops the most recently pushed token from the stack STACK and returns it. 2595 Returns NULL if the stack is empty. */ 2596static inline token_ty * 2597token_stack_pop (token_stack_ty *stack) 2598{ 2599 if (stack->nitems > 0) 2600 return stack->items[--(stack->nitems)]; 2601 else 2602 return NULL; 2603} 2604 2605/* Return the top of the stack without removing it from the stack, or 2606 NULL if the stack is empty. */ 2607static inline token_ty * 2608token_stack_peek (const token_stack_ty *stack) 2609{ 2610 if (stack->nitems > 0) 2611 return stack->items[stack->nitems - 1]; 2612 else 2613 return NULL; 2614} 2615 2616/* Frees all resources allocated by stack STACK. */ 2617static inline void 2618token_stack_free (token_stack_ty *stack) 2619{ 2620 size_t i; 2621 2622 for (i = 0; i < stack->nitems; i++) 2623 free_token (stack->items[i]); 2624 free (stack->items); 2625} 2626 2627 2628static token_ty * 2629x_perl_lex (message_list_ty *mlp) 2630{ 2631#if DEBUG_PERL 2632 int dummy = token_stack_dump (&token_stack); 2633#endif 2634 token_ty *tp = token_stack_pop (&token_stack); 2635 2636 if (!tp) 2637 { 2638 tp = XMALLOC (token_ty); 2639 x_perl_prelex (mlp, tp); 2640#if DEBUG_PERL 2641 fprintf (stderr, "%s:%d: x_perl_prelex returned %s\n", 2642 real_file_name, line_number, token2string (tp)); 2643#endif 2644 } 2645#if DEBUG_PERL 2646 else 2647 { 2648 fprintf (stderr, "%s:%d: %s recycled from stack\n", 2649 real_file_name, line_number, token2string (tp)); 2650 } 2651#endif 2652 2653 /* A symbol followed by a fat comma is really a single-quoted string. 2654 Function definitions or forward declarations also need a special 2655 handling because the dollars and at signs inside the parentheses 2656 must not be interpreted as the beginning of a variable ')'. */ 2657 if (tp->type == token_type_symbol || tp->type == token_type_named_op) 2658 { 2659 token_ty *next = token_stack_peek (&token_stack); 2660 2661 if (!next) 2662 { 2663#if DEBUG_PERL 2664 fprintf (stderr, "%s:%d: pre-fetching next token\n", 2665 real_file_name, line_number); 2666#endif 2667 next = x_perl_lex (mlp); 2668 x_perl_unlex (next); 2669#if DEBUG_PERL 2670 fprintf (stderr, "%s:%d: unshifted next token\n", 2671 real_file_name, line_number); 2672#endif 2673 } 2674 2675#if DEBUG_PERL 2676 fprintf (stderr, "%s:%d: next token is %s\n", 2677 real_file_name, line_number, token2string (next)); 2678#endif 2679 2680 if (next->type == token_type_fat_comma) 2681 { 2682 tp->type = token_type_string; 2683 tp->sub_type = string_type_q; 2684 tp->comment = add_reference (savable_comment); 2685#if DEBUG_PERL 2686 fprintf (stderr, 2687 "%s:%d: token %s mutated to token_type_string\n", 2688 real_file_name, line_number, token2string (tp)); 2689#endif 2690 } 2691 else if (tp->type == token_type_symbol && tp->sub_type == symbol_type_sub 2692 && next->type == token_type_symbol) 2693 { 2694 /* Start of a function declaration or definition. Mark this 2695 symbol as a function name, so that we can later eat up 2696 possible prototype information. */ 2697#if DEBUG_PERL 2698 fprintf (stderr, "%s:%d: subroutine declaration/definition '%s'\n", 2699 real_file_name, line_number, next->string); 2700#endif 2701 next->sub_type = symbol_type_function; 2702 } 2703 else if (tp->type == token_type_symbol 2704 && (tp->sub_type == symbol_type_sub 2705 || tp->sub_type == symbol_type_function) 2706 && next->type == token_type_lparen) 2707 { 2708 /* For simplicity we simply consume everything up to the 2709 closing parenthesis. Actually only a limited set of 2710 characters is allowed inside parentheses but we leave 2711 complaints to the interpreter and are prepared for 2712 future extensions to the Perl syntax. */ 2713 int c; 2714 2715#if DEBUG_PERL 2716 fprintf (stderr, "%s:%d: consuming prototype information\n", 2717 real_file_name, line_number); 2718#endif 2719 2720 do 2721 { 2722 c = phase1_getc (); 2723#if DEBUG_PERL 2724 fprintf (stderr, " consuming character '%c'\n", c); 2725#endif 2726 } 2727 while (c != EOF && c != ')'); 2728 phase1_ungetc (c); 2729 } 2730 } 2731 2732 return tp; 2733} 2734 2735static void 2736x_perl_unlex (token_ty *tp) 2737{ 2738 token_stack_push (&token_stack, tp); 2739} 2740 2741 2742/* ========================= Extracting strings. ========================== */ 2743 2744/* Assuming TP is a string token, this function accumulates all subsequent 2745 . string2 . string3 ... to the string. (String concatenation.) */ 2746 2747static char * 2748collect_message (message_list_ty *mlp, token_ty *tp, int error_level) 2749{ 2750 char *string; 2751 size_t len; 2752 2753 extract_quotelike_pass3 (tp, error_level); 2754 string = xstrdup (tp->string); 2755 len = strlen (tp->string) + 1; 2756 2757 for (;;) 2758 { 2759 int c; 2760 2761 do 2762 c = phase2_getc (); 2763 while (is_whitespace (c)); 2764 2765 if (c != '.') 2766 { 2767 phase2_ungetc (c); 2768 return string; 2769 } 2770 2771 do 2772 c = phase2_getc (); 2773 while (is_whitespace (c)); 2774 2775 phase2_ungetc (c); 2776 2777 if (c == '"' || c == '\'' || c == '`' 2778 || (!prefer_division_over_regexp && (c == '/' || c == '?')) 2779 || c == 'q') 2780 { 2781 token_ty *qstring = x_perl_lex (mlp); 2782 if (qstring->type != token_type_string) 2783 { 2784 /* assert (qstring->type == token_type_symbol) */ 2785 x_perl_unlex (qstring); 2786 return string; 2787 } 2788 2789 extract_quotelike_pass3 (qstring, error_level); 2790 len += strlen (qstring->string); 2791 string = xrealloc (string, len); 2792 strcat (string, qstring->string); 2793 free_token (qstring); 2794 } 2795 } 2796} 2797 2798/* The file is broken into tokens. Scan the token stream, looking for 2799 a keyword, followed by a left paren, followed by a string. When we 2800 see this sequence, we have something to remember. We assume we are 2801 looking at a valid Perl program, and leave the complaints about 2802 the grammar to the compiler. 2803 2804 Normal handling: Look for 2805 keyword ( ... msgid ... ) 2806 Plural handling: Look for 2807 keyword ( ... msgid ... msgid_plural ... ) 2808 2809 We use recursion because the arguments before msgid or between msgid 2810 and msgid_plural can contain subexpressions of the same form. 2811 2812 In Perl, parentheses around function arguments can be omitted. 2813 2814 The general rules are: 2815 1) Functions declared with a prototype take exactly the specified number 2816 of arguments. 2817 sub one_arg ($) { ... } 2818 sub two_args ($$) { ... } 2819 2) When a function name is immediately followed by an opening parenthesis, 2820 the argument list ends at the corresponding closing parenthesis. 2821 2822 If rule 1 and rule 2 are contradictory, i.e. when the program calls a 2823 function with an explicit argument list and the wrong number of arguments, 2824 the program is invalid: 2825 sub two_args ($$) { ... } 2826 foo two_args (x), y - invalid due to rules 1 and 2 2827 2828 Ambiguities are resolved as follows: 2829 3) Some built-ins, such as 'abs', 'sqrt', 'sin', 'cos', ..., and functions 2830 declared with a prototype of exactly one argument take exactly one 2831 argument: 2832 foo sin x, y ==> foo (sin (x), y) 2833 sub one_arg ($) { ... } 2834 foo one_arg x, y, z ==> foo (one_arg (x), y, z) 2835 4) Other identifiers, if not immediately followed by an opening 2836 parenthesis, consume the entire remaining argument list: 2837 foo bar x, y ==> foo (bar (x, y)) 2838 sub two_args ($$) { ... } 2839 foo two_args x, y ==> foo (two_args (x, y)) 2840 2841 Other series of comma separated expressions without a function name at 2842 the beginning are comma expressions: 2843 sub two_args ($$) { ... } 2844 foo two_args x, (y, z) ==> foo (two_args (x, (y, z))) 2845 Note that the evaluation of comma expressions returns a list of values 2846 when in list context (e.g. inside the argument list of a function without 2847 prototype) but only one value when inside the argument list of a function 2848 with a prototype: 2849 sub print3 ($$$) { print @_ } 2850 print3 5, (6, 7), 8 ==> 578 2851 print 5, (6, 7), 8 ==> 5678 2852 2853 Where rule 3 or 4 contradict rule 1 or 2, the program is invalid: 2854 sin (x, y) - invalid due to rules 2 and 3 2855 sub one_arg ($) { ... } 2856 one_arg (x, y) - invalid due to rules 2 and 3 2857 sub two_args ($$) { ... } 2858 foo two_args x, y, z - invalid due to rules 1 and 4 2859 */ 2860 2861/* Extract messages until the next balanced closing parenthesis. 2862 Extracted messages are added to MLP. 2863 2864 DELIM can be either token_type_rbrace, token_type_rbracket, 2865 token_type_rparen. Additionally, if COMMA_DELIM is true, parsing 2866 stops at the next comma outside parentheses. 2867 2868 ARG is the current argument list position, starts with 1. 2869 ARGPARSER is the corresponding argument list parser. 2870 2871 Returns true for EOF, false otherwise. */ 2872 2873static bool 2874extract_balanced (message_list_ty *mlp, 2875 token_type_ty delim, bool eat_delim, bool comma_delim, 2876 flag_context_ty outer_context, 2877 flag_context_list_iterator_ty context_iter, 2878 int arg, struct arglist_parser *argparser) 2879{ 2880 /* Whether to implicitly assume the next tokens are arguments even without 2881 a '('. */ 2882 bool next_is_argument = false; 2883 /* Parameters of the keyword just seen. Defined only when next_is_argument 2884 is true. */ 2885 const struct callshapes *next_shapes = NULL; 2886 struct arglist_parser *next_argparser = NULL; 2887 2888 /* Whether to not consider strings until the next comma. */ 2889 bool skip_until_comma = false; 2890 2891 /* Context iterator that will be used if the next token is a '('. */ 2892 flag_context_list_iterator_ty next_context_iter = 2893 passthrough_context_list_iterator; 2894 /* Current context. */ 2895 flag_context_ty inner_context = 2896 inherited_context (outer_context, 2897 flag_context_list_iterator_advance (&context_iter)); 2898 2899#if DEBUG_PERL 2900 static int nesting_level = 0; 2901 2902 ++nesting_level; 2903#endif 2904 2905 last_token = token_type_semicolon; /* Safe assumption. */ 2906 prefer_division_over_regexp = false; 2907 2908 for (;;) 2909 { 2910 /* The current token. */ 2911 token_ty *tp; 2912 2913 tp = x_perl_lex (mlp); 2914 2915 last_token = tp->type; 2916 2917 if (delim == tp->type) 2918 { 2919 xgettext_current_source_encoding = po_charset_utf8; 2920 arglist_parser_done (argparser, arg); 2921 xgettext_current_source_encoding = xgettext_global_source_encoding; 2922 if (next_argparser != NULL) 2923 free (next_argparser); 2924#if DEBUG_PERL 2925 fprintf (stderr, "%s:%d: extract_balanced finished (%d)\n", 2926 logical_file_name, tp->line_number, --nesting_level); 2927#endif 2928 if (eat_delim) 2929 free_token (tp); 2930 else 2931 /* Preserve the delimiter for the caller. */ 2932 x_perl_unlex (tp); 2933 return false; 2934 } 2935 2936 if (comma_delim && tp->type == token_type_comma) 2937 { 2938 xgettext_current_source_encoding = po_charset_utf8; 2939 arglist_parser_done (argparser, arg); 2940 xgettext_current_source_encoding = xgettext_global_source_encoding; 2941 if (next_argparser != NULL) 2942 free (next_argparser); 2943#if DEBUG_PERL 2944 fprintf (stderr, "%s:%d: extract_balanced finished at comma (%d)\n", 2945 logical_file_name, tp->line_number, --nesting_level); 2946#endif 2947 x_perl_unlex (tp); 2948 return false; 2949 } 2950 2951 if (next_is_argument && tp->type != token_type_lparen) 2952 { 2953 /* An argument list starts, even though there is no '('. */ 2954 bool next_comma_delim; 2955 2956 x_perl_unlex (tp); 2957 2958 if (next_shapes != NULL) 2959 /* We know something about the function being called. Assume 2960 that it consumes only one argument if no argument number or 2961 total > 1 is specified. */ 2962 { 2963 size_t i; 2964 2965 next_comma_delim = true; 2966 for (i = 0; i < next_shapes->nshapes; i++) 2967 { 2968 const struct callshape *shape = &next_shapes->shapes[i]; 2969 2970 if (shape->argnum1 > 1 2971 || shape->argnum2 > 1 2972 || shape->argnumc > 1 2973 || shape->argtotal > 1) 2974 next_comma_delim = false; 2975 } 2976 } 2977 else 2978 /* We know nothing about the function being called. It could be 2979 a function prototyped to take only one argument, or on the other 2980 hand it could be prototyped to take more than one argument or an 2981 arbitrary argument list or it could be unprototyped. Due to 2982 the way the parser works, assuming the first case gives the 2983 best results. */ 2984 next_comma_delim = true; 2985 2986 if (extract_balanced (mlp, delim, false, next_comma_delim, 2987 inner_context, next_context_iter, 2988 1, next_argparser)) 2989 { 2990 xgettext_current_source_encoding = po_charset_utf8; 2991 arglist_parser_done (argparser, arg); 2992 xgettext_current_source_encoding = xgettext_global_source_encoding; 2993 return true; 2994 } 2995 2996 next_is_argument = false; 2997 next_argparser = NULL; 2998 next_context_iter = null_context_list_iterator; 2999 continue; 3000 } 3001 3002 switch (tp->type) 3003 { 3004 case token_type_symbol: 3005#if DEBUG_PERL 3006 fprintf (stderr, "%s:%d: type symbol (%d) \"%s\"\n", 3007 logical_file_name, tp->line_number, nesting_level, 3008 tp->string); 3009#endif 3010 3011 { 3012 void *keyword_value; 3013 3014 if (hash_find_entry (&keywords, tp->string, strlen (tp->string), 3015 &keyword_value) == 0) 3016 { 3017 const struct callshapes *shapes = 3018 (const struct callshapes *) keyword_value; 3019 3020 last_token = token_type_keyword_symbol; 3021 next_shapes = shapes; 3022 next_argparser = arglist_parser_alloc (mlp, shapes); 3023 } 3024 else 3025 { 3026 next_shapes = NULL; 3027 next_argparser = arglist_parser_alloc (mlp, NULL); 3028 } 3029 } 3030 next_is_argument = true; 3031 next_context_iter = 3032 flag_context_list_iterator ( 3033 flag_context_list_table_lookup ( 3034 flag_context_list_table, 3035 tp->string, strlen (tp->string))); 3036 break; 3037 3038 case token_type_variable: 3039#if DEBUG_PERL 3040 fprintf (stderr, "%s:%d: type variable (%d) \"%s\"\n", 3041 logical_file_name, tp->line_number, nesting_level, tp->string); 3042#endif 3043 prefer_division_over_regexp = true; 3044 next_is_argument = false; 3045 if (next_argparser != NULL) 3046 free (next_argparser); 3047 next_argparser = NULL; 3048 next_context_iter = null_context_list_iterator; 3049 break; 3050 3051 case token_type_lparen: 3052#if DEBUG_PERL 3053 fprintf (stderr, "%s:%d: type left parenthesis (%d)\n", 3054 logical_file_name, tp->line_number, nesting_level); 3055#endif 3056 if (next_is_argument) 3057 { 3058 /* Parse the argument list of a function call. */ 3059 if (extract_balanced (mlp, token_type_rparen, true, false, 3060 inner_context, next_context_iter, 3061 1, next_argparser)) 3062 { 3063 xgettext_current_source_encoding = po_charset_utf8; 3064 arglist_parser_done (argparser, arg); 3065 xgettext_current_source_encoding = xgettext_global_source_encoding; 3066 return true; 3067 } 3068 next_is_argument = false; 3069 next_argparser = NULL; 3070 } 3071 else 3072 { 3073 /* Parse a parenthesized expression or comma expression. */ 3074 if (extract_balanced (mlp, token_type_rparen, true, false, 3075 inner_context, next_context_iter, 3076 arg, arglist_parser_clone (argparser))) 3077 { 3078 xgettext_current_source_encoding = po_charset_utf8; 3079 arglist_parser_done (argparser, arg); 3080 xgettext_current_source_encoding = xgettext_global_source_encoding; 3081 if (next_argparser != NULL) 3082 free (next_argparser); 3083 free_token (tp); 3084 return true; 3085 } 3086 next_is_argument = false; 3087 if (next_argparser != NULL) 3088 free (next_argparser); 3089 next_argparser = NULL; 3090 } 3091 skip_until_comma = true; 3092 next_context_iter = null_context_list_iterator; 3093 break; 3094 3095 case token_type_rparen: 3096#if DEBUG_PERL 3097 fprintf (stderr, "%s:%d: type right parenthesis (%d)\n", 3098 logical_file_name, tp->line_number, nesting_level); 3099#endif 3100 next_is_argument = false; 3101 if (next_argparser != NULL) 3102 free (next_argparser); 3103 next_argparser = NULL; 3104 skip_until_comma = true; 3105 next_context_iter = null_context_list_iterator; 3106 break; 3107 3108 case token_type_comma: 3109 case token_type_fat_comma: 3110#if DEBUG_PERL 3111 fprintf (stderr, "%s:%d: type comma (%d)\n", 3112 logical_file_name, tp->line_number, nesting_level); 3113#endif 3114 if (arglist_parser_decidedp (argparser, arg)) 3115 { 3116 /* We have missed the argument. */ 3117 xgettext_current_source_encoding = po_charset_utf8; 3118 arglist_parser_done (argparser, arg); 3119 xgettext_current_source_encoding = xgettext_global_source_encoding; 3120 argparser = arglist_parser_alloc (mlp, NULL); 3121 arg = 0; 3122 } 3123 arg++; 3124#if DEBUG_PERL 3125 fprintf (stderr, "%s:%d: arg: %d\n", 3126 real_file_name, tp->line_number, arg); 3127#endif 3128 inner_context = 3129 inherited_context (outer_context, 3130 flag_context_list_iterator_advance ( 3131 &context_iter)); 3132 next_is_argument = false; 3133 if (next_argparser != NULL) 3134 free (next_argparser); 3135 next_argparser = NULL; 3136 skip_until_comma = false; 3137 next_context_iter = passthrough_context_list_iterator; 3138 break; 3139 3140 case token_type_string: 3141#if DEBUG_PERL 3142 fprintf (stderr, "%s:%d: type string (%d): \"%s\"\n", 3143 logical_file_name, tp->line_number, nesting_level, 3144 tp->string); 3145#endif 3146 3147 if (extract_all) 3148 { 3149 char *string = collect_message (mlp, tp, EXIT_SUCCESS); 3150 lex_pos_ty pos; 3151 3152 pos.file_name = logical_file_name; 3153 pos.line_number = tp->line_number; 3154 xgettext_current_source_encoding = po_charset_utf8; 3155 remember_a_message (mlp, NULL, string, inner_context, &pos, 3156 tp->comment); 3157 xgettext_current_source_encoding = xgettext_global_source_encoding; 3158 } 3159 else if (!skip_until_comma) 3160 { 3161 /* Need to collect the complete string, with error checking, 3162 only if the argument ARG is used in ARGPARSER. */ 3163 bool must_collect = false; 3164 { 3165 size_t nalternatives = argparser->nalternatives; 3166 size_t i; 3167 3168 for (i = 0; i < nalternatives; i++) 3169 { 3170 struct partial_call *cp = &argparser->alternative[i]; 3171 3172 if (arg == cp->argnumc 3173 || arg == cp->argnum1 || arg == cp->argnum2) 3174 must_collect = true; 3175 } 3176 } 3177 3178 if (must_collect) 3179 { 3180 char *string = collect_message (mlp, tp, EXIT_FAILURE); 3181 3182 xgettext_current_source_encoding = po_charset_utf8; 3183 arglist_parser_remember (argparser, arg, 3184 string, inner_context, 3185 logical_file_name, tp->line_number, 3186 tp->comment); 3187 xgettext_current_source_encoding = xgettext_global_source_encoding; 3188 } 3189 } 3190 3191 if (arglist_parser_decidedp (argparser, arg)) 3192 { 3193 xgettext_current_source_encoding = po_charset_utf8; 3194 arglist_parser_done (argparser, arg); 3195 xgettext_current_source_encoding = xgettext_global_source_encoding; 3196 argparser = arglist_parser_alloc (mlp, NULL); 3197 } 3198 3199 next_is_argument = false; 3200 if (next_argparser != NULL) 3201 free (next_argparser); 3202 next_argparser = NULL; 3203 next_context_iter = null_context_list_iterator; 3204 break; 3205 3206 case token_type_eof: 3207#if DEBUG_PERL 3208 fprintf (stderr, "%s:%d: type EOF (%d)\n", 3209 logical_file_name, tp->line_number, nesting_level); 3210#endif 3211 xgettext_current_source_encoding = po_charset_utf8; 3212 arglist_parser_done (argparser, arg); 3213 xgettext_current_source_encoding = xgettext_global_source_encoding; 3214 if (next_argparser != NULL) 3215 free (next_argparser); 3216 next_argparser = NULL; 3217 free_token (tp); 3218 return true; 3219 3220 case token_type_lbrace: 3221#if DEBUG_PERL 3222 fprintf (stderr, "%s:%d: type lbrace (%d)\n", 3223 logical_file_name, tp->line_number, nesting_level); 3224#endif 3225 if (extract_balanced (mlp, token_type_rbrace, true, false, 3226 null_context, null_context_list_iterator, 3227 1, arglist_parser_alloc (mlp, NULL))) 3228 { 3229 xgettext_current_source_encoding = po_charset_utf8; 3230 arglist_parser_done (argparser, arg); 3231 xgettext_current_source_encoding = xgettext_global_source_encoding; 3232 if (next_argparser != NULL) 3233 free (next_argparser); 3234 free_token (tp); 3235 return true; 3236 } 3237 next_is_argument = false; 3238 if (next_argparser != NULL) 3239 free (next_argparser); 3240 next_argparser = NULL; 3241 next_context_iter = null_context_list_iterator; 3242 break; 3243 3244 case token_type_rbrace: 3245#if DEBUG_PERL 3246 fprintf (stderr, "%s:%d: type rbrace (%d)\n", 3247 logical_file_name, tp->line_number, nesting_level); 3248#endif 3249 next_is_argument = false; 3250 if (next_argparser != NULL) 3251 free (next_argparser); 3252 next_argparser = NULL; 3253 next_context_iter = null_context_list_iterator; 3254 break; 3255 3256 case token_type_lbracket: 3257#if DEBUG_PERL 3258 fprintf (stderr, "%s:%d: type lbracket (%d)\n", 3259 logical_file_name, tp->line_number, nesting_level); 3260#endif 3261 if (extract_balanced (mlp, token_type_rbracket, true, false, 3262 null_context, null_context_list_iterator, 3263 1, arglist_parser_alloc (mlp, NULL))) 3264 { 3265 xgettext_current_source_encoding = po_charset_utf8; 3266 arglist_parser_done (argparser, arg); 3267 xgettext_current_source_encoding = xgettext_global_source_encoding; 3268 if (next_argparser != NULL) 3269 free (next_argparser); 3270 free_token (tp); 3271 return true; 3272 } 3273 next_is_argument = false; 3274 if (next_argparser != NULL) 3275 free (next_argparser); 3276 next_argparser = NULL; 3277 next_context_iter = null_context_list_iterator; 3278 break; 3279 3280 case token_type_rbracket: 3281#if DEBUG_PERL 3282 fprintf (stderr, "%s:%d: type rbracket (%d)\n", 3283 logical_file_name, tp->line_number, nesting_level); 3284#endif 3285 next_is_argument = false; 3286 if (next_argparser != NULL) 3287 free (next_argparser); 3288 next_argparser = NULL; 3289 next_context_iter = null_context_list_iterator; 3290 break; 3291 3292 case token_type_semicolon: 3293#if DEBUG_PERL 3294 fprintf (stderr, "%s:%d: type semicolon (%d)\n", 3295 logical_file_name, tp->line_number, nesting_level); 3296#endif 3297 3298 /* The ultimate sign. */ 3299 xgettext_current_source_encoding = po_charset_utf8; 3300 arglist_parser_done (argparser, arg); 3301 xgettext_current_source_encoding = xgettext_global_source_encoding; 3302 argparser = arglist_parser_alloc (mlp, NULL); 3303 3304 /* FIXME: Instead of resetting outer_context here, it may be better 3305 to recurse in the next_is_argument handling above, waiting for 3306 the next semicolon or other statement terminator. */ 3307 outer_context = null_context; 3308 context_iter = null_context_list_iterator; 3309 next_is_argument = false; 3310 if (next_argparser != NULL) 3311 free (next_argparser); 3312 next_argparser = NULL; 3313 next_context_iter = passthrough_context_list_iterator; 3314 inner_context = 3315 inherited_context (outer_context, 3316 flag_context_list_iterator_advance ( 3317 &context_iter)); 3318 break; 3319 3320 case token_type_dereference: 3321#if DEBUG_PERL 3322 fprintf (stderr, "%s:%d: type dereference (%d)\n", 3323 logical_file_name, tp->line_number, nesting_level); 3324#endif 3325 next_is_argument = false; 3326 if (next_argparser != NULL) 3327 free (next_argparser); 3328 next_argparser = NULL; 3329 next_context_iter = null_context_list_iterator; 3330 break; 3331 3332 case token_type_dot: 3333#if DEBUG_PERL 3334 fprintf (stderr, "%s:%d: type dot (%d)\n", 3335 logical_file_name, tp->line_number, nesting_level); 3336#endif 3337 next_is_argument = false; 3338 if (next_argparser != NULL) 3339 free (next_argparser); 3340 next_argparser = NULL; 3341 next_context_iter = null_context_list_iterator; 3342 break; 3343 3344 case token_type_named_op: 3345#if DEBUG_PERL 3346 fprintf (stderr, "%s:%d: type named operator (%d): %s\n", 3347 logical_file_name, tp->line_number, nesting_level, 3348 tp->string); 3349#endif 3350 next_is_argument = false; 3351 if (next_argparser != NULL) 3352 free (next_argparser); 3353 next_argparser = NULL; 3354 next_context_iter = null_context_list_iterator; 3355 break; 3356 3357 case token_type_regex_op: 3358#if DEBUG_PERL 3359 fprintf (stderr, "%s:%d: type regex operator (%d)\n", 3360 logical_file_name, tp->line_number, nesting_level); 3361#endif 3362 next_is_argument = false; 3363 if (next_argparser != NULL) 3364 free (next_argparser); 3365 next_argparser = NULL; 3366 next_context_iter = null_context_list_iterator; 3367 break; 3368 3369 case token_type_other: 3370#if DEBUG_PERL 3371 fprintf (stderr, "%s:%d: type other (%d)\n", 3372 logical_file_name, tp->line_number, nesting_level); 3373#endif 3374 next_is_argument = false; 3375 if (next_argparser != NULL) 3376 free (next_argparser); 3377 next_argparser = NULL; 3378 next_context_iter = null_context_list_iterator; 3379 break; 3380 3381 default: 3382 fprintf (stderr, "%s:%d: unknown token type %d\n", 3383 real_file_name, tp->line_number, tp->type); 3384 abort (); 3385 } 3386 3387 free_token (tp); 3388 } 3389} 3390 3391void 3392extract_perl (FILE *f, const char *real_filename, const char *logical_filename, 3393 flag_context_list_table_ty *flag_table, 3394 msgdomain_list_ty *mdlp) 3395{ 3396 message_list_ty *mlp = mdlp->item[0]->messages; 3397 3398 fp = f; 3399 real_file_name = real_filename; 3400 logical_file_name = xstrdup (logical_filename); 3401 line_number = 0; 3402 3403 last_comment_line = -1; 3404 last_non_comment_line = -1; 3405 3406 flag_context_list_table = flag_table; 3407 3408 init_keywords (); 3409 3410 token_stack.items = NULL; 3411 token_stack.nitems = 0; 3412 token_stack.nitems_max = 0; 3413 linesize = 0; 3414 linepos = 0; 3415 here_eaten = 0; 3416 end_of_file = false; 3417 3418 /* Eat tokens until eof is seen. When extract_balanced returns 3419 due to an unbalanced closing brace, just restart it. */ 3420 while (!extract_balanced (mlp, token_type_rbrace, true, false, 3421 null_context, null_context_list_iterator, 3422 1, arglist_parser_alloc (mlp, NULL))) 3423 ; 3424 3425 fp = NULL; 3426 real_file_name = NULL; 3427 free (logical_file_name); 3428 logical_file_name = NULL; 3429 line_number = 0; 3430 last_token = token_type_semicolon; 3431 token_stack_free (&token_stack); 3432 here_eaten = 0; 3433 end_of_file = true; 3434} 3435