1/* xgettext PHP backend. 2 Copyright (C) 2001-2003, 2005-2007 Free Software Foundation, Inc. 3 4 This file was written by Bruno Haible <bruno@clisp.org>, 2002. 5 6 This program is free software: you can redistribute it and/or modify 7 it under the terms of the GNU General Public License as published by 8 the Free Software Foundation; either version 3 of the License, or 9 (at your option) any later version. 10 11 This program is distributed in the hope that it will be useful, 12 but WITHOUT ANY WARRANTY; without even the implied warranty of 13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 GNU General Public License for more details. 15 16 You should have received a copy of the GNU General Public License 17 along with this program. If not, see <http://www.gnu.org/licenses/>. */ 18 19#ifdef HAVE_CONFIG_H 20# include "config.h" 21#endif 22 23/* Specification. */ 24#include "x-php.h" 25 26#include <errno.h> 27#include <stdbool.h> 28#include <stdio.h> 29#include <stdlib.h> 30 31#include "message.h" 32#include "xgettext.h" 33#include "x-php.h" 34#include "error.h" 35#include "xalloc.h" 36#include "gettext.h" 37 38#define _(s) gettext(s) 39 40#define SIZEOF(a) (sizeof(a) / sizeof(a[0])) 41 42 43/* The PHP syntax is defined in phpdoc/manual/langref.html. 44 See also php-4.1.0/Zend/zend_language_scanner.l 45 and php-4.1.0/Zend/zend_language_parser.y. 46 Note that variable and function names can contain bytes in the range 47 0x7f..0xff; see 48 http://www.php.net/manual/en/language.variables.php 49 http://www.php.net/manual/en/language.functions.php */ 50 51 52/* ====================== Keyword set customization. ====================== */ 53 54/* If true extract all strings. */ 55static bool extract_all = false; 56 57static hash_table keywords; 58static bool default_keywords = true; 59 60 61void 62x_php_extract_all () 63{ 64 extract_all = true; 65} 66 67 68void 69x_php_keyword (const char *name) 70{ 71 if (name == NULL) 72 default_keywords = false; 73 else 74 { 75 const char *end; 76 struct callshape shape; 77 const char *colon; 78 79 if (keywords.table == NULL) 80 hash_init (&keywords, 100); 81 82 split_keywordspec (name, &end, &shape); 83 84 /* The characters between name and end should form a valid C identifier. 85 A colon means an invalid parse in split_keywordspec(). */ 86 colon = strchr (name, ':'); 87 if (colon == NULL || colon >= end) 88 insert_keyword_callshape (&keywords, name, end - name, &shape); 89 } 90} 91 92/* Finish initializing the keywords hash table. 93 Called after argument processing, before each file is processed. */ 94static void 95init_keywords () 96{ 97 if (default_keywords) 98 { 99 /* When adding new keywords here, also update the documentation in 100 xgettext.texi! */ 101 x_php_keyword ("_"); 102 x_php_keyword ("gettext"); 103 x_php_keyword ("dgettext:2"); 104 x_php_keyword ("dcgettext:2"); 105 /* The following were added in PHP 4.2.0. */ 106 x_php_keyword ("ngettext:1,2"); 107 x_php_keyword ("dngettext:2,3"); 108 x_php_keyword ("dcngettext:2,3"); 109 default_keywords = false; 110 } 111} 112 113void 114init_flag_table_php () 115{ 116 xgettext_record_flag ("_:1:pass-php-format"); 117 xgettext_record_flag ("gettext:1:pass-php-format"); 118 xgettext_record_flag ("dgettext:2:pass-php-format"); 119 xgettext_record_flag ("dcgettext:2:pass-php-format"); 120 xgettext_record_flag ("ngettext:1:pass-php-format"); 121 xgettext_record_flag ("ngettext:2:pass-php-format"); 122 xgettext_record_flag ("dngettext:2:pass-php-format"); 123 xgettext_record_flag ("dngettext:3:pass-php-format"); 124 xgettext_record_flag ("dcngettext:2:pass-php-format"); 125 xgettext_record_flag ("dcngettext:3:pass-php-format"); 126 xgettext_record_flag ("sprintf:1:php-format"); 127 xgettext_record_flag ("printf:1:php-format"); 128} 129 130 131/* ======================== Reading of characters. ======================== */ 132 133 134/* Real filename, used in error messages about the input file. */ 135static const char *real_file_name; 136 137/* Logical filename and line number, used to label the extracted messages. */ 138static char *logical_file_name; 139static int line_number; 140 141/* The input file stream. */ 142static FILE *fp; 143 144 145/* 1. line_number handling. */ 146 147static unsigned char phase1_pushback[2]; 148static int phase1_pushback_length; 149 150static int 151phase1_getc () 152{ 153 int c; 154 155 if (phase1_pushback_length) 156 c = phase1_pushback[--phase1_pushback_length]; 157 else 158 { 159 c = getc (fp); 160 161 if (c == EOF) 162 { 163 if (ferror (fp)) 164 error (EXIT_FAILURE, errno, _("error while reading \"%s\""), 165 real_file_name); 166 return EOF; 167 } 168 } 169 170 if (c == '\n') 171 line_number++; 172 173 return c; 174} 175 176/* Supports 2 characters of pushback. */ 177static void 178phase1_ungetc (int c) 179{ 180 if (c != EOF) 181 { 182 if (c == '\n') 183 --line_number; 184 185 if (phase1_pushback_length == SIZEOF (phase1_pushback)) 186 abort (); 187 phase1_pushback[phase1_pushback_length++] = c; 188 } 189} 190 191 192/* 2. Ignore HTML sections. They are equivalent to PHP echo commands and 193 therefore don't contain translatable strings. */ 194 195static void 196skip_html () 197{ 198 for (;;) 199 { 200 int c = phase1_getc (); 201 202 if (c == EOF) 203 return; 204 205 if (c == '<') 206 { 207 int c2 = phase1_getc (); 208 209 if (c2 == EOF) 210 break; 211 212 if (c2 == '?') 213 { 214 /* <?php is the normal way to enter PHP mode. <? and <?= are 215 recognized by PHP depending on a configuration setting. */ 216 int c3 = phase1_getc (); 217 218 if (c3 != '=') 219 phase1_ungetc (c3); 220 221 return; 222 } 223 224 if (c2 == '%') 225 { 226 /* <% and <%= are recognized by PHP depending on a configuration 227 setting. */ 228 int c3 = phase1_getc (); 229 230 if (c3 != '=') 231 phase1_ungetc (c3); 232 233 return; 234 } 235 236 if (c2 == '<') 237 { 238 phase1_ungetc (c2); 239 continue; 240 } 241 242 /* < script language = php > 243 < script language = "php" > 244 < script language = 'php' > 245 are always recognized. */ 246 while (c2 == ' ' || c2 == '\t' || c2 == '\n' || c2 == '\r') 247 c2 = phase1_getc (); 248 if (c2 != 's' && c2 != 'S') 249 { 250 phase1_ungetc (c2); 251 continue; 252 } 253 c2 = phase1_getc (); 254 if (c2 != 'c' && c2 != 'C') 255 { 256 phase1_ungetc (c2); 257 continue; 258 } 259 c2 = phase1_getc (); 260 if (c2 != 'r' && c2 != 'R') 261 { 262 phase1_ungetc (c2); 263 continue; 264 } 265 c2 = phase1_getc (); 266 if (c2 != 'i' && c2 != 'I') 267 { 268 phase1_ungetc (c2); 269 continue; 270 } 271 c2 = phase1_getc (); 272 if (c2 != 'p' && c2 != 'P') 273 { 274 phase1_ungetc (c2); 275 continue; 276 } 277 c2 = phase1_getc (); 278 if (c2 != 't' && c2 != 'T') 279 { 280 phase1_ungetc (c2); 281 continue; 282 } 283 c2 = phase1_getc (); 284 if (!(c2 == ' ' || c2 == '\t' || c2 == '\n' || c2 == '\r')) 285 { 286 phase1_ungetc (c2); 287 continue; 288 } 289 do 290 c2 = phase1_getc (); 291 while (c2 == ' ' || c2 == '\t' || c2 == '\n' || c2 == '\r'); 292 if (c2 != 'l' && c2 != 'L') 293 { 294 phase1_ungetc (c2); 295 continue; 296 } 297 c2 = phase1_getc (); 298 if (c2 != 'a' && c2 != 'A') 299 { 300 phase1_ungetc (c2); 301 continue; 302 } 303 c2 = phase1_getc (); 304 if (c2 != 'n' && c2 != 'N') 305 { 306 phase1_ungetc (c2); 307 continue; 308 } 309 c2 = phase1_getc (); 310 if (c2 != 'g' && c2 != 'G') 311 { 312 phase1_ungetc (c2); 313 continue; 314 } 315 c2 = phase1_getc (); 316 if (c2 != 'u' && c2 != 'U') 317 { 318 phase1_ungetc (c2); 319 continue; 320 } 321 c2 = phase1_getc (); 322 if (c2 != 'a' && c2 != 'A') 323 { 324 phase1_ungetc (c2); 325 continue; 326 } 327 c2 = phase1_getc (); 328 if (c2 != 'g' && c2 != 'G') 329 { 330 phase1_ungetc (c2); 331 continue; 332 } 333 c2 = phase1_getc (); 334 if (c2 != 'e' && c2 != 'E') 335 { 336 phase1_ungetc (c2); 337 continue; 338 } 339 c2 = phase1_getc (); 340 while (c2 == ' ' || c2 == '\t' || c2 == '\n' || c2 == '\r') 341 c2 = phase1_getc (); 342 if (c2 != '=') 343 { 344 phase1_ungetc (c2); 345 continue; 346 } 347 c2 = phase1_getc (); 348 while (c2 == ' ' || c2 == '\t' || c2 == '\n' || c2 == '\r') 349 c2 = phase1_getc (); 350 if (c2 == '"') 351 { 352 c2 = phase1_getc (); 353 if (c2 != 'p') 354 { 355 phase1_ungetc (c2); 356 continue; 357 } 358 c2 = phase1_getc (); 359 if (c2 != 'h') 360 { 361 phase1_ungetc (c2); 362 continue; 363 } 364 c2 = phase1_getc (); 365 if (c2 != 'p') 366 { 367 phase1_ungetc (c2); 368 continue; 369 } 370 c2 = phase1_getc (); 371 if (c2 != '"') 372 { 373 phase1_ungetc (c2); 374 continue; 375 } 376 } 377 else if (c2 == '\'') 378 { 379 c2 = phase1_getc (); 380 if (c2 != 'p') 381 { 382 phase1_ungetc (c2); 383 continue; 384 } 385 c2 = phase1_getc (); 386 if (c2 != 'h') 387 { 388 phase1_ungetc (c2); 389 continue; 390 } 391 c2 = phase1_getc (); 392 if (c2 != 'p') 393 { 394 phase1_ungetc (c2); 395 continue; 396 } 397 c2 = phase1_getc (); 398 if (c2 != '\'') 399 { 400 phase1_ungetc (c2); 401 continue; 402 } 403 } 404 else 405 { 406 if (c2 != 'p') 407 { 408 phase1_ungetc (c2); 409 continue; 410 } 411 c2 = phase1_getc (); 412 if (c2 != 'h') 413 { 414 phase1_ungetc (c2); 415 continue; 416 } 417 c2 = phase1_getc (); 418 if (c2 != 'p') 419 { 420 phase1_ungetc (c2); 421 continue; 422 } 423 } 424 c2 = phase1_getc (); 425 while (c2 == ' ' || c2 == '\t' || c2 == '\n' || c2 == '\r') 426 c2 = phase1_getc (); 427 if (c2 != '>') 428 { 429 phase1_ungetc (c2); 430 continue; 431 } 432 return; 433 } 434 } 435} 436 437#if 0 438 439static unsigned char phase2_pushback[1]; 440static int phase2_pushback_length; 441 442static int 443phase2_getc () 444{ 445 int c; 446 447 if (phase2_pushback_length) 448 return phase2_pushback[--phase2_pushback_length]; 449 450 c = phase1_getc (); 451 switch (c) 452 { 453 case '?': 454 case '%': 455 { 456 int c2 = phase1_getc (); 457 if (c2 == '>') 458 { 459 /* ?> and %> terminate PHP mode and switch back to HTML mode. */ 460 skip_html (); 461 return ' '; 462 } 463 phase1_ungetc (c2); 464 } 465 break; 466 467 case '<': 468 { 469 int c2 = phase1_getc (); 470 471 /* < / script > terminates PHP mode and switches back to HTML mode. */ 472 while (c2 == ' ' || c2 == '\t' || c2 == '\n' || c2 == '\r') 473 c2 = phase1_getc (); 474 if (c2 == '/') 475 { 476 do 477 c2 = phase1_getc (); 478 while (c2 == ' ' || c2 == '\t' || c2 == '\n' || c2 == '\r'); 479 if (c2 == 's' || c2 == 'S') 480 { 481 c2 = phase1_getc (); 482 if (c2 == 'c' || c2 == 'C') 483 { 484 c2 = phase1_getc (); 485 if (c2 == 'r' || c2 == 'R') 486 { 487 c2 = phase1_getc (); 488 if (c2 == 'i' || c2 == 'I') 489 { 490 c2 = phase1_getc (); 491 if (c2 == 'p' || c2 == 'P') 492 { 493 c2 = phase1_getc (); 494 if (c2 == 't' || c2 == 'T') 495 { 496 do 497 c2 = phase1_getc (); 498 while (c2 == ' ' || c2 == '\t' 499 || c2 == '\n' || c2 == '\r'); 500 if (c2 == '>') 501 { 502 skip_html (); 503 return ' '; 504 } 505 } 506 } 507 } 508 } 509 } 510 } 511 } 512 phase1_ungetc (c2); 513 } 514 break; 515 } 516 517 return c; 518} 519 520static void 521phase2_ungetc (int c) 522{ 523 if (c != EOF) 524 { 525 if (phase2_pushback_length == SIZEOF (phase2_pushback)) 526 abort (); 527 phase2_pushback[phase2_pushback_length++] = c; 528 } 529} 530 531#endif 532 533 534/* Accumulating comments. */ 535 536static char *buffer; 537static size_t bufmax; 538static size_t buflen; 539 540static inline void 541comment_start () 542{ 543 buflen = 0; 544} 545 546static inline void 547comment_add (int c) 548{ 549 if (buflen >= bufmax) 550 { 551 bufmax = 2 * bufmax + 10; 552 buffer = xrealloc (buffer, bufmax); 553 } 554 buffer[buflen++] = c; 555} 556 557static inline void 558comment_line_end (size_t chars_to_remove) 559{ 560 buflen -= chars_to_remove; 561 while (buflen >= 1 562 && (buffer[buflen - 1] == ' ' || buffer[buflen - 1] == '\t')) 563 --buflen; 564 if (chars_to_remove == 0 && buflen >= bufmax) 565 { 566 bufmax = 2 * bufmax + 10; 567 buffer = xrealloc (buffer, bufmax); 568 } 569 buffer[buflen] = '\0'; 570 savable_comment_add (buffer); 571} 572 573 574/* 3. Replace each comment that is not inside a string literal with a 575 space character. We need to remember the comment for later, because 576 it may be attached to a keyword string. */ 577 578/* These are for tracking whether comments count as immediately before 579 keyword. */ 580static int last_comment_line; 581static int last_non_comment_line; 582 583static unsigned char phase3_pushback[1]; 584static int phase3_pushback_length; 585 586static int 587phase3_getc () 588{ 589 int lineno; 590 int c; 591 592 if (phase3_pushback_length) 593 return phase3_pushback[--phase3_pushback_length]; 594 595 c = phase1_getc (); 596 597 if (c == '#') 598 { 599 /* sh comment. */ 600 bool last_was_qmark = false; 601 602 comment_start (); 603 lineno = line_number; 604 for (;;) 605 { 606 c = phase1_getc (); 607 if (c == '\n' || c == EOF) 608 { 609 comment_line_end (0); 610 break; 611 } 612 if (last_was_qmark && c == '>') 613 { 614 comment_line_end (1); 615 skip_html (); 616 break; 617 } 618 /* We skip all leading white space, but not EOLs. */ 619 if (!(buflen == 0 && (c == ' ' || c == '\t'))) 620 comment_add (c); 621 last_was_qmark = (c == '?' || c == '%'); 622 } 623 last_comment_line = lineno; 624 return '\n'; 625 } 626 else if (c == '/') 627 { 628 c = phase1_getc (); 629 630 switch (c) 631 { 632 default: 633 phase1_ungetc (c); 634 return '/'; 635 636 case '*': 637 { 638 /* C comment. */ 639 bool last_was_star; 640 641 comment_start (); 642 lineno = line_number; 643 last_was_star = false; 644 for (;;) 645 { 646 c = phase1_getc (); 647 if (c == EOF) 648 break; 649 /* We skip all leading white space, but not EOLs. */ 650 if (buflen == 0 && (c == ' ' || c == '\t')) 651 continue; 652 comment_add (c); 653 switch (c) 654 { 655 case '\n': 656 comment_line_end (1); 657 comment_start (); 658 lineno = line_number; 659 last_was_star = false; 660 continue; 661 662 case '*': 663 last_was_star = true; 664 continue; 665 666 case '/': 667 if (last_was_star) 668 { 669 comment_line_end (2); 670 break; 671 } 672 /* FALLTHROUGH */ 673 674 default: 675 last_was_star = false; 676 continue; 677 } 678 break; 679 } 680 last_comment_line = lineno; 681 return ' '; 682 } 683 684 case '/': 685 { 686 /* C++ comment. */ 687 bool last_was_qmark = false; 688 689 comment_start (); 690 lineno = line_number; 691 for (;;) 692 { 693 c = phase1_getc (); 694 if (c == '\n' || c == EOF) 695 { 696 comment_line_end (0); 697 break; 698 } 699 if (last_was_qmark && c == '>') 700 { 701 comment_line_end (1); 702 skip_html (); 703 break; 704 } 705 /* We skip all leading white space, but not EOLs. */ 706 if (!(buflen == 0 && (c == ' ' || c == '\t'))) 707 comment_add (c); 708 last_was_qmark = (c == '?' || c == '%'); 709 } 710 last_comment_line = lineno; 711 return '\n'; 712 } 713 } 714 } 715 else 716 return c; 717} 718 719#ifdef unused 720static void 721phase3_ungetc (int c) 722{ 723 if (c != EOF) 724 { 725 if (phase3_pushback_length == SIZEOF (phase3_pushback)) 726 abort (); 727 phase3_pushback[phase3_pushback_length++] = c; 728 } 729} 730#endif 731 732 733/* ========================== Reading of tokens. ========================== */ 734 735 736enum token_type_ty 737{ 738 token_type_eof, 739 token_type_lparen, /* ( */ 740 token_type_rparen, /* ) */ 741 token_type_comma, /* , */ 742 token_type_lbracket, /* [ */ 743 token_type_rbracket, /* ] */ 744 token_type_dot, /* . */ 745 token_type_operator1, /* * / % ++ -- */ 746 token_type_operator2, /* + - ! ~ @ */ 747 token_type_string_literal, /* "abc" */ 748 token_type_symbol, /* symbol, number */ 749 token_type_other /* misc. operator */ 750}; 751typedef enum token_type_ty token_type_ty; 752 753typedef struct token_ty token_ty; 754struct token_ty 755{ 756 token_type_ty type; 757 char *string; /* for token_type_string_literal, token_type_symbol */ 758 refcounted_string_list_ty *comment; /* for token_type_string_literal */ 759 int line_number; 760}; 761 762 763/* Free the memory pointed to by a 'struct token_ty'. */ 764static inline void 765free_token (token_ty *tp) 766{ 767 if (tp->type == token_type_string_literal || tp->type == token_type_symbol) 768 free (tp->string); 769 if (tp->type == token_type_string_literal) 770 drop_reference (tp->comment); 771} 772 773 774/* 4. Combine characters into tokens. Discard whitespace. */ 775 776static token_ty phase4_pushback[3]; 777static int phase4_pushback_length; 778 779static void 780phase4_get (token_ty *tp) 781{ 782 static char *buffer; 783 static int bufmax; 784 int bufpos; 785 int c; 786 787 if (phase4_pushback_length) 788 { 789 *tp = phase4_pushback[--phase4_pushback_length]; 790 return; 791 } 792 tp->string = NULL; 793 794 for (;;) 795 { 796 tp->line_number = line_number; 797 c = phase3_getc (); 798 switch (c) 799 { 800 case EOF: 801 tp->type = token_type_eof; 802 return; 803 804 case '\n': 805 if (last_non_comment_line > last_comment_line) 806 savable_comment_reset (); 807 /* FALLTHROUGH */ 808 case ' ': 809 case '\t': 810 case '\r': 811 /* Ignore whitespace. */ 812 continue; 813 } 814 815 last_non_comment_line = tp->line_number; 816 817 switch (c) 818 { 819 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G': 820 case 'H': case 'I': case 'J': case 'K': case 'L': case 'M': case 'N': 821 case 'O': case 'P': case 'Q': case 'R': case 'S': case 'T': case 'U': 822 case 'V': case 'W': case 'X': case 'Y': case 'Z': 823 case '_': 824 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g': 825 case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n': 826 case 'o': case 'p': case 'q': case 'r': case 's': case 't': case 'u': 827 case 'v': case 'w': case 'x': case 'y': case 'z': 828 case 127: case 128: case 129: case 130: case 131: case 132: case 133: 829 case 134: case 135: case 136: case 137: case 138: case 139: case 140: 830 case 141: case 142: case 143: case 144: case 145: case 146: case 147: 831 case 148: case 149: case 150: case 151: case 152: case 153: case 154: 832 case 155: case 156: case 157: case 158: case 159: case 160: case 161: 833 case 162: case 163: case 164: case 165: case 166: case 167: case 168: 834 case 169: case 170: case 171: case 172: case 173: case 174: case 175: 835 case 176: case 177: case 178: case 179: case 180: case 181: case 182: 836 case 183: case 184: case 185: case 186: case 187: case 188: case 189: 837 case 190: case 191: case 192: case 193: case 194: case 195: case 196: 838 case 197: case 198: case 199: case 200: case 201: case 202: case 203: 839 case 204: case 205: case 206: case 207: case 208: case 209: case 210: 840 case 211: case 212: case 213: case 214: case 215: case 216: case 217: 841 case 218: case 219: case 220: case 221: case 222: case 223: case 224: 842 case 225: case 226: case 227: case 228: case 229: case 230: case 231: 843 case 232: case 233: case 234: case 235: case 236: case 237: case 238: 844 case 239: case 240: case 241: case 242: case 243: case 244: case 245: 845 case 246: case 247: case 248: case 249: case 250: case 251: case 252: 846 case 253: case 254: case 255: 847 bufpos = 0; 848 for (;;) 849 { 850 if (bufpos >= bufmax) 851 { 852 bufmax = 2 * bufmax + 10; 853 buffer = xrealloc (buffer, bufmax); 854 } 855 buffer[bufpos++] = c; 856 c = phase1_getc (); 857 switch (c) 858 { 859 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': 860 case 'G': case 'H': case 'I': case 'J': case 'K': case 'L': 861 case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R': 862 case 'S': case 'T': case 'U': case 'V': case 'W': case 'X': 863 case 'Y': case 'Z': 864 case '_': 865 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': 866 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l': 867 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r': 868 case 's': case 't': case 'u': case 'v': case 'w': case 'x': 869 case 'y': case 'z': 870 case '0': case '1': case '2': case '3': case '4': 871 case '5': case '6': case '7': case '8': case '9': 872 case 127: case 128: case 129: case 130: case 131: case 132: 873 case 133: case 134: case 135: case 136: case 137: case 138: 874 case 139: case 140: case 141: case 142: case 143: case 144: 875 case 145: case 146: case 147: case 148: case 149: case 150: 876 case 151: case 152: case 153: case 154: case 155: case 156: 877 case 157: case 158: case 159: case 160: case 161: case 162: 878 case 163: case 164: case 165: case 166: case 167: case 168: 879 case 169: case 170: case 171: case 172: case 173: case 174: 880 case 175: case 176: case 177: case 178: case 179: case 180: 881 case 181: case 182: case 183: case 184: case 185: case 186: 882 case 187: case 188: case 189: case 190: case 191: case 192: 883 case 193: case 194: case 195: case 196: case 197: case 198: 884 case 199: case 200: case 201: case 202: case 203: case 204: 885 case 205: case 206: case 207: case 208: case 209: case 210: 886 case 211: case 212: case 213: case 214: case 215: case 216: 887 case 217: case 218: case 219: case 220: case 221: case 222: 888 case 223: case 224: case 225: case 226: case 227: case 228: 889 case 229: case 230: case 231: case 232: case 233: case 234: 890 case 235: case 236: case 237: case 238: case 239: case 240: 891 case 241: case 242: case 243: case 244: case 245: case 246: 892 case 247: case 248: case 249: case 250: case 251: case 252: 893 case 253: case 254: case 255: 894 continue; 895 896 default: 897 phase1_ungetc (c); 898 break; 899 } 900 break; 901 } 902 if (bufpos >= bufmax) 903 { 904 bufmax = 2 * bufmax + 10; 905 buffer = xrealloc (buffer, bufmax); 906 } 907 buffer[bufpos] = 0; 908 tp->string = xstrdup (buffer); 909 tp->type = token_type_symbol; 910 return; 911 912 case '\'': 913 /* Single-quoted string literal. */ 914 bufpos = 0; 915 for (;;) 916 { 917 c = phase1_getc (); 918 if (c == EOF || c == '\'') 919 break; 920 if (c == '\\') 921 { 922 c = phase1_getc (); 923 if (c != '\\' && c != '\'') 924 { 925 phase1_ungetc (c); 926 c = '\\'; 927 } 928 } 929 if (bufpos >= bufmax) 930 { 931 bufmax = 2 * bufmax + 10; 932 buffer = xrealloc (buffer, bufmax); 933 } 934 buffer[bufpos++] = c; 935 } 936 if (bufpos >= bufmax) 937 { 938 bufmax = 2 * bufmax + 10; 939 buffer = xrealloc (buffer, bufmax); 940 } 941 buffer[bufpos] = 0; 942 tp->type = token_type_string_literal; 943 tp->string = xstrdup (buffer); 944 tp->comment = add_reference (savable_comment); 945 return; 946 947 case '"': 948 /* Double-quoted string literal. */ 949 tp->type = token_type_string_literal; 950 bufpos = 0; 951 for (;;) 952 { 953 c = phase1_getc (); 954 if (c == EOF || c == '"') 955 break; 956 if (c == '$') 957 { 958 c = phase1_getc (); 959 if ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') 960 || c == '_' || c == '{' || c >= 0x7f) 961 { 962 /* String with variables. */ 963 tp->type = token_type_other; 964 continue; 965 } 966 phase1_ungetc (c); 967 c = '$'; 968 } 969 if (c == '{') 970 { 971 c = phase1_getc (); 972 if (c == '$') 973 { 974 /* String with expressions. */ 975 tp->type = token_type_other; 976 continue; 977 } 978 phase1_ungetc (c); 979 c = '{'; 980 } 981 if (c == '\\') 982 { 983 int n, j; 984 985 c = phase1_getc (); 986 switch (c) 987 { 988 case '"': 989 case '\\': 990 case '$': 991 break; 992 993 case '0': case '1': case '2': case '3': 994 case '4': case '5': case '6': case '7': 995 n = 0; 996 for (j = 0; j < 3; ++j) 997 { 998 n = n * 8 + c - '0'; 999 c = phase1_getc (); 1000 switch (c) 1001 { 1002 default: 1003 break; 1004 1005 case '0': case '1': case '2': case '3': 1006 case '4': case '5': case '6': case '7': 1007 continue; 1008 } 1009 break; 1010 } 1011 phase1_ungetc (c); 1012 c = n; 1013 break; 1014 1015 case 'x': 1016 n = 0; 1017 for (j = 0; j < 2; ++j) 1018 { 1019 c = phase1_getc (); 1020 switch (c) 1021 { 1022 case '0': case '1': case '2': case '3': case '4': 1023 case '5': case '6': case '7': case '8': case '9': 1024 n = n * 16 + c - '0'; 1025 break; 1026 case 'A': case 'B': case 'C': case 'D': case 'E': 1027 case 'F': 1028 n = n * 16 + 10 + c - 'A'; 1029 break; 1030 case 'a': case 'b': case 'c': case 'd': case 'e': 1031 case 'f': 1032 n = n * 16 + 10 + c - 'a'; 1033 break; 1034 default: 1035 phase1_ungetc (c); 1036 c = 0; 1037 break; 1038 } 1039 if (c == 0) 1040 break; 1041 } 1042 if (j == 0) 1043 { 1044 phase1_ungetc ('x'); 1045 c = '\\'; 1046 } 1047 else 1048 c = n; 1049 break; 1050 1051 case 'n': 1052 c = '\n'; 1053 break; 1054 case 't': 1055 c = '\t'; 1056 break; 1057 case 'r': 1058 c = '\r'; 1059 break; 1060 1061 default: 1062 phase1_ungetc (c); 1063 c = '\\'; 1064 break; 1065 } 1066 } 1067 if (bufpos >= bufmax) 1068 { 1069 bufmax = 2 * bufmax + 10; 1070 buffer = xrealloc (buffer, bufmax); 1071 } 1072 buffer[bufpos++] = c; 1073 } 1074 if (bufpos >= bufmax) 1075 { 1076 bufmax = 2 * bufmax + 10; 1077 buffer = xrealloc (buffer, bufmax); 1078 } 1079 buffer[bufpos] = 0; 1080 if (tp->type == token_type_string_literal) 1081 { 1082 tp->string = xstrdup (buffer); 1083 tp->comment = add_reference (savable_comment); 1084 } 1085 return; 1086 1087 case '?': 1088 case '%': 1089 { 1090 int c2 = phase1_getc (); 1091 if (c2 == '>') 1092 { 1093 /* ?> and %> terminate PHP mode and switch back to HTML 1094 mode. */ 1095 skip_html (); 1096 tp->type = token_type_other; 1097 } 1098 else 1099 { 1100 phase1_ungetc (c2); 1101 tp->type = (c == '%' ? token_type_operator1 : token_type_other); 1102 } 1103 return; 1104 } 1105 1106 case '(': 1107 tp->type = token_type_lparen; 1108 return; 1109 1110 case ')': 1111 tp->type = token_type_rparen; 1112 return; 1113 1114 case ',': 1115 tp->type = token_type_comma; 1116 return; 1117 1118 case '[': 1119 tp->type = token_type_lbracket; 1120 return; 1121 1122 case ']': 1123 tp->type = token_type_rbracket; 1124 return; 1125 1126 case '.': 1127 tp->type = token_type_dot; 1128 return; 1129 1130 case '*': 1131 case '/': 1132 tp->type = token_type_operator1; 1133 return; 1134 1135 case '+': 1136 case '-': 1137 { 1138 int c2 = phase1_getc (); 1139 if (c2 == c) 1140 /* ++ or -- */ 1141 tp->type = token_type_operator1; 1142 else 1143 /* + or - */ 1144 { 1145 phase1_ungetc (c2); 1146 tp->type = token_type_operator2; 1147 } 1148 return; 1149 } 1150 1151 case '!': 1152 case '~': 1153 case '@': 1154 tp->type = token_type_operator2; 1155 return; 1156 1157 case '<': 1158 { 1159 int c2 = phase1_getc (); 1160 if (c2 == '<') 1161 { 1162 int c3 = phase1_getc (); 1163 if (c3 == '<') 1164 { 1165 /* Start of here document. 1166 Parse whitespace, then label, then newline. */ 1167 do 1168 c = phase3_getc (); 1169 while (c == ' ' || c == '\t' || c == '\n' || c == '\r'); 1170 1171 bufpos = 0; 1172 do 1173 { 1174 if (bufpos >= bufmax) 1175 { 1176 bufmax = 2 * bufmax + 10; 1177 buffer = xrealloc (buffer, bufmax); 1178 } 1179 buffer[bufpos++] = c; 1180 c = phase3_getc (); 1181 } 1182 while (c != EOF && c != '\n' && c != '\r'); 1183 /* buffer[0..bufpos-1] now contains the label. */ 1184 1185 /* Now skip the here document. */ 1186 for (;;) 1187 { 1188 c = phase1_getc (); 1189 if (c == EOF) 1190 break; 1191 if (c == '\n' || c == '\r') 1192 { 1193 int bufidx = 0; 1194 1195 while (bufidx < bufpos) 1196 { 1197 c = phase1_getc (); 1198 if (c == EOF) 1199 break; 1200 if (c != buffer[bufidx]) 1201 { 1202 phase1_ungetc (c); 1203 break; 1204 } 1205 bufidx++; 1206 } 1207 if (bufidx == bufpos) 1208 { 1209 c = phase1_getc (); 1210 if (c != ';') 1211 phase1_ungetc (c); 1212 c = phase1_getc (); 1213 if (c == '\n' || c == '\r') 1214 break; 1215 } 1216 } 1217 } 1218 1219 /* FIXME: Ideally we should turn the here document into a 1220 string literal if it didn't contain $ substitution. And 1221 we should also respect backslash escape sequences like 1222 in double-quoted strings. */ 1223 tp->type = token_type_other; 1224 return; 1225 } 1226 phase1_ungetc (c3); 1227 } 1228 1229 /* < / script > terminates PHP mode and switches back to HTML 1230 mode. */ 1231 while (c2 == ' ' || c2 == '\t' || c2 == '\n' || c2 == '\r') 1232 c2 = phase1_getc (); 1233 if (c2 == '/') 1234 { 1235 do 1236 c2 = phase1_getc (); 1237 while (c2 == ' ' || c2 == '\t' || c2 == '\n' || c2 == '\r'); 1238 if (c2 == 's' || c2 == 'S') 1239 { 1240 c2 = phase1_getc (); 1241 if (c2 == 'c' || c2 == 'C') 1242 { 1243 c2 = phase1_getc (); 1244 if (c2 == 'r' || c2 == 'R') 1245 { 1246 c2 = phase1_getc (); 1247 if (c2 == 'i' || c2 == 'I') 1248 { 1249 c2 = phase1_getc (); 1250 if (c2 == 'p' || c2 == 'P') 1251 { 1252 c2 = phase1_getc (); 1253 if (c2 == 't' || c2 == 'T') 1254 { 1255 do 1256 c2 = phase1_getc (); 1257 while (c2 == ' ' || c2 == '\t' 1258 || c2 == '\n' || c2 == '\r'); 1259 if (c2 == '>') 1260 { 1261 skip_html (); 1262 } 1263 else 1264 phase1_ungetc (c2); 1265 } 1266 else 1267 phase1_ungetc (c2); 1268 } 1269 else 1270 phase1_ungetc (c2); 1271 } 1272 else 1273 phase1_ungetc (c2); 1274 } 1275 else 1276 phase1_ungetc (c2); 1277 } 1278 else 1279 phase1_ungetc (c2); 1280 } 1281 else 1282 phase1_ungetc (c2); 1283 } 1284 else 1285 phase1_ungetc (c2); 1286 1287 tp->type = token_type_other; 1288 return; 1289 } 1290 1291 case '`': 1292 /* Execution operator. */ 1293 default: 1294 /* We could carefully recognize each of the 2 and 3 character 1295 operators, but it is not necessary, as we only need to recognize 1296 gettext invocations. Don't bother. */ 1297 tp->type = token_type_other; 1298 return; 1299 } 1300 } 1301} 1302 1303/* Supports 3 tokens of pushback. */ 1304static void 1305phase4_unget (token_ty *tp) 1306{ 1307 if (tp->type != token_type_eof) 1308 { 1309 if (phase4_pushback_length == SIZEOF (phase4_pushback)) 1310 abort (); 1311 phase4_pushback[phase4_pushback_length++] = *tp; 1312 } 1313} 1314 1315 1316/* 5. Compile-time optimization of string literal concatenation. 1317 Combine "string1" . ... . "stringN" to the concatenated string if 1318 - the token before this expression is none of 1319 '+' '-' '.' '*' '/' '%' '!' '~' '++' '--' ')' '@' 1320 (because then the first string could be part of an expression with 1321 the same or higher precedence as '.', such as an additive, 1322 multiplicative, negation, preincrement, or cast expression), 1323 - the token after this expression is none of 1324 '*' '/' '%' '++' '--' 1325 (because then the last string could be part of an expression with 1326 higher precedence as '.', such as a multiplicative or postincrement 1327 expression). */ 1328 1329static token_type_ty phase5_last; 1330 1331static void 1332x_php_lex (token_ty *tp) 1333{ 1334 phase4_get (tp); 1335 if (tp->type == token_type_string_literal 1336 && !(phase5_last == token_type_dot 1337 || phase5_last == token_type_operator1 1338 || phase5_last == token_type_operator2 1339 || phase5_last == token_type_rparen)) 1340 { 1341 char *sum = tp->string; 1342 size_t sum_len = strlen (sum); 1343 1344 for (;;) 1345 { 1346 token_ty token2; 1347 1348 phase4_get (&token2); 1349 if (token2.type == token_type_dot) 1350 { 1351 token_ty token3; 1352 1353 phase4_get (&token3); 1354 if (token3.type == token_type_string_literal) 1355 { 1356 token_ty token_after; 1357 1358 phase4_get (&token_after); 1359 if (token_after.type != token_type_operator1) 1360 { 1361 char *addend = token3.string; 1362 size_t addend_len = strlen (addend); 1363 1364 sum = (char *) xrealloc (sum, sum_len + addend_len + 1); 1365 memcpy (sum + sum_len, addend, addend_len + 1); 1366 sum_len += addend_len; 1367 1368 phase4_unget (&token_after); 1369 free_token (&token3); 1370 free_token (&token2); 1371 continue; 1372 } 1373 phase4_unget (&token_after); 1374 } 1375 phase4_unget (&token3); 1376 } 1377 phase4_unget (&token2); 1378 break; 1379 } 1380 tp->string = sum; 1381 } 1382 phase5_last = tp->type; 1383} 1384 1385 1386/* ========================= Extracting strings. ========================== */ 1387 1388 1389/* Context lookup table. */ 1390static flag_context_list_table_ty *flag_context_list_table; 1391 1392 1393/* The file is broken into tokens. Scan the token stream, looking for 1394 a keyword, followed by a left paren, followed by a string. When we 1395 see this sequence, we have something to remember. We assume we are 1396 looking at a valid C or C++ program, and leave the complaints about 1397 the grammar to the compiler. 1398 1399 Normal handling: Look for 1400 keyword ( ... msgid ... ) 1401 Plural handling: Look for 1402 keyword ( ... msgid ... msgid_plural ... ) 1403 1404 We use recursion because the arguments before msgid or between msgid 1405 and msgid_plural can contain subexpressions of the same form. */ 1406 1407 1408/* Extract messages until the next balanced closing parenthesis or bracket. 1409 Extracted messages are added to MLP. 1410 DELIM can be either token_type_rparen or token_type_rbracket, or 1411 token_type_eof to accept both. 1412 Return true upon eof, false upon closing parenthesis. */ 1413static bool 1414extract_balanced (message_list_ty *mlp, 1415 token_type_ty delim, 1416 flag_context_ty outer_context, 1417 flag_context_list_iterator_ty context_iter, 1418 struct arglist_parser *argparser) 1419{ 1420 /* Current argument number. */ 1421 int arg = 1; 1422 /* 0 when no keyword has been seen. 1 right after a keyword is seen. */ 1423 int state; 1424 /* Parameters of the keyword just seen. Defined only in state 1. */ 1425 const struct callshapes *next_shapes = NULL; 1426 /* Context iterator that will be used if the next token is a '('. */ 1427 flag_context_list_iterator_ty next_context_iter = 1428 passthrough_context_list_iterator; 1429 /* Current context. */ 1430 flag_context_ty inner_context = 1431 inherited_context (outer_context, 1432 flag_context_list_iterator_advance (&context_iter)); 1433 1434 /* Start state is 0. */ 1435 state = 0; 1436 1437 for (;;) 1438 { 1439 token_ty token; 1440 1441 x_php_lex (&token); 1442 switch (token.type) 1443 { 1444 case token_type_symbol: 1445 { 1446 void *keyword_value; 1447 1448 if (hash_find_entry (&keywords, token.string, strlen (token.string), 1449 &keyword_value) 1450 == 0) 1451 { 1452 next_shapes = (const struct callshapes *) keyword_value; 1453 state = 1; 1454 } 1455 else 1456 state = 0; 1457 } 1458 next_context_iter = 1459 flag_context_list_iterator ( 1460 flag_context_list_table_lookup ( 1461 flag_context_list_table, 1462 token.string, strlen (token.string))); 1463 free (token.string); 1464 continue; 1465 1466 case token_type_lparen: 1467 if (extract_balanced (mlp, token_type_rparen, 1468 inner_context, next_context_iter, 1469 arglist_parser_alloc (mlp, 1470 state ? next_shapes : NULL))) 1471 { 1472 arglist_parser_done (argparser, arg); 1473 return true; 1474 } 1475 next_context_iter = null_context_list_iterator; 1476 state = 0; 1477 continue; 1478 1479 case token_type_rparen: 1480 if (delim == token_type_rparen || delim == token_type_eof) 1481 { 1482 arglist_parser_done (argparser, arg); 1483 return false; 1484 } 1485 next_context_iter = null_context_list_iterator; 1486 state = 0; 1487 continue; 1488 1489 case token_type_comma: 1490 arg++; 1491 inner_context = 1492 inherited_context (outer_context, 1493 flag_context_list_iterator_advance ( 1494 &context_iter)); 1495 next_context_iter = passthrough_context_list_iterator; 1496 state = 0; 1497 continue; 1498 1499 case token_type_lbracket: 1500 if (extract_balanced (mlp, token_type_rbracket, 1501 null_context, null_context_list_iterator, 1502 arglist_parser_alloc (mlp, NULL))) 1503 { 1504 arglist_parser_done (argparser, arg); 1505 return true; 1506 } 1507 1508 case token_type_rbracket: 1509 if (delim == token_type_rbracket || delim == token_type_eof) 1510 { 1511 arglist_parser_done (argparser, arg); 1512 return false; 1513 } 1514 next_context_iter = null_context_list_iterator; 1515 state = 0; 1516 continue; 1517 1518 case token_type_string_literal: 1519 { 1520 lex_pos_ty pos; 1521 pos.file_name = logical_file_name; 1522 pos.line_number = token.line_number; 1523 1524 if (extract_all) 1525 remember_a_message (mlp, NULL, token.string, inner_context, 1526 &pos, token.comment); 1527 else 1528 arglist_parser_remember (argparser, arg, token.string, 1529 inner_context, 1530 pos.file_name, pos.line_number, 1531 token.comment); 1532 drop_reference (token.comment); 1533 } 1534 next_context_iter = null_context_list_iterator; 1535 state = 0; 1536 continue; 1537 1538 case token_type_dot: 1539 case token_type_operator1: 1540 case token_type_operator2: 1541 case token_type_other: 1542 next_context_iter = null_context_list_iterator; 1543 state = 0; 1544 continue; 1545 1546 case token_type_eof: 1547 arglist_parser_done (argparser, arg); 1548 return true; 1549 1550 default: 1551 abort (); 1552 } 1553 } 1554} 1555 1556 1557void 1558extract_php (FILE *f, 1559 const char *real_filename, const char *logical_filename, 1560 flag_context_list_table_ty *flag_table, 1561 msgdomain_list_ty *mdlp) 1562{ 1563 message_list_ty *mlp = mdlp->item[0]->messages; 1564 1565 fp = f; 1566 real_file_name = real_filename; 1567 logical_file_name = xstrdup (logical_filename); 1568 line_number = 1; 1569 1570 last_comment_line = -1; 1571 last_non_comment_line = -1; 1572 1573 phase5_last = token_type_eof; 1574 1575 flag_context_list_table = flag_table; 1576 1577 init_keywords (); 1578 1579 /* Initial mode is HTML mode, not PHP mode. */ 1580 skip_html (); 1581 1582 /* Eat tokens until eof is seen. When extract_balanced returns 1583 due to an unbalanced closing parenthesis, just restart it. */ 1584 while (!extract_balanced (mlp, token_type_eof, 1585 null_context, null_context_list_iterator, 1586 arglist_parser_alloc (mlp, NULL))) 1587 ; 1588 1589 /* Close scanner. */ 1590 fp = NULL; 1591 real_file_name = NULL; 1592 logical_file_name = NULL; 1593 line_number = 0; 1594} 1595