1/* xgettext C/C++/ObjectiveC backend. 2 Copyright (C) 1995-1998, 2000-2007 Free Software Foundation, Inc. 3 4 This file was written by Peter Miller <millerp@canb.auug.org.au> 5 6 This program is free software: you can redistribute it and/or modify 7 it under the terms of the GNU General Public License as published by 8 the Free Software Foundation; either version 3 of the License, or 9 (at your option) any later version. 10 11 This program is distributed in the hope that it will be useful, 12 but WITHOUT ANY WARRANTY; without even the implied warranty of 13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 GNU General Public License for more details. 15 16 You should have received a copy of the GNU General Public License 17 along with this program. If not, see <http://www.gnu.org/licenses/>. */ 18 19#ifdef HAVE_CONFIG_H 20# include "config.h" 21#endif 22 23/* Specification. */ 24#include "x-c.h" 25 26#include <errno.h> 27#include <stdbool.h> 28#include <stdio.h> 29#include <stdlib.h> 30#include <string.h> 31 32#include "message.h" 33#include "xgettext.h" 34#include "x-c.h" 35#include "error.h" 36#include "error-progname.h" 37#include "xalloc.h" 38#include "xvasprintf.h" 39#include "hash.h" 40#include "gettext.h" 41 42#define _(s) gettext(s) 43 44#define SIZEOF(a) (sizeof(a) / sizeof(a[0])) 45 46 47/* The ANSI C standard defines several phases of translation: 48 49 1. Terminate line by \n, regardless of the external representation 50 of a text line. Stdio does this for us. 51 52 2. Convert trigraphs to their single character equivalents. 53 54 3. Concatenate each line ending in backslash (\) with the following 55 line. 56 57 4. Replace each comment with a space character. 58 59 5. Parse each resulting logical line as preprocessing tokens a 60 white space. 61 62 6. Recognize and carry out directives (it also expands macros on 63 non-directive lines, which we do not do here). 64 65 7. Replaces escape sequences within character strings with their 66 single character equivalents (we do this in step 5, because we 67 don't have to worry about the #include argument). 68 69 8. Concatenates adjacent string literals to form single string 70 literals (because we don't expand macros, there are a few things 71 we will miss). 72 73 9. Converts the remaining preprocessing tokens to C tokens and 74 discards any white space from the translation unit. 75 76 This lexer implements the above, and presents the scanner (in 77 xgettext.c) with a stream of C tokens. The comments are 78 accumulated in a buffer, and given to xgettext when asked for. */ 79 80 81/* ========================= Lexer customization. ========================= */ 82 83static bool trigraphs = false; 84 85void 86x_c_trigraphs () 87{ 88 trigraphs = true; 89} 90 91 92/* ====================== Keyword set customization. ====================== */ 93 94/* If true extract all strings. */ 95static bool extract_all = false; 96 97static hash_table c_keywords; 98static hash_table objc_keywords; 99static bool default_keywords = true; 100 101 102void 103x_c_extract_all () 104{ 105 extract_all = true; 106} 107 108 109static void 110add_keyword (const char *name, hash_table *keywords) 111{ 112 if (name == NULL) 113 default_keywords = false; 114 else 115 { 116 const char *end; 117 struct callshape shape; 118 const char *colon; 119 120 if (keywords->table == NULL) 121 hash_init (keywords, 100); 122 123 split_keywordspec (name, &end, &shape); 124 125 /* The characters between name and end should form a valid C identifier. 126 A colon means an invalid parse in split_keywordspec(). */ 127 colon = strchr (name, ':'); 128 if (colon == NULL || colon >= end) 129 insert_keyword_callshape (keywords, name, end - name, &shape); 130 } 131} 132 133void 134x_c_keyword (const char *name) 135{ 136 add_keyword (name, &c_keywords); 137} 138 139void 140x_objc_keyword (const char *name) 141{ 142 add_keyword (name, &objc_keywords); 143} 144 145/* Finish initializing the keywords hash tables. 146 Called after argument processing, before each file is processed. */ 147static void 148init_keywords () 149{ 150 if (default_keywords) 151 { 152 /* When adding new keywords here, also update the documentation in 153 xgettext.texi! */ 154 x_c_keyword ("gettext"); 155 x_c_keyword ("dgettext:2"); 156 x_c_keyword ("dcgettext:2"); 157 x_c_keyword ("ngettext:1,2"); 158 x_c_keyword ("dngettext:2,3"); 159 x_c_keyword ("dcngettext:2,3"); 160 x_c_keyword ("gettext_noop"); 161 x_c_keyword ("pgettext:1c,2"); 162 x_c_keyword ("dpgettext:2c,3"); 163 x_c_keyword ("dcpgettext:2c,3"); 164 x_c_keyword ("npgettext:1c,2,3"); 165 x_c_keyword ("dnpgettext:2c,3,4"); 166 x_c_keyword ("dcnpgettext:2c,3,4"); 167 168 x_objc_keyword ("gettext"); 169 x_objc_keyword ("dgettext:2"); 170 x_objc_keyword ("dcgettext:2"); 171 x_objc_keyword ("ngettext:1,2"); 172 x_objc_keyword ("dngettext:2,3"); 173 x_objc_keyword ("dcngettext:2,3"); 174 x_objc_keyword ("gettext_noop"); 175 x_objc_keyword ("pgettext:1c,2"); 176 x_objc_keyword ("dpgettext:2c,3"); 177 x_objc_keyword ("dcpgettext:2c,3"); 178 x_objc_keyword ("npgettext:1c,2,3"); 179 x_objc_keyword ("dnpgettext:2c,3,4"); 180 x_objc_keyword ("dcnpgettext:2c,3,4"); 181 x_objc_keyword ("NSLocalizedString"); /* similar to gettext */ 182 x_objc_keyword ("_"); /* similar to gettext */ 183 x_objc_keyword ("NSLocalizedStaticString"); /* similar to gettext_noop */ 184 x_objc_keyword ("__"); /* similar to gettext_noop */ 185 186 default_keywords = false; 187 } 188} 189 190void 191init_flag_table_c () 192{ 193 xgettext_record_flag ("gettext:1:pass-c-format"); 194 xgettext_record_flag ("dgettext:2:pass-c-format"); 195 xgettext_record_flag ("dcgettext:2:pass-c-format"); 196 xgettext_record_flag ("ngettext:1:pass-c-format"); 197 xgettext_record_flag ("ngettext:2:pass-c-format"); 198 xgettext_record_flag ("dngettext:2:pass-c-format"); 199 xgettext_record_flag ("dngettext:3:pass-c-format"); 200 xgettext_record_flag ("dcngettext:2:pass-c-format"); 201 xgettext_record_flag ("dcngettext:3:pass-c-format"); 202 xgettext_record_flag ("gettext_noop:1:pass-c-format"); 203 xgettext_record_flag ("pgettext:2:pass-c-format"); 204 xgettext_record_flag ("dpgettext:3:pass-c-format"); 205 xgettext_record_flag ("dcpgettext:3:pass-c-format"); 206 xgettext_record_flag ("npgettext:2:pass-c-format"); 207 xgettext_record_flag ("npgettext:3:pass-c-format"); 208 xgettext_record_flag ("dnpgettext:3:pass-c-format"); 209 xgettext_record_flag ("dnpgettext:4:pass-c-format"); 210 xgettext_record_flag ("dcnpgettext:3:pass-c-format"); 211 xgettext_record_flag ("dcnpgettext:4:pass-c-format"); 212 213 /* <stdio.h> */ 214 xgettext_record_flag ("fprintf:2:c-format"); 215 xgettext_record_flag ("vfprintf:2:c-format"); 216 xgettext_record_flag ("printf:1:c-format"); 217 xgettext_record_flag ("vprintf:1:c-format"); 218 xgettext_record_flag ("sprintf:2:c-format"); 219 xgettext_record_flag ("vsprintf:2:c-format"); 220 xgettext_record_flag ("snprintf:3:c-format"); 221 xgettext_record_flag ("vsnprintf:3:c-format"); 222#if 0 /* These functions are not standard. */ 223 /* <stdio.h> */ 224 xgettext_record_flag ("asprintf:2:c-format"); 225 xgettext_record_flag ("vasprintf:2:c-format"); 226 xgettext_record_flag ("dprintf:2:c-format"); 227 xgettext_record_flag ("vdprintf:2:c-format"); 228 xgettext_record_flag ("obstack_printf:2:c-format"); 229 xgettext_record_flag ("obstack_vprintf:2:c-format"); 230 /* <error.h> */ 231 xgettext_record_flag ("error:3:c-format"); 232 xgettext_record_flag ("error_at_line:5:c-format"); 233 /* <argp.h> */ 234 xgettext_record_flag ("argp_error:2:c-format"); 235 xgettext_record_flag ("argp_failure:2:c-format"); 236#endif 237 238 xgettext_record_flag ("gettext:1:pass-qt-format"); 239 xgettext_record_flag ("dgettext:2:pass-qt-format"); 240 xgettext_record_flag ("dcgettext:2:pass-qt-format"); 241 xgettext_record_flag ("ngettext:1:pass-qt-format"); 242 xgettext_record_flag ("ngettext:2:pass-qt-format"); 243 xgettext_record_flag ("dngettext:2:pass-qt-format"); 244 xgettext_record_flag ("dngettext:3:pass-qt-format"); 245 xgettext_record_flag ("dcngettext:2:pass-qt-format"); 246 xgettext_record_flag ("dcngettext:3:pass-qt-format"); 247 xgettext_record_flag ("gettext_noop:1:pass-qt-format"); 248 xgettext_record_flag ("pgettext:2:pass-qt-format"); 249 xgettext_record_flag ("dpgettext:3:pass-qt-format"); 250 xgettext_record_flag ("dcpgettext:3:pass-qt-format"); 251 xgettext_record_flag ("npgettext:2:pass-qt-format"); 252 xgettext_record_flag ("npgettext:3:pass-qt-format"); 253 xgettext_record_flag ("dnpgettext:3:pass-qt-format"); 254 xgettext_record_flag ("dnpgettext:4:pass-qt-format"); 255 xgettext_record_flag ("dcnpgettext:3:pass-qt-format"); 256 xgettext_record_flag ("dcnpgettext:4:pass-qt-format"); 257 258 xgettext_record_flag ("gettext:1:pass-kde-format"); 259 xgettext_record_flag ("dgettext:2:pass-kde-format"); 260 xgettext_record_flag ("dcgettext:2:pass-kde-format"); 261 xgettext_record_flag ("ngettext:1:pass-kde-format"); 262 xgettext_record_flag ("ngettext:2:pass-kde-format"); 263 xgettext_record_flag ("dngettext:2:pass-kde-format"); 264 xgettext_record_flag ("dngettext:3:pass-kde-format"); 265 xgettext_record_flag ("dcngettext:2:pass-kde-format"); 266 xgettext_record_flag ("dcngettext:3:pass-kde-format"); 267 xgettext_record_flag ("gettext_noop:1:pass-kde-format"); 268 xgettext_record_flag ("pgettext:2:pass-kde-format"); 269 xgettext_record_flag ("dpgettext:3:pass-kde-format"); 270 xgettext_record_flag ("dcpgettext:3:pass-kde-format"); 271 xgettext_record_flag ("npgettext:2:pass-kde-format"); 272 xgettext_record_flag ("npgettext:3:pass-kde-format"); 273 xgettext_record_flag ("dnpgettext:3:pass-kde-format"); 274 xgettext_record_flag ("dnpgettext:4:pass-kde-format"); 275 xgettext_record_flag ("dcnpgettext:3:pass-kde-format"); 276 xgettext_record_flag ("dcnpgettext:4:pass-kde-format"); 277 278 xgettext_record_flag ("gettext:1:pass-boost-format"); 279 xgettext_record_flag ("dgettext:2:pass-boost-format"); 280 xgettext_record_flag ("dcgettext:2:pass-boost-format"); 281 xgettext_record_flag ("ngettext:1:pass-boost-format"); 282 xgettext_record_flag ("ngettext:2:pass-boost-format"); 283 xgettext_record_flag ("dngettext:2:pass-boost-format"); 284 xgettext_record_flag ("dngettext:3:pass-boost-format"); 285 xgettext_record_flag ("dcngettext:2:pass-boost-format"); 286 xgettext_record_flag ("dcngettext:3:pass-boost-format"); 287 xgettext_record_flag ("gettext_noop:1:pass-boost-format"); 288 xgettext_record_flag ("pgettext:2:pass-boost-format"); 289 xgettext_record_flag ("dpgettext:3:pass-boost-format"); 290 xgettext_record_flag ("dcpgettext:3:pass-boost-format"); 291 xgettext_record_flag ("npgettext:2:pass-boost-format"); 292 xgettext_record_flag ("npgettext:3:pass-boost-format"); 293 xgettext_record_flag ("dnpgettext:3:pass-boost-format"); 294 xgettext_record_flag ("dnpgettext:4:pass-boost-format"); 295 xgettext_record_flag ("dcnpgettext:3:pass-boost-format"); 296 xgettext_record_flag ("dcnpgettext:4:pass-boost-format"); 297 298 /* <boost/format.hpp> */ 299 xgettext_record_flag ("format:1:boost-format"); 300} 301 302void 303init_flag_table_objc () 304{ 305 /* Since the settings done in init_flag_table_c() also have an effect for 306 the ObjectiveC parser, we don't have to repeat them here. */ 307 xgettext_record_flag ("gettext:1:pass-objc-format"); 308 xgettext_record_flag ("dgettext:2:pass-objc-format"); 309 xgettext_record_flag ("dcgettext:2:pass-objc-format"); 310 xgettext_record_flag ("ngettext:1:pass-objc-format"); 311 xgettext_record_flag ("ngettext:2:pass-objc-format"); 312 xgettext_record_flag ("dngettext:2:pass-objc-format"); 313 xgettext_record_flag ("dngettext:3:pass-objc-format"); 314 xgettext_record_flag ("dcngettext:2:pass-objc-format"); 315 xgettext_record_flag ("dcngettext:3:pass-objc-format"); 316 xgettext_record_flag ("gettext_noop:1:pass-objc-format"); 317 xgettext_record_flag ("pgettext:2:pass-objc-format"); 318 xgettext_record_flag ("dpgettext:3:pass-objc-format"); 319 xgettext_record_flag ("dcpgettext:3:pass-objc-format"); 320 xgettext_record_flag ("npgettext:2:pass-objc-format"); 321 xgettext_record_flag ("npgettext:3:pass-objc-format"); 322 xgettext_record_flag ("dnpgettext:3:pass-objc-format"); 323 xgettext_record_flag ("dnpgettext:4:pass-objc-format"); 324 xgettext_record_flag ("dcnpgettext:3:pass-objc-format"); 325 xgettext_record_flag ("dcnpgettext:4:pass-objc-format"); 326 xgettext_record_flag ("NSLocalizedString:1:pass-c-format"); 327 xgettext_record_flag ("NSLocalizedString:1:pass-objc-format"); 328 xgettext_record_flag ("_:1:pass-c-format"); 329 xgettext_record_flag ("_:1:pass-objc-format"); 330 xgettext_record_flag ("stringWithFormat::1:objc-format"); 331 xgettext_record_flag ("initWithFormat::1:objc-format"); 332 xgettext_record_flag ("stringByAppendingFormat::1:objc-format"); 333 xgettext_record_flag ("localizedStringWithFormat::1:objc-format"); 334 xgettext_record_flag ("appendFormat::1:objc-format"); 335} 336 337void 338init_flag_table_gcc_internal () 339{ 340 xgettext_record_flag ("gettext:1:pass-gcc-internal-format"); 341 xgettext_record_flag ("dgettext:2:pass-gcc-internal-format"); 342 xgettext_record_flag ("dcgettext:2:pass-gcc-internal-format"); 343 xgettext_record_flag ("ngettext:1:pass-gcc-internal-format"); 344 xgettext_record_flag ("ngettext:2:pass-gcc-internal-format"); 345 xgettext_record_flag ("dngettext:2:pass-gcc-internal-format"); 346 xgettext_record_flag ("dngettext:3:pass-gcc-internal-format"); 347 xgettext_record_flag ("dcngettext:2:pass-gcc-internal-format"); 348 xgettext_record_flag ("dcngettext:3:pass-gcc-internal-format"); 349 xgettext_record_flag ("gettext_noop:1:pass-gcc-internal-format"); 350 xgettext_record_flag ("pgettext:2:pass-gcc-internal-format"); 351 xgettext_record_flag ("dpgettext:3:pass-gcc-internal-format"); 352 xgettext_record_flag ("dcpgettext:3:pass-gcc-internal-format"); 353 xgettext_record_flag ("npgettext:2:pass-gcc-internal-format"); 354 xgettext_record_flag ("npgettext:3:pass-gcc-internal-format"); 355 xgettext_record_flag ("dnpgettext:3:pass-gcc-internal-format"); 356 xgettext_record_flag ("dnpgettext:4:pass-gcc-internal-format"); 357 xgettext_record_flag ("dcnpgettext:3:pass-gcc-internal-format"); 358 xgettext_record_flag ("dcnpgettext:4:pass-gcc-internal-format"); 359#if 0 /* This should better be done inside GCC. */ 360 /* grepping for ATTRIBUTE_PRINTF in gcc-3.3/gcc/?*.h */ 361 /* c-format.c */ 362 xgettext_record_flag ("status_warning:2:gcc-internal-format"); 363 /* c-tree.h */ 364 xgettext_record_flag ("pedwarn_c99:1:pass-gcc-internal-format"); 365 /* collect2.h */ 366 //xgettext_record_flag ("error:1:c-format"); // 3 different versions 367 xgettext_record_flag ("notice:1:c-format"); 368 //xgettext_record_flag ("fatal:1:c-format"); // 2 different versions 369 xgettext_record_flag ("fatal_perror:1:c-format"); 370 /* cpplib.h */ 371 xgettext_record_flag ("cpp_error:3:c-format"); 372 xgettext_record_flag ("cpp_error_with_line:5:c-format"); 373 /* diagnostic.h */ 374 xgettext_record_flag ("diagnostic_set_info:2:pass-gcc-internal-format"); 375 xgettext_record_flag ("output_printf:2:gcc-internal-format"); 376 xgettext_record_flag ("output_verbatim:2:pass-gcc-internal-format"); 377 xgettext_record_flag ("verbatim:1:gcc-internal-format"); 378 xgettext_record_flag ("inform:1:pass-gcc-internal-format"); 379 /* gcc.h */ 380 //xgettext_record_flag ("fatal:1:c-format"); // 2 different versions 381 //xgettext_record_flag ("error:1:c-format"); // 3 different versions 382 /* genattrtab.h */ 383 xgettext_record_flag ("attr_printf:2:pass-c-format"); 384 /* gengtype.h */ 385 xgettext_record_flag ("error_at_line:2:pass-c-format"); 386 xgettext_record_flag ("xvasprintf:2:pass-c-format"); 387 xgettext_record_flag ("xasprintf:1:pass-c-format"); 388 xgettext_record_flag ("oprintf:2:pass-c-format"); 389 /* gensupport.h */ 390 xgettext_record_flag ("message_with_line:2:pass-c-format"); 391 /* output.h */ 392 xgettext_record_flag ("output_operand_lossage:1:c-format"); 393 /* ra.h */ 394 xgettext_record_flag ("ra_debug_msg:2:pass-c-format"); 395 /* toplev.h */ 396 xgettext_record_flag ("fnotice:2:c-format"); 397 xgettext_record_flag ("fatal_io_error:2:gcc-internal-format"); 398 xgettext_record_flag ("error_for_asm:2:pass-gcc-internal-format"); 399 xgettext_record_flag ("warning_for_asm:2:pass-gcc-internal-format"); 400 xgettext_record_flag ("error_with_file_and_line:3:pass-gcc-internal-format"); 401 xgettext_record_flag ("error_with_decl:2:pass-gcc-internal-format"); 402 xgettext_record_flag ("pedwarn:1:gcc-internal-format"); 403 xgettext_record_flag ("pedwarn_with_file_and_line:3:gcc-internal-format"); 404 xgettext_record_flag ("pedwarn_with_decl:2:gcc-internal-format"); 405 xgettext_record_flag ("sorry:1:gcc-internal-format"); 406 xgettext_record_flag ("error:1:pass-gcc-internal-format"); 407 xgettext_record_flag ("fatal_error:1:pass-gcc-internal-format"); 408 xgettext_record_flag ("internal_error:1:pass-gcc-internal-format"); 409 xgettext_record_flag ("warning:1:pass-gcc-internal-format"); 410 xgettext_record_flag ("warning_with_file_and_line:3:pass-gcc-internal-format"); 411 xgettext_record_flag ("warning_with_decl:2:pass-gcc-internal-format"); 412 /* f/com.h */ 413 xgettext_record_flag ("ffecom_get_invented_identifier:1:pass-c-format"); 414 /* f/sts.h */ 415 xgettext_record_flag ("ffests_printf:2:pass-c-format"); 416 /* java/java-tree.h */ 417 xgettext_record_flag ("parse_error_context:2:pass-c-format"); 418#endif 419} 420 421 422/* ======================== Reading of characters. ======================== */ 423 424/* Real filename, used in error messages about the input file. */ 425static const char *real_file_name; 426 427/* Logical filename and line number, used to label the extracted messages. */ 428static char *logical_file_name; 429static int line_number; 430 431/* The input file stream. */ 432static FILE *fp; 433 434 435/* 0. Terminate line by \n, regardless whether the external representation of 436 a line terminator is LF (Unix), CR (Mac) or CR/LF (DOS/Windows). 437 It is debatable whether supporting CR/LF line terminators in C sources 438 on Unix is ISO C or POSIX compliant, but since GCC 3.3 now supports it 439 unconditionally, it must be OK. 440 The so-called "text mode" in stdio on DOS/Windows translates CR/LF to \n 441 automatically, but here we also need this conversion on Unix. As a side 442 effect, on DOS/Windows we also parse CR/CR/LF into a single \n, but this 443 is not a problem. */ 444 445 446static int 447phase0_getc () 448{ 449 int c; 450 451 c = getc (fp); 452 if (c == EOF) 453 { 454 if (ferror (fp)) 455 error (EXIT_FAILURE, errno, _("error while reading \"%s\""), 456 real_file_name); 457 return EOF; 458 } 459 460 if (c == '\r') 461 { 462 int c1 = getc (fp); 463 464 if (c1 != EOF && c1 != '\n') 465 ungetc (c1, fp); 466 467 /* Seen line terminator CR or CR/LF. */ 468 return '\n'; 469 } 470 471 return c; 472} 473 474 475/* Supports only one pushback character, and not '\n'. */ 476static inline void 477phase0_ungetc (int c) 478{ 479 if (c != EOF) 480 ungetc (c, fp); 481} 482 483 484/* 1. line_number handling. Combine backslash-newline to nothing. */ 485 486static unsigned char phase1_pushback[2]; 487static int phase1_pushback_length; 488 489 490static int 491phase1_getc () 492{ 493 int c; 494 495 if (phase1_pushback_length) 496 { 497 c = phase1_pushback[--phase1_pushback_length]; 498 if (c == '\n') 499 ++line_number; 500 return c; 501 } 502 for (;;) 503 { 504 c = phase0_getc (); 505 switch (c) 506 { 507 case '\n': 508 ++line_number; 509 return '\n'; 510 511 case '\\': 512 c = phase0_getc (); 513 if (c != '\n') 514 { 515 phase0_ungetc (c); 516 return '\\'; 517 } 518 ++line_number; 519 break; 520 521 default: 522 return c; 523 } 524 } 525} 526 527 528/* Supports 2 characters of pushback. */ 529static void 530phase1_ungetc (int c) 531{ 532 switch (c) 533 { 534 case EOF: 535 break; 536 537 case '\n': 538 --line_number; 539 /* FALLTHROUGH */ 540 541 default: 542 if (phase1_pushback_length == SIZEOF (phase1_pushback)) 543 abort (); 544 phase1_pushback[phase1_pushback_length++] = c; 545 break; 546 } 547} 548 549 550/* 2. Convert trigraphs to their single character equivalents. Most 551 sane human beings vomit copiously at the mention of trigraphs, which 552 is why they are an option. */ 553 554static unsigned char phase2_pushback[1]; 555static int phase2_pushback_length; 556 557 558static int 559phase2_getc () 560{ 561 int c; 562 563 if (phase2_pushback_length) 564 return phase2_pushback[--phase2_pushback_length]; 565 if (!trigraphs) 566 return phase1_getc (); 567 568 c = phase1_getc (); 569 if (c != '?') 570 return c; 571 c = phase1_getc (); 572 if (c != '?') 573 { 574 phase1_ungetc (c); 575 return '?'; 576 } 577 c = phase1_getc (); 578 switch (c) 579 { 580 case '(': 581 return '['; 582 case '/': 583 return '\\'; 584 case ')': 585 return ']'; 586 case '\'': 587 return '^'; 588 case '<': 589 return '{'; 590 case '!': 591 return '|'; 592 case '>': 593 return '}'; 594 case '-': 595 return '~'; 596 case '#': 597 return '='; 598 } 599 phase1_ungetc (c); 600 phase1_ungetc ('?'); 601 return '?'; 602} 603 604 605/* Supports only one pushback character. */ 606static void 607phase2_ungetc (int c) 608{ 609 if (c != EOF) 610 { 611 if (phase2_pushback_length == SIZEOF (phase2_pushback)) 612 abort (); 613 phase2_pushback[phase2_pushback_length++] = c; 614 } 615} 616 617 618/* 3. Concatenate each line ending in backslash (\) with the following 619 line. Basically, all you need to do is elide "\\\n" sequences from 620 the input. */ 621 622static unsigned char phase3_pushback[2]; 623static int phase3_pushback_length; 624 625 626static int 627phase3_getc () 628{ 629 if (phase3_pushback_length) 630 return phase3_pushback[--phase3_pushback_length]; 631 for (;;) 632 { 633 int c = phase2_getc (); 634 if (c != '\\') 635 return c; 636 c = phase2_getc (); 637 if (c != '\n') 638 { 639 phase2_ungetc (c); 640 return '\\'; 641 } 642 } 643} 644 645 646/* Supports 2 characters of pushback. */ 647static void 648phase3_ungetc (int c) 649{ 650 if (c != EOF) 651 { 652 if (phase3_pushback_length == SIZEOF (phase3_pushback)) 653 abort (); 654 phase3_pushback[phase3_pushback_length++] = c; 655 } 656} 657 658 659/* Accumulating comments. */ 660 661static char *buffer; 662static size_t bufmax; 663static size_t buflen; 664 665static inline void 666comment_start () 667{ 668 buflen = 0; 669} 670 671static inline void 672comment_add (int c) 673{ 674 if (buflen >= bufmax) 675 { 676 bufmax = 2 * bufmax + 10; 677 buffer = xrealloc (buffer, bufmax); 678 } 679 buffer[buflen++] = c; 680} 681 682static inline void 683comment_line_end (size_t chars_to_remove) 684{ 685 buflen -= chars_to_remove; 686 while (buflen >= 1 687 && (buffer[buflen - 1] == ' ' || buffer[buflen - 1] == '\t')) 688 --buflen; 689 if (chars_to_remove == 0 && buflen >= bufmax) 690 { 691 bufmax = 2 * bufmax + 10; 692 buffer = xrealloc (buffer, bufmax); 693 } 694 buffer[buflen] = '\0'; 695 savable_comment_add (buffer); 696} 697 698 699/* These are for tracking whether comments count as immediately before 700 keyword. */ 701static int last_comment_line; 702static int last_non_comment_line; 703static int newline_count; 704 705 706/* 4. Replace each comment that is not inside a character constant or 707 string literal with a space character. We need to remember the 708 comment for later, because it may be attached to a keyword string. 709 We also optionally understand C++ comments. */ 710 711static int 712phase4_getc () 713{ 714 int c; 715 bool last_was_star; 716 717 c = phase3_getc (); 718 if (c != '/') 719 return c; 720 c = phase3_getc (); 721 switch (c) 722 { 723 default: 724 phase3_ungetc (c); 725 return '/'; 726 727 case '*': 728 /* C comment. */ 729 comment_start (); 730 last_was_star = false; 731 for (;;) 732 { 733 c = phase3_getc (); 734 if (c == EOF) 735 break; 736 /* We skip all leading white space, but not EOLs. */ 737 if (!(buflen == 0 && (c == ' ' || c == '\t'))) 738 comment_add (c); 739 switch (c) 740 { 741 case '\n': 742 comment_line_end (1); 743 comment_start (); 744 last_was_star = false; 745 continue; 746 747 case '*': 748 last_was_star = true; 749 continue; 750 751 case '/': 752 if (last_was_star) 753 { 754 comment_line_end (2); 755 break; 756 } 757 /* FALLTHROUGH */ 758 759 default: 760 last_was_star = false; 761 continue; 762 } 763 break; 764 } 765 last_comment_line = newline_count; 766 return ' '; 767 768 case '/': 769 /* C++ or ISO C 99 comment. */ 770 comment_start (); 771 for (;;) 772 { 773 c = phase3_getc (); 774 if (c == '\n' || c == EOF) 775 break; 776 /* We skip all leading white space, but not EOLs. */ 777 if (!(buflen == 0 && (c == ' ' || c == '\t'))) 778 comment_add (c); 779 } 780 comment_line_end (0); 781 last_comment_line = newline_count; 782 return '\n'; 783 } 784} 785 786 787/* Supports only one pushback character. */ 788static void 789phase4_ungetc (int c) 790{ 791 phase3_ungetc (c); 792} 793 794 795/* ========================== Reading of tokens. ========================== */ 796 797 798/* True if ObjectiveC extensions are recognized. */ 799static bool objc_extensions; 800 801enum token_type_ty 802{ 803 token_type_character_constant, /* 'x' */ 804 token_type_eof, 805 token_type_eoln, 806 token_type_hash, /* # */ 807 token_type_lparen, /* ( */ 808 token_type_rparen, /* ) */ 809 token_type_comma, /* , */ 810 token_type_colon, /* : */ 811 token_type_name, /* abc */ 812 token_type_number, /* 2.7 */ 813 token_type_string_literal, /* "abc" */ 814 token_type_symbol, /* < > = etc. */ 815 token_type_objc_special, /* @ */ 816 token_type_white_space 817}; 818typedef enum token_type_ty token_type_ty; 819 820typedef struct token_ty token_ty; 821struct token_ty 822{ 823 token_type_ty type; 824 char *string; /* for token_type_name, token_type_string_literal */ 825 refcounted_string_list_ty *comment; /* for token_type_string_literal, 826 token_type_objc_special */ 827 long number; 828 int line_number; 829}; 830 831 832/* 7. Replace escape sequences within character strings with their 833 single character equivalents. This is called from phase 5, because 834 we don't have to worry about the #include argument. There are 835 pathological cases which could bite us (like the DOS directory 836 separator), but just pretend it can't happen. */ 837 838#define P7_QUOTES (1000 + '"') 839#define P7_QUOTE (1000 + '\'') 840#define P7_NEWLINE (1000 + '\n') 841 842static int 843phase7_getc () 844{ 845 int c, n, j; 846 847 /* Use phase 3, because phase 4 elides comments. */ 848 c = phase3_getc (); 849 850 /* Return a magic newline indicator, so that we can distinguish 851 between the user requesting a newline in the string (e.g. using 852 "\n" or "\012") from the user failing to terminate the string or 853 character constant. The ANSI C standard says: 3.1.3.4 Character 854 Constants contain ``any character except single quote, backslash or 855 newline; or an escape sequence'' and 3.1.4 String Literals contain 856 ``any character except double quote, backslash or newline; or an 857 escape sequence''. 858 859 Most compilers give a fatal error in this case, however gcc is 860 stupidly silent, even though this is a very common typo. OK, so 861 gcc --pedantic will tell me, but that gripes about too much other 862 stuff. Could I have a ``gcc -Wnewline-in-string'' option, or 863 better yet a ``gcc -fno-newline-in-string'' option, please? Gcc is 864 also inconsistent between string literals and character constants: 865 you may not embed newlines in character constants; try it, you get 866 a useful diagnostic. --PMiller */ 867 if (c == '\n') 868 return P7_NEWLINE; 869 870 if (c == '"') 871 return P7_QUOTES; 872 if (c == '\'') 873 return P7_QUOTE; 874 if (c != '\\') 875 return c; 876 c = phase3_getc (); 877 switch (c) 878 { 879 default: 880 /* Unknown escape sequences really should be an error, but just 881 ignore them, and let the real compiler complain. */ 882 phase3_ungetc (c); 883 return '\\'; 884 885 case '"': 886 case '\'': 887 case '?': 888 case '\\': 889 return c; 890 891 case 'a': 892 return '\a'; 893 case 'b': 894 return '\b'; 895 896 /* The \e escape is preculiar to gcc, and assumes an ASCII 897 character set (or superset). We don't provide support for it 898 here. */ 899 900 case 'f': 901 return '\f'; 902 case 'n': 903 return '\n'; 904 case 'r': 905 return '\r'; 906 case 't': 907 return '\t'; 908 case 'v': 909 return '\v'; 910 911 case 'x': 912 c = phase3_getc (); 913 switch (c) 914 { 915 default: 916 phase3_ungetc (c); 917 phase3_ungetc ('x'); 918 return '\\'; 919 920 case '0': case '1': case '2': case '3': case '4': 921 case '5': case '6': case '7': case '8': case '9': 922 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': 923 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': 924 break; 925 } 926 n = 0; 927 for (;;) 928 { 929 switch (c) 930 { 931 default: 932 phase3_ungetc (c); 933 return n; 934 935 case '0': case '1': case '2': case '3': case '4': 936 case '5': case '6': case '7': case '8': case '9': 937 n = n * 16 + c - '0'; 938 break; 939 940 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': 941 n = n * 16 + 10 + c - 'A'; 942 break; 943 944 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': 945 n = n * 16 + 10 + c - 'a'; 946 break; 947 } 948 c = phase3_getc (); 949 } 950 return n; 951 952 case '0': case '1': case '2': case '3': 953 case '4': case '5': case '6': case '7': 954 n = 0; 955 for (j = 0; j < 3; ++j) 956 { 957 n = n * 8 + c - '0'; 958 c = phase3_getc (); 959 switch (c) 960 { 961 default: 962 break; 963 964 case '0': case '1': case '2': case '3': 965 case '4': case '5': case '6': case '7': 966 continue; 967 } 968 break; 969 } 970 phase3_ungetc (c); 971 return n; 972 } 973} 974 975 976static void 977phase7_ungetc (int c) 978{ 979 phase3_ungetc (c); 980} 981 982 983/* Free the memory pointed to by a 'struct token_ty'. */ 984static inline void 985free_token (token_ty *tp) 986{ 987 if (tp->type == token_type_name || tp->type == token_type_string_literal) 988 free (tp->string); 989 if (tp->type == token_type_string_literal 990 || tp->type == token_type_objc_special) 991 drop_reference (tp->comment); 992} 993 994 995/* 5. Parse each resulting logical line as preprocessing tokens and 996 white space. Preprocessing tokens and C tokens don't always match. */ 997 998static token_ty phase5_pushback[1]; 999static int phase5_pushback_length; 1000 1001 1002static void 1003phase5_get (token_ty *tp) 1004{ 1005 static char *buffer; 1006 static int bufmax; 1007 int bufpos; 1008 int c; 1009 1010 if (phase5_pushback_length) 1011 { 1012 *tp = phase5_pushback[--phase5_pushback_length]; 1013 return; 1014 } 1015 tp->string = NULL; 1016 tp->number = 0; 1017 tp->line_number = line_number; 1018 c = phase4_getc (); 1019 switch (c) 1020 { 1021 case EOF: 1022 tp->type = token_type_eof; 1023 return; 1024 1025 case '\n': 1026 tp->type = token_type_eoln; 1027 return; 1028 1029 case ' ': 1030 case '\f': 1031 case '\t': 1032 for (;;) 1033 { 1034 c = phase4_getc (); 1035 switch (c) 1036 { 1037 case ' ': 1038 case '\f': 1039 case '\t': 1040 continue; 1041 1042 default: 1043 phase4_ungetc (c); 1044 break; 1045 } 1046 break; 1047 } 1048 tp->type = token_type_white_space; 1049 return; 1050 1051 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G': 1052 case 'H': case 'I': case 'J': case 'K': case 'L': case 'M': case 'N': 1053 case 'O': case 'P': case 'Q': case 'R': case 'S': case 'T': case 'U': 1054 case 'V': case 'W': case 'X': case 'Y': case 'Z': 1055 case '_': 1056 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g': 1057 case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n': 1058 case 'o': case 'p': case 'q': case 'r': case 's': case 't': case 'u': 1059 case 'v': case 'w': case 'x': case 'y': case 'z': 1060 bufpos = 0; 1061 for (;;) 1062 { 1063 if (bufpos >= bufmax) 1064 { 1065 bufmax = 2 * bufmax + 10; 1066 buffer = xrealloc (buffer, bufmax); 1067 } 1068 buffer[bufpos++] = c; 1069 c = phase4_getc (); 1070 switch (c) 1071 { 1072 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': 1073 case 'G': case 'H': case 'I': case 'J': case 'K': case 'L': 1074 case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R': 1075 case 'S': case 'T': case 'U': case 'V': case 'W': case 'X': 1076 case 'Y': case 'Z': 1077 case '_': 1078 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': 1079 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l': 1080 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r': 1081 case 's': case 't': case 'u': case 'v': case 'w': case 'x': 1082 case 'y': case 'z': 1083 case '0': case '1': case '2': case '3': case '4': 1084 case '5': case '6': case '7': case '8': case '9': 1085 continue; 1086 1087 default: 1088 phase4_ungetc (c); 1089 break; 1090 } 1091 break; 1092 } 1093 if (bufpos >= bufmax) 1094 { 1095 bufmax = 2 * bufmax + 10; 1096 buffer = xrealloc (buffer, bufmax); 1097 } 1098 buffer[bufpos] = 0; 1099 tp->string = xstrdup (buffer); 1100 tp->type = token_type_name; 1101 return; 1102 1103 case '.': 1104 c = phase4_getc (); 1105 phase4_ungetc (c); 1106 switch (c) 1107 { 1108 default: 1109 tp->type = token_type_symbol; 1110 return; 1111 1112 case '0': case '1': case '2': case '3': case '4': 1113 case '5': case '6': case '7': case '8': case '9': 1114 c = '.'; 1115 break; 1116 } 1117 /* FALLTHROUGH */ 1118 1119 case '0': case '1': case '2': case '3': case '4': 1120 case '5': case '6': case '7': case '8': case '9': 1121 /* The preprocessing number token is more "generous" than the C 1122 number tokens. This is mostly due to token pasting (another 1123 thing we can ignore here). */ 1124 bufpos = 0; 1125 for (;;) 1126 { 1127 if (bufpos >= bufmax) 1128 { 1129 bufmax = 2 * bufmax + 10; 1130 buffer = xrealloc (buffer, bufmax); 1131 } 1132 buffer[bufpos++] = c; 1133 c = phase4_getc (); 1134 switch (c) 1135 { 1136 case 'e': 1137 case 'E': 1138 if (bufpos >= bufmax) 1139 { 1140 bufmax = 2 * bufmax + 10; 1141 buffer = xrealloc (buffer, bufmax); 1142 } 1143 buffer[bufpos++] = c; 1144 c = phase4_getc (); 1145 if (c != '+' || c != '-') 1146 { 1147 phase4_ungetc (c); 1148 break; 1149 } 1150 continue; 1151 1152 case 'A': case 'B': case 'C': case 'D': case 'F': 1153 case 'G': case 'H': case 'I': case 'J': case 'K': case 'L': 1154 case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R': 1155 case 'S': case 'T': case 'U': case 'V': case 'W': case 'X': 1156 case 'Y': case 'Z': 1157 case 'a': case 'b': case 'c': case 'd': case 'f': 1158 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l': 1159 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r': 1160 case 's': case 't': case 'u': case 'v': case 'w': case 'x': 1161 case 'y': case 'z': 1162 case '0': case '1': case '2': case '3': case '4': 1163 case '5': case '6': case '7': case '8': case '9': 1164 case '.': 1165 continue; 1166 1167 default: 1168 phase4_ungetc (c); 1169 break; 1170 } 1171 break; 1172 } 1173 if (bufpos >= bufmax) 1174 { 1175 bufmax = 2 * bufmax + 10; 1176 buffer = xrealloc (buffer, bufmax); 1177 } 1178 buffer[bufpos] = 0; 1179 tp->type = token_type_number; 1180 tp->number = atol (buffer); 1181 return; 1182 1183 case '\'': 1184 /* We could worry about the 'L' before wide character constants, 1185 but ignoring it has no effect unless one of the keywords is 1186 "L". Just pretend it won't happen. Also, we don't need to 1187 remember the character constant. */ 1188 for (;;) 1189 { 1190 c = phase7_getc (); 1191 if (c == P7_NEWLINE) 1192 { 1193 error_with_progname = false; 1194 error (0, 0, _("%s:%d: warning: unterminated character constant"), 1195 logical_file_name, line_number - 1); 1196 error_with_progname = true; 1197 phase7_ungetc ('\n'); 1198 break; 1199 } 1200 if (c == EOF || c == P7_QUOTE) 1201 break; 1202 } 1203 tp->type = token_type_character_constant; 1204 return; 1205 1206 case '"': 1207 /* We could worry about the 'L' before wide string constants, 1208 but since gettext's argument is not a wide character string, 1209 let the compiler complain about the argument not matching the 1210 prototype. Just pretend it won't happen. */ 1211 bufpos = 0; 1212 for (;;) 1213 { 1214 c = phase7_getc (); 1215 if (c == P7_NEWLINE) 1216 { 1217 error_with_progname = false; 1218 error (0, 0, _("%s:%d: warning: unterminated string literal"), 1219 logical_file_name, line_number - 1); 1220 error_with_progname = true; 1221 phase7_ungetc ('\n'); 1222 break; 1223 } 1224 if (c == EOF || c == P7_QUOTES) 1225 break; 1226 if (c == P7_QUOTE) 1227 c = '\''; 1228 if (bufpos >= bufmax) 1229 { 1230 bufmax = 2 * bufmax + 10; 1231 buffer = xrealloc (buffer, bufmax); 1232 } 1233 buffer[bufpos++] = c; 1234 } 1235 if (bufpos >= bufmax) 1236 { 1237 bufmax = 2 * bufmax + 10; 1238 buffer = xrealloc (buffer, bufmax); 1239 } 1240 buffer[bufpos] = 0; 1241 tp->type = token_type_string_literal; 1242 tp->string = xstrdup (buffer); 1243 tp->comment = add_reference (savable_comment); 1244 return; 1245 1246 case '(': 1247 tp->type = token_type_lparen; 1248 return; 1249 1250 case ')': 1251 tp->type = token_type_rparen; 1252 return; 1253 1254 case ',': 1255 tp->type = token_type_comma; 1256 return; 1257 1258 case '#': 1259 tp->type = token_type_hash; 1260 return; 1261 1262 case ':': 1263 tp->type = token_type_colon; 1264 return; 1265 1266 case '@': 1267 if (objc_extensions) 1268 { 1269 tp->type = token_type_objc_special; 1270 tp->comment = add_reference (savable_comment); 1271 return; 1272 } 1273 /* FALLTHROUGH */ 1274 1275 default: 1276 /* We could carefully recognize each of the 2 and 3 character 1277 operators, but it is not necessary, as we only need to recognize 1278 gettext invocations. Don't bother. */ 1279 tp->type = token_type_symbol; 1280 return; 1281 } 1282} 1283 1284 1285/* Supports only one pushback token. */ 1286static void 1287phase5_unget (token_ty *tp) 1288{ 1289 if (tp->type != token_type_eof) 1290 { 1291 if (phase5_pushback_length == SIZEOF (phase5_pushback)) 1292 abort (); 1293 phase5_pushback[phase5_pushback_length++] = *tp; 1294 } 1295} 1296 1297 1298/* X. Recognize a leading # symbol. Leave leading hash as a hash, but 1299 turn hash in the middle of a line into a plain symbol token. This 1300 makes the phase 6 easier. */ 1301 1302static void 1303phaseX_get (token_ty *tp) 1304{ 1305 static bool middle; /* false at the beginning of a line, true otherwise. */ 1306 1307 phase5_get (tp); 1308 1309 if (tp->type == token_type_eoln || tp->type == token_type_eof) 1310 middle = false; 1311 else 1312 { 1313 if (middle) 1314 { 1315 /* Turn hash in the middle of a line into a plain symbol token. */ 1316 if (tp->type == token_type_hash) 1317 tp->type = token_type_symbol; 1318 } 1319 else 1320 { 1321 /* When we see leading whitespace followed by a hash sign, 1322 discard the leading white space token. The hash is all 1323 phase 6 is interested in. */ 1324 if (tp->type == token_type_white_space) 1325 { 1326 token_ty next; 1327 1328 phase5_get (&next); 1329 if (next.type == token_type_hash) 1330 *tp = next; 1331 else 1332 phase5_unget (&next); 1333 } 1334 middle = true; 1335 } 1336 } 1337} 1338 1339 1340/* 6. Recognize and carry out directives (it also expands macros on 1341 non-directive lines, which we do not do here). The only directive 1342 we care about are the #line and #define directive. We throw all the 1343 others away. */ 1344 1345static token_ty phase6_pushback[2]; 1346static int phase6_pushback_length; 1347 1348 1349static void 1350phase6_get (token_ty *tp) 1351{ 1352 static token_ty *buf; 1353 static int bufmax; 1354 int bufpos; 1355 int j; 1356 1357 if (phase6_pushback_length) 1358 { 1359 *tp = phase6_pushback[--phase6_pushback_length]; 1360 return; 1361 } 1362 for (;;) 1363 { 1364 /* Get the next token. If it is not a '#' at the beginning of a 1365 line (ignoring whitespace), return immediately. */ 1366 phaseX_get (tp); 1367 if (tp->type != token_type_hash) 1368 return; 1369 1370 /* Accumulate the rest of the directive in a buffer, until the 1371 "define" keyword is seen or until end of line. */ 1372 bufpos = 0; 1373 for (;;) 1374 { 1375 phaseX_get (tp); 1376 if (tp->type == token_type_eoln || tp->type == token_type_eof) 1377 break; 1378 1379 /* Before the "define" keyword and inside other directives 1380 white space is irrelevant. So just throw it away. */ 1381 if (tp->type != token_type_white_space) 1382 { 1383 /* If it is a #define directive, return immediately, 1384 thus treating the body of the #define directive like 1385 normal input. */ 1386 if (bufpos == 0 1387 && tp->type == token_type_name 1388 && strcmp (tp->string, "define") == 0) 1389 return; 1390 1391 /* Accumulate. */ 1392 if (bufpos >= bufmax) 1393 { 1394 bufmax = 2 * bufmax + 10; 1395 buf = xrealloc (buf, bufmax * sizeof (buf[0])); 1396 } 1397 buf[bufpos++] = *tp; 1398 } 1399 } 1400 1401 /* If it is a #line directive, with no macros to expand, act on 1402 it. Ignore all other directives. */ 1403 if (bufpos >= 3 && buf[0].type == token_type_name 1404 && strcmp (buf[0].string, "line") == 0 1405 && buf[1].type == token_type_number 1406 && buf[2].type == token_type_string_literal) 1407 { 1408 logical_file_name = xstrdup (buf[2].string); 1409 line_number = buf[1].number; 1410 } 1411 if (bufpos >= 2 && buf[0].type == token_type_number 1412 && buf[1].type == token_type_string_literal) 1413 { 1414 logical_file_name = xstrdup (buf[1].string); 1415 line_number = buf[0].number; 1416 } 1417 1418 /* Release the storage held by the directive. */ 1419 for (j = 0; j < bufpos; ++j) 1420 free_token (&buf[j]); 1421 1422 /* We must reset the selected comments. */ 1423 savable_comment_reset (); 1424 } 1425} 1426 1427 1428/* Supports 2 tokens of pushback. */ 1429static void 1430phase6_unget (token_ty *tp) 1431{ 1432 if (tp->type != token_type_eof) 1433 { 1434 if (phase6_pushback_length == SIZEOF (phase6_pushback)) 1435 abort (); 1436 phase6_pushback[phase6_pushback_length++] = *tp; 1437 } 1438} 1439 1440 1441/* 8a. Convert ISO C 99 section 7.8.1 format string directives to string 1442 literal placeholders. */ 1443 1444/* Test for an ISO C 99 section 7.8.1 format string directive. */ 1445static bool 1446is_inttypes_macro (const char *name) 1447{ 1448 /* Syntax: 1449 P R I { d | i | o | u | x | X } 1450 { { | LEAST | FAST } { 8 | 16 | 32 | 64 } | MAX | PTR } */ 1451 if (name[0] == 'P' && name[1] == 'R' && name[2] == 'I') 1452 { 1453 name += 3; 1454 if (name[0] == 'd' || name[0] == 'i' || name[0] == 'o' || name[0] == 'u' 1455 || name[0] == 'x' || name[0] == 'X') 1456 { 1457 name += 1; 1458 if (name[0] == 'M' && name[1] == 'A' && name[2] == 'X' 1459 && name[3] == '\0') 1460 return true; 1461 if (name[0] == 'P' && name[1] == 'T' && name[2] == 'R' 1462 && name[3] == '\0') 1463 return true; 1464 if (name[0] == 'L' && name[1] == 'E' && name[2] == 'A' 1465 && name[3] == 'S' && name[4] == 'T') 1466 name += 5; 1467 else if (name[0] == 'F' && name[1] == 'A' && name[2] == 'S' 1468 && name[3] == 'T') 1469 name += 4; 1470 if (name[0] == '8' && name[1] == '\0') 1471 return true; 1472 if (name[0] == '1' && name[1] == '6' && name[2] == '\0') 1473 return true; 1474 if (name[0] == '3' && name[1] == '2' && name[2] == '\0') 1475 return true; 1476 if (name[0] == '6' && name[1] == '4' && name[2] == '\0') 1477 return true; 1478 } 1479 } 1480 return false; 1481} 1482 1483static void 1484phase8a_get (token_ty *tp) 1485{ 1486 phase6_get (tp); 1487 if (tp->type == token_type_name && is_inttypes_macro (tp->string)) 1488 { 1489 /* Turn PRIdXXX into "<PRIdXXX>". */ 1490 char *new_string = xasprintf ("<%s>", tp->string); 1491 free (tp->string); 1492 tp->string = new_string; 1493 tp->comment = add_reference (savable_comment); 1494 tp->type = token_type_string_literal; 1495 } 1496} 1497 1498/* Supports 2 tokens of pushback. */ 1499static inline void 1500phase8a_unget (token_ty *tp) 1501{ 1502 phase6_unget (tp); 1503} 1504 1505 1506/* 8b. Drop whitespace. */ 1507static void 1508phase8b_get (token_ty *tp) 1509{ 1510 for (;;) 1511 { 1512 phase8a_get (tp); 1513 1514 if (tp->type == token_type_white_space) 1515 continue; 1516 if (tp->type == token_type_eoln) 1517 { 1518 /* We have to track the last occurrence of a string. One 1519 mode of xgettext allows to group an extracted message 1520 with a comment for documentation. The rule which states 1521 which comment is assumed to be grouped with the message 1522 says it should immediately precede it. Our 1523 interpretation: between the last line of the comment and 1524 the line in which the keyword is found must be no line 1525 with non-white space tokens. */ 1526 ++newline_count; 1527 if (last_non_comment_line > last_comment_line) 1528 savable_comment_reset (); 1529 continue; 1530 } 1531 break; 1532 } 1533} 1534 1535/* Supports 2 tokens of pushback. */ 1536static inline void 1537phase8b_unget (token_ty *tp) 1538{ 1539 phase8a_unget (tp); 1540} 1541 1542 1543/* 8c. In ObjectiveC mode, drop '@' before a literal string. We need to 1544 do this before performing concatenation of adjacent string literals. */ 1545static void 1546phase8c_get (token_ty *tp) 1547{ 1548 token_ty tmp; 1549 1550 phase8b_get (tp); 1551 if (tp->type != token_type_objc_special) 1552 return; 1553 phase8b_get (&tmp); 1554 if (tmp.type != token_type_string_literal) 1555 { 1556 phase8b_unget (&tmp); 1557 return; 1558 } 1559 /* Drop the '@' token and return immediately the following string. */ 1560 drop_reference (tmp.comment); 1561 tmp.comment = tp->comment; 1562 *tp = tmp; 1563} 1564 1565/* Supports only one pushback token. */ 1566static inline void 1567phase8c_unget (token_ty *tp) 1568{ 1569 phase8b_unget (tp); 1570} 1571 1572 1573/* 8. Concatenate adjacent string literals to form single string 1574 literals (because we don't expand macros, there are a few things we 1575 will miss). */ 1576 1577static void 1578phase8_get (token_ty *tp) 1579{ 1580 phase8c_get (tp); 1581 if (tp->type != token_type_string_literal) 1582 return; 1583 for (;;) 1584 { 1585 token_ty tmp; 1586 size_t len; 1587 1588 phase8c_get (&tmp); 1589 if (tmp.type != token_type_string_literal) 1590 { 1591 phase8c_unget (&tmp); 1592 return; 1593 } 1594 len = strlen (tp->string); 1595 tp->string = xrealloc (tp->string, len + strlen (tmp.string) + 1); 1596 strcpy (tp->string + len, tmp.string); 1597 free_token (&tmp); 1598 } 1599} 1600 1601 1602/* ===================== Reading of high-level tokens. ==================== */ 1603 1604 1605enum xgettext_token_type_ty 1606{ 1607 xgettext_token_type_eof, 1608 xgettext_token_type_keyword, 1609 xgettext_token_type_symbol, 1610 xgettext_token_type_lparen, 1611 xgettext_token_type_rparen, 1612 xgettext_token_type_comma, 1613 xgettext_token_type_colon, 1614 xgettext_token_type_string_literal, 1615 xgettext_token_type_other 1616}; 1617typedef enum xgettext_token_type_ty xgettext_token_type_ty; 1618 1619typedef struct xgettext_token_ty xgettext_token_ty; 1620struct xgettext_token_ty 1621{ 1622 xgettext_token_type_ty type; 1623 1624 /* This field is used only for xgettext_token_type_keyword. */ 1625 const struct callshapes *shapes; 1626 1627 /* This field is used only for xgettext_token_type_string_literal, 1628 xgettext_token_type_keyword, xgettext_token_type_symbol. */ 1629 char *string; 1630 1631 /* This field is used only for xgettext_token_type_string_literal. */ 1632 refcounted_string_list_ty *comment; 1633 1634 /* These fields are only for 1635 xgettext_token_type_keyword, 1636 xgettext_token_type_string_literal. */ 1637 lex_pos_ty pos; 1638}; 1639 1640 1641/* 9. Convert the remaining preprocessing tokens to C tokens and 1642 discards any white space from the translation unit. */ 1643 1644static void 1645x_c_lex (xgettext_token_ty *tp) 1646{ 1647 for (;;) 1648 { 1649 token_ty token; 1650 void *keyword_value; 1651 1652 phase8_get (&token); 1653 switch (token.type) 1654 { 1655 case token_type_eof: 1656 tp->type = xgettext_token_type_eof; 1657 return; 1658 1659 case token_type_name: 1660 last_non_comment_line = newline_count; 1661 1662 if (hash_find_entry (objc_extensions ? &objc_keywords : &c_keywords, 1663 token.string, strlen (token.string), 1664 &keyword_value) 1665 == 0) 1666 { 1667 tp->type = xgettext_token_type_keyword; 1668 tp->shapes = (const struct callshapes *) keyword_value; 1669 tp->pos.file_name = logical_file_name; 1670 tp->pos.line_number = token.line_number; 1671 } 1672 else 1673 tp->type = xgettext_token_type_symbol; 1674 tp->string = token.string; 1675 return; 1676 1677 case token_type_lparen: 1678 last_non_comment_line = newline_count; 1679 1680 tp->type = xgettext_token_type_lparen; 1681 return; 1682 1683 case token_type_rparen: 1684 last_non_comment_line = newline_count; 1685 1686 tp->type = xgettext_token_type_rparen; 1687 return; 1688 1689 case token_type_comma: 1690 last_non_comment_line = newline_count; 1691 1692 tp->type = xgettext_token_type_comma; 1693 return; 1694 1695 case token_type_colon: 1696 last_non_comment_line = newline_count; 1697 1698 tp->type = xgettext_token_type_colon; 1699 return; 1700 1701 case token_type_string_literal: 1702 last_non_comment_line = newline_count; 1703 1704 tp->type = xgettext_token_type_string_literal; 1705 tp->string = token.string; 1706 tp->comment = token.comment; 1707 tp->pos.file_name = logical_file_name; 1708 tp->pos.line_number = token.line_number; 1709 return; 1710 1711 case token_type_objc_special: 1712 drop_reference (token.comment); 1713 /* FALLTHROUGH */ 1714 1715 default: 1716 last_non_comment_line = newline_count; 1717 1718 tp->type = xgettext_token_type_other; 1719 return; 1720 } 1721 } 1722} 1723 1724 1725/* ========================= Extracting strings. ========================== */ 1726 1727 1728/* Context lookup table. */ 1729static flag_context_list_table_ty *flag_context_list_table; 1730 1731 1732/* The file is broken into tokens. Scan the token stream, looking for 1733 a keyword, followed by a left paren, followed by a string. When we 1734 see this sequence, we have something to remember. We assume we are 1735 looking at a valid C or C++ program, and leave the complaints about 1736 the grammar to the compiler. 1737 1738 Normal handling: Look for 1739 keyword ( ... msgid ... ) 1740 Plural handling: Look for 1741 keyword ( ... msgid ... msgid_plural ... ) 1742 1743 We use recursion because the arguments before msgid or between msgid 1744 and msgid_plural can contain subexpressions of the same form. */ 1745 1746 1747/* Extract messages until the next balanced closing parenthesis. 1748 Extracted messages are added to MLP. 1749 Return true upon eof, false upon closing parenthesis. */ 1750static bool 1751extract_parenthesized (message_list_ty *mlp, 1752 flag_context_ty outer_context, 1753 flag_context_list_iterator_ty context_iter, 1754 struct arglist_parser *argparser) 1755{ 1756 /* Current argument number. */ 1757 int arg = 1; 1758 /* 0 when no keyword has been seen. 1 right after a keyword is seen. */ 1759 int state; 1760 /* Parameters of the keyword just seen. Defined only in state 1. */ 1761 const struct callshapes *next_shapes = NULL; 1762 /* Context iterator that will be used if the next token is a '('. */ 1763 flag_context_list_iterator_ty next_context_iter = 1764 passthrough_context_list_iterator; 1765 /* Context iterator that will be used if the next token is a ':'. 1766 (Objective C selector syntax.) */ 1767 flag_context_list_iterator_ty selectorcall_context_iter = 1768 passthrough_context_list_iterator; 1769 /* Current context. */ 1770 flag_context_ty inner_context = 1771 inherited_context (outer_context, 1772 flag_context_list_iterator_advance (&context_iter)); 1773 1774 /* Start state is 0. */ 1775 state = 0; 1776 1777 for (;;) 1778 { 1779 xgettext_token_ty token; 1780 1781 x_c_lex (&token); 1782 switch (token.type) 1783 { 1784 case xgettext_token_type_keyword: 1785 next_shapes = token.shapes; 1786 state = 1; 1787 goto keyword_or_symbol; 1788 1789 case xgettext_token_type_symbol: 1790 state = 0; 1791 keyword_or_symbol: 1792 next_context_iter = 1793 flag_context_list_iterator ( 1794 flag_context_list_table_lookup ( 1795 flag_context_list_table, 1796 token.string, strlen (token.string))); 1797 if (objc_extensions) 1798 { 1799 size_t token_string_len = strlen (token.string); 1800 token.string = xrealloc (token.string, token_string_len + 2); 1801 token.string[token_string_len] = ':'; 1802 token.string[token_string_len + 1] = '\0'; 1803 selectorcall_context_iter = 1804 flag_context_list_iterator ( 1805 flag_context_list_table_lookup ( 1806 flag_context_list_table, 1807 token.string, token_string_len + 1)); 1808 } 1809 free (token.string); 1810 continue; 1811 1812 case xgettext_token_type_lparen: 1813 if (extract_parenthesized (mlp, inner_context, next_context_iter, 1814 arglist_parser_alloc (mlp, 1815 state ? next_shapes : NULL))) 1816 { 1817 arglist_parser_done (argparser, arg); 1818 return true; 1819 } 1820 next_context_iter = null_context_list_iterator; 1821 selectorcall_context_iter = null_context_list_iterator; 1822 state = 0; 1823 continue; 1824 1825 case xgettext_token_type_rparen: 1826 arglist_parser_done (argparser, arg); 1827 return false; 1828 1829 case xgettext_token_type_comma: 1830 arg++; 1831 inner_context = 1832 inherited_context (outer_context, 1833 flag_context_list_iterator_advance ( 1834 &context_iter)); 1835 next_context_iter = passthrough_context_list_iterator; 1836 selectorcall_context_iter = passthrough_context_list_iterator; 1837 state = 0; 1838 continue; 1839 1840 case xgettext_token_type_colon: 1841 if (objc_extensions) 1842 { 1843 context_iter = selectorcall_context_iter; 1844 inner_context = 1845 inherited_context (inner_context, 1846 flag_context_list_iterator_advance ( 1847 &context_iter)); 1848 next_context_iter = passthrough_context_list_iterator; 1849 selectorcall_context_iter = passthrough_context_list_iterator; 1850 } 1851 else 1852 { 1853 next_context_iter = null_context_list_iterator; 1854 selectorcall_context_iter = null_context_list_iterator; 1855 } 1856 state = 0; 1857 continue; 1858 1859 case xgettext_token_type_string_literal: 1860 if (extract_all) 1861 remember_a_message (mlp, NULL, token.string, inner_context, 1862 &token.pos, token.comment); 1863 else 1864 arglist_parser_remember (argparser, arg, token.string, 1865 inner_context, 1866 token.pos.file_name, token.pos.line_number, 1867 token.comment); 1868 drop_reference (token.comment); 1869 next_context_iter = null_context_list_iterator; 1870 selectorcall_context_iter = null_context_list_iterator; 1871 state = 0; 1872 continue; 1873 1874 case xgettext_token_type_other: 1875 next_context_iter = null_context_list_iterator; 1876 selectorcall_context_iter = null_context_list_iterator; 1877 state = 0; 1878 continue; 1879 1880 case xgettext_token_type_eof: 1881 arglist_parser_done (argparser, arg); 1882 return true; 1883 1884 default: 1885 abort (); 1886 } 1887 } 1888} 1889 1890 1891static void 1892extract_whole_file (FILE *f, 1893 const char *real_filename, const char *logical_filename, 1894 flag_context_list_table_ty *flag_table, 1895 msgdomain_list_ty *mdlp) 1896{ 1897 message_list_ty *mlp = mdlp->item[0]->messages; 1898 1899 fp = f; 1900 real_file_name = real_filename; 1901 logical_file_name = xstrdup (logical_filename); 1902 line_number = 1; 1903 1904 newline_count = 0; 1905 last_comment_line = -1; 1906 last_non_comment_line = -1; 1907 1908 flag_context_list_table = flag_table; 1909 1910 init_keywords (); 1911 1912 /* Eat tokens until eof is seen. When extract_parenthesized returns 1913 due to an unbalanced closing parenthesis, just restart it. */ 1914 while (!extract_parenthesized (mlp, null_context, null_context_list_iterator, 1915 arglist_parser_alloc (mlp, NULL))) 1916 ; 1917 1918 /* Close scanner. */ 1919 fp = NULL; 1920 real_file_name = NULL; 1921 logical_file_name = NULL; 1922 line_number = 0; 1923} 1924 1925 1926void 1927extract_c (FILE *f, 1928 const char *real_filename, const char *logical_filename, 1929 flag_context_list_table_ty *flag_table, 1930 msgdomain_list_ty *mdlp) 1931{ 1932 objc_extensions = false; 1933 extract_whole_file (f, real_filename, logical_filename, flag_table, mdlp); 1934} 1935 1936void 1937extract_objc (FILE *f, 1938 const char *real_filename, const char *logical_filename, 1939 flag_context_list_table_ty *flag_table, 1940 msgdomain_list_ty *mdlp) 1941{ 1942 objc_extensions = true; 1943 extract_whole_file (f, real_filename, logical_filename, flag_table, mdlp); 1944} 1945