1/* xgettext Python backend. 2 Copyright (C) 2002-2003, 2005-2007 Free Software Foundation, Inc. 3 4 This file was written by Bruno Haible <haible@clisp.cons.org>, 2002. 5 6 This program is free software: you can redistribute it and/or modify 7 it under the terms of the GNU General Public License as published by 8 the Free Software Foundation; either version 3 of the License, or 9 (at your option) any later version. 10 11 This program is distributed in the hope that it will be useful, 12 but WITHOUT ANY WARRANTY; without even the implied warranty of 13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 GNU General Public License for more details. 15 16 You should have received a copy of the GNU General Public License 17 along with this program. If not, see <http://www.gnu.org/licenses/>. */ 18 19#ifdef HAVE_CONFIG_H 20# include "config.h" 21#endif 22 23/* Specification. */ 24#include "x-python.h" 25 26#include <assert.h> 27#include <errno.h> 28#include <stdbool.h> 29#include <stdio.h> 30#include <stdlib.h> 31#include <string.h> 32 33#include "message.h" 34#include "xgettext.h" 35#include "x-python.h" 36#include "error.h" 37#include "error-progname.h" 38#include "progname.h" 39#include "basename.h" 40#include "xerror.h" 41#include "xvasprintf.h" 42#include "xalloc.h" 43#include "c-strstr.h" 44#include "c-ctype.h" 45#include "po-charset.h" 46#include "uniname.h" 47#include "unistr.h" 48#include "gettext.h" 49 50#define _(s) gettext(s) 51 52#define max(a,b) ((a) > (b) ? (a) : (b)) 53 54#define SIZEOF(a) (sizeof(a) / sizeof(a[0])) 55 56 57/* The Python syntax is defined in the Python Reference Manual 58 /usr/share/doc/packages/python/html/ref/index.html. 59 See also Python-2.0/Parser/tokenizer.c, Python-2.0/Python/compile.c, 60 Python-2.0/Objects/unicodeobject.c. */ 61 62 63/* ====================== Keyword set customization. ====================== */ 64 65/* If true extract all strings. */ 66static bool extract_all = false; 67 68static hash_table keywords; 69static bool default_keywords = true; 70 71 72void 73x_python_extract_all () 74{ 75 extract_all = true; 76} 77 78 79void 80x_python_keyword (const char *name) 81{ 82 if (name == NULL) 83 default_keywords = false; 84 else 85 { 86 const char *end; 87 struct callshape shape; 88 const char *colon; 89 90 if (keywords.table == NULL) 91 hash_init (&keywords, 100); 92 93 split_keywordspec (name, &end, &shape); 94 95 /* The characters between name and end should form a valid C identifier. 96 A colon means an invalid parse in split_keywordspec(). */ 97 colon = strchr (name, ':'); 98 if (colon == NULL || colon >= end) 99 insert_keyword_callshape (&keywords, name, end - name, &shape); 100 } 101} 102 103/* Finish initializing the keywords hash table. 104 Called after argument processing, before each file is processed. */ 105static void 106init_keywords () 107{ 108 if (default_keywords) 109 { 110 /* When adding new keywords here, also update the documentation in 111 xgettext.texi! */ 112 x_python_keyword ("gettext"); 113 x_python_keyword ("ugettext"); 114 x_python_keyword ("dgettext:2"); 115 x_python_keyword ("ngettext:1,2"); 116 x_python_keyword ("ungettext:1,2"); 117 x_python_keyword ("dngettext:2,3"); 118 x_python_keyword ("_"); 119 default_keywords = false; 120 } 121} 122 123void 124init_flag_table_python () 125{ 126 xgettext_record_flag ("gettext:1:pass-python-format"); 127 xgettext_record_flag ("ugettext:1:pass-python-format"); 128 xgettext_record_flag ("dgettext:2:pass-python-format"); 129 xgettext_record_flag ("ngettext:1:pass-python-format"); 130 xgettext_record_flag ("ngettext:2:pass-python-format"); 131 xgettext_record_flag ("ungettext:1:pass-python-format"); 132 xgettext_record_flag ("ungettext:2:pass-python-format"); 133 xgettext_record_flag ("dngettext:2:pass-python-format"); 134 xgettext_record_flag ("dngettext:3:pass-python-format"); 135 xgettext_record_flag ("_:1:pass-python-format"); 136 /* xgettext_record_flag ("%:1:python-format"); // % is an infix operator! */ 137} 138 139 140/* ======================== Reading of characters. ======================== */ 141 142/* Real filename, used in error messages about the input file. */ 143static const char *real_file_name; 144 145/* Logical filename and line number, used to label the extracted messages. */ 146static char *logical_file_name; 147static int line_number; 148 149/* The input file stream. */ 150static FILE *fp; 151 152 153/* 1. line_number handling. */ 154 155/* Maximum used, roughly a safer MB_LEN_MAX. */ 156#define MAX_PHASE1_PUSHBACK 16 157static unsigned char phase1_pushback[MAX_PHASE1_PUSHBACK]; 158static int phase1_pushback_length; 159 160/* Read the next single byte from the input file. */ 161static int 162phase1_getc () 163{ 164 int c; 165 166 if (phase1_pushback_length) 167 c = phase1_pushback[--phase1_pushback_length]; 168 else 169 { 170 c = getc (fp); 171 172 if (c == EOF) 173 { 174 if (ferror (fp)) 175 error (EXIT_FAILURE, errno, _("error while reading \"%s\""), 176 real_file_name); 177 return EOF; 178 } 179 } 180 181 if (c == '\n') 182 ++line_number; 183 184 return c; 185} 186 187/* Supports MAX_PHASE1_PUSHBACK characters of pushback. */ 188static void 189phase1_ungetc (int c) 190{ 191 if (c != EOF) 192 { 193 if (c == '\n') 194 --line_number; 195 196 if (phase1_pushback_length == SIZEOF (phase1_pushback)) 197 abort (); 198 phase1_pushback[phase1_pushback_length++] = c; 199 } 200} 201 202 203/* Phase 2: Conversion to Unicode. 204 This is done early because PEP 0263 specifies that conversion to Unicode 205 conceptually occurs before tokenization. A test case where it matters 206 is with encodings like BIG5: when a double-byte character ending in 0x5C 207 is followed by '\' or 'u0021', the tokenizer must not treat the second 208 half of the double-byte character as a backslash. */ 209 210/* End-of-file indicator for functions returning an UCS-4 character. */ 211#define UEOF -1 212 213static int phase2_pushback[max (9, UNINAME_MAX + 3)]; 214static int phase2_pushback_length; 215 216/* Read the next Unicode UCS-4 character from the input file. */ 217static int 218phase2_getc () 219{ 220 if (phase2_pushback_length) 221 return phase2_pushback[--phase2_pushback_length]; 222 223 if (xgettext_current_source_encoding == po_charset_ascii) 224 { 225 int c = phase1_getc (); 226 if (c == EOF) 227 return UEOF; 228 if (!c_isascii (c)) 229 { 230 char buffer[21]; 231 sprintf (buffer, ":%ld", (long) line_number); 232 multiline_error (xstrdup (""), 233 xasprintf (_("\ 234Non-ASCII string at %s%s.\n\ 235Please specify the source encoding through --from-code or through a comment\n\ 236as specified in http://www.python.org/peps/pep-0263.html.\n"), 237 real_file_name, buffer)); 238 exit (EXIT_FAILURE); 239 } 240 return c; 241 } 242 else if (xgettext_current_source_encoding != po_charset_utf8) 243 { 244#if HAVE_ICONV 245 /* Use iconv on an increasing number of bytes. Read only as many bytes 246 through phase1_getc as needed. This is needed to give reasonable 247 interactive behaviour when fp is connected to an interactive tty. */ 248 unsigned char buf[MAX_PHASE1_PUSHBACK]; 249 size_t bufcount; 250 int c = phase1_getc (); 251 if (c == EOF) 252 return UEOF; 253 buf[0] = (unsigned char) c; 254 bufcount = 1; 255 256 for (;;) 257 { 258 unsigned char scratchbuf[6]; 259 const char *inptr = (const char *) &buf[0]; 260 size_t insize = bufcount; 261 char *outptr = (char *) &scratchbuf[0]; 262 size_t outsize = sizeof (scratchbuf); 263 264 size_t res = iconv (xgettext_current_source_iconv, 265 (ICONV_CONST char **) &inptr, &insize, 266 &outptr, &outsize); 267 /* We expect that a character has been produced if and only if 268 some input bytes have been consumed. */ 269 if ((insize < bufcount) != (outsize < sizeof (scratchbuf))) 270 abort (); 271 if (outsize == sizeof (scratchbuf)) 272 { 273 /* No character has been produced. Must be an error. */ 274 if (res != (size_t)(-1)) 275 abort (); 276 277 if (errno == EILSEQ) 278 { 279 /* An invalid multibyte sequence was encountered. */ 280 multiline_error (xstrdup (""), 281 xasprintf (_("\ 282%s:%d: Invalid multibyte sequence.\n\ 283Please specify the correct source encoding through --from-code or through a\n\ 284comment as specified in http://www.python.org/peps/pep-0263.html.\n"), 285 real_file_name, line_number)); 286 exit (EXIT_FAILURE); 287 } 288 else if (errno == EINVAL) 289 { 290 /* An incomplete multibyte character. */ 291 int c; 292 293 if (bufcount == MAX_PHASE1_PUSHBACK) 294 { 295 /* An overlong incomplete multibyte sequence was 296 encountered. */ 297 multiline_error (xstrdup (""), 298 xasprintf (_("\ 299%s:%d: Long incomplete multibyte sequence.\n\ 300Please specify the correct source encoding through --from-code or through a\n\ 301comment as specified in http://www.python.org/peps/pep-0263.html.\n"), 302 real_file_name, line_number)); 303 exit (EXIT_FAILURE); 304 } 305 306 /* Read one more byte and retry iconv. */ 307 c = phase1_getc (); 308 if (c == EOF) 309 { 310 multiline_error (xstrdup (""), 311 xasprintf (_("\ 312%s:%d: Incomplete multibyte sequence at end of file.\n\ 313Please specify the correct source encoding through --from-code or through a\n\ 314comment as specified in http://www.python.org/peps/pep-0263.html.\n"), 315 real_file_name, line_number)); 316 exit (EXIT_FAILURE); 317 } 318 if (c == '\n') 319 { 320 multiline_error (xstrdup (""), 321 xasprintf (_("\ 322%s:%d: Incomplete multibyte sequence at end of line.\n\ 323Please specify the correct source encoding through --from-code or through a\n\ 324comment as specified in http://www.python.org/peps/pep-0263.html.\n"), 325 real_file_name, line_number - 1)); 326 exit (EXIT_FAILURE); 327 } 328 buf[bufcount++] = (unsigned char) c; 329 } 330 else 331 error (EXIT_FAILURE, errno, _("%s:%d: iconv failure"), 332 real_file_name, line_number); 333 } 334 else 335 { 336 size_t outbytes = sizeof (scratchbuf) - outsize; 337 size_t bytes = bufcount - insize; 338 unsigned int uc; 339 340 /* We expect that one character has been produced. */ 341 if (bytes == 0) 342 abort (); 343 if (outbytes == 0) 344 abort (); 345 /* Push back the unused bytes. */ 346 while (insize > 0) 347 phase1_ungetc (buf[--insize]); 348 /* Convert the character from UTF-8 to UCS-4. */ 349 if (u8_mbtouc (&uc, scratchbuf, outbytes) < outbytes) 350 { 351 /* scratchbuf contains an out-of-range Unicode character 352 (> 0x10ffff). */ 353 multiline_error (xstrdup (""), 354 xasprintf (_("\ 355%s:%d: Invalid multibyte sequence.\n\ 356Please specify the source encoding through --from-code or through a comment\n\ 357as specified in http://www.python.org/peps/pep-0263.html.\n"), 358 real_file_name, line_number)); 359 exit (EXIT_FAILURE); 360 } 361 return uc; 362 } 363 } 364#else 365 /* If we don't have iconv(), the only supported values for 366 xgettext_global_source_encoding and thus also for 367 xgettext_current_source_encoding are ASCII and UTF-8. */ 368 abort (); 369#endif 370 } 371 else 372 { 373 /* Read an UTF-8 encoded character. */ 374 unsigned char buf[6]; 375 unsigned int count; 376 int c; 377 unsigned int uc; 378 379 c = phase1_getc (); 380 if (c == EOF) 381 return UEOF; 382 buf[0] = c; 383 count = 1; 384 385 if (buf[0] >= 0xc0) 386 { 387 c = phase1_getc (); 388 if (c == EOF) 389 return UEOF; 390 buf[1] = c; 391 count = 2; 392 } 393 394 if (buf[0] >= 0xe0 395 && ((buf[1] ^ 0x80) < 0x40)) 396 { 397 c = phase1_getc (); 398 if (c == EOF) 399 return UEOF; 400 buf[2] = c; 401 count = 3; 402 } 403 404 if (buf[0] >= 0xf0 405 && ((buf[1] ^ 0x80) < 0x40) 406 && ((buf[2] ^ 0x80) < 0x40)) 407 { 408 c = phase1_getc (); 409 if (c == EOF) 410 return UEOF; 411 buf[3] = c; 412 count = 4; 413 } 414 415 if (buf[0] >= 0xf8 416 && ((buf[1] ^ 0x80) < 0x40) 417 && ((buf[2] ^ 0x80) < 0x40) 418 && ((buf[3] ^ 0x80) < 0x40)) 419 { 420 c = phase1_getc (); 421 if (c == EOF) 422 return UEOF; 423 buf[4] = c; 424 count = 5; 425 } 426 427 if (buf[0] >= 0xfc 428 && ((buf[1] ^ 0x80) < 0x40) 429 && ((buf[2] ^ 0x80) < 0x40) 430 && ((buf[3] ^ 0x80) < 0x40) 431 && ((buf[4] ^ 0x80) < 0x40)) 432 { 433 c = phase1_getc (); 434 if (c == EOF) 435 return UEOF; 436 buf[5] = c; 437 count = 6; 438 } 439 440 u8_mbtouc (&uc, buf, count); 441 return uc; 442 } 443} 444 445/* Supports max (9, UNINAME_MAX + 3) pushback characters. */ 446static void 447phase2_ungetc (int c) 448{ 449 if (c != UEOF) 450 { 451 if (phase2_pushback_length == SIZEOF (phase2_pushback)) 452 abort (); 453 phase2_pushback[phase2_pushback_length++] = c; 454 } 455} 456 457 458/* ========================= Accumulating strings. ======================== */ 459 460/* A string buffer type that allows appending Unicode characters. 461 Returns the entire string in UTF-8 encoding. */ 462 463struct unicode_string_buffer 464{ 465 /* The part of the string that has already been converted to UTF-8. */ 466 char *utf8_buffer; 467 size_t utf8_buflen; 468 size_t utf8_allocated; 469}; 470 471/* Initialize a 'struct unicode_string_buffer' to empty. */ 472static inline void 473init_unicode_string_buffer (struct unicode_string_buffer *bp) 474{ 475 bp->utf8_buffer = NULL; 476 bp->utf8_buflen = 0; 477 bp->utf8_allocated = 0; 478} 479 480/* Auxiliary function: Ensure count more bytes are available in bp->utf8. */ 481static inline void 482unicode_string_buffer_append_unicode_grow (struct unicode_string_buffer *bp, 483 size_t count) 484{ 485 if (bp->utf8_buflen + count > bp->utf8_allocated) 486 { 487 size_t new_allocated = 2 * bp->utf8_allocated + 10; 488 if (new_allocated < bp->utf8_buflen + count) 489 new_allocated = bp->utf8_buflen + count; 490 bp->utf8_allocated = new_allocated; 491 bp->utf8_buffer = xrealloc (bp->utf8_buffer, new_allocated); 492 } 493} 494 495/* Auxiliary function: Append a Unicode character to bp->utf8. 496 uc must be < 0x110000. */ 497static inline void 498unicode_string_buffer_append_unicode (struct unicode_string_buffer *bp, 499 unsigned int uc) 500{ 501 unsigned char utf8buf[6]; 502 int count = u8_uctomb (utf8buf, uc, 6); 503 504 if (count < 0) 505 /* The caller should have ensured that uc is not out-of-range. */ 506 abort (); 507 508 unicode_string_buffer_append_unicode_grow (bp, count); 509 memcpy (bp->utf8_buffer + bp->utf8_buflen, utf8buf, count); 510 bp->utf8_buflen += count; 511} 512 513/* Return the string buffer's contents. */ 514static char * 515unicode_string_buffer_result (struct unicode_string_buffer *bp) 516{ 517 /* NUL-terminate it. */ 518 unicode_string_buffer_append_unicode_grow (bp, 1); 519 bp->utf8_buffer[bp->utf8_buflen] = '\0'; 520 /* Return it. */ 521 return bp->utf8_buffer; 522} 523 524/* Free the memory pointed to by a 'struct unicode_string_buffer'. */ 525static inline void 526free_unicode_string_buffer (struct unicode_string_buffer *bp) 527{ 528 free (bp->utf8_buffer); 529} 530 531 532/* ======================== Accumulating comments. ======================== */ 533 534 535/* Accumulating a single comment line. */ 536 537static struct unicode_string_buffer comment_buffer; 538 539static inline void 540comment_start () 541{ 542 comment_buffer.utf8_buflen = 0; 543} 544 545static inline bool 546comment_at_start () 547{ 548 return (comment_buffer.utf8_buflen == 0); 549} 550 551static inline void 552comment_add (int c) 553{ 554 unicode_string_buffer_append_unicode (&comment_buffer, c); 555} 556 557static inline const char * 558comment_line_end () 559{ 560 char *buffer = unicode_string_buffer_result (&comment_buffer); 561 size_t buflen = strlen (buffer); 562 563 while (buflen >= 1 564 && (buffer[buflen - 1] == ' ' || buffer[buflen - 1] == '\t')) 565 --buflen; 566 buffer[buflen] = '\0'; 567 savable_comment_add (buffer); 568 return buffer; 569} 570 571 572/* These are for tracking whether comments count as immediately before 573 keyword. */ 574static int last_comment_line; 575static int last_non_comment_line; 576 577 578/* ======================== Recognizing comments. ======================== */ 579 580 581/* Recognizing the "coding" comment. 582 As specified in PEP 0263, it takes the form 583 "coding" [":"|"="] {alphanumeric or "-" or "_" or "*"}* 584 and is located in a comment in a line that 585 - is either the first or second line, 586 - is not a continuation line, 587 - contains no other tokens except this comment. */ 588 589/* Canonicalized encoding name for the current input file. */ 590static const char *xgettext_current_file_source_encoding; 591 592#if HAVE_ICONV 593/* Converter from xgettext_current_file_source_encoding to UTF-8 (except from 594 ASCII or UTF-8, when this conversion is a no-op). */ 595static iconv_t xgettext_current_file_source_iconv; 596#endif 597 598static inline void 599set_current_file_source_encoding (const char *canon_encoding) 600{ 601 xgettext_current_file_source_encoding = canon_encoding; 602 603 if (xgettext_current_file_source_encoding != po_charset_ascii 604 && xgettext_current_file_source_encoding != po_charset_utf8) 605 { 606#if HAVE_ICONV 607 iconv_t cd; 608 609 /* Avoid glibc-2.1 bug with EUC-KR. */ 610# if (__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) && !defined _LIBICONV_VERSION 611 if (strcmp (xgettext_current_file_source_encoding, "EUC-KR") == 0) 612 cd = (iconv_t)(-1); 613 else 614# endif 615 cd = iconv_open (po_charset_utf8, xgettext_current_file_source_encoding); 616 if (cd == (iconv_t)(-1)) 617 error_at_line (EXIT_FAILURE, 0, logical_file_name, line_number - 1, _("\ 618Cannot convert from \"%s\" to \"%s\". %s relies on iconv(), \ 619and iconv() does not support this conversion."), 620 xgettext_current_file_source_encoding, po_charset_utf8, 621 basename (program_name)); 622 xgettext_current_file_source_iconv = cd; 623#else 624 error_at_line (EXIT_FAILURE, 0, logical_file_name, line_number - 1, _("\ 625Cannot convert from \"%s\" to \"%s\". %s relies on iconv(). \ 626This version was built without iconv()."), 627 xgettext_global_source_encoding, po_charset_utf8, 628 basename (program_name)); 629#endif 630 } 631 632 xgettext_current_source_encoding = xgettext_current_file_source_encoding; 633#if HAVE_ICONV 634 xgettext_current_source_iconv = xgettext_current_file_source_iconv; 635#endif 636} 637 638static inline void 639try_to_extract_coding (const char *comment) 640{ 641 const char *p = c_strstr (comment, "coding"); 642 643 if (p != NULL) 644 { 645 p += 6; 646 if (*p == ':' || *p == '=') 647 { 648 p++; 649 while (*p == ' ' || *p == '\t') 650 p++; 651 { 652 const char *encoding_start = p; 653 654 while (c_isalnum (*p) || *p == '-' || *p == '_' || *p == '.') 655 p++; 656 { 657 const char *encoding_end = p; 658 659 if (encoding_end > encoding_start) 660 { 661 /* Extract the encoding string. */ 662 size_t encoding_len = encoding_end - encoding_start; 663 char *encoding = XNMALLOC (encoding_len + 1, char); 664 665 memcpy (encoding, encoding_start, encoding_len); 666 encoding[encoding_len] = '\0'; 667 668 { 669 /* Canonicalize it. */ 670 const char *canon_encoding = po_charset_canonicalize (encoding); 671 if (canon_encoding == NULL) 672 { 673 error_at_line (0, 0, 674 logical_file_name, line_number - 1, _("\ 675Unknown encoding \"%s\". Proceeding with ASCII instead."), 676 encoding); 677 canon_encoding = po_charset_ascii; 678 } 679 680 /* Activate it. */ 681 set_current_file_source_encoding (canon_encoding); 682 } 683 684 free (encoding); 685 } 686 } 687 } 688 } 689 } 690} 691 692/* Tracking whether the current line is a continuation line or contains a 693 non-blank character. */ 694static bool continuation_or_nonblank_line = false; 695 696 697/* Phase 3: Outside strings, replace backslash-newline with nothing and a 698 comment with nothing. */ 699 700static int 701phase3_getc () 702{ 703 int c; 704 705 for (;;) 706 { 707 c = phase2_getc (); 708 if (c == '\\') 709 { 710 c = phase2_getc (); 711 if (c != '\n') 712 { 713 phase2_ungetc (c); 714 /* This shouldn't happen usually, because "A backslash is 715 illegal elsewhere on a line outside a string literal." */ 716 return '\\'; 717 } 718 /* Eat backslash-newline. */ 719 continuation_or_nonblank_line = true; 720 } 721 else if (c == '#') 722 { 723 /* Eat a comment. */ 724 const char *comment; 725 726 last_comment_line = line_number; 727 comment_start (); 728 for (;;) 729 { 730 c = phase2_getc (); 731 if (c == UEOF || c == '\n') 732 break; 733 /* We skip all leading white space, but not EOLs. */ 734 if (!(comment_at_start () && (c == ' ' || c == '\t'))) 735 comment_add (c); 736 } 737 comment = comment_line_end (); 738 if (line_number - 1 <= 2 && !continuation_or_nonblank_line) 739 try_to_extract_coding (comment); 740 continuation_or_nonblank_line = false; 741 return c; 742 } 743 else 744 { 745 if (c == '\n') 746 continuation_or_nonblank_line = false; 747 else if (!(c == ' ' || c == '\t' || c == '\f')) 748 continuation_or_nonblank_line = true; 749 return c; 750 } 751 } 752} 753 754/* Supports only one pushback character. */ 755static void 756phase3_ungetc (int c) 757{ 758 phase2_ungetc (c); 759} 760 761 762/* ========================= Accumulating strings. ======================== */ 763 764/* Return value of phase7_getuc when EOF is reached. */ 765#define P7_EOF (-1) 766#define P7_STRING_END (-2) 767 768/* Convert an UTF-16 or UTF-32 code point to a return value that can be 769 distinguished from a single-byte return value. */ 770#define UNICODE(code) (0x100 + (code)) 771 772/* Test a return value of phase7_getuc whether it designates an UTF-16 or 773 UTF-32 code point. */ 774#define IS_UNICODE(p7_result) ((p7_result) >= 0x100) 775 776/* Extract the UTF-16 or UTF-32 code of a return value that satisfies 777 IS_UNICODE. */ 778#define UNICODE_VALUE(p7_result) ((p7_result) - 0x100) 779 780/* A string buffer type that allows appending bytes (in the 781 xgettext_current_source_encoding) or Unicode characters. 782 Returns the entire string in UTF-8 encoding. */ 783 784struct mixed_string_buffer 785{ 786 /* The part of the string that has already been converted to UTF-8. */ 787 char *utf8_buffer; 788 size_t utf8_buflen; 789 size_t utf8_allocated; 790 /* The first half of an UTF-16 surrogate character. */ 791 unsigned short utf16_surr; 792 /* The part of the string that is still in the source encoding. */ 793 char *curr_buffer; 794 size_t curr_buflen; 795 size_t curr_allocated; 796}; 797 798/* Initialize a 'struct mixed_string_buffer' to empty. */ 799static inline void 800init_mixed_string_buffer (struct mixed_string_buffer *bp) 801{ 802 bp->utf8_buffer = NULL; 803 bp->utf8_buflen = 0; 804 bp->utf8_allocated = 0; 805 bp->utf16_surr = 0; 806 bp->curr_buffer = NULL; 807 bp->curr_buflen = 0; 808 bp->curr_allocated = 0; 809} 810 811/* Auxiliary function: Append a byte to bp->curr. */ 812static inline void 813mixed_string_buffer_append_byte (struct mixed_string_buffer *bp, unsigned char c) 814{ 815 if (bp->curr_buflen == bp->curr_allocated) 816 { 817 bp->curr_allocated = 2 * bp->curr_allocated + 10; 818 bp->curr_buffer = xrealloc (bp->curr_buffer, bp->curr_allocated); 819 } 820 bp->curr_buffer[bp->curr_buflen++] = c; 821} 822 823/* Auxiliary function: Ensure count more bytes are available in bp->utf8. */ 824static inline void 825mixed_string_buffer_append_unicode_grow (struct mixed_string_buffer *bp, size_t count) 826{ 827 if (bp->utf8_buflen + count > bp->utf8_allocated) 828 { 829 size_t new_allocated = 2 * bp->utf8_allocated + 10; 830 if (new_allocated < bp->utf8_buflen + count) 831 new_allocated = bp->utf8_buflen + count; 832 bp->utf8_allocated = new_allocated; 833 bp->utf8_buffer = xrealloc (bp->utf8_buffer, new_allocated); 834 } 835} 836 837/* Auxiliary function: Append a Unicode character to bp->utf8. 838 uc must be < 0x110000. */ 839static inline void 840mixed_string_buffer_append_unicode (struct mixed_string_buffer *bp, unsigned int uc) 841{ 842 unsigned char utf8buf[6]; 843 int count = u8_uctomb (utf8buf, uc, 6); 844 845 if (count < 0) 846 /* The caller should have ensured that uc is not out-of-range. */ 847 abort (); 848 849 mixed_string_buffer_append_unicode_grow (bp, count); 850 memcpy (bp->utf8_buffer + bp->utf8_buflen, utf8buf, count); 851 bp->utf8_buflen += count; 852} 853 854/* Auxiliary function: Flush bp->utf16_surr into bp->utf8_buffer. */ 855static inline void 856mixed_string_buffer_flush_utf16_surr (struct mixed_string_buffer *bp) 857{ 858 if (bp->utf16_surr != 0) 859 { 860 /* A half surrogate is invalid, therefore use U+FFFD instead. */ 861 mixed_string_buffer_append_unicode (bp, 0xfffd); 862 bp->utf16_surr = 0; 863 } 864} 865 866/* Auxiliary function: Flush bp->curr_buffer into bp->utf8_buffer. */ 867static inline void 868mixed_string_buffer_flush_curr_buffer (struct mixed_string_buffer *bp, int lineno) 869{ 870 if (bp->curr_buflen > 0) 871 { 872 char *curr; 873 size_t count; 874 875 mixed_string_buffer_append_byte (bp, '\0'); 876 877 /* Convert from the source encoding to UTF-8. */ 878 curr = from_current_source_encoding (bp->curr_buffer, 879 logical_file_name, lineno); 880 881 /* Append it to bp->utf8_buffer. */ 882 count = strlen (curr); 883 mixed_string_buffer_append_unicode_grow (bp, count); 884 memcpy (bp->utf8_buffer + bp->utf8_buflen, curr, count); 885 bp->utf8_buflen += count; 886 887 if (curr != bp->curr_buffer) 888 free (curr); 889 bp->curr_buflen = 0; 890 } 891} 892 893/* Append a character or Unicode character to a 'struct mixed_string_buffer'. */ 894static void 895mixed_string_buffer_append (struct mixed_string_buffer *bp, int c) 896{ 897 if (IS_UNICODE (c)) 898 { 899 /* Append a Unicode character. */ 900 901 /* Switch from multibyte character mode to Unicode character mode. */ 902 mixed_string_buffer_flush_curr_buffer (bp, line_number); 903 904 /* Test whether this character and the previous one form a Unicode 905 surrogate character pair. */ 906 if (bp->utf16_surr != 0 907 && (c >= UNICODE (0xdc00) && c < UNICODE (0xe000))) 908 { 909 unsigned short utf16buf[2]; 910 unsigned int uc; 911 912 utf16buf[0] = bp->utf16_surr; 913 utf16buf[1] = UNICODE_VALUE (c); 914 if (u16_mbtouc (&uc, utf16buf, 2) != 2) 915 abort (); 916 917 mixed_string_buffer_append_unicode (bp, uc); 918 bp->utf16_surr = 0; 919 } 920 else 921 { 922 mixed_string_buffer_flush_utf16_surr (bp); 923 924 if (c >= UNICODE (0xd800) && c < UNICODE (0xdc00)) 925 bp->utf16_surr = UNICODE_VALUE (c); 926 else 927 mixed_string_buffer_append_unicode (bp, UNICODE_VALUE (c)); 928 } 929 } 930 else 931 { 932 /* Append a single byte. */ 933 934 /* Switch from Unicode character mode to multibyte character mode. */ 935 mixed_string_buffer_flush_utf16_surr (bp); 936 937 /* When a newline is seen, convert the accumulated multibyte sequence. 938 This ensures a correct line number in the error message in case of 939 a conversion error. The "- 1" is to account for the newline. */ 940 if (c == '\n') 941 mixed_string_buffer_flush_curr_buffer (bp, line_number - 1); 942 943 mixed_string_buffer_append_byte (bp, (unsigned char) c); 944 } 945} 946 947/* Return the string buffer's contents. */ 948static char * 949mixed_string_buffer_result (struct mixed_string_buffer *bp) 950{ 951 /* Flush all into bp->utf8_buffer. */ 952 mixed_string_buffer_flush_utf16_surr (bp); 953 mixed_string_buffer_flush_curr_buffer (bp, line_number); 954 /* NUL-terminate it. */ 955 mixed_string_buffer_append_unicode_grow (bp, 1); 956 bp->utf8_buffer[bp->utf8_buflen] = '\0'; 957 /* Return it. */ 958 return bp->utf8_buffer; 959} 960 961/* Free the memory pointed to by a 'struct mixed_string_buffer'. */ 962static inline void 963free_mixed_string_buffer (struct mixed_string_buffer *bp) 964{ 965 free (bp->utf8_buffer); 966 free (bp->curr_buffer); 967} 968 969 970/* ========================== Reading of tokens. ========================== */ 971 972 973enum token_type_ty 974{ 975 token_type_eof, 976 token_type_lparen, /* ( */ 977 token_type_rparen, /* ) */ 978 token_type_comma, /* , */ 979 token_type_string, /* "abc", 'abc', """abc""", '''abc''' */ 980 token_type_symbol, /* symbol, number */ 981 token_type_other /* misc. operator */ 982}; 983typedef enum token_type_ty token_type_ty; 984 985typedef struct token_ty token_ty; 986struct token_ty 987{ 988 token_type_ty type; 989 char *string; /* for token_type_string, token_type_symbol */ 990 refcounted_string_list_ty *comment; /* for token_type_string */ 991 int line_number; 992}; 993 994 995/* There are two different input syntaxes for strings, "abc" and r"abc", 996 and two different input syntaxes for Unicode strings, u"abc" and ur"abc". 997 Which escape sequences are understood, i.e. what is interpreted specially 998 after backslash? 999 "abc" \<nl> \\ \' \" \a\b\f\n\r\t\v \ooo \xnn 1000 r"abc" 1001 u"abc" \<nl> \\ \' \" \a\b\f\n\r\t\v \ooo \xnn \unnnn \Unnnnnnnn \N{...} 1002 ur"abc" \unnnn 1003 The \unnnn values are UTF-16 values; a single \Unnnnnnnn can expand to two 1004 \unnnn items. The \ooo and \xnn values are in the current source encoding 1005 for byte strings, and Unicode code points for Unicode strings. 1006 */ 1007 1008static int 1009phase7_getuc (int quote_char, 1010 bool triple, bool interpret_ansic, bool interpret_unicode, 1011 unsigned int *backslash_counter) 1012{ 1013 int c; 1014 1015 for (;;) 1016 { 1017 /* Use phase 2, because phase 3 elides comments. */ 1018 c = phase2_getc (); 1019 1020 if (c == UEOF) 1021 return P7_EOF; 1022 1023 if (c == quote_char && (interpret_ansic || (*backslash_counter & 1) == 0)) 1024 { 1025 if (triple) 1026 { 1027 int c1 = phase2_getc (); 1028 if (c1 == quote_char) 1029 { 1030 int c2 = phase2_getc (); 1031 if (c2 == quote_char) 1032 return P7_STRING_END; 1033 phase2_ungetc (c2); 1034 } 1035 phase2_ungetc (c1); 1036 return UNICODE (c); 1037 } 1038 else 1039 return P7_STRING_END; 1040 } 1041 1042 if (c == '\n') 1043 { 1044 if (triple) 1045 { 1046 *backslash_counter = 0; 1047 return UNICODE ('\n'); 1048 } 1049 /* In r"..." and ur"..." strings, newline is only allowed 1050 immediately after an odd number of backslashes (although the 1051 backslashes are not interpreted!). */ 1052 if (!(interpret_ansic || (*backslash_counter & 1) == 0)) 1053 { 1054 *backslash_counter = 0; 1055 return UNICODE ('\n'); 1056 } 1057 phase2_ungetc (c); 1058 error_with_progname = false; 1059 error (0, 0, _("%s:%d: warning: unterminated string"), 1060 logical_file_name, line_number); 1061 error_with_progname = true; 1062 return P7_STRING_END; 1063 } 1064 1065 if (c != '\\') 1066 { 1067 *backslash_counter = 0; 1068 return UNICODE (c); 1069 } 1070 1071 /* Backslash handling. */ 1072 1073 if (!interpret_ansic && !interpret_unicode) 1074 { 1075 ++*backslash_counter; 1076 return UNICODE ('\\'); 1077 } 1078 1079 /* Dispatch according to the character following the backslash. */ 1080 c = phase2_getc (); 1081 if (c == UEOF) 1082 { 1083 ++*backslash_counter; 1084 return UNICODE ('\\'); 1085 } 1086 1087 if (interpret_ansic) 1088 switch (c) 1089 { 1090 case '\n': 1091 continue; 1092 case '\\': 1093 ++*backslash_counter; 1094 return UNICODE (c); 1095 case '\'': case '"': 1096 *backslash_counter = 0; 1097 return UNICODE (c); 1098 case 'a': 1099 *backslash_counter = 0; 1100 return UNICODE ('\a'); 1101 case 'b': 1102 *backslash_counter = 0; 1103 return UNICODE ('\b'); 1104 case 'f': 1105 *backslash_counter = 0; 1106 return UNICODE ('\f'); 1107 case 'n': 1108 *backslash_counter = 0; 1109 return UNICODE ('\n'); 1110 case 'r': 1111 *backslash_counter = 0; 1112 return UNICODE ('\r'); 1113 case 't': 1114 *backslash_counter = 0; 1115 return UNICODE ('\t'); 1116 case 'v': 1117 *backslash_counter = 0; 1118 return UNICODE ('\v'); 1119 case '0': case '1': case '2': case '3': case '4': 1120 case '5': case '6': case '7': 1121 { 1122 int n = c - '0'; 1123 1124 c = phase2_getc (); 1125 if (c != UEOF) 1126 { 1127 if (c >= '0' && c <= '7') 1128 { 1129 n = (n << 3) + (c - '0'); 1130 c = phase2_getc (); 1131 if (c != UEOF) 1132 { 1133 if (c >= '0' && c <= '7') 1134 n = (n << 3) + (c - '0'); 1135 else 1136 phase2_ungetc (c); 1137 } 1138 } 1139 else 1140 phase2_ungetc (c); 1141 } 1142 *backslash_counter = 0; 1143 if (interpret_unicode) 1144 return UNICODE (n); 1145 else 1146 return (unsigned char) n; 1147 } 1148 case 'x': 1149 { 1150 int c1 = phase2_getc (); 1151 int n1; 1152 1153 if (c1 >= '0' && c1 <= '9') 1154 n1 = c1 - '0'; 1155 else if (c1 >= 'A' && c1 <= 'F') 1156 n1 = c1 - 'A' + 10; 1157 else if (c1 >= 'a' && c1 <= 'f') 1158 n1 = c1 - 'a' + 10; 1159 else 1160 n1 = -1; 1161 1162 if (n1 >= 0) 1163 { 1164 int c2 = phase2_getc (); 1165 int n2; 1166 1167 if (c2 >= '0' && c2 <= '9') 1168 n2 = c2 - '0'; 1169 else if (c2 >= 'A' && c2 <= 'F') 1170 n2 = c2 - 'A' + 10; 1171 else if (c2 >= 'a' && c2 <= 'f') 1172 n2 = c2 - 'a' + 10; 1173 else 1174 n2 = -1; 1175 1176 if (n2 >= 0) 1177 { 1178 int n = (n1 << 4) + n2; 1179 *backslash_counter = 0; 1180 if (interpret_unicode) 1181 return UNICODE (n); 1182 else 1183 return (unsigned char) n; 1184 } 1185 1186 phase2_ungetc (c2); 1187 } 1188 phase2_ungetc (c1); 1189 phase2_ungetc (c); 1190 ++*backslash_counter; 1191 return UNICODE ('\\'); 1192 } 1193 } 1194 1195 if (interpret_unicode) 1196 { 1197 if (c == 'u') 1198 { 1199 unsigned char buf[4]; 1200 unsigned int n = 0; 1201 int i; 1202 1203 for (i = 0; i < 4; i++) 1204 { 1205 int c1 = phase2_getc (); 1206 1207 if (c1 >= '0' && c1 <= '9') 1208 n = (n << 4) + (c1 - '0'); 1209 else if (c1 >= 'A' && c1 <= 'F') 1210 n = (n << 4) + (c1 - 'A' + 10); 1211 else if (c1 >= 'a' && c1 <= 'f') 1212 n = (n << 4) + (c1 - 'a' + 10); 1213 else 1214 { 1215 phase2_ungetc (c1); 1216 while (--i >= 0) 1217 phase2_ungetc (buf[i]); 1218 phase2_ungetc (c); 1219 ++*backslash_counter; 1220 return UNICODE ('\\'); 1221 } 1222 1223 buf[i] = c1; 1224 } 1225 *backslash_counter = 0; 1226 return UNICODE (n); 1227 } 1228 1229 if (interpret_ansic) 1230 { 1231 if (c == 'U') 1232 { 1233 unsigned char buf[8]; 1234 unsigned int n = 0; 1235 int i; 1236 1237 for (i = 0; i < 8; i++) 1238 { 1239 int c1 = phase2_getc (); 1240 1241 if (c1 >= '0' && c1 <= '9') 1242 n = (n << 4) + (c1 - '0'); 1243 else if (c1 >= 'A' && c1 <= 'F') 1244 n = (n << 4) + (c1 - 'A' + 10); 1245 else if (c1 >= 'a' && c1 <= 'f') 1246 n = (n << 4) + (c1 - 'a' + 10); 1247 else 1248 { 1249 phase2_ungetc (c1); 1250 while (--i >= 0) 1251 phase2_ungetc (buf[i]); 1252 phase2_ungetc (c); 1253 ++*backslash_counter; 1254 return UNICODE ('\\'); 1255 } 1256 1257 buf[i] = c1; 1258 } 1259 if (n < 0x110000) 1260 { 1261 *backslash_counter = 0; 1262 return UNICODE (n); 1263 } 1264 1265 error_with_progname = false; 1266 error (0, 0, _("%s:%d: warning: invalid Unicode character"), 1267 logical_file_name, line_number); 1268 error_with_progname = true; 1269 1270 while (--i >= 0) 1271 phase2_ungetc (buf[i]); 1272 phase2_ungetc (c); 1273 ++*backslash_counter; 1274 return UNICODE ('\\'); 1275 } 1276 1277 if (c == 'N') 1278 { 1279 int c1 = phase2_getc (); 1280 if (c1 == '{') 1281 { 1282 unsigned char buf[UNINAME_MAX + 1]; 1283 int i; 1284 unsigned int n; 1285 1286 for (i = 0; i < UNINAME_MAX; i++) 1287 { 1288 int c2 = phase2_getc (); 1289 if (!(c2 >= ' ' && c2 <= '~')) 1290 { 1291 phase2_ungetc (c2); 1292 while (--i >= 0) 1293 phase2_ungetc (buf[i]); 1294 phase2_ungetc (c1); 1295 phase2_ungetc (c); 1296 ++*backslash_counter; 1297 return UNICODE ('\\'); 1298 } 1299 if (c2 == '}') 1300 break; 1301 buf[i] = c2; 1302 } 1303 buf[i] = '\0'; 1304 1305 n = unicode_name_character ((char *) buf); 1306 if (n != UNINAME_INVALID) 1307 { 1308 *backslash_counter = 0; 1309 return UNICODE (n); 1310 } 1311 1312 phase2_ungetc ('}'); 1313 while (--i >= 0) 1314 phase2_ungetc (buf[i]); 1315 } 1316 phase2_ungetc (c1); 1317 phase2_ungetc (c); 1318 ++*backslash_counter; 1319 return UNICODE ('\\'); 1320 } 1321 } 1322 } 1323 1324 phase2_ungetc (c); 1325 ++*backslash_counter; 1326 return UNICODE ('\\'); 1327 } 1328} 1329 1330 1331/* Combine characters into tokens. Discard whitespace except newlines at 1332 the end of logical lines. */ 1333 1334/* Number of pending open parentheses/braces/brackets. */ 1335static int open_pbb; 1336 1337static token_ty phase5_pushback[1]; 1338static int phase5_pushback_length; 1339 1340static void 1341phase5_get (token_ty *tp) 1342{ 1343 int c; 1344 1345 if (phase5_pushback_length) 1346 { 1347 *tp = phase5_pushback[--phase5_pushback_length]; 1348 return; 1349 } 1350 1351 for (;;) 1352 { 1353 tp->line_number = line_number; 1354 c = phase3_getc (); 1355 1356 switch (c) 1357 { 1358 case UEOF: 1359 tp->type = token_type_eof; 1360 return; 1361 1362 case ' ': 1363 case '\t': 1364 case '\f': 1365 /* Ignore whitespace and comments. */ 1366 continue; 1367 1368 case '\n': 1369 if (last_non_comment_line > last_comment_line) 1370 savable_comment_reset (); 1371 /* Ignore newline if and only if it is used for implicit line 1372 joining. */ 1373 if (open_pbb > 0) 1374 continue; 1375 tp->type = token_type_other; 1376 return; 1377 } 1378 1379 last_non_comment_line = tp->line_number; 1380 1381 switch (c) 1382 { 1383 case '.': 1384 { 1385 int c1 = phase3_getc (); 1386 phase3_ungetc (c1); 1387 if (!(c1 >= '0' && c1 <= '9')) 1388 { 1389 1390 tp->type = token_type_other; 1391 return; 1392 } 1393 } 1394 /* FALLTHROUGH */ 1395 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': 1396 case 'G': case 'H': case 'I': case 'J': case 'K': case 'L': 1397 case 'M': case 'N': case 'O': case 'P': case 'Q': 1398 case 'S': case 'T': case 'V': case 'W': case 'X': 1399 case 'Y': case 'Z': 1400 case '_': 1401 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': 1402 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l': 1403 case 'm': case 'n': case 'o': case 'p': case 'q': 1404 case 's': case 't': case 'v': case 'w': case 'x': 1405 case 'y': case 'z': 1406 case '0': case '1': case '2': case '3': case '4': 1407 case '5': case '6': case '7': case '8': case '9': 1408 symbol: 1409 /* Symbol, or part of a number. */ 1410 { 1411 static char *buffer; 1412 static int bufmax; 1413 int bufpos; 1414 1415 bufpos = 0; 1416 for (;;) 1417 { 1418 if (bufpos >= bufmax) 1419 { 1420 bufmax = 2 * bufmax + 10; 1421 buffer = xrealloc (buffer, bufmax); 1422 } 1423 buffer[bufpos++] = c; 1424 c = phase3_getc (); 1425 switch (c) 1426 { 1427 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': 1428 case 'G': case 'H': case 'I': case 'J': case 'K': case 'L': 1429 case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R': 1430 case 'S': case 'T': case 'U': case 'V': case 'W': case 'X': 1431 case 'Y': case 'Z': 1432 case '_': 1433 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': 1434 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l': 1435 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r': 1436 case 's': case 't': case 'u': case 'v': case 'w': case 'x': 1437 case 'y': case 'z': 1438 case '0': case '1': case '2': case '3': case '4': 1439 case '5': case '6': case '7': case '8': case '9': 1440 continue; 1441 default: 1442 phase3_ungetc (c); 1443 break; 1444 } 1445 break; 1446 } 1447 if (bufpos >= bufmax) 1448 { 1449 bufmax = 2 * bufmax + 10; 1450 buffer = xrealloc (buffer, bufmax); 1451 } 1452 buffer[bufpos] = '\0'; 1453 tp->string = xstrdup (buffer); 1454 tp->type = token_type_symbol; 1455 return; 1456 } 1457 1458 /* Strings. */ 1459 { 1460 struct mixed_string_buffer literal; 1461 int quote_char; 1462 bool interpret_ansic; 1463 bool interpret_unicode; 1464 bool triple; 1465 unsigned int backslash_counter; 1466 1467 case 'R': case 'r': 1468 { 1469 int c1 = phase2_getc (); 1470 if (c1 == '"' || c1 == '\'') 1471 { 1472 quote_char = c1; 1473 interpret_ansic = false; 1474 interpret_unicode = false; 1475 goto string; 1476 } 1477 phase2_ungetc (c1); 1478 goto symbol; 1479 } 1480 1481 case 'U': case 'u': 1482 { 1483 int c1 = phase2_getc (); 1484 if (c1 == '"' || c1 == '\'') 1485 { 1486 quote_char = c1; 1487 interpret_ansic = true; 1488 interpret_unicode = true; 1489 goto string; 1490 } 1491 if (c1 == 'R' || c1 == 'r') 1492 { 1493 int c2 = phase2_getc (); 1494 if (c2 == '"' || c2 == '\'') 1495 { 1496 quote_char = c2; 1497 interpret_ansic = false; 1498 interpret_unicode = true; 1499 goto string; 1500 } 1501 phase2_ungetc (c2); 1502 } 1503 phase2_ungetc (c1); 1504 goto symbol; 1505 } 1506 1507 case '"': case '\'': 1508 quote_char = c; 1509 interpret_ansic = true; 1510 interpret_unicode = false; 1511 string: 1512 triple = false; 1513 { 1514 int c1 = phase2_getc (); 1515 if (c1 == quote_char) 1516 { 1517 int c2 = phase2_getc (); 1518 if (c2 == quote_char) 1519 triple = true; 1520 else 1521 { 1522 phase2_ungetc (c2); 1523 phase2_ungetc (c1); 1524 } 1525 } 1526 else 1527 phase2_ungetc (c1); 1528 } 1529 backslash_counter = 0; 1530 /* Start accumulating the string. */ 1531 init_mixed_string_buffer (&literal); 1532 for (;;) 1533 { 1534 int uc = phase7_getuc (quote_char, triple, interpret_ansic, 1535 interpret_unicode, &backslash_counter); 1536 1537 if (uc == P7_EOF || uc == P7_STRING_END) 1538 break; 1539 1540 if (IS_UNICODE (uc)) 1541 assert (UNICODE_VALUE (uc) >= 0 1542 && UNICODE_VALUE (uc) < 0x110000); 1543 1544 mixed_string_buffer_append (&literal, uc); 1545 } 1546 tp->string = xstrdup (mixed_string_buffer_result (&literal)); 1547 free_mixed_string_buffer (&literal); 1548 tp->comment = add_reference (savable_comment); 1549 tp->type = token_type_string; 1550 return; 1551 } 1552 1553 case '(': 1554 open_pbb++; 1555 tp->type = token_type_lparen; 1556 return; 1557 1558 case ')': 1559 if (open_pbb > 0) 1560 open_pbb--; 1561 tp->type = token_type_rparen; 1562 return; 1563 1564 case ',': 1565 tp->type = token_type_comma; 1566 return; 1567 1568 case '[': case '{': 1569 open_pbb++; 1570 tp->type = token_type_other; 1571 return; 1572 1573 case ']': case '}': 1574 if (open_pbb > 0) 1575 open_pbb--; 1576 tp->type = token_type_other; 1577 return; 1578 1579 default: 1580 /* We could carefully recognize each of the 2 and 3 character 1581 operators, but it is not necessary, as we only need to recognize 1582 gettext invocations. Don't bother. */ 1583 tp->type = token_type_other; 1584 return; 1585 } 1586 } 1587} 1588 1589/* Supports only one pushback token. */ 1590static void 1591phase5_unget (token_ty *tp) 1592{ 1593 if (tp->type != token_type_eof) 1594 { 1595 if (phase5_pushback_length == SIZEOF (phase5_pushback)) 1596 abort (); 1597 phase5_pushback[phase5_pushback_length++] = *tp; 1598 } 1599} 1600 1601 1602/* Combine adjacent strings to form a single string. Note that the end 1603 of a logical line appears as a token of its own, therefore strings that 1604 belong to different logical lines will not be concatenated. */ 1605 1606static void 1607x_python_lex (token_ty *tp) 1608{ 1609 phase5_get (tp); 1610 if (tp->type != token_type_string) 1611 return; 1612 for (;;) 1613 { 1614 token_ty tmp; 1615 size_t len; 1616 1617 phase5_get (&tmp); 1618 if (tmp.type != token_type_string) 1619 { 1620 phase5_unget (&tmp); 1621 return; 1622 } 1623 len = strlen (tp->string); 1624 tp->string = xrealloc (tp->string, len + strlen (tmp.string) + 1); 1625 strcpy (tp->string + len, tmp.string); 1626 free (tmp.string); 1627 } 1628} 1629 1630 1631/* ========================= Extracting strings. ========================== */ 1632 1633 1634/* Context lookup table. */ 1635static flag_context_list_table_ty *flag_context_list_table; 1636 1637 1638/* The file is broken into tokens. Scan the token stream, looking for 1639 a keyword, followed by a left paren, followed by a string. When we 1640 see this sequence, we have something to remember. We assume we are 1641 looking at a valid C or C++ program, and leave the complaints about 1642 the grammar to the compiler. 1643 1644 Normal handling: Look for 1645 keyword ( ... msgid ... ) 1646 Plural handling: Look for 1647 keyword ( ... msgid ... msgid_plural ... ) 1648 1649 We use recursion because the arguments before msgid or between msgid 1650 and msgid_plural can contain subexpressions of the same form. */ 1651 1652 1653/* Extract messages until the next balanced closing parenthesis. 1654 Extracted messages are added to MLP. 1655 Return true upon eof, false upon closing parenthesis. */ 1656static bool 1657extract_parenthesized (message_list_ty *mlp, 1658 flag_context_ty outer_context, 1659 flag_context_list_iterator_ty context_iter, 1660 struct arglist_parser *argparser) 1661{ 1662 /* Current argument number. */ 1663 int arg = 1; 1664 /* 0 when no keyword has been seen. 1 right after a keyword is seen. */ 1665 int state; 1666 /* Parameters of the keyword just seen. Defined only in state 1. */ 1667 const struct callshapes *next_shapes = NULL; 1668 /* Context iterator that will be used if the next token is a '('. */ 1669 flag_context_list_iterator_ty next_context_iter = 1670 passthrough_context_list_iterator; 1671 /* Current context. */ 1672 flag_context_ty inner_context = 1673 inherited_context (outer_context, 1674 flag_context_list_iterator_advance (&context_iter)); 1675 1676 /* Start state is 0. */ 1677 state = 0; 1678 1679 for (;;) 1680 { 1681 token_ty token; 1682 1683 x_python_lex (&token); 1684 switch (token.type) 1685 { 1686 case token_type_symbol: 1687 { 1688 void *keyword_value; 1689 1690 if (hash_find_entry (&keywords, token.string, strlen (token.string), 1691 &keyword_value) 1692 == 0) 1693 { 1694 next_shapes = (const struct callshapes *) keyword_value; 1695 state = 1; 1696 } 1697 else 1698 state = 0; 1699 } 1700 next_context_iter = 1701 flag_context_list_iterator ( 1702 flag_context_list_table_lookup ( 1703 flag_context_list_table, 1704 token.string, strlen (token.string))); 1705 free (token.string); 1706 continue; 1707 1708 case token_type_lparen: 1709 if (extract_parenthesized (mlp, inner_context, next_context_iter, 1710 arglist_parser_alloc (mlp, 1711 state ? next_shapes : NULL))) 1712 { 1713 xgettext_current_source_encoding = po_charset_utf8; 1714 arglist_parser_done (argparser, arg); 1715 xgettext_current_source_encoding = xgettext_current_file_source_encoding; 1716 return true; 1717 } 1718 next_context_iter = null_context_list_iterator; 1719 state = 0; 1720 continue; 1721 1722 case token_type_rparen: 1723 xgettext_current_source_encoding = po_charset_utf8; 1724 arglist_parser_done (argparser, arg); 1725 xgettext_current_source_encoding = xgettext_current_file_source_encoding; 1726 return false; 1727 1728 case token_type_comma: 1729 arg++; 1730 inner_context = 1731 inherited_context (outer_context, 1732 flag_context_list_iterator_advance ( 1733 &context_iter)); 1734 next_context_iter = passthrough_context_list_iterator; 1735 state = 0; 1736 continue; 1737 1738 case token_type_string: 1739 { 1740 lex_pos_ty pos; 1741 pos.file_name = logical_file_name; 1742 pos.line_number = token.line_number; 1743 1744 xgettext_current_source_encoding = po_charset_utf8; 1745 if (extract_all) 1746 remember_a_message (mlp, NULL, token.string, inner_context, 1747 &pos, token.comment); 1748 else 1749 arglist_parser_remember (argparser, arg, token.string, 1750 inner_context, 1751 pos.file_name, pos.line_number, 1752 token.comment); 1753 xgettext_current_source_encoding = xgettext_current_file_source_encoding; 1754 } 1755 drop_reference (token.comment); 1756 next_context_iter = null_context_list_iterator; 1757 state = 0; 1758 continue; 1759 1760 case token_type_eof: 1761 xgettext_current_source_encoding = po_charset_utf8; 1762 arglist_parser_done (argparser, arg); 1763 xgettext_current_source_encoding = xgettext_current_file_source_encoding; 1764 return true; 1765 1766 case token_type_other: 1767 next_context_iter = null_context_list_iterator; 1768 state = 0; 1769 continue; 1770 1771 default: 1772 abort (); 1773 } 1774 } 1775} 1776 1777 1778void 1779extract_python (FILE *f, 1780 const char *real_filename, const char *logical_filename, 1781 flag_context_list_table_ty *flag_table, 1782 msgdomain_list_ty *mdlp) 1783{ 1784 message_list_ty *mlp = mdlp->item[0]->messages; 1785 1786 fp = f; 1787 real_file_name = real_filename; 1788 logical_file_name = xstrdup (logical_filename); 1789 line_number = 1; 1790 1791 last_comment_line = -1; 1792 last_non_comment_line = -1; 1793 1794 xgettext_current_file_source_encoding = xgettext_global_source_encoding; 1795#if HAVE_ICONV 1796 xgettext_current_file_source_iconv = xgettext_global_source_iconv; 1797#endif 1798 1799 xgettext_current_source_encoding = xgettext_current_file_source_encoding; 1800#if HAVE_ICONV 1801 xgettext_current_source_iconv = xgettext_current_file_source_iconv; 1802#endif 1803 1804 continuation_or_nonblank_line = false; 1805 1806 open_pbb = 0; 1807 1808 flag_context_list_table = flag_table; 1809 1810 init_keywords (); 1811 1812 /* Eat tokens until eof is seen. When extract_parenthesized returns 1813 due to an unbalanced closing parenthesis, just restart it. */ 1814 while (!extract_parenthesized (mlp, null_context, null_context_list_iterator, 1815 arglist_parser_alloc (mlp, NULL))) 1816 ; 1817 1818 fp = NULL; 1819 real_file_name = NULL; 1820 logical_file_name = NULL; 1821 line_number = 0; 1822} 1823