1/* xgettext Python backend. 2 Copyright (C) 2002-2003, 2005-2006 Free Software Foundation, Inc. 3 4 This file was written by Bruno Haible <haible@clisp.cons.org>, 2002. 5 6 This program is free software; you can redistribute it and/or modify 7 it under the terms of the GNU General Public License as published by 8 the Free Software Foundation; either version 2, or (at your option) 9 any later version. 10 11 This program is distributed in the hope that it will be useful, 12 but WITHOUT ANY WARRANTY; without even the implied warranty of 13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 GNU General Public License for more details. 15 16 You should have received a copy of the GNU General Public License 17 along with this program; if not, write to the Free Software Foundation, 18 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ 19 20#ifdef HAVE_CONFIG_H 21# include "config.h" 22#endif 23 24#include <assert.h> 25#include <errno.h> 26#include <stdbool.h> 27#include <stdio.h> 28#include <stdlib.h> 29#include <string.h> 30 31#include "message.h" 32#include "xgettext.h" 33#include "x-python.h" 34#include "error.h" 35#include "error-progname.h" 36#include "progname.h" 37#include "basename.h" 38#include "xerror.h" 39#include "xvasprintf.h" 40#include "xalloc.h" 41#include "exit.h" 42#include "c-strstr.h" 43#include "c-ctype.h" 44#include "po-charset.h" 45#include "uniname.h" 46#include "utf16-ucs4.h" 47#include "utf8-ucs4.h" 48#include "ucs4-utf8.h" 49#include "gettext.h" 50 51#define _(s) gettext(s) 52 53#define max(a,b) ((a) > (b) ? (a) : (b)) 54 55#define SIZEOF(a) (sizeof(a) / sizeof(a[0])) 56 57 58/* The Python syntax is defined in the Python Reference Manual 59 /usr/share/doc/packages/python/html/ref/index.html. 60 See also Python-2.0/Parser/tokenizer.c, Python-2.0/Python/compile.c, 61 Python-2.0/Objects/unicodeobject.c. */ 62 63 64/* ====================== Keyword set customization. ====================== */ 65 66/* If true extract all strings. */ 67static bool extract_all = false; 68 69static hash_table keywords; 70static bool default_keywords = true; 71 72 73void 74x_python_extract_all () 75{ 76 extract_all = true; 77} 78 79 80void 81x_python_keyword (const char *name) 82{ 83 if (name == NULL) 84 default_keywords = false; 85 else 86 { 87 const char *end; 88 struct callshape shape; 89 const char *colon; 90 91 if (keywords.table == NULL) 92 hash_init (&keywords, 100); 93 94 split_keywordspec (name, &end, &shape); 95 96 /* The characters between name and end should form a valid C identifier. 97 A colon means an invalid parse in split_keywordspec(). */ 98 colon = strchr (name, ':'); 99 if (colon == NULL || colon >= end) 100 insert_keyword_callshape (&keywords, name, end - name, &shape); 101 } 102} 103 104/* Finish initializing the keywords hash table. 105 Called after argument processing, before each file is processed. */ 106static void 107init_keywords () 108{ 109 if (default_keywords) 110 { 111 /* When adding new keywords here, also update the documentation in 112 xgettext.texi! */ 113 x_python_keyword ("gettext"); 114 x_python_keyword ("ugettext"); 115 x_python_keyword ("dgettext:2"); 116 x_python_keyword ("ngettext:1,2"); 117 x_python_keyword ("ungettext:1,2"); 118 x_python_keyword ("dngettext:2,3"); 119 x_python_keyword ("_"); 120 default_keywords = false; 121 } 122} 123 124void 125init_flag_table_python () 126{ 127 xgettext_record_flag ("gettext:1:pass-python-format"); 128 xgettext_record_flag ("ugettext:1:pass-python-format"); 129 xgettext_record_flag ("dgettext:2:pass-python-format"); 130 xgettext_record_flag ("ngettext:1:pass-python-format"); 131 xgettext_record_flag ("ngettext:2:pass-python-format"); 132 xgettext_record_flag ("ungettext:1:pass-python-format"); 133 xgettext_record_flag ("ungettext:2:pass-python-format"); 134 xgettext_record_flag ("dngettext:2:pass-python-format"); 135 xgettext_record_flag ("dngettext:3:pass-python-format"); 136 xgettext_record_flag ("_:1:pass-python-format"); 137 /* xgettext_record_flag ("%:1:python-format"); // % is an infix operator! */ 138} 139 140 141/* ======================== Reading of characters. ======================== */ 142 143/* Real filename, used in error messages about the input file. */ 144static const char *real_file_name; 145 146/* Logical filename and line number, used to label the extracted messages. */ 147static char *logical_file_name; 148static int line_number; 149 150/* The input file stream. */ 151static FILE *fp; 152 153 154/* 1. line_number handling. */ 155 156/* Maximum used, roughly a safer MB_LEN_MAX. */ 157#define MAX_PHASE1_PUSHBACK 16 158static unsigned char phase1_pushback[MAX_PHASE1_PUSHBACK]; 159static int phase1_pushback_length; 160 161/* Read the next single byte from the input file. */ 162static int 163phase1_getc () 164{ 165 int c; 166 167 if (phase1_pushback_length) 168 c = phase1_pushback[--phase1_pushback_length]; 169 else 170 { 171 c = getc (fp); 172 173 if (c == EOF) 174 { 175 if (ferror (fp)) 176 error (EXIT_FAILURE, errno, _("error while reading \"%s\""), 177 real_file_name); 178 return EOF; 179 } 180 } 181 182 if (c == '\n') 183 ++line_number; 184 185 return c; 186} 187 188/* Supports MAX_PHASE1_PUSHBACK characters of pushback. */ 189static void 190phase1_ungetc (int c) 191{ 192 if (c != EOF) 193 { 194 if (c == '\n') 195 --line_number; 196 197 if (phase1_pushback_length == SIZEOF (phase1_pushback)) 198 abort (); 199 phase1_pushback[phase1_pushback_length++] = c; 200 } 201} 202 203 204/* Phase 2: Conversion to Unicode. 205 This is done early because PEP 0263 specifies that conversion to Unicode 206 conceptually occurs before tokenization. A test case where it matters 207 is with encodings like BIG5: when a double-byte character ending in 0x5C 208 is followed by '\' or 'u0021', the tokenizer must not treat the second 209 half of the double-byte character as a backslash. */ 210 211/* End-of-file indicator for functions returning an UCS-4 character. */ 212#define UEOF -1 213 214static int phase2_pushback[max (9, UNINAME_MAX + 3)]; 215static int phase2_pushback_length; 216 217/* Read the next Unicode UCS-4 character from the input file. */ 218static int 219phase2_getc () 220{ 221 if (phase2_pushback_length) 222 return phase2_pushback[--phase2_pushback_length]; 223 224 if (xgettext_current_source_encoding == po_charset_ascii) 225 { 226 int c = phase1_getc (); 227 if (c == EOF) 228 return UEOF; 229 if (!c_isascii (c)) 230 { 231 char buffer[21]; 232 sprintf (buffer, ":%ld", (long) line_number); 233 multiline_error (xstrdup (""), 234 xasprintf (_("\ 235Non-ASCII string at %s%s.\n\ 236Please specify the source encoding through --from-code or through a comment\n\ 237as specified in http://www.python.org/peps/pep-0263.html.\n"), 238 real_file_name, buffer)); 239 exit (EXIT_FAILURE); 240 } 241 return c; 242 } 243 else if (xgettext_current_source_encoding != po_charset_utf8) 244 { 245#if HAVE_ICONV 246 /* Use iconv on an increasing number of bytes. Read only as many bytes 247 through phase1_getc as needed. This is needed to give reasonable 248 interactive behaviour when fp is connected to an interactive tty. */ 249 unsigned char buf[MAX_PHASE1_PUSHBACK]; 250 size_t bufcount; 251 int c = phase1_getc (); 252 if (c == EOF) 253 return UEOF; 254 buf[0] = (unsigned char) c; 255 bufcount = 1; 256 257 for (;;) 258 { 259 unsigned char scratchbuf[6]; 260 const char *inptr = (const char *) &buf[0]; 261 size_t insize = bufcount; 262 char *outptr = (char *) &scratchbuf[0]; 263 size_t outsize = sizeof (scratchbuf); 264 265 size_t res = iconv (xgettext_current_source_iconv, 266 (ICONV_CONST char **) &inptr, &insize, 267 &outptr, &outsize); 268 /* We expect that a character has been produced if and only if 269 some input bytes have been consumed. */ 270 if ((insize < bufcount) != (outsize < sizeof (scratchbuf))) 271 abort (); 272 if (outsize == sizeof (scratchbuf)) 273 { 274 /* No character has been produced. Must be an error. */ 275 if (res != (size_t)(-1)) 276 abort (); 277 278 if (errno == EILSEQ) 279 { 280 /* An invalid multibyte sequence was encountered. */ 281 multiline_error (xstrdup (""), 282 xasprintf (_("\ 283%s:%d: Invalid multibyte sequence.\n\ 284Please specify the correct source encoding through --from-code or through a\n\ 285comment as specified in http://www.python.org/peps/pep-0263.html.\n"), 286 real_file_name, line_number)); 287 exit (EXIT_FAILURE); 288 } 289 else if (errno == EINVAL) 290 { 291 /* An incomplete multibyte character. */ 292 int c; 293 294 if (bufcount == MAX_PHASE1_PUSHBACK) 295 { 296 /* An overlong incomplete multibyte sequence was 297 encountered. */ 298 multiline_error (xstrdup (""), 299 xasprintf (_("\ 300%s:%d: Long incomplete multibyte sequence.\n\ 301Please specify the correct source encoding through --from-code or through a\n\ 302comment as specified in http://www.python.org/peps/pep-0263.html.\n"), 303 real_file_name, line_number)); 304 exit (EXIT_FAILURE); 305 } 306 307 /* Read one more byte and retry iconv. */ 308 c = phase1_getc (); 309 if (c == EOF) 310 { 311 multiline_error (xstrdup (""), 312 xasprintf (_("\ 313%s:%d: Incomplete multibyte sequence at end of file.\n\ 314Please specify the correct source encoding through --from-code or through a\n\ 315comment as specified in http://www.python.org/peps/pep-0263.html.\n"), 316 real_file_name, line_number)); 317 exit (EXIT_FAILURE); 318 } 319 if (c == '\n') 320 { 321 multiline_error (xstrdup (""), 322 xasprintf (_("\ 323%s:%d: Incomplete multibyte sequence at end of line.\n\ 324Please specify the correct source encoding through --from-code or through a\n\ 325comment as specified in http://www.python.org/peps/pep-0263.html.\n"), 326 real_file_name, line_number - 1)); 327 exit (EXIT_FAILURE); 328 } 329 buf[bufcount++] = (unsigned char) c; 330 } 331 else 332 error (EXIT_FAILURE, errno, _("%s:%d: iconv failure"), 333 real_file_name, line_number); 334 } 335 else 336 { 337 size_t outbytes = sizeof (scratchbuf) - outsize; 338 size_t bytes = bufcount - insize; 339 unsigned int uc; 340 341 /* We expect that one character has been produced. */ 342 if (bytes == 0) 343 abort (); 344 if (outbytes == 0) 345 abort (); 346 /* Push back the unused bytes. */ 347 while (insize > 0) 348 phase1_ungetc (buf[--insize]); 349 /* Convert the character from UTF-8 to UCS-4. */ 350 if (u8_mbtouc (&uc, scratchbuf, outbytes) < outbytes) 351 { 352 /* scratchbuf contains an out-of-range Unicode character 353 (> 0x10ffff). */ 354 multiline_error (xstrdup (""), 355 xasprintf (_("\ 356%s:%d: Invalid multibyte sequence.\n\ 357Please specify the source encoding through --from-code or through a comment\n\ 358as specified in http://www.python.org/peps/pep-0263.html.\n"), 359 real_file_name, line_number)); 360 exit (EXIT_FAILURE); 361 } 362 return uc; 363 } 364 } 365#else 366 /* If we don't have iconv(), the only supported values for 367 xgettext_global_source_encoding and thus also for 368 xgettext_current_source_encoding are ASCII and UTF-8. */ 369 abort (); 370#endif 371 } 372 else 373 { 374 /* Read an UTF-8 encoded character. */ 375 unsigned char buf[6]; 376 unsigned int count; 377 int c; 378 unsigned int uc; 379 380 c = phase1_getc (); 381 if (c == EOF) 382 return UEOF; 383 buf[0] = c; 384 count = 1; 385 386 if (buf[0] >= 0xc0) 387 { 388 c = phase1_getc (); 389 if (c == EOF) 390 return UEOF; 391 buf[1] = c; 392 count = 2; 393 } 394 395 if (buf[0] >= 0xe0 396 && ((buf[1] ^ 0x80) < 0x40)) 397 { 398 c = phase1_getc (); 399 if (c == EOF) 400 return UEOF; 401 buf[2] = c; 402 count = 3; 403 } 404 405 if (buf[0] >= 0xf0 406 && ((buf[1] ^ 0x80) < 0x40) 407 && ((buf[2] ^ 0x80) < 0x40)) 408 { 409 c = phase1_getc (); 410 if (c == EOF) 411 return UEOF; 412 buf[3] = c; 413 count = 4; 414 } 415 416 if (buf[0] >= 0xf8 417 && ((buf[1] ^ 0x80) < 0x40) 418 && ((buf[2] ^ 0x80) < 0x40) 419 && ((buf[3] ^ 0x80) < 0x40)) 420 { 421 c = phase1_getc (); 422 if (c == EOF) 423 return UEOF; 424 buf[4] = c; 425 count = 5; 426 } 427 428 if (buf[0] >= 0xfc 429 && ((buf[1] ^ 0x80) < 0x40) 430 && ((buf[2] ^ 0x80) < 0x40) 431 && ((buf[3] ^ 0x80) < 0x40) 432 && ((buf[4] ^ 0x80) < 0x40)) 433 { 434 c = phase1_getc (); 435 if (c == EOF) 436 return UEOF; 437 buf[5] = c; 438 count = 6; 439 } 440 441 u8_mbtouc (&uc, buf, count); 442 return uc; 443 } 444} 445 446/* Supports max (9, UNINAME_MAX + 3) pushback characters. */ 447static void 448phase2_ungetc (int c) 449{ 450 if (c != UEOF) 451 { 452 if (phase2_pushback_length == SIZEOF (phase2_pushback)) 453 abort (); 454 phase2_pushback[phase2_pushback_length++] = c; 455 } 456} 457 458 459/* ========================= Accumulating strings. ======================== */ 460 461/* A string buffer type that allows appending Unicode characters. 462 Returns the entire string in UTF-8 encoding. */ 463 464struct unicode_string_buffer 465{ 466 /* The part of the string that has already been converted to UTF-8. */ 467 char *utf8_buffer; 468 size_t utf8_buflen; 469 size_t utf8_allocated; 470}; 471 472/* Initialize a 'struct unicode_string_buffer' to empty. */ 473static inline void 474init_unicode_string_buffer (struct unicode_string_buffer *bp) 475{ 476 bp->utf8_buffer = NULL; 477 bp->utf8_buflen = 0; 478 bp->utf8_allocated = 0; 479} 480 481/* Auxiliary function: Ensure count more bytes are available in bp->utf8. */ 482static inline void 483unicode_string_buffer_append_unicode_grow (struct unicode_string_buffer *bp, 484 size_t count) 485{ 486 if (bp->utf8_buflen + count > bp->utf8_allocated) 487 { 488 size_t new_allocated = 2 * bp->utf8_allocated + 10; 489 if (new_allocated < bp->utf8_buflen + count) 490 new_allocated = bp->utf8_buflen + count; 491 bp->utf8_allocated = new_allocated; 492 bp->utf8_buffer = xrealloc (bp->utf8_buffer, new_allocated); 493 } 494} 495 496/* Auxiliary function: Append a Unicode character to bp->utf8. 497 uc must be < 0x110000. */ 498static inline void 499unicode_string_buffer_append_unicode (struct unicode_string_buffer *bp, 500 unsigned int uc) 501{ 502 unsigned char utf8buf[6]; 503 int count = u8_uctomb (utf8buf, uc, 6); 504 505 if (count < 0) 506 /* The caller should have ensured that uc is not out-of-range. */ 507 abort (); 508 509 unicode_string_buffer_append_unicode_grow (bp, count); 510 memcpy (bp->utf8_buffer + bp->utf8_buflen, utf8buf, count); 511 bp->utf8_buflen += count; 512} 513 514/* Return the string buffer's contents. */ 515static char * 516unicode_string_buffer_result (struct unicode_string_buffer *bp) 517{ 518 /* NUL-terminate it. */ 519 unicode_string_buffer_append_unicode_grow (bp, 1); 520 bp->utf8_buffer[bp->utf8_buflen] = '\0'; 521 /* Return it. */ 522 return bp->utf8_buffer; 523} 524 525/* Free the memory pointed to by a 'struct unicode_string_buffer'. */ 526static inline void 527free_unicode_string_buffer (struct unicode_string_buffer *bp) 528{ 529 free (bp->utf8_buffer); 530} 531 532 533/* ======================== Accumulating comments. ======================== */ 534 535 536/* Accumulating a single comment line. */ 537 538static struct unicode_string_buffer comment_buffer; 539 540static inline void 541comment_start () 542{ 543 comment_buffer.utf8_buflen = 0; 544} 545 546static inline bool 547comment_at_start () 548{ 549 return (comment_buffer.utf8_buflen == 0); 550} 551 552static inline void 553comment_add (int c) 554{ 555 unicode_string_buffer_append_unicode (&comment_buffer, c); 556} 557 558static inline const char * 559comment_line_end () 560{ 561 char *buffer = unicode_string_buffer_result (&comment_buffer); 562 size_t buflen = strlen (buffer); 563 564 while (buflen >= 1 565 && (buffer[buflen - 1] == ' ' || buffer[buflen - 1] == '\t')) 566 --buflen; 567 buffer[buflen] = '\0'; 568 savable_comment_add (buffer); 569 return buffer; 570} 571 572 573/* These are for tracking whether comments count as immediately before 574 keyword. */ 575static int last_comment_line; 576static int last_non_comment_line; 577 578 579/* ======================== Recognizing comments. ======================== */ 580 581 582/* Recognizing the "coding" comment. 583 As specified in PEP 0263, it takes the form 584 "coding" [":"|"="] {alphanumeric or "-" or "_" or "*"}* 585 and is located in a comment in a line that 586 - is either the first or second line, 587 - is not a continuation line, 588 - contains no other tokens except this comment. */ 589 590/* Canonicalized encoding name for the current input file. */ 591static const char *xgettext_current_file_source_encoding; 592 593#if HAVE_ICONV 594/* Converter from xgettext_current_file_source_encoding to UTF-8 (except from 595 ASCII or UTF-8, when this conversion is a no-op). */ 596static iconv_t xgettext_current_file_source_iconv; 597#endif 598 599static inline void 600set_current_file_source_encoding (const char *canon_encoding) 601{ 602 xgettext_current_file_source_encoding = canon_encoding; 603 604 if (xgettext_current_file_source_encoding != po_charset_ascii 605 && xgettext_current_file_source_encoding != po_charset_utf8) 606 { 607#if HAVE_ICONV 608 iconv_t cd; 609 610 /* Avoid glibc-2.1 bug with EUC-KR. */ 611# if (__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) && !defined _LIBICONV_VERSION 612 if (strcmp (xgettext_current_file_source_encoding, "EUC-KR") == 0) 613 cd = (iconv_t)(-1); 614 else 615# endif 616 cd = iconv_open (po_charset_utf8, xgettext_current_file_source_encoding); 617 if (cd == (iconv_t)(-1)) 618 error_at_line (EXIT_FAILURE, 0, logical_file_name, line_number - 1, _("\ 619Cannot convert from \"%s\" to \"%s\". %s relies on iconv(), \ 620and iconv() does not support this conversion."), 621 xgettext_current_file_source_encoding, po_charset_utf8, 622 basename (program_name)); 623 xgettext_current_file_source_iconv = cd; 624#else 625 error_at_line (EXIT_FAILURE, 0, logical_file_name, line_number - 1, _("\ 626Cannot convert from \"%s\" to \"%s\". %s relies on iconv(). \ 627This version was built without iconv()."), 628 xgettext_global_source_encoding, po_charset_utf8, 629 basename (program_name)); 630#endif 631 } 632 633 xgettext_current_source_encoding = xgettext_current_file_source_encoding; 634#if HAVE_ICONV 635 xgettext_current_source_iconv = xgettext_current_file_source_iconv; 636#endif 637} 638 639static inline void 640try_to_extract_coding (const char *comment) 641{ 642 const char *p = c_strstr (comment, "coding"); 643 644 if (p != NULL) 645 { 646 p += 6; 647 if (*p == ':' || *p == '=') 648 { 649 p++; 650 while (*p == ' ' || *p == '\t') 651 p++; 652 { 653 const char *encoding_start = p; 654 655 while (c_isalnum (*p) || *p == '-' || *p == '_' || *p == '.') 656 p++; 657 { 658 const char *encoding_end = p; 659 660 if (encoding_end > encoding_start) 661 { 662 /* Extract the encoding string. */ 663 size_t encoding_len = encoding_end - encoding_start; 664 char *encoding = (char *) xmalloc (encoding_len + 1); 665 666 memcpy (encoding, encoding_start, encoding_len); 667 encoding[encoding_len] = '\0'; 668 669 { 670 /* Canonicalize it. */ 671 const char *canon_encoding = po_charset_canonicalize (encoding); 672 if (canon_encoding == NULL) 673 { 674 error_at_line (0, 0, 675 logical_file_name, line_number - 1, _("\ 676Unknown encoding \"%s\". Proceeding with ASCII instead."), 677 encoding); 678 canon_encoding = po_charset_ascii; 679 } 680 681 /* Activate it. */ 682 set_current_file_source_encoding (canon_encoding); 683 } 684 685 free (encoding); 686 } 687 } 688 } 689 } 690 } 691} 692 693/* Tracking whether the current line is a continuation line or contains a 694 non-blank character. */ 695static bool continuation_or_nonblank_line = false; 696 697 698/* Phase 3: Outside strings, replace backslash-newline with nothing and a 699 comment with nothing. */ 700 701static int 702phase3_getc () 703{ 704 int c; 705 706 for (;;) 707 { 708 c = phase2_getc (); 709 if (c == '\\') 710 { 711 c = phase2_getc (); 712 if (c != '\n') 713 { 714 phase2_ungetc (c); 715 /* This shouldn't happen usually, because "A backslash is 716 illegal elsewhere on a line outside a string literal." */ 717 return '\\'; 718 } 719 /* Eat backslash-newline. */ 720 continuation_or_nonblank_line = true; 721 } 722 else if (c == '#') 723 { 724 /* Eat a comment. */ 725 const char *comment; 726 727 last_comment_line = line_number; 728 comment_start (); 729 for (;;) 730 { 731 c = phase2_getc (); 732 if (c == UEOF || c == '\n') 733 break; 734 /* We skip all leading white space, but not EOLs. */ 735 if (!(comment_at_start () && (c == ' ' || c == '\t'))) 736 comment_add (c); 737 } 738 comment = comment_line_end (); 739 if (line_number - 1 <= 2 && !continuation_or_nonblank_line) 740 try_to_extract_coding (comment); 741 continuation_or_nonblank_line = false; 742 return c; 743 } 744 else 745 { 746 if (c == '\n') 747 continuation_or_nonblank_line = false; 748 else if (!(c == ' ' || c == '\t' || c == '\f')) 749 continuation_or_nonblank_line = true; 750 return c; 751 } 752 } 753} 754 755/* Supports only one pushback character. */ 756static void 757phase3_ungetc (int c) 758{ 759 phase2_ungetc (c); 760} 761 762 763/* ========================= Accumulating strings. ======================== */ 764 765/* Return value of phase7_getuc when EOF is reached. */ 766#define P7_EOF (-1) 767#define P7_STRING_END (-2) 768 769/* Convert an UTF-16 or UTF-32 code point to a return value that can be 770 distinguished from a single-byte return value. */ 771#define UNICODE(code) (0x100 + (code)) 772 773/* Test a return value of phase7_getuc whether it designates an UTF-16 or 774 UTF-32 code point. */ 775#define IS_UNICODE(p7_result) ((p7_result) >= 0x100) 776 777/* Extract the UTF-16 or UTF-32 code of a return value that satisfies 778 IS_UNICODE. */ 779#define UNICODE_VALUE(p7_result) ((p7_result) - 0x100) 780 781/* A string buffer type that allows appending bytes (in the 782 xgettext_current_source_encoding) or Unicode characters. 783 Returns the entire string in UTF-8 encoding. */ 784 785struct mixed_string_buffer 786{ 787 /* The part of the string that has already been converted to UTF-8. */ 788 char *utf8_buffer; 789 size_t utf8_buflen; 790 size_t utf8_allocated; 791 /* The first half of an UTF-16 surrogate character. */ 792 unsigned short utf16_surr; 793 /* The part of the string that is still in the source encoding. */ 794 char *curr_buffer; 795 size_t curr_buflen; 796 size_t curr_allocated; 797}; 798 799/* Initialize a 'struct mixed_string_buffer' to empty. */ 800static inline void 801init_mixed_string_buffer (struct mixed_string_buffer *bp) 802{ 803 bp->utf8_buffer = NULL; 804 bp->utf8_buflen = 0; 805 bp->utf8_allocated = 0; 806 bp->utf16_surr = 0; 807 bp->curr_buffer = NULL; 808 bp->curr_buflen = 0; 809 bp->curr_allocated = 0; 810} 811 812/* Auxiliary function: Append a byte to bp->curr. */ 813static inline void 814mixed_string_buffer_append_byte (struct mixed_string_buffer *bp, unsigned char c) 815{ 816 if (bp->curr_buflen == bp->curr_allocated) 817 { 818 bp->curr_allocated = 2 * bp->curr_allocated + 10; 819 bp->curr_buffer = xrealloc (bp->curr_buffer, bp->curr_allocated); 820 } 821 bp->curr_buffer[bp->curr_buflen++] = c; 822} 823 824/* Auxiliary function: Ensure count more bytes are available in bp->utf8. */ 825static inline void 826mixed_string_buffer_append_unicode_grow (struct mixed_string_buffer *bp, size_t count) 827{ 828 if (bp->utf8_buflen + count > bp->utf8_allocated) 829 { 830 size_t new_allocated = 2 * bp->utf8_allocated + 10; 831 if (new_allocated < bp->utf8_buflen + count) 832 new_allocated = bp->utf8_buflen + count; 833 bp->utf8_allocated = new_allocated; 834 bp->utf8_buffer = xrealloc (bp->utf8_buffer, new_allocated); 835 } 836} 837 838/* Auxiliary function: Append a Unicode character to bp->utf8. 839 uc must be < 0x110000. */ 840static inline void 841mixed_string_buffer_append_unicode (struct mixed_string_buffer *bp, unsigned int uc) 842{ 843 unsigned char utf8buf[6]; 844 int count = u8_uctomb (utf8buf, uc, 6); 845 846 if (count < 0) 847 /* The caller should have ensured that uc is not out-of-range. */ 848 abort (); 849 850 mixed_string_buffer_append_unicode_grow (bp, count); 851 memcpy (bp->utf8_buffer + bp->utf8_buflen, utf8buf, count); 852 bp->utf8_buflen += count; 853} 854 855/* Auxiliary function: Flush bp->utf16_surr into bp->utf8_buffer. */ 856static inline void 857mixed_string_buffer_flush_utf16_surr (struct mixed_string_buffer *bp) 858{ 859 if (bp->utf16_surr != 0) 860 { 861 /* A half surrogate is invalid, therefore use U+FFFD instead. */ 862 mixed_string_buffer_append_unicode (bp, 0xfffd); 863 bp->utf16_surr = 0; 864 } 865} 866 867/* Auxiliary function: Flush bp->curr_buffer into bp->utf8_buffer. */ 868static inline void 869mixed_string_buffer_flush_curr_buffer (struct mixed_string_buffer *bp, int lineno) 870{ 871 if (bp->curr_buflen > 0) 872 { 873 char *curr; 874 size_t count; 875 876 mixed_string_buffer_append_byte (bp, '\0'); 877 878 /* Convert from the source encoding to UTF-8. */ 879 curr = from_current_source_encoding (bp->curr_buffer, 880 logical_file_name, lineno); 881 882 /* Append it to bp->utf8_buffer. */ 883 count = strlen (curr); 884 mixed_string_buffer_append_unicode_grow (bp, count); 885 memcpy (bp->utf8_buffer + bp->utf8_buflen, curr, count); 886 bp->utf8_buflen += count; 887 888 if (curr != bp->curr_buffer) 889 free (curr); 890 bp->curr_buflen = 0; 891 } 892} 893 894/* Append a character or Unicode character to a 'struct mixed_string_buffer'. */ 895static void 896mixed_string_buffer_append (struct mixed_string_buffer *bp, int c) 897{ 898 if (IS_UNICODE (c)) 899 { 900 /* Append a Unicode character. */ 901 902 /* Switch from multibyte character mode to Unicode character mode. */ 903 mixed_string_buffer_flush_curr_buffer (bp, line_number); 904 905 /* Test whether this character and the previous one form a Unicode 906 surrogate character pair. */ 907 if (bp->utf16_surr != 0 908 && (c >= UNICODE (0xdc00) && c < UNICODE (0xe000))) 909 { 910 unsigned short utf16buf[2]; 911 unsigned int uc; 912 913 utf16buf[0] = bp->utf16_surr; 914 utf16buf[1] = UNICODE_VALUE (c); 915 if (u16_mbtouc_aux (&uc, utf16buf, 2) != 2) 916 abort (); 917 918 mixed_string_buffer_append_unicode (bp, uc); 919 bp->utf16_surr = 0; 920 } 921 else 922 { 923 mixed_string_buffer_flush_utf16_surr (bp); 924 925 if (c >= UNICODE (0xd800) && c < UNICODE (0xdc00)) 926 bp->utf16_surr = UNICODE_VALUE (c); 927 else 928 mixed_string_buffer_append_unicode (bp, UNICODE_VALUE (c)); 929 } 930 } 931 else 932 { 933 /* Append a single byte. */ 934 935 /* Switch from Unicode character mode to multibyte character mode. */ 936 mixed_string_buffer_flush_utf16_surr (bp); 937 938 /* When a newline is seen, convert the accumulated multibyte sequence. 939 This ensures a correct line number in the error message in case of 940 a conversion error. The "- 1" is to account for the newline. */ 941 if (c == '\n') 942 mixed_string_buffer_flush_curr_buffer (bp, line_number - 1); 943 944 mixed_string_buffer_append_byte (bp, (unsigned char) c); 945 } 946} 947 948/* Return the string buffer's contents. */ 949static char * 950mixed_string_buffer_result (struct mixed_string_buffer *bp) 951{ 952 /* Flush all into bp->utf8_buffer. */ 953 mixed_string_buffer_flush_utf16_surr (bp); 954 mixed_string_buffer_flush_curr_buffer (bp, line_number); 955 /* NUL-terminate it. */ 956 mixed_string_buffer_append_unicode_grow (bp, 1); 957 bp->utf8_buffer[bp->utf8_buflen] = '\0'; 958 /* Return it. */ 959 return bp->utf8_buffer; 960} 961 962/* Free the memory pointed to by a 'struct mixed_string_buffer'. */ 963static inline void 964free_mixed_string_buffer (struct mixed_string_buffer *bp) 965{ 966 free (bp->utf8_buffer); 967 free (bp->curr_buffer); 968} 969 970 971/* ========================== Reading of tokens. ========================== */ 972 973 974enum token_type_ty 975{ 976 token_type_eof, 977 token_type_lparen, /* ( */ 978 token_type_rparen, /* ) */ 979 token_type_comma, /* , */ 980 token_type_string, /* "abc", 'abc', """abc""", '''abc''' */ 981 token_type_symbol, /* symbol, number */ 982 token_type_other /* misc. operator */ 983}; 984typedef enum token_type_ty token_type_ty; 985 986typedef struct token_ty token_ty; 987struct token_ty 988{ 989 token_type_ty type; 990 char *string; /* for token_type_string, token_type_symbol */ 991 refcounted_string_list_ty *comment; /* for token_type_string */ 992 int line_number; 993}; 994 995 996/* There are two different input syntaxes for strings, "abc" and r"abc", 997 and two different input syntaxes for Unicode strings, u"abc" and ur"abc". 998 Which escape sequences are understood, i.e. what is interpreted specially 999 after backslash? 1000 "abc" \<nl> \\ \' \" \a\b\f\n\r\t\v \ooo \xnn 1001 r"abc" 1002 u"abc" \<nl> \\ \' \" \a\b\f\n\r\t\v \ooo \xnn \unnnn \Unnnnnnnn \N{...} 1003 ur"abc" \unnnn 1004 The \unnnn values are UTF-16 values; a single \Unnnnnnnn can expand to two 1005 \unnnn items. The \ooo and \xnn values are in the current source encoding. 1006 */ 1007 1008static int 1009phase7_getuc (int quote_char, 1010 bool triple, bool interpret_ansic, bool interpret_unicode, 1011 unsigned int *backslash_counter) 1012{ 1013 int c; 1014 1015 for (;;) 1016 { 1017 /* Use phase 2, because phase 3 elides comments. */ 1018 c = phase2_getc (); 1019 1020 if (c == UEOF) 1021 return P7_EOF; 1022 1023 if (c == quote_char && (interpret_ansic || (*backslash_counter & 1) == 0)) 1024 { 1025 if (triple) 1026 { 1027 int c1 = phase2_getc (); 1028 if (c1 == quote_char) 1029 { 1030 int c2 = phase2_getc (); 1031 if (c2 == quote_char) 1032 return P7_STRING_END; 1033 phase2_ungetc (c2); 1034 } 1035 phase2_ungetc (c1); 1036 return UNICODE (c); 1037 } 1038 else 1039 return P7_STRING_END; 1040 } 1041 1042 if (c == '\n') 1043 { 1044 if (triple) 1045 { 1046 *backslash_counter = 0; 1047 return UNICODE ('\n'); 1048 } 1049 /* In r"..." and ur"..." strings, newline is only allowed 1050 immediately after an odd number of backslashes (although the 1051 backslashes are not interpreted!). */ 1052 if (!(interpret_ansic || (*backslash_counter & 1) == 0)) 1053 { 1054 *backslash_counter = 0; 1055 return UNICODE ('\n'); 1056 } 1057 phase2_ungetc (c); 1058 error_with_progname = false; 1059 error (0, 0, _("%s:%d: warning: unterminated string"), 1060 logical_file_name, line_number); 1061 error_with_progname = true; 1062 return P7_STRING_END; 1063 } 1064 1065 if (c != '\\') 1066 { 1067 *backslash_counter = 0; 1068 return UNICODE (c); 1069 } 1070 1071 /* Backslash handling. */ 1072 1073 if (!interpret_ansic && !interpret_unicode) 1074 { 1075 ++*backslash_counter; 1076 return UNICODE ('\\'); 1077 } 1078 1079 /* Dispatch according to the character following the backslash. */ 1080 c = phase2_getc (); 1081 if (c == UEOF) 1082 { 1083 ++*backslash_counter; 1084 return UNICODE ('\\'); 1085 } 1086 1087 if (interpret_ansic) 1088 switch (c) 1089 { 1090 case '\n': 1091 continue; 1092 case '\\': 1093 ++*backslash_counter; 1094 return UNICODE (c); 1095 case '\'': case '"': 1096 *backslash_counter = 0; 1097 return UNICODE (c); 1098 case 'a': 1099 *backslash_counter = 0; 1100 return UNICODE ('\a'); 1101 case 'b': 1102 *backslash_counter = 0; 1103 return UNICODE ('\b'); 1104 case 'f': 1105 *backslash_counter = 0; 1106 return UNICODE ('\f'); 1107 case 'n': 1108 *backslash_counter = 0; 1109 return UNICODE ('\n'); 1110 case 'r': 1111 *backslash_counter = 0; 1112 return UNICODE ('\r'); 1113 case 't': 1114 *backslash_counter = 0; 1115 return UNICODE ('\t'); 1116 case 'v': 1117 *backslash_counter = 0; 1118 return UNICODE ('\v'); 1119 case '0': case '1': case '2': case '3': case '4': 1120 case '5': case '6': case '7': 1121 { 1122 int n = c - '0'; 1123 1124 c = phase2_getc (); 1125 if (c != UEOF) 1126 { 1127 if (c >= '0' && c <= '7') 1128 { 1129 n = (n << 3) + (c - '0'); 1130 c = phase2_getc (); 1131 if (c != UEOF) 1132 { 1133 if (c >= '0' && c <= '7') 1134 n = (n << 3) + (c - '0'); 1135 else 1136 phase2_ungetc (c); 1137 } 1138 } 1139 else 1140 phase2_ungetc (c); 1141 } 1142 *backslash_counter = 0; 1143 return (unsigned char) n; 1144 } 1145 case 'x': 1146 { 1147 int c1 = phase2_getc (); 1148 int n1; 1149 1150 if (c1 >= '0' && c1 <= '9') 1151 n1 = c1 - '0'; 1152 else if (c1 >= 'A' && c1 <= 'F') 1153 n1 = c1 - 'A' + 10; 1154 else if (c1 >= 'a' && c1 <= 'f') 1155 n1 = c1 - 'a' + 10; 1156 else 1157 n1 = -1; 1158 1159 if (n1 >= 0) 1160 { 1161 int c2 = phase2_getc (); 1162 int n2; 1163 1164 if (c2 >= '0' && c2 <= '9') 1165 n2 = c2 - '0'; 1166 else if (c2 >= 'A' && c2 <= 'F') 1167 n2 = c2 - 'A' + 10; 1168 else if (c2 >= 'a' && c2 <= 'f') 1169 n2 = c2 - 'a' + 10; 1170 else 1171 n2 = -1; 1172 1173 if (n2 >= 0) 1174 { 1175 *backslash_counter = 0; 1176 return (unsigned char) ((n1 << 4) + n2); 1177 } 1178 1179 phase2_ungetc (c2); 1180 } 1181 phase2_ungetc (c1); 1182 phase2_ungetc (c); 1183 ++*backslash_counter; 1184 return UNICODE ('\\'); 1185 } 1186 } 1187 1188 if (interpret_unicode) 1189 { 1190 if (c == 'u') 1191 { 1192 unsigned char buf[4]; 1193 unsigned int n = 0; 1194 int i; 1195 1196 for (i = 0; i < 4; i++) 1197 { 1198 int c1 = phase2_getc (); 1199 1200 if (c1 >= '0' && c1 <= '9') 1201 n = (n << 4) + (c1 - '0'); 1202 else if (c1 >= 'A' && c1 <= 'F') 1203 n = (n << 4) + (c1 - 'A' + 10); 1204 else if (c1 >= 'a' && c1 <= 'f') 1205 n = (n << 4) + (c1 - 'a' + 10); 1206 else 1207 { 1208 phase2_ungetc (c1); 1209 while (--i >= 0) 1210 phase2_ungetc (buf[i]); 1211 phase2_ungetc (c); 1212 ++*backslash_counter; 1213 return UNICODE ('\\'); 1214 } 1215 1216 buf[i] = c1; 1217 } 1218 *backslash_counter = 0; 1219 return UNICODE (n); 1220 } 1221 1222 if (interpret_ansic) 1223 { 1224 if (c == 'U') 1225 { 1226 unsigned char buf[8]; 1227 unsigned int n = 0; 1228 int i; 1229 1230 for (i = 0; i < 8; i++) 1231 { 1232 int c1 = phase2_getc (); 1233 1234 if (c1 >= '0' && c1 <= '9') 1235 n = (n << 4) + (c1 - '0'); 1236 else if (c1 >= 'A' && c1 <= 'F') 1237 n = (n << 4) + (c1 - 'A' + 10); 1238 else if (c1 >= 'a' && c1 <= 'f') 1239 n = (n << 4) + (c1 - 'a' + 10); 1240 else 1241 { 1242 phase2_ungetc (c1); 1243 while (--i >= 0) 1244 phase2_ungetc (buf[i]); 1245 phase2_ungetc (c); 1246 ++*backslash_counter; 1247 return UNICODE ('\\'); 1248 } 1249 1250 buf[i] = c1; 1251 } 1252 if (n < 0x110000) 1253 { 1254 *backslash_counter = 0; 1255 return UNICODE (n); 1256 } 1257 1258 error_with_progname = false; 1259 error (0, 0, _("%s:%d: warning: invalid Unicode character"), 1260 logical_file_name, line_number); 1261 error_with_progname = true; 1262 1263 while (--i >= 0) 1264 phase2_ungetc (buf[i]); 1265 phase2_ungetc (c); 1266 ++*backslash_counter; 1267 return UNICODE ('\\'); 1268 } 1269 1270 if (c == 'N') 1271 { 1272 int c1 = phase2_getc (); 1273 if (c1 == '{') 1274 { 1275 unsigned char buf[UNINAME_MAX + 1]; 1276 int i; 1277 unsigned int n; 1278 1279 for (i = 0; i < UNINAME_MAX; i++) 1280 { 1281 int c2 = phase2_getc (); 1282 if (!(c2 >= ' ' && c2 <= '~')) 1283 { 1284 phase2_ungetc (c2); 1285 while (--i >= 0) 1286 phase2_ungetc (buf[i]); 1287 phase2_ungetc (c1); 1288 phase2_ungetc (c); 1289 ++*backslash_counter; 1290 return UNICODE ('\\'); 1291 } 1292 if (c2 == '}') 1293 break; 1294 buf[i] = c2; 1295 } 1296 buf[i] = '\0'; 1297 1298 n = unicode_name_character ((char *) buf); 1299 if (n != UNINAME_INVALID) 1300 { 1301 *backslash_counter = 0; 1302 return UNICODE (n); 1303 } 1304 1305 phase2_ungetc ('}'); 1306 while (--i >= 0) 1307 phase2_ungetc (buf[i]); 1308 } 1309 phase2_ungetc (c1); 1310 phase2_ungetc (c); 1311 ++*backslash_counter; 1312 return UNICODE ('\\'); 1313 } 1314 } 1315 } 1316 1317 phase2_ungetc (c); 1318 ++*backslash_counter; 1319 return UNICODE ('\\'); 1320 } 1321} 1322 1323 1324/* Combine characters into tokens. Discard whitespace except newlines at 1325 the end of logical lines. */ 1326 1327/* Number of pending open parentheses/braces/brackets. */ 1328static int open_pbb; 1329 1330static token_ty phase5_pushback[1]; 1331static int phase5_pushback_length; 1332 1333static void 1334phase5_get (token_ty *tp) 1335{ 1336 int c; 1337 1338 if (phase5_pushback_length) 1339 { 1340 *tp = phase5_pushback[--phase5_pushback_length]; 1341 return; 1342 } 1343 1344 for (;;) 1345 { 1346 tp->line_number = line_number; 1347 c = phase3_getc (); 1348 1349 switch (c) 1350 { 1351 case UEOF: 1352 tp->type = token_type_eof; 1353 return; 1354 1355 case ' ': 1356 case '\t': 1357 case '\f': 1358 /* Ignore whitespace and comments. */ 1359 continue; 1360 1361 case '\n': 1362 if (last_non_comment_line > last_comment_line) 1363 savable_comment_reset (); 1364 /* Ignore newline if and only if it is used for implicit line 1365 joining. */ 1366 if (open_pbb > 0) 1367 continue; 1368 tp->type = token_type_other; 1369 return; 1370 } 1371 1372 last_non_comment_line = tp->line_number; 1373 1374 switch (c) 1375 { 1376 case '.': 1377 { 1378 int c1 = phase3_getc (); 1379 phase3_ungetc (c1); 1380 if (!(c1 >= '0' && c1 <= '9')) 1381 { 1382 1383 tp->type = token_type_other; 1384 return; 1385 } 1386 } 1387 /* FALLTHROUGH */ 1388 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': 1389 case 'G': case 'H': case 'I': case 'J': case 'K': case 'L': 1390 case 'M': case 'N': case 'O': case 'P': case 'Q': 1391 case 'S': case 'T': case 'V': case 'W': case 'X': 1392 case 'Y': case 'Z': 1393 case '_': 1394 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': 1395 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l': 1396 case 'm': case 'n': case 'o': case 'p': case 'q': 1397 case 's': case 't': case 'v': case 'w': case 'x': 1398 case 'y': case 'z': 1399 case '0': case '1': case '2': case '3': case '4': 1400 case '5': case '6': case '7': case '8': case '9': 1401 symbol: 1402 /* Symbol, or part of a number. */ 1403 { 1404 static char *buffer; 1405 static int bufmax; 1406 int bufpos; 1407 1408 bufpos = 0; 1409 for (;;) 1410 { 1411 if (bufpos >= bufmax) 1412 { 1413 bufmax = 2 * bufmax + 10; 1414 buffer = xrealloc (buffer, bufmax); 1415 } 1416 buffer[bufpos++] = c; 1417 c = phase3_getc (); 1418 switch (c) 1419 { 1420 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': 1421 case 'G': case 'H': case 'I': case 'J': case 'K': case 'L': 1422 case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R': 1423 case 'S': case 'T': case 'U': case 'V': case 'W': case 'X': 1424 case 'Y': case 'Z': 1425 case '_': 1426 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': 1427 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l': 1428 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r': 1429 case 's': case 't': case 'u': case 'v': case 'w': case 'x': 1430 case 'y': case 'z': 1431 case '0': case '1': case '2': case '3': case '4': 1432 case '5': case '6': case '7': case '8': case '9': 1433 continue; 1434 default: 1435 phase3_ungetc (c); 1436 break; 1437 } 1438 break; 1439 } 1440 if (bufpos >= bufmax) 1441 { 1442 bufmax = 2 * bufmax + 10; 1443 buffer = xrealloc (buffer, bufmax); 1444 } 1445 buffer[bufpos] = '\0'; 1446 tp->string = xstrdup (buffer); 1447 tp->type = token_type_symbol; 1448 return; 1449 } 1450 1451 /* Strings. */ 1452 { 1453 struct mixed_string_buffer literal; 1454 int quote_char; 1455 bool interpret_ansic; 1456 bool interpret_unicode; 1457 bool triple; 1458 unsigned int backslash_counter; 1459 1460 case 'R': case 'r': 1461 { 1462 int c1 = phase2_getc (); 1463 if (c1 == '"' || c1 == '\'') 1464 { 1465 quote_char = c1; 1466 interpret_ansic = false; 1467 interpret_unicode = false; 1468 goto string; 1469 } 1470 phase2_ungetc (c1); 1471 goto symbol; 1472 } 1473 1474 case 'U': case 'u': 1475 { 1476 int c1 = phase2_getc (); 1477 if (c1 == '"' || c1 == '\'') 1478 { 1479 quote_char = c1; 1480 interpret_ansic = true; 1481 interpret_unicode = true; 1482 goto string; 1483 } 1484 if (c1 == 'R' || c1 == 'r') 1485 { 1486 int c2 = phase2_getc (); 1487 if (c2 == '"' || c2 == '\'') 1488 { 1489 quote_char = c2; 1490 interpret_ansic = false; 1491 interpret_unicode = true; 1492 goto string; 1493 } 1494 phase2_ungetc (c2); 1495 } 1496 phase2_ungetc (c1); 1497 goto symbol; 1498 } 1499 1500 case '"': case '\'': 1501 quote_char = c; 1502 interpret_ansic = true; 1503 interpret_unicode = false; 1504 string: 1505 triple = false; 1506 { 1507 int c1 = phase2_getc (); 1508 if (c1 == quote_char) 1509 { 1510 int c2 = phase2_getc (); 1511 if (c2 == quote_char) 1512 triple = true; 1513 else 1514 { 1515 phase2_ungetc (c2); 1516 phase2_ungetc (c1); 1517 } 1518 } 1519 else 1520 phase2_ungetc (c1); 1521 } 1522 backslash_counter = 0; 1523 /* Start accumulating the string. */ 1524 init_mixed_string_buffer (&literal); 1525 for (;;) 1526 { 1527 int uc = phase7_getuc (quote_char, triple, interpret_ansic, 1528 interpret_unicode, &backslash_counter); 1529 1530 if (uc == P7_EOF || uc == P7_STRING_END) 1531 break; 1532 1533 if (IS_UNICODE (uc)) 1534 assert (UNICODE_VALUE (uc) >= 0 1535 && UNICODE_VALUE (uc) < 0x110000); 1536 1537 mixed_string_buffer_append (&literal, uc); 1538 } 1539 tp->string = xstrdup (mixed_string_buffer_result (&literal)); 1540 free_mixed_string_buffer (&literal); 1541 tp->comment = add_reference (savable_comment); 1542 tp->type = token_type_string; 1543 return; 1544 } 1545 1546 case '(': 1547 open_pbb++; 1548 tp->type = token_type_lparen; 1549 return; 1550 1551 case ')': 1552 if (open_pbb > 0) 1553 open_pbb--; 1554 tp->type = token_type_rparen; 1555 return; 1556 1557 case ',': 1558 tp->type = token_type_comma; 1559 return; 1560 1561 case '[': case '{': 1562 open_pbb++; 1563 tp->type = token_type_other; 1564 return; 1565 1566 case ']': case '}': 1567 if (open_pbb > 0) 1568 open_pbb--; 1569 tp->type = token_type_other; 1570 return; 1571 1572 default: 1573 /* We could carefully recognize each of the 2 and 3 character 1574 operators, but it is not necessary, as we only need to recognize 1575 gettext invocations. Don't bother. */ 1576 tp->type = token_type_other; 1577 return; 1578 } 1579 } 1580} 1581 1582/* Supports only one pushback token. */ 1583static void 1584phase5_unget (token_ty *tp) 1585{ 1586 if (tp->type != token_type_eof) 1587 { 1588 if (phase5_pushback_length == SIZEOF (phase5_pushback)) 1589 abort (); 1590 phase5_pushback[phase5_pushback_length++] = *tp; 1591 } 1592} 1593 1594 1595/* Combine adjacent strings to form a single string. Note that the end 1596 of a logical line appears as a token of its own, therefore strings that 1597 belong to different logical lines will not be concatenated. */ 1598 1599static void 1600x_python_lex (token_ty *tp) 1601{ 1602 phase5_get (tp); 1603 if (tp->type != token_type_string) 1604 return; 1605 for (;;) 1606 { 1607 token_ty tmp; 1608 size_t len; 1609 1610 phase5_get (&tmp); 1611 if (tmp.type != token_type_string) 1612 { 1613 phase5_unget (&tmp); 1614 return; 1615 } 1616 len = strlen (tp->string); 1617 tp->string = xrealloc (tp->string, len + strlen (tmp.string) + 1); 1618 strcpy (tp->string + len, tmp.string); 1619 free (tmp.string); 1620 } 1621} 1622 1623 1624/* ========================= Extracting strings. ========================== */ 1625 1626 1627/* Context lookup table. */ 1628static flag_context_list_table_ty *flag_context_list_table; 1629 1630 1631/* The file is broken into tokens. Scan the token stream, looking for 1632 a keyword, followed by a left paren, followed by a string. When we 1633 see this sequence, we have something to remember. We assume we are 1634 looking at a valid C or C++ program, and leave the complaints about 1635 the grammar to the compiler. 1636 1637 Normal handling: Look for 1638 keyword ( ... msgid ... ) 1639 Plural handling: Look for 1640 keyword ( ... msgid ... msgid_plural ... ) 1641 1642 We use recursion because the arguments before msgid or between msgid 1643 and msgid_plural can contain subexpressions of the same form. */ 1644 1645 1646/* Extract messages until the next balanced closing parenthesis. 1647 Extracted messages are added to MLP. 1648 Return true upon eof, false upon closing parenthesis. */ 1649static bool 1650extract_parenthesized (message_list_ty *mlp, 1651 flag_context_ty outer_context, 1652 flag_context_list_iterator_ty context_iter, 1653 struct arglist_parser *argparser) 1654{ 1655 /* Current argument number. */ 1656 int arg = 1; 1657 /* 0 when no keyword has been seen. 1 right after a keyword is seen. */ 1658 int state; 1659 /* Parameters of the keyword just seen. Defined only in state 1. */ 1660 const struct callshapes *next_shapes = NULL; 1661 /* Context iterator that will be used if the next token is a '('. */ 1662 flag_context_list_iterator_ty next_context_iter = 1663 passthrough_context_list_iterator; 1664 /* Current context. */ 1665 flag_context_ty inner_context = 1666 inherited_context (outer_context, 1667 flag_context_list_iterator_advance (&context_iter)); 1668 1669 /* Start state is 0. */ 1670 state = 0; 1671 1672 for (;;) 1673 { 1674 token_ty token; 1675 1676 x_python_lex (&token); 1677 switch (token.type) 1678 { 1679 case token_type_symbol: 1680 { 1681 void *keyword_value; 1682 1683 if (hash_find_entry (&keywords, token.string, strlen (token.string), 1684 &keyword_value) 1685 == 0) 1686 { 1687 next_shapes = (const struct callshapes *) keyword_value; 1688 state = 1; 1689 } 1690 else 1691 state = 0; 1692 } 1693 next_context_iter = 1694 flag_context_list_iterator ( 1695 flag_context_list_table_lookup ( 1696 flag_context_list_table, 1697 token.string, strlen (token.string))); 1698 free (token.string); 1699 continue; 1700 1701 case token_type_lparen: 1702 if (extract_parenthesized (mlp, inner_context, next_context_iter, 1703 arglist_parser_alloc (mlp, 1704 state ? next_shapes : NULL))) 1705 { 1706 xgettext_current_source_encoding = po_charset_utf8; 1707 arglist_parser_done (argparser, arg); 1708 xgettext_current_source_encoding = xgettext_current_file_source_encoding; 1709 return true; 1710 } 1711 next_context_iter = null_context_list_iterator; 1712 state = 0; 1713 continue; 1714 1715 case token_type_rparen: 1716 xgettext_current_source_encoding = po_charset_utf8; 1717 arglist_parser_done (argparser, arg); 1718 xgettext_current_source_encoding = xgettext_current_file_source_encoding; 1719 return false; 1720 1721 case token_type_comma: 1722 arg++; 1723 inner_context = 1724 inherited_context (outer_context, 1725 flag_context_list_iterator_advance ( 1726 &context_iter)); 1727 next_context_iter = passthrough_context_list_iterator; 1728 state = 0; 1729 continue; 1730 1731 case token_type_string: 1732 { 1733 lex_pos_ty pos; 1734 pos.file_name = logical_file_name; 1735 pos.line_number = token.line_number; 1736 1737 xgettext_current_source_encoding = po_charset_utf8; 1738 if (extract_all) 1739 remember_a_message (mlp, NULL, token.string, inner_context, 1740 &pos, token.comment); 1741 else 1742 arglist_parser_remember (argparser, arg, token.string, 1743 inner_context, 1744 pos.file_name, pos.line_number, 1745 token.comment); 1746 xgettext_current_source_encoding = xgettext_current_file_source_encoding; 1747 } 1748 drop_reference (token.comment); 1749 next_context_iter = null_context_list_iterator; 1750 state = 0; 1751 continue; 1752 1753 case token_type_eof: 1754 xgettext_current_source_encoding = po_charset_utf8; 1755 arglist_parser_done (argparser, arg); 1756 xgettext_current_source_encoding = xgettext_current_file_source_encoding; 1757 return true; 1758 1759 case token_type_other: 1760 next_context_iter = null_context_list_iterator; 1761 state = 0; 1762 continue; 1763 1764 default: 1765 abort (); 1766 } 1767 } 1768} 1769 1770 1771void 1772extract_python (FILE *f, 1773 const char *real_filename, const char *logical_filename, 1774 flag_context_list_table_ty *flag_table, 1775 msgdomain_list_ty *mdlp) 1776{ 1777 message_list_ty *mlp = mdlp->item[0]->messages; 1778 1779 fp = f; 1780 real_file_name = real_filename; 1781 logical_file_name = xstrdup (logical_filename); 1782 line_number = 1; 1783 1784 last_comment_line = -1; 1785 last_non_comment_line = -1; 1786 1787 xgettext_current_file_source_encoding = xgettext_global_source_encoding; 1788#if HAVE_ICONV 1789 xgettext_current_file_source_iconv = xgettext_global_source_iconv; 1790#endif 1791 1792 xgettext_current_source_encoding = xgettext_current_file_source_encoding; 1793#if HAVE_ICONV 1794 xgettext_current_source_iconv = xgettext_current_file_source_iconv; 1795#endif 1796 1797 continuation_or_nonblank_line = false; 1798 1799 open_pbb = 0; 1800 1801 flag_context_list_table = flag_table; 1802 1803 init_keywords (); 1804 1805 /* Eat tokens until eof is seen. When extract_parenthesized returns 1806 due to an unbalanced closing parenthesis, just restart it. */ 1807 while (!extract_parenthesized (mlp, null_context, null_context_list_iterator, 1808 arglist_parser_alloc (mlp, NULL))) 1809 ; 1810 1811 fp = NULL; 1812 real_file_name = NULL; 1813 logical_file_name = NULL; 1814 line_number = 0; 1815} 1816