1/* GNU gettext - internationalization aids 2 Copyright (C) 1995-1999, 2000-2006 Free Software Foundation, Inc. 3 4 This file was written by Peter Miller <millerp@canb.auug.org.au>. 5 Multibyte character handling by Bruno Haible <haible@clisp.cons.org>. 6 7 This program is free software; you can redistribute it and/or modify 8 it under the terms of the GNU General Public License as published by 9 the Free Software Foundation; either version 2, or (at your option) 10 any later version. 11 12 This program is distributed in the hope that it will be useful, 13 but WITHOUT ANY WARRANTY; without even the implied warranty of 14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 GNU General Public License for more details. 16 17 You should have received a copy of the GNU General Public License 18 along with this program; if not, write to the Free Software Foundation, 19 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ 20 21 22#ifdef HAVE_CONFIG_H 23# include "config.h" 24#endif 25 26/* Specification. */ 27#include "po-lex.h" 28 29#include <errno.h> 30#include <limits.h> 31#include <stdio.h> 32#include <stdlib.h> 33#include <string.h> 34#include <stdarg.h> 35 36#if HAVE_ICONV 37# include <iconv.h> 38#endif 39 40#include "c-ctype.h" 41#include "linebreak.h" 42#include "vasprintf.h" 43#include "gettext.h" 44#include "po-charset.h" 45#include "xalloc.h" 46#include "exit.h" 47#include "error.h" 48#include "error-progname.h" 49#include "xvasprintf.h" 50#include "po-error.h" 51#include "po-xerror.h" 52#include "pos.h" 53#include "message.h" 54#include "str-list.h" 55#include "po-gram-gen2.h" 56 57#define _(str) gettext(str) 58 59#if HAVE_ICONV 60# include "utf8-ucs4.h" 61#endif 62 63#if HAVE_DECL_GETC_UNLOCKED 64# undef getc 65# define getc getc_unlocked 66#endif 67 68 69/* Current position within the PO file. */ 70lex_pos_ty gram_pos; 71int gram_pos_column; 72 73 74/* Error handling during the parsing of a PO file. 75 These functions can access gram_pos and gram_pos_column. */ 76 77/* VARARGS1 */ 78void 79po_gram_error (const char *fmt, ...) 80{ 81 va_list ap; 82 char *buffer; 83 84 va_start (ap, fmt); 85 if (vasprintf (&buffer, fmt, ap) < 0) 86 error (EXIT_FAILURE, 0, _("memory exhausted")); 87 va_end (ap); 88 po_xerror (PO_SEVERITY_ERROR, NULL, gram_pos.file_name, gram_pos.line_number, 89 gram_pos_column + 1, false, buffer); 90 free (buffer); 91 92 if (error_message_count >= gram_max_allowed_errors) 93 po_error (EXIT_FAILURE, 0, _("too many errors, aborting")); 94} 95 96/* VARARGS2 */ 97void 98po_gram_error_at_line (const lex_pos_ty *pp, const char *fmt, ...) 99{ 100 va_list ap; 101 char *buffer; 102 103 va_start (ap, fmt); 104 if (vasprintf (&buffer, fmt, ap) < 0) 105 error (EXIT_FAILURE, 0, _("memory exhausted")); 106 va_end (ap); 107 po_xerror (PO_SEVERITY_ERROR, NULL, pp->file_name, pp->line_number, 108 (size_t)(-1), false, buffer); 109 free (buffer); 110 111 if (error_message_count >= gram_max_allowed_errors) 112 po_error (EXIT_FAILURE, 0, _("too many errors, aborting")); 113} 114 115 116/* The lowest level of PO file parsing converts bytes to multibyte characters. 117 This is needed 118 1. for C compatibility: ISO C 99 section 5.1.1.2 says that the first 119 translation phase maps bytes to characters. 120 2. to keep track of the current column, for the sake of precise error 121 location. Emacs compile.el interprets the column in error messages 122 by default as a screen column number, not as character number. 123 3. to avoid skipping backslash-newline in the midst of a multibyte 124 character. If XY is a multibyte character, X \ newline Y is invalid. 125 */ 126 127/* Multibyte character data type. */ 128/* Note this depends on po_lex_charset and po_lex_iconv, which get set 129 while the file is being parsed. */ 130 131#define MBCHAR_BUF_SIZE 24 132 133struct mbchar 134{ 135 size_t bytes; /* number of bytes of current character, > 0 */ 136#if HAVE_ICONV 137 bool uc_valid; /* true if uc is a valid Unicode character */ 138 unsigned int uc; /* if uc_valid: the current character */ 139#endif 140 char buf[MBCHAR_BUF_SIZE]; /* room for the bytes */ 141}; 142 143/* We want to pass multibyte characters by reference automatically, 144 therefore we use an array type. */ 145typedef struct mbchar mbchar_t[1]; 146 147/* A version of memcpy optimized for the case n <= 1. */ 148static inline void 149memcpy_small (void *dst, const void *src, size_t n) 150{ 151 if (n > 0) 152 { 153 char *q = (char *) dst; 154 const char *p = (const char *) src; 155 156 *q = *p; 157 if (--n > 0) 158 do *++q = *++p; while (--n > 0); 159 } 160} 161 162/* EOF (not a real character) is represented with bytes = 0 and 163 uc_valid = false. */ 164static inline bool 165mb_iseof (const mbchar_t mbc) 166{ 167 return (mbc->bytes == 0); 168} 169 170/* Access the current character. */ 171static inline const char * 172mb_ptr (const mbchar_t mbc) 173{ 174 return mbc->buf; 175} 176static inline size_t 177mb_len (const mbchar_t mbc) 178{ 179 return mbc->bytes; 180} 181 182/* Comparison of characters. */ 183 184static inline bool 185mb_iseq (const mbchar_t mbc, char sc) 186{ 187 /* Note: It is wrong to compare only mbc->uc, because when the encoding is 188 SHIFT_JIS, mbc->buf[0] == '\\' corresponds to mbc->uc == 0x00A5, but we 189 want to treat it as an escape character, although it looks like a Yen 190 sign. */ 191#if HAVE_ICONV && 0 192 if (mbc->uc_valid) 193 return (mbc->uc == sc); /* wrong! */ 194 else 195#endif 196 return (mbc->bytes == 1 && mbc->buf[0] == sc); 197} 198 199static inline bool 200mb_isnul (const mbchar_t mbc) 201{ 202#if HAVE_ICONV 203 if (mbc->uc_valid) 204 return (mbc->uc == 0); 205 else 206#endif 207 return (mbc->bytes == 1 && mbc->buf[0] == 0); 208} 209 210static inline int 211mb_cmp (const mbchar_t mbc1, const mbchar_t mbc2) 212{ 213#if HAVE_ICONV 214 if (mbc1->uc_valid && mbc2->uc_valid) 215 return (int) mbc1->uc - (int) mbc2->uc; 216 else 217#endif 218 return (mbc1->bytes == mbc2->bytes 219 ? memcmp (mbc1->buf, mbc2->buf, mbc1->bytes) 220 : mbc1->bytes < mbc2->bytes 221 ? (memcmp (mbc1->buf, mbc2->buf, mbc1->bytes) > 0 ? 1 : -1) 222 : (memcmp (mbc1->buf, mbc2->buf, mbc2->bytes) >= 0 ? 1 : -1)); 223} 224 225static inline bool 226mb_equal (const mbchar_t mbc1, const mbchar_t mbc2) 227{ 228#if HAVE_ICONV 229 if (mbc1->uc_valid && mbc2->uc_valid) 230 return mbc1->uc == mbc2->uc; 231 else 232#endif 233 return (mbc1->bytes == mbc2->bytes 234 && memcmp (mbc1->buf, mbc2->buf, mbc1->bytes) == 0); 235} 236 237/* <ctype.h>, <wctype.h> classification. */ 238 239static inline bool 240mb_isascii (const mbchar_t mbc) 241{ 242#if HAVE_ICONV 243 if (mbc->uc_valid) 244 return (mbc->uc >= 0x0000 && mbc->uc <= 0x007F); 245 else 246#endif 247 return mbc->bytes == 1 && (mbc->buf[0] & 0x80) == 0; 248} 249 250/* Extra <wchar.h> function. */ 251 252/* Unprintable characters appear as a small box of width 1. */ 253#define MB_UNPRINTABLE_WIDTH 1 254 255static int 256mb_width (const mbchar_t mbc) 257{ 258#if HAVE_ICONV 259 if (mbc->uc_valid) 260 { 261 unsigned int uc = mbc->uc; 262 const char *encoding = 263 (po_lex_iconv != (iconv_t)(-1) ? po_lex_charset : ""); 264 int w = uc_width (uc, encoding); 265 /* For unprintable characters, arbitrarily return 0 for control 266 characters (except tab) and MB_UNPRINTABLE_WIDTH otherwise. */ 267 if (w >= 0) 268 return w; 269 if (uc >= 0x0000 && uc <= 0x001F) 270 { 271 if (uc == 0x0009) 272 return 8 - (gram_pos_column & 7); 273 return 0; 274 } 275 if ((uc >= 0x007F && uc <= 0x009F) || (uc >= 0x2028 && uc <= 0x2029)) 276 return 0; 277 return MB_UNPRINTABLE_WIDTH; 278 } 279 else 280#endif 281 { 282 if (mbc->bytes == 1) 283 { 284 if ( 285#if CHAR_MIN < 0x00 /* to avoid gcc warning */ 286 mbc->buf[0] >= 0x00 && 287#endif 288 mbc->buf[0] <= 0x1F) 289 { 290 if (mbc->buf[0] == 0x09) 291 return 8 - (gram_pos_column & 7); 292 return 0; 293 } 294 if (mbc->buf[0] == 0x7F) 295 return 0; 296 } 297 return MB_UNPRINTABLE_WIDTH; 298 } 299} 300 301/* Output. */ 302static inline void 303mb_putc (const mbchar_t mbc, FILE *stream) 304{ 305 fwrite (mbc->buf, 1, mbc->bytes, stream); 306} 307 308/* Assignment. */ 309static inline void 310mb_setascii (mbchar_t mbc, char sc) 311{ 312 mbc->bytes = 1; 313#if HAVE_ICONV 314 mbc->uc_valid = 1; 315 mbc->uc = sc; 316#endif 317 mbc->buf[0] = sc; 318} 319 320/* Copying a character. */ 321static inline void 322mb_copy (mbchar_t new, const mbchar_t old) 323{ 324 memcpy_small (&new->buf[0], &old->buf[0], old->bytes); 325 new->bytes = old->bytes; 326#if HAVE_ICONV 327 if ((new->uc_valid = old->uc_valid)) 328 new->uc = old->uc; 329#endif 330} 331 332 333/* Multibyte character input. */ 334 335/* Number of characters that can be pushed back. 336 We need 1 for lex_getc, plus 1 for lex_ungetc. */ 337#define NPUSHBACK 2 338 339/* Data type of a multibyte character input stream. */ 340struct mbfile 341{ 342 FILE *fp; 343 bool eof_seen; 344 int have_pushback; 345 unsigned int bufcount; 346 char buf[MBCHAR_BUF_SIZE]; 347 struct mbchar pushback[NPUSHBACK]; 348}; 349 350/* We want to pass multibyte streams by reference automatically, 351 therefore we use an array type. */ 352typedef struct mbfile mbfile_t[1]; 353 354/* Whether invalid multibyte sequences in the input shall be signalled 355 or silently tolerated. */ 356static bool signal_eilseq; 357 358static inline void 359mbfile_init (mbfile_t mbf, FILE *stream) 360{ 361 mbf->fp = stream; 362 mbf->eof_seen = false; 363 mbf->have_pushback = 0; 364 mbf->bufcount = 0; 365} 366 367/* Read the next multibyte character from mbf and put it into mbc. 368 If a read error occurs, errno is set and ferror (mbf->fp) becomes true. */ 369static void 370mbfile_getc (mbchar_t mbc, mbfile_t mbf) 371{ 372 size_t bytes; 373 374 /* If EOF has already been seen, don't use getc. This matters if 375 mbf->fp is connected to an interactive tty. */ 376 if (mbf->eof_seen) 377 goto eof; 378 379 /* Return character pushed back, if there is one. */ 380 if (mbf->have_pushback > 0) 381 { 382 mbf->have_pushback--; 383 mb_copy (mbc, &mbf->pushback[mbf->have_pushback]); 384 return; 385 } 386 387 /* Before using iconv, we need at least one byte. */ 388 if (mbf->bufcount == 0) 389 { 390 int c = getc (mbf->fp); 391 if (c == EOF) 392 { 393 mbf->eof_seen = true; 394 goto eof; 395 } 396 mbf->buf[0] = (unsigned char) c; 397 mbf->bufcount++; 398 } 399 400#if HAVE_ICONV 401 if (po_lex_iconv != (iconv_t)(-1)) 402 { 403 /* Use iconv on an increasing number of bytes. Read only as many 404 bytes from mbf->fp as needed. This is needed to give reasonable 405 interactive behaviour when mbf->fp is connected to an interactive 406 tty. */ 407 for (;;) 408 { 409 unsigned char scratchbuf[64]; 410 const char *inptr = &mbf->buf[0]; 411 size_t insize = mbf->bufcount; 412 char *outptr = (char *) &scratchbuf[0]; 413 size_t outsize = sizeof (scratchbuf); 414 415 size_t res = iconv (po_lex_iconv, 416 (ICONV_CONST char **) &inptr, &insize, 417 &outptr, &outsize); 418 /* We expect that a character has been produced if and only if 419 some input bytes have been consumed. */ 420 if ((insize < mbf->bufcount) != (outsize < sizeof (scratchbuf))) 421 abort (); 422 if (outsize == sizeof (scratchbuf)) 423 { 424 /* No character has been produced. Must be an error. */ 425 if (res != (size_t)(-1)) 426 abort (); 427 428 if (errno == EILSEQ) 429 { 430 /* An invalid multibyte sequence was encountered. */ 431 /* Return a single byte. */ 432 if (signal_eilseq) 433 po_gram_error (_("invalid multibyte sequence")); 434 bytes = 1; 435 mbc->uc_valid = false; 436 break; 437 } 438 else if (errno == EINVAL) 439 { 440 /* An incomplete multibyte character. */ 441 int c; 442 443 if (mbf->bufcount == MBCHAR_BUF_SIZE) 444 { 445 /* An overlong incomplete multibyte sequence was 446 encountered. */ 447 /* Return a single byte. */ 448 bytes = 1; 449 mbc->uc_valid = false; 450 break; 451 } 452 453 /* Read one more byte and retry iconv. */ 454 c = getc (mbf->fp); 455 if (c == EOF) 456 { 457 mbf->eof_seen = true; 458 if (ferror (mbf->fp)) 459 goto eof; 460 if (signal_eilseq) 461 po_gram_error (_("\ 462incomplete multibyte sequence at end of file")); 463 bytes = mbf->bufcount; 464 mbc->uc_valid = false; 465 break; 466 } 467 mbf->buf[mbf->bufcount++] = (unsigned char) c; 468 if (c == '\n') 469 { 470 if (signal_eilseq) 471 po_gram_error (_("\ 472incomplete multibyte sequence at end of line")); 473 bytes = mbf->bufcount - 1; 474 mbc->uc_valid = false; 475 break; 476 } 477 } 478 else 479 { 480 const char *errno_description = strerror (errno); 481 po_xerror (PO_SEVERITY_FATAL_ERROR, NULL, NULL, 0, 0, false, 482 xasprintf ("%s: %s", 483 _("iconv failure"), 484 errno_description)); 485 } 486 } 487 else 488 { 489 size_t outbytes = sizeof (scratchbuf) - outsize; 490 bytes = mbf->bufcount - insize; 491 492 /* We expect that one character has been produced. */ 493 if (bytes == 0) 494 abort (); 495 if (outbytes == 0) 496 abort (); 497 /* Convert it from UTF-8 to UCS-4. */ 498 if (u8_mbtouc (&mbc->uc, scratchbuf, outbytes) < outbytes) 499 { 500 /* scratchbuf contains an out-of-range Unicode character 501 (> 0x10ffff). */ 502 if (signal_eilseq) 503 po_gram_error (_("invalid multibyte sequence")); 504 mbc->uc_valid = false; 505 break; 506 } 507 mbc->uc_valid = true; 508 break; 509 } 510 } 511 } 512 else 513#endif 514 { 515 if (po_lex_weird_cjk 516 /* Special handling of encodings with CJK structure. */ 517 && (unsigned char) mbf->buf[0] >= 0x80) 518 { 519 if (mbf->bufcount == 1) 520 { 521 /* Read one more byte. */ 522 int c = getc (mbf->fp); 523 if (c == EOF) 524 { 525 if (ferror (mbf->fp)) 526 { 527 mbf->eof_seen = true; 528 goto eof; 529 } 530 } 531 else 532 { 533 mbf->buf[1] = (unsigned char) c; 534 mbf->bufcount++; 535 } 536 } 537 if (mbf->bufcount >= 2 && (unsigned char) mbf->buf[1] >= 0x30) 538 /* Return a double byte. */ 539 bytes = 2; 540 else 541 /* Return a single byte. */ 542 bytes = 1; 543 } 544 else 545 { 546 /* Return a single byte. */ 547 bytes = 1; 548 } 549#if HAVE_ICONV 550 mbc->uc_valid = false; 551#endif 552 } 553 554 /* Return the multibyte sequence mbf->buf[0..bytes-1]. */ 555 memcpy_small (&mbc->buf[0], &mbf->buf[0], bytes); 556 mbc->bytes = bytes; 557 558 mbf->bufcount -= bytes; 559 if (mbf->bufcount > 0) 560 { 561 /* It's not worth calling memmove() for so few bytes. */ 562 unsigned int count = mbf->bufcount; 563 char *p = &mbf->buf[0]; 564 565 do 566 { 567 *p = *(p + bytes); 568 p++; 569 } 570 while (--count > 0); 571 } 572 return; 573 574eof: 575 /* An mbchar_t with bytes == 0 is used to indicate EOF. */ 576 mbc->bytes = 0; 577#if HAVE_ICONV 578 mbc->uc_valid = false; 579#endif 580 return; 581} 582 583static void 584mbfile_ungetc (const mbchar_t mbc, mbfile_t mbf) 585{ 586 if (mbf->have_pushback >= NPUSHBACK) 587 abort (); 588 mb_copy (&mbf->pushback[mbf->have_pushback], mbc); 589 mbf->have_pushback++; 590} 591 592 593/* Lexer variables. */ 594 595static mbfile_t mbf; 596unsigned int gram_max_allowed_errors = 20; 597static bool po_lex_obsolete; 598static bool po_lex_previous; 599static bool pass_comments = false; 600bool pass_obsolete_entries = false; 601 602 603/* Prepare lexical analysis. */ 604void 605lex_start (FILE *fp, const char *real_filename, const char *logical_filename) 606{ 607 /* Ignore the logical_filename, because PO file entries already have 608 their file names attached. But use real_filename for error messages. */ 609 gram_pos.file_name = xstrdup (real_filename); 610 611 mbfile_init (mbf, fp); 612 613 gram_pos.line_number = 1; 614 gram_pos_column = 0; 615 signal_eilseq = true; 616 po_lex_obsolete = false; 617 po_lex_previous = false; 618 po_lex_charset_init (); 619} 620 621/* Terminate lexical analysis. */ 622void 623lex_end () 624{ 625 mbf->fp = NULL; 626 gram_pos.file_name = NULL; 627 gram_pos.line_number = 0; 628 gram_pos_column = 0; 629 signal_eilseq = false; 630 po_lex_obsolete = false; 631 po_lex_previous = false; 632 po_lex_charset_close (); 633} 634 635 636/* Read a single character, dealing with backslash-newline. 637 Also keep track of the current line number and column number. */ 638static void 639lex_getc (mbchar_t mbc) 640{ 641 for (;;) 642 { 643 mbfile_getc (mbc, mbf); 644 645 if (mb_iseof (mbc)) 646 { 647 if (ferror (mbf->fp)) 648 bomb: 649 { 650 const char *errno_description = strerror (errno); 651 po_xerror (PO_SEVERITY_FATAL_ERROR, NULL, NULL, 0, 0, false, 652 xasprintf ("%s: %s", 653 xasprintf (_("error while reading \"%s\""), 654 gram_pos.file_name), 655 errno_description)); 656 } 657 break; 658 } 659 660 if (mb_iseq (mbc, '\n')) 661 { 662 gram_pos.line_number++; 663 gram_pos_column = 0; 664 break; 665 } 666 667 gram_pos_column += mb_width (mbc); 668 669 if (mb_iseq (mbc, '\\')) 670 { 671 mbchar_t mbc2; 672 673 mbfile_getc (mbc2, mbf); 674 675 if (mb_iseof (mbc2)) 676 { 677 if (ferror (mbf->fp)) 678 goto bomb; 679 break; 680 } 681 682 if (!mb_iseq (mbc2, '\n')) 683 { 684 mbfile_ungetc (mbc2, mbf); 685 break; 686 } 687 688 gram_pos.line_number++; 689 gram_pos_column = 0; 690 } 691 else 692 break; 693 } 694} 695 696 697static void 698lex_ungetc (const mbchar_t mbc) 699{ 700 if (!mb_iseof (mbc)) 701 { 702 if (mb_iseq (mbc, '\n')) 703 /* Decrement the line number, but don't care about the column. */ 704 gram_pos.line_number--; 705 else 706 /* Decrement the column number. Also works well enough for tabs. */ 707 gram_pos_column -= mb_width (mbc); 708 709 mbfile_ungetc (mbc, mbf); 710 } 711} 712 713 714static int 715keyword_p (const char *s) 716{ 717 if (!po_lex_previous) 718 { 719 if (!strcmp (s, "domain")) 720 return DOMAIN; 721 if (!strcmp (s, "msgid")) 722 return MSGID; 723 if (!strcmp (s, "msgid_plural")) 724 return MSGID_PLURAL; 725 if (!strcmp (s, "msgstr")) 726 return MSGSTR; 727 if (!strcmp (s, "msgctxt")) 728 return MSGCTXT; 729 } 730 else 731 { 732 /* Inside a "#|" context, the keywords have a different meaning. */ 733 if (!strcmp (s, "msgid")) 734 return PREV_MSGID; 735 if (!strcmp (s, "msgid_plural")) 736 return PREV_MSGID_PLURAL; 737 if (!strcmp (s, "msgctxt")) 738 return PREV_MSGCTXT; 739 } 740 po_gram_error_at_line (&gram_pos, _("keyword \"%s\" unknown"), s); 741 return NAME; 742} 743 744 745static int 746control_sequence () 747{ 748 mbchar_t mbc; 749 int val; 750 int max; 751 752 lex_getc (mbc); 753 if (mb_len (mbc) == 1) 754 switch (mb_ptr (mbc) [0]) 755 { 756 case 'n': 757 return '\n'; 758 759 case 't': 760 return '\t'; 761 762 case 'b': 763 return '\b'; 764 765 case 'r': 766 return '\r'; 767 768 case 'f': 769 return '\f'; 770 771 case 'v': 772 return '\v'; 773 774 case 'a': 775 return '\a'; 776 777 case '\\': 778 case '"': 779 return mb_ptr (mbc) [0]; 780 781 case '0': case '1': case '2': case '3': 782 case '4': case '5': case '6': case '7': 783 val = 0; 784 max = 0; 785 for (;;) 786 { 787 char c = mb_ptr (mbc) [0]; 788 /* Warning: not portable, can't depend on '0'..'7' ordering. */ 789 val = val * 8 + (c - '0'); 790 if (++max == 3) 791 break; 792 lex_getc (mbc); 793 if (mb_len (mbc) == 1) 794 switch (mb_ptr (mbc) [0]) 795 { 796 case '0': case '1': case '2': case '3': 797 case '4': case '5': case '6': case '7': 798 continue; 799 800 default: 801 break; 802 } 803 lex_ungetc (mbc); 804 break; 805 } 806 return val; 807 808 case 'x': 809 lex_getc (mbc); 810 if (mb_iseof (mbc) || mb_len (mbc) != 1 811 || !c_isxdigit (mb_ptr (mbc) [0])) 812 break; 813 814 val = 0; 815 for (;;) 816 { 817 char c = mb_ptr (mbc) [0]; 818 val *= 16; 819 if (c_isdigit (c)) 820 /* Warning: not portable, can't depend on '0'..'9' ordering */ 821 val += c - '0'; 822 else if (c_isupper (c)) 823 /* Warning: not portable, can't depend on 'A'..'F' ordering */ 824 val += c - 'A' + 10; 825 else 826 /* Warning: not portable, can't depend on 'a'..'f' ordering */ 827 val += c - 'a' + 10; 828 829 lex_getc (mbc); 830 if (mb_len (mbc) == 1) 831 switch (mb_ptr (mbc) [0]) 832 { 833 case '0': case '1': case '2': case '3': case '4': 834 case '5': case '6': case '7': case '8': case '9': 835 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': 836 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': 837 continue; 838 839 default: 840 break; 841 } 842 lex_ungetc (mbc); 843 break; 844 } 845 return val; 846 847 /* FIXME: \u and \U are not handled. */ 848 } 849 lex_ungetc (mbc); 850 po_gram_error (_("invalid control sequence")); 851 return ' '; 852} 853 854 855/* Return the next token in the PO file. The return codes are defined 856 in "po-gram-gen2.h". Associated data is put in 'po_gram_lval'. */ 857int 858po_gram_lex () 859{ 860 static char *buf; 861 static size_t bufmax; 862 mbchar_t mbc; 863 size_t bufpos; 864 865 for (;;) 866 { 867 lex_getc (mbc); 868 869 if (mb_iseof (mbc)) 870 /* Yacc want this for end of file. */ 871 return 0; 872 873 if (mb_len (mbc) == 1) 874 switch (mb_ptr (mbc) [0]) 875 { 876 case '\n': 877 po_lex_obsolete = false; 878 po_lex_previous = false; 879 /* Ignore whitespace, not relevant for the grammar. */ 880 break; 881 882 case ' ': 883 case '\t': 884 case '\r': 885 case '\f': 886 case '\v': 887 /* Ignore whitespace, not relevant for the grammar. */ 888 break; 889 890 case '#': 891 lex_getc (mbc); 892 if (mb_iseq (mbc, '~')) 893 /* A pseudo-comment beginning with #~ is found. This is 894 not a comment. It is the format for obsolete entries. 895 We simply discard the "#~" prefix. The following 896 characters are expected to be well formed. */ 897 { 898 po_lex_obsolete = true; 899 /* A pseudo-comment beginning with #~| denotes a previous 900 untranslated string in an obsolete entry. This does not 901 make much sense semantically, and is implemented here 902 for completeness only. */ 903 lex_getc (mbc); 904 if (mb_iseq (mbc, '|')) 905 po_lex_previous = true; 906 else 907 lex_ungetc (mbc); 908 break; 909 } 910 if (mb_iseq (mbc, '|')) 911 /* A pseudo-comment beginning with #| is found. This is 912 the previous untranslated string. We discard the "#|" 913 prefix, but change the keywords and string returns 914 accordingly. */ 915 { 916 po_lex_previous = true; 917 break; 918 } 919 920 /* Accumulate comments into a buffer. If we have been asked 921 to pass comments, generate a COMMENT token, otherwise 922 discard it. */ 923 signal_eilseq = false; 924 if (pass_comments) 925 { 926 bufpos = 0; 927 for (;;) 928 { 929 while (bufpos + mb_len (mbc) >= bufmax) 930 { 931 bufmax += 100; 932 buf = xrealloc (buf, bufmax); 933 } 934 if (mb_iseof (mbc) || mb_iseq (mbc, '\n')) 935 break; 936 937 memcpy_small (&buf[bufpos], mb_ptr (mbc), mb_len (mbc)); 938 bufpos += mb_len (mbc); 939 940 lex_getc (mbc); 941 } 942 buf[bufpos] = '\0'; 943 944 po_gram_lval.string.string = buf; 945 po_gram_lval.string.pos = gram_pos; 946 po_gram_lval.string.obsolete = po_lex_obsolete; 947 po_lex_obsolete = false; 948 signal_eilseq = true; 949 return COMMENT; 950 } 951 else 952 { 953 /* We do this in separate loop because collecting large 954 comments while they get not passed to the upper layers 955 is not very efficient. */ 956 while (!mb_iseof (mbc) && !mb_iseq (mbc, '\n')) 957 lex_getc (mbc); 958 po_lex_obsolete = false; 959 signal_eilseq = true; 960 } 961 break; 962 963 case '"': 964 /* Accumulate a string. */ 965 bufpos = 0; 966 for (;;) 967 { 968 lex_getc (mbc); 969 while (bufpos + mb_len (mbc) >= bufmax) 970 { 971 bufmax += 100; 972 buf = xrealloc (buf, bufmax); 973 } 974 if (mb_iseof (mbc)) 975 { 976 po_gram_error_at_line (&gram_pos, 977 _("end-of-file within string")); 978 break; 979 } 980 if (mb_iseq (mbc, '\n')) 981 { 982 po_gram_error_at_line (&gram_pos, 983 _("end-of-line within string")); 984 break; 985 } 986 if (mb_iseq (mbc, '"')) 987 break; 988 if (mb_iseq (mbc, '\\')) 989 { 990 buf[bufpos++] = control_sequence (); 991 continue; 992 } 993 994 /* Add mbc to the accumulator. */ 995 memcpy_small (&buf[bufpos], mb_ptr (mbc), mb_len (mbc)); 996 bufpos += mb_len (mbc); 997 } 998 buf[bufpos] = '\0'; 999 1000 /* Strings cannot contain the msgctxt separator, because it cannot 1001 be faithfully represented in the msgid of a .mo file. */ 1002 if (strchr (buf, MSGCTXT_SEPARATOR) != NULL) 1003 po_gram_error_at_line (&gram_pos, 1004 _("context separator <EOT> within string")); 1005 1006 /* FIXME: Treatment of embedded \000 chars is incorrect. */ 1007 po_gram_lval.string.string = xstrdup (buf); 1008 po_gram_lval.string.pos = gram_pos; 1009 po_gram_lval.string.obsolete = po_lex_obsolete; 1010 return (po_lex_previous ? PREV_STRING : STRING); 1011 1012 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': 1013 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l': 1014 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r': 1015 case 's': case 't': case 'u': case 'v': case 'w': case 'x': 1016 case 'y': case 'z': 1017 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': 1018 case 'G': case 'H': case 'I': case 'J': case 'K': case 'L': 1019 case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R': 1020 case 'S': case 'T': case 'U': case 'V': case 'W': case 'X': 1021 case 'Y': case 'Z': 1022 case '_': case '$': 1023 bufpos = 0; 1024 for (;;) 1025 { 1026 char c = mb_ptr (mbc) [0]; 1027 if (bufpos + 1 >= bufmax) 1028 { 1029 bufmax += 100; 1030 buf = xrealloc (buf, bufmax); 1031 } 1032 buf[bufpos++] = c; 1033 lex_getc (mbc); 1034 if (mb_len (mbc) == 1) 1035 switch (mb_ptr (mbc) [0]) 1036 { 1037 default: 1038 break; 1039 case 'a': case 'b': case 'c': case 'd': case 'e': 1040 case 'f': case 'g': case 'h': case 'i': case 'j': 1041 case 'k': case 'l': case 'm': case 'n': case 'o': 1042 case 'p': case 'q': case 'r': case 's': case 't': 1043 case 'u': case 'v': case 'w': case 'x': case 'y': 1044 case 'z': 1045 case 'A': case 'B': case 'C': case 'D': case 'E': 1046 case 'F': case 'G': case 'H': case 'I': case 'J': 1047 case 'K': case 'L': case 'M': case 'N': case 'O': 1048 case 'P': case 'Q': case 'R': case 'S': case 'T': 1049 case 'U': case 'V': case 'W': case 'X': case 'Y': 1050 case 'Z': 1051 case '_': case '$': 1052 case '0': case '1': case '2': case '3': case '4': 1053 case '5': case '6': case '7': case '8': case '9': 1054 continue; 1055 } 1056 break; 1057 } 1058 lex_ungetc (mbc); 1059 1060 buf[bufpos] = '\0'; 1061 1062 { 1063 int k = keyword_p (buf); 1064 if (k == NAME) 1065 { 1066 po_gram_lval.string.string = xstrdup (buf); 1067 po_gram_lval.string.pos = gram_pos; 1068 po_gram_lval.string.obsolete = po_lex_obsolete; 1069 } 1070 else 1071 { 1072 po_gram_lval.pos.pos = gram_pos; 1073 po_gram_lval.pos.obsolete = po_lex_obsolete; 1074 } 1075 return k; 1076 } 1077 1078 case '0': case '1': case '2': case '3': case '4': 1079 case '5': case '6': case '7': case '8': case '9': 1080 bufpos = 0; 1081 for (;;) 1082 { 1083 char c = mb_ptr (mbc) [0]; 1084 if (bufpos + 1 >= bufmax) 1085 { 1086 bufmax += 100; 1087 buf = xrealloc (buf, bufmax + 1); 1088 } 1089 buf[bufpos++] = c; 1090 lex_getc (mbc); 1091 if (mb_len (mbc) == 1) 1092 switch (mb_ptr (mbc) [0]) 1093 { 1094 default: 1095 break; 1096 1097 case '0': case '1': case '2': case '3': case '4': 1098 case '5': case '6': case '7': case '8': case '9': 1099 continue; 1100 } 1101 break; 1102 } 1103 lex_ungetc (mbc); 1104 1105 buf[bufpos] = '\0'; 1106 1107 po_gram_lval.number.number = atol (buf); 1108 po_gram_lval.number.pos = gram_pos; 1109 po_gram_lval.number.obsolete = po_lex_obsolete; 1110 return NUMBER; 1111 1112 case '[': 1113 po_gram_lval.pos.pos = gram_pos; 1114 po_gram_lval.pos.obsolete = po_lex_obsolete; 1115 return '['; 1116 1117 case ']': 1118 po_gram_lval.pos.pos = gram_pos; 1119 po_gram_lval.pos.obsolete = po_lex_obsolete; 1120 return ']'; 1121 1122 default: 1123 /* This will cause a syntax error. */ 1124 return JUNK; 1125 } 1126 else 1127 /* This will cause a syntax error. */ 1128 return JUNK; 1129 } 1130} 1131 1132 1133/* po_gram_lex() can return comments as COMMENT. Switch this on or off. */ 1134void 1135po_lex_pass_comments (bool flag) 1136{ 1137 pass_comments = flag; 1138} 1139 1140 1141/* po_gram_lex() can return obsolete entries as if they were normal entries. 1142 Switch this on or off. */ 1143void 1144po_lex_pass_obsolete_entries (bool flag) 1145{ 1146 pass_obsolete_entries = flag; 1147} 1148