1/* GNU gettext - internationalization aids 2 Copyright (C) 1995-1999, 2000-2007 Free Software Foundation, Inc. 3 4 This file was written by Peter Miller <millerp@canb.auug.org.au>. 5 Multibyte character handling by Bruno Haible <haible@clisp.cons.org>. 6 7 This program is free software: you can redistribute it and/or modify 8 it under the terms of the GNU General Public License as published by 9 the Free Software Foundation; either version 3 of the License, or 10 (at your option) any later version. 11 12 This program is distributed in the hope that it will be useful, 13 but WITHOUT ANY WARRANTY; without even the implied warranty of 14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 GNU General Public License for more details. 16 17 You should have received a copy of the GNU General Public License 18 along with this program. If not, see <http://www.gnu.org/licenses/>. */ 19 20 21#ifdef HAVE_CONFIG_H 22# include "config.h" 23#endif 24 25/* Specification. */ 26#include "po-lex.h" 27 28#include <errno.h> 29#include <limits.h> 30#include <stdio.h> 31#include <stdlib.h> 32#include <string.h> 33#include <stdarg.h> 34 35#if HAVE_ICONV 36# include <iconv.h> 37#endif 38 39#include "c-ctype.h" 40#include "linebreak.h" 41#include "uniwidth.h" 42#include "gettext.h" 43#include "po-charset.h" 44#include "xalloc.h" 45#include "error.h" 46#include "error-progname.h" 47#include "xvasprintf.h" 48#include "po-error.h" 49#include "po-xerror.h" 50#include "pos.h" 51#include "message.h" 52#include "str-list.h" 53#include "po-gram-gen2.h" 54 55#define _(str) gettext(str) 56 57#if HAVE_ICONV 58# include "unistr.h" 59#endif 60 61#if HAVE_DECL_GETC_UNLOCKED 62# undef getc 63# define getc getc_unlocked 64#endif 65 66 67/* Current position within the PO file. */ 68lex_pos_ty gram_pos; 69int gram_pos_column; 70 71 72/* Error handling during the parsing of a PO file. 73 These functions can access gram_pos and gram_pos_column. */ 74 75/* VARARGS1 */ 76void 77po_gram_error (const char *fmt, ...) 78{ 79 va_list ap; 80 char *buffer; 81 82 va_start (ap, fmt); 83 if (vasprintf (&buffer, fmt, ap) < 0) 84 error (EXIT_FAILURE, 0, _("memory exhausted")); 85 va_end (ap); 86 po_xerror (PO_SEVERITY_ERROR, NULL, gram_pos.file_name, gram_pos.line_number, 87 gram_pos_column + 1, false, buffer); 88 free (buffer); 89 90 if (error_message_count >= gram_max_allowed_errors) 91 po_error (EXIT_FAILURE, 0, _("too many errors, aborting")); 92} 93 94/* VARARGS2 */ 95void 96po_gram_error_at_line (const lex_pos_ty *pp, const char *fmt, ...) 97{ 98 va_list ap; 99 char *buffer; 100 101 va_start (ap, fmt); 102 if (vasprintf (&buffer, fmt, ap) < 0) 103 error (EXIT_FAILURE, 0, _("memory exhausted")); 104 va_end (ap); 105 po_xerror (PO_SEVERITY_ERROR, NULL, pp->file_name, pp->line_number, 106 (size_t)(-1), false, buffer); 107 free (buffer); 108 109 if (error_message_count >= gram_max_allowed_errors) 110 po_error (EXIT_FAILURE, 0, _("too many errors, aborting")); 111} 112 113 114/* The lowest level of PO file parsing converts bytes to multibyte characters. 115 This is needed 116 1. for C compatibility: ISO C 99 section 5.1.1.2 says that the first 117 translation phase maps bytes to characters. 118 2. to keep track of the current column, for the sake of precise error 119 location. Emacs compile.el interprets the column in error messages 120 by default as a screen column number, not as character number. 121 3. to avoid skipping backslash-newline in the midst of a multibyte 122 character. If XY is a multibyte character, X \ newline Y is invalid. 123 */ 124 125/* Multibyte character data type. */ 126/* Note this depends on po_lex_charset and po_lex_iconv, which get set 127 while the file is being parsed. */ 128 129#define MBCHAR_BUF_SIZE 24 130 131struct mbchar 132{ 133 size_t bytes; /* number of bytes of current character, > 0 */ 134#if HAVE_ICONV 135 bool uc_valid; /* true if uc is a valid Unicode character */ 136 unsigned int uc; /* if uc_valid: the current character */ 137#endif 138 char buf[MBCHAR_BUF_SIZE]; /* room for the bytes */ 139}; 140 141/* We want to pass multibyte characters by reference automatically, 142 therefore we use an array type. */ 143typedef struct mbchar mbchar_t[1]; 144 145/* A version of memcpy optimized for the case n <= 1. */ 146static inline void 147memcpy_small (void *dst, const void *src, size_t n) 148{ 149 if (n > 0) 150 { 151 char *q = (char *) dst; 152 const char *p = (const char *) src; 153 154 *q = *p; 155 if (--n > 0) 156 do *++q = *++p; while (--n > 0); 157 } 158} 159 160/* EOF (not a real character) is represented with bytes = 0 and 161 uc_valid = false. */ 162static inline bool 163mb_iseof (const mbchar_t mbc) 164{ 165 return (mbc->bytes == 0); 166} 167 168/* Access the current character. */ 169static inline const char * 170mb_ptr (const mbchar_t mbc) 171{ 172 return mbc->buf; 173} 174static inline size_t 175mb_len (const mbchar_t mbc) 176{ 177 return mbc->bytes; 178} 179 180/* Comparison of characters. */ 181 182static inline bool 183mb_iseq (const mbchar_t mbc, char sc) 184{ 185 /* Note: It is wrong to compare only mbc->uc, because when the encoding is 186 SHIFT_JIS, mbc->buf[0] == '\\' corresponds to mbc->uc == 0x00A5, but we 187 want to treat it as an escape character, although it looks like a Yen 188 sign. */ 189#if HAVE_ICONV && 0 190 if (mbc->uc_valid) 191 return (mbc->uc == sc); /* wrong! */ 192 else 193#endif 194 return (mbc->bytes == 1 && mbc->buf[0] == sc); 195} 196 197static inline bool 198mb_isnul (const mbchar_t mbc) 199{ 200#if HAVE_ICONV 201 if (mbc->uc_valid) 202 return (mbc->uc == 0); 203 else 204#endif 205 return (mbc->bytes == 1 && mbc->buf[0] == 0); 206} 207 208static inline int 209mb_cmp (const mbchar_t mbc1, const mbchar_t mbc2) 210{ 211#if HAVE_ICONV 212 if (mbc1->uc_valid && mbc2->uc_valid) 213 return (int) mbc1->uc - (int) mbc2->uc; 214 else 215#endif 216 return (mbc1->bytes == mbc2->bytes 217 ? memcmp (mbc1->buf, mbc2->buf, mbc1->bytes) 218 : mbc1->bytes < mbc2->bytes 219 ? (memcmp (mbc1->buf, mbc2->buf, mbc1->bytes) > 0 ? 1 : -1) 220 : (memcmp (mbc1->buf, mbc2->buf, mbc2->bytes) >= 0 ? 1 : -1)); 221} 222 223static inline bool 224mb_equal (const mbchar_t mbc1, const mbchar_t mbc2) 225{ 226#if HAVE_ICONV 227 if (mbc1->uc_valid && mbc2->uc_valid) 228 return mbc1->uc == mbc2->uc; 229 else 230#endif 231 return (mbc1->bytes == mbc2->bytes 232 && memcmp (mbc1->buf, mbc2->buf, mbc1->bytes) == 0); 233} 234 235/* <ctype.h>, <wctype.h> classification. */ 236 237static inline bool 238mb_isascii (const mbchar_t mbc) 239{ 240#if HAVE_ICONV 241 if (mbc->uc_valid) 242 return (mbc->uc >= 0x0000 && mbc->uc <= 0x007F); 243 else 244#endif 245 return (mbc->bytes == 1 246#if CHAR_MIN < 0x00 /* to avoid gcc warning */ 247 && mbc->buf[0] >= 0x00 248#endif 249#if CHAR_MAX > 0x7F /* to avoid gcc warning */ 250 && mbc->buf[0] <= 0x7F 251#endif 252 ); 253} 254 255/* Extra <wchar.h> function. */ 256 257/* Unprintable characters appear as a small box of width 1. */ 258#define MB_UNPRINTABLE_WIDTH 1 259 260static int 261mb_width (const mbchar_t mbc) 262{ 263#if HAVE_ICONV 264 if (mbc->uc_valid) 265 { 266 unsigned int uc = mbc->uc; 267 const char *encoding = 268 (po_lex_iconv != (iconv_t)(-1) ? po_lex_charset : ""); 269 int w = uc_width (uc, encoding); 270 /* For unprintable characters, arbitrarily return 0 for control 271 characters (except tab) and MB_UNPRINTABLE_WIDTH otherwise. */ 272 if (w >= 0) 273 return w; 274 if (uc >= 0x0000 && uc <= 0x001F) 275 { 276 if (uc == 0x0009) 277 return 8 - (gram_pos_column & 7); 278 return 0; 279 } 280 if ((uc >= 0x007F && uc <= 0x009F) || (uc >= 0x2028 && uc <= 0x2029)) 281 return 0; 282 return MB_UNPRINTABLE_WIDTH; 283 } 284 else 285#endif 286 { 287 if (mbc->bytes == 1) 288 { 289 if ( 290#if CHAR_MIN < 0x00 /* to avoid gcc warning */ 291 mbc->buf[0] >= 0x00 && 292#endif 293 mbc->buf[0] <= 0x1F) 294 { 295 if (mbc->buf[0] == 0x09) 296 return 8 - (gram_pos_column & 7); 297 return 0; 298 } 299 if (mbc->buf[0] == 0x7F) 300 return 0; 301 } 302 return MB_UNPRINTABLE_WIDTH; 303 } 304} 305 306/* Output. */ 307static inline void 308mb_putc (const mbchar_t mbc, FILE *stream) 309{ 310 fwrite (mbc->buf, 1, mbc->bytes, stream); 311} 312 313/* Assignment. */ 314static inline void 315mb_setascii (mbchar_t mbc, char sc) 316{ 317 mbc->bytes = 1; 318#if HAVE_ICONV 319 mbc->uc_valid = 1; 320 mbc->uc = sc; 321#endif 322 mbc->buf[0] = sc; 323} 324 325/* Copying a character. */ 326static inline void 327mb_copy (mbchar_t new_mbc, const mbchar_t old_mbc) 328{ 329 memcpy_small (&new_mbc->buf[0], &old_mbc->buf[0], old_mbc->bytes); 330 new_mbc->bytes = old_mbc->bytes; 331#if HAVE_ICONV 332 if ((new_mbc->uc_valid = old_mbc->uc_valid)) 333 new_mbc->uc = old_mbc->uc; 334#endif 335} 336 337 338/* Multibyte character input. */ 339 340/* Number of characters that can be pushed back. 341 We need 1 for lex_getc, plus 1 for lex_ungetc. */ 342#define NPUSHBACK 2 343 344/* Data type of a multibyte character input stream. */ 345struct mbfile 346{ 347 FILE *fp; 348 bool eof_seen; 349 int have_pushback; 350 unsigned int bufcount; 351 char buf[MBCHAR_BUF_SIZE]; 352 struct mbchar pushback[NPUSHBACK]; 353}; 354 355/* We want to pass multibyte streams by reference automatically, 356 therefore we use an array type. */ 357typedef struct mbfile mbfile_t[1]; 358 359/* Whether invalid multibyte sequences in the input shall be signalled 360 or silently tolerated. */ 361static bool signal_eilseq; 362 363static inline void 364mbfile_init (mbfile_t mbf, FILE *stream) 365{ 366 mbf->fp = stream; 367 mbf->eof_seen = false; 368 mbf->have_pushback = 0; 369 mbf->bufcount = 0; 370} 371 372/* Read the next multibyte character from mbf and put it into mbc. 373 If a read error occurs, errno is set and ferror (mbf->fp) becomes true. */ 374static void 375mbfile_getc (mbchar_t mbc, mbfile_t mbf) 376{ 377 size_t bytes; 378 379 /* If EOF has already been seen, don't use getc. This matters if 380 mbf->fp is connected to an interactive tty. */ 381 if (mbf->eof_seen) 382 goto eof; 383 384 /* Return character pushed back, if there is one. */ 385 if (mbf->have_pushback > 0) 386 { 387 mbf->have_pushback--; 388 mb_copy (mbc, &mbf->pushback[mbf->have_pushback]); 389 return; 390 } 391 392 /* Before using iconv, we need at least one byte. */ 393 if (mbf->bufcount == 0) 394 { 395 int c = getc (mbf->fp); 396 if (c == EOF) 397 { 398 mbf->eof_seen = true; 399 goto eof; 400 } 401 mbf->buf[0] = (unsigned char) c; 402 mbf->bufcount++; 403 } 404 405#if HAVE_ICONV 406 if (po_lex_iconv != (iconv_t)(-1)) 407 { 408 /* Use iconv on an increasing number of bytes. Read only as many 409 bytes from mbf->fp as needed. This is needed to give reasonable 410 interactive behaviour when mbf->fp is connected to an interactive 411 tty. */ 412 for (;;) 413 { 414 unsigned char scratchbuf[64]; 415 const char *inptr = &mbf->buf[0]; 416 size_t insize = mbf->bufcount; 417 char *outptr = (char *) &scratchbuf[0]; 418 size_t outsize = sizeof (scratchbuf); 419 420 size_t res = iconv (po_lex_iconv, 421 (ICONV_CONST char **) &inptr, &insize, 422 &outptr, &outsize); 423 /* We expect that a character has been produced if and only if 424 some input bytes have been consumed. */ 425 if ((insize < mbf->bufcount) != (outsize < sizeof (scratchbuf))) 426 abort (); 427 if (outsize == sizeof (scratchbuf)) 428 { 429 /* No character has been produced. Must be an error. */ 430 if (res != (size_t)(-1)) 431 abort (); 432 433 if (errno == EILSEQ) 434 { 435 /* An invalid multibyte sequence was encountered. */ 436 /* Return a single byte. */ 437 if (signal_eilseq) 438 po_gram_error (_("invalid multibyte sequence")); 439 bytes = 1; 440 mbc->uc_valid = false; 441 break; 442 } 443 else if (errno == EINVAL) 444 { 445 /* An incomplete multibyte character. */ 446 int c; 447 448 if (mbf->bufcount == MBCHAR_BUF_SIZE) 449 { 450 /* An overlong incomplete multibyte sequence was 451 encountered. */ 452 /* Return a single byte. */ 453 bytes = 1; 454 mbc->uc_valid = false; 455 break; 456 } 457 458 /* Read one more byte and retry iconv. */ 459 c = getc (mbf->fp); 460 if (c == EOF) 461 { 462 mbf->eof_seen = true; 463 if (ferror (mbf->fp)) 464 goto eof; 465 if (signal_eilseq) 466 po_gram_error (_("\ 467incomplete multibyte sequence at end of file")); 468 bytes = mbf->bufcount; 469 mbc->uc_valid = false; 470 break; 471 } 472 mbf->buf[mbf->bufcount++] = (unsigned char) c; 473 if (c == '\n') 474 { 475 if (signal_eilseq) 476 po_gram_error (_("\ 477incomplete multibyte sequence at end of line")); 478 bytes = mbf->bufcount - 1; 479 mbc->uc_valid = false; 480 break; 481 } 482 } 483 else 484 { 485 const char *errno_description = strerror (errno); 486 po_xerror (PO_SEVERITY_FATAL_ERROR, NULL, NULL, 0, 0, false, 487 xasprintf ("%s: %s", 488 _("iconv failure"), 489 errno_description)); 490 } 491 } 492 else 493 { 494 size_t outbytes = sizeof (scratchbuf) - outsize; 495 bytes = mbf->bufcount - insize; 496 497 /* We expect that one character has been produced. */ 498 if (bytes == 0) 499 abort (); 500 if (outbytes == 0) 501 abort (); 502 /* Convert it from UTF-8 to UCS-4. */ 503 if (u8_mbtouc (&mbc->uc, scratchbuf, outbytes) < outbytes) 504 { 505 /* scratchbuf contains an out-of-range Unicode character 506 (> 0x10ffff). */ 507 if (signal_eilseq) 508 po_gram_error (_("invalid multibyte sequence")); 509 mbc->uc_valid = false; 510 break; 511 } 512 mbc->uc_valid = true; 513 break; 514 } 515 } 516 } 517 else 518#endif 519 { 520 if (po_lex_weird_cjk 521 /* Special handling of encodings with CJK structure. */ 522 && (unsigned char) mbf->buf[0] >= 0x80) 523 { 524 if (mbf->bufcount == 1) 525 { 526 /* Read one more byte. */ 527 int c = getc (mbf->fp); 528 if (c == EOF) 529 { 530 if (ferror (mbf->fp)) 531 { 532 mbf->eof_seen = true; 533 goto eof; 534 } 535 } 536 else 537 { 538 mbf->buf[1] = (unsigned char) c; 539 mbf->bufcount++; 540 } 541 } 542 if (mbf->bufcount >= 2 && (unsigned char) mbf->buf[1] >= 0x30) 543 /* Return a double byte. */ 544 bytes = 2; 545 else 546 /* Return a single byte. */ 547 bytes = 1; 548 } 549 else 550 { 551 /* Return a single byte. */ 552 bytes = 1; 553 } 554#if HAVE_ICONV 555 mbc->uc_valid = false; 556#endif 557 } 558 559 /* Return the multibyte sequence mbf->buf[0..bytes-1]. */ 560 memcpy_small (&mbc->buf[0], &mbf->buf[0], bytes); 561 mbc->bytes = bytes; 562 563 mbf->bufcount -= bytes; 564 if (mbf->bufcount > 0) 565 { 566 /* It's not worth calling memmove() for so few bytes. */ 567 unsigned int count = mbf->bufcount; 568 char *p = &mbf->buf[0]; 569 570 do 571 { 572 *p = *(p + bytes); 573 p++; 574 } 575 while (--count > 0); 576 } 577 return; 578 579eof: 580 /* An mbchar_t with bytes == 0 is used to indicate EOF. */ 581 mbc->bytes = 0; 582#if HAVE_ICONV 583 mbc->uc_valid = false; 584#endif 585 return; 586} 587 588static void 589mbfile_ungetc (const mbchar_t mbc, mbfile_t mbf) 590{ 591 if (mbf->have_pushback >= NPUSHBACK) 592 abort (); 593 mb_copy (&mbf->pushback[mbf->have_pushback], mbc); 594 mbf->have_pushback++; 595} 596 597 598/* Lexer variables. */ 599 600static mbfile_t mbf; 601unsigned int gram_max_allowed_errors = 20; 602static bool po_lex_obsolete; 603static bool po_lex_previous; 604static bool pass_comments = false; 605bool pass_obsolete_entries = false; 606 607 608/* Prepare lexical analysis. */ 609void 610lex_start (FILE *fp, const char *real_filename, const char *logical_filename) 611{ 612 /* Ignore the logical_filename, because PO file entries already have 613 their file names attached. But use real_filename for error messages. */ 614 gram_pos.file_name = xstrdup (real_filename); 615 616 mbfile_init (mbf, fp); 617 618 gram_pos.line_number = 1; 619 gram_pos_column = 0; 620 signal_eilseq = true; 621 po_lex_obsolete = false; 622 po_lex_previous = false; 623 po_lex_charset_init (); 624} 625 626/* Terminate lexical analysis. */ 627void 628lex_end () 629{ 630 mbf->fp = NULL; 631 gram_pos.file_name = NULL; 632 gram_pos.line_number = 0; 633 gram_pos_column = 0; 634 signal_eilseq = false; 635 po_lex_obsolete = false; 636 po_lex_previous = false; 637 po_lex_charset_close (); 638} 639 640 641/* Read a single character, dealing with backslash-newline. 642 Also keep track of the current line number and column number. */ 643static void 644lex_getc (mbchar_t mbc) 645{ 646 for (;;) 647 { 648 mbfile_getc (mbc, mbf); 649 650 if (mb_iseof (mbc)) 651 { 652 if (ferror (mbf->fp)) 653 bomb: 654 { 655 const char *errno_description = strerror (errno); 656 po_xerror (PO_SEVERITY_FATAL_ERROR, NULL, NULL, 0, 0, false, 657 xasprintf ("%s: %s", 658 xasprintf (_("error while reading \"%s\""), 659 gram_pos.file_name), 660 errno_description)); 661 } 662 break; 663 } 664 665 if (mb_iseq (mbc, '\n')) 666 { 667 gram_pos.line_number++; 668 gram_pos_column = 0; 669 break; 670 } 671 672 gram_pos_column += mb_width (mbc); 673 674 if (mb_iseq (mbc, '\\')) 675 { 676 mbchar_t mbc2; 677 678 mbfile_getc (mbc2, mbf); 679 680 if (mb_iseof (mbc2)) 681 { 682 if (ferror (mbf->fp)) 683 goto bomb; 684 break; 685 } 686 687 if (!mb_iseq (mbc2, '\n')) 688 { 689 mbfile_ungetc (mbc2, mbf); 690 break; 691 } 692 693 gram_pos.line_number++; 694 gram_pos_column = 0; 695 } 696 else 697 break; 698 } 699} 700 701 702static void 703lex_ungetc (const mbchar_t mbc) 704{ 705 if (!mb_iseof (mbc)) 706 { 707 if (mb_iseq (mbc, '\n')) 708 /* Decrement the line number, but don't care about the column. */ 709 gram_pos.line_number--; 710 else 711 /* Decrement the column number. Also works well enough for tabs. */ 712 gram_pos_column -= mb_width (mbc); 713 714 mbfile_ungetc (mbc, mbf); 715 } 716} 717 718 719static int 720keyword_p (const char *s) 721{ 722 if (!po_lex_previous) 723 { 724 if (!strcmp (s, "domain")) 725 return DOMAIN; 726 if (!strcmp (s, "msgid")) 727 return MSGID; 728 if (!strcmp (s, "msgid_plural")) 729 return MSGID_PLURAL; 730 if (!strcmp (s, "msgstr")) 731 return MSGSTR; 732 if (!strcmp (s, "msgctxt")) 733 return MSGCTXT; 734 } 735 else 736 { 737 /* Inside a "#|" context, the keywords have a different meaning. */ 738 if (!strcmp (s, "msgid")) 739 return PREV_MSGID; 740 if (!strcmp (s, "msgid_plural")) 741 return PREV_MSGID_PLURAL; 742 if (!strcmp (s, "msgctxt")) 743 return PREV_MSGCTXT; 744 } 745 po_gram_error_at_line (&gram_pos, _("keyword \"%s\" unknown"), s); 746 return NAME; 747} 748 749 750static int 751control_sequence () 752{ 753 mbchar_t mbc; 754 int val; 755 int max; 756 757 lex_getc (mbc); 758 if (mb_len (mbc) == 1) 759 switch (mb_ptr (mbc) [0]) 760 { 761 case 'n': 762 return '\n'; 763 764 case 't': 765 return '\t'; 766 767 case 'b': 768 return '\b'; 769 770 case 'r': 771 return '\r'; 772 773 case 'f': 774 return '\f'; 775 776 case 'v': 777 return '\v'; 778 779 case 'a': 780 return '\a'; 781 782 case '\\': 783 case '"': 784 return mb_ptr (mbc) [0]; 785 786 case '0': case '1': case '2': case '3': 787 case '4': case '5': case '6': case '7': 788 val = 0; 789 max = 0; 790 for (;;) 791 { 792 char c = mb_ptr (mbc) [0]; 793 /* Warning: not portable, can't depend on '0'..'7' ordering. */ 794 val = val * 8 + (c - '0'); 795 if (++max == 3) 796 break; 797 lex_getc (mbc); 798 if (mb_len (mbc) == 1) 799 switch (mb_ptr (mbc) [0]) 800 { 801 case '0': case '1': case '2': case '3': 802 case '4': case '5': case '6': case '7': 803 continue; 804 805 default: 806 break; 807 } 808 lex_ungetc (mbc); 809 break; 810 } 811 return val; 812 813 case 'x': 814 lex_getc (mbc); 815 if (mb_iseof (mbc) || mb_len (mbc) != 1 816 || !c_isxdigit (mb_ptr (mbc) [0])) 817 break; 818 819 val = 0; 820 for (;;) 821 { 822 char c = mb_ptr (mbc) [0]; 823 val *= 16; 824 if (c_isdigit (c)) 825 /* Warning: not portable, can't depend on '0'..'9' ordering */ 826 val += c - '0'; 827 else if (c_isupper (c)) 828 /* Warning: not portable, can't depend on 'A'..'F' ordering */ 829 val += c - 'A' + 10; 830 else 831 /* Warning: not portable, can't depend on 'a'..'f' ordering */ 832 val += c - 'a' + 10; 833 834 lex_getc (mbc); 835 if (mb_len (mbc) == 1) 836 switch (mb_ptr (mbc) [0]) 837 { 838 case '0': case '1': case '2': case '3': case '4': 839 case '5': case '6': case '7': case '8': case '9': 840 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': 841 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': 842 continue; 843 844 default: 845 break; 846 } 847 lex_ungetc (mbc); 848 break; 849 } 850 return val; 851 852 /* FIXME: \u and \U are not handled. */ 853 } 854 lex_ungetc (mbc); 855 po_gram_error (_("invalid control sequence")); 856 return ' '; 857} 858 859 860/* Return the next token in the PO file. The return codes are defined 861 in "po-gram-gen2.h". Associated data is put in 'po_gram_lval'. */ 862int 863po_gram_lex () 864{ 865 static char *buf; 866 static size_t bufmax; 867 mbchar_t mbc; 868 size_t bufpos; 869 870 for (;;) 871 { 872 lex_getc (mbc); 873 874 if (mb_iseof (mbc)) 875 /* Yacc want this for end of file. */ 876 return 0; 877 878 if (mb_len (mbc) == 1) 879 switch (mb_ptr (mbc) [0]) 880 { 881 case '\n': 882 po_lex_obsolete = false; 883 po_lex_previous = false; 884 /* Ignore whitespace, not relevant for the grammar. */ 885 break; 886 887 case ' ': 888 case '\t': 889 case '\r': 890 case '\f': 891 case '\v': 892 /* Ignore whitespace, not relevant for the grammar. */ 893 break; 894 895 case '#': 896 lex_getc (mbc); 897 if (mb_iseq (mbc, '~')) 898 /* A pseudo-comment beginning with #~ is found. This is 899 not a comment. It is the format for obsolete entries. 900 We simply discard the "#~" prefix. The following 901 characters are expected to be well formed. */ 902 { 903 po_lex_obsolete = true; 904 /* A pseudo-comment beginning with #~| denotes a previous 905 untranslated string in an obsolete entry. This does not 906 make much sense semantically, and is implemented here 907 for completeness only. */ 908 lex_getc (mbc); 909 if (mb_iseq (mbc, '|')) 910 po_lex_previous = true; 911 else 912 lex_ungetc (mbc); 913 break; 914 } 915 if (mb_iseq (mbc, '|')) 916 /* A pseudo-comment beginning with #| is found. This is 917 the previous untranslated string. We discard the "#|" 918 prefix, but change the keywords and string returns 919 accordingly. */ 920 { 921 po_lex_previous = true; 922 break; 923 } 924 925 /* Accumulate comments into a buffer. If we have been asked 926 to pass comments, generate a COMMENT token, otherwise 927 discard it. */ 928 signal_eilseq = false; 929 if (pass_comments) 930 { 931 bufpos = 0; 932 for (;;) 933 { 934 while (bufpos + mb_len (mbc) >= bufmax) 935 { 936 bufmax += 100; 937 buf = xrealloc (buf, bufmax); 938 } 939 if (mb_iseof (mbc) || mb_iseq (mbc, '\n')) 940 break; 941 942 memcpy_small (&buf[bufpos], mb_ptr (mbc), mb_len (mbc)); 943 bufpos += mb_len (mbc); 944 945 lex_getc (mbc); 946 } 947 buf[bufpos] = '\0'; 948 949 po_gram_lval.string.string = buf; 950 po_gram_lval.string.pos = gram_pos; 951 po_gram_lval.string.obsolete = po_lex_obsolete; 952 po_lex_obsolete = false; 953 signal_eilseq = true; 954 return COMMENT; 955 } 956 else 957 { 958 /* We do this in separate loop because collecting large 959 comments while they get not passed to the upper layers 960 is not very efficient. */ 961 while (!mb_iseof (mbc) && !mb_iseq (mbc, '\n')) 962 lex_getc (mbc); 963 po_lex_obsolete = false; 964 signal_eilseq = true; 965 } 966 break; 967 968 case '"': 969 /* Accumulate a string. */ 970 bufpos = 0; 971 for (;;) 972 { 973 lex_getc (mbc); 974 while (bufpos + mb_len (mbc) >= bufmax) 975 { 976 bufmax += 100; 977 buf = xrealloc (buf, bufmax); 978 } 979 if (mb_iseof (mbc)) 980 { 981 po_gram_error_at_line (&gram_pos, 982 _("end-of-file within string")); 983 break; 984 } 985 if (mb_iseq (mbc, '\n')) 986 { 987 po_gram_error_at_line (&gram_pos, 988 _("end-of-line within string")); 989 break; 990 } 991 if (mb_iseq (mbc, '"')) 992 break; 993 if (mb_iseq (mbc, '\\')) 994 { 995 buf[bufpos++] = control_sequence (); 996 continue; 997 } 998 999 /* Add mbc to the accumulator. */ 1000 memcpy_small (&buf[bufpos], mb_ptr (mbc), mb_len (mbc)); 1001 bufpos += mb_len (mbc); 1002 } 1003 buf[bufpos] = '\0'; 1004 1005 /* Strings cannot contain the msgctxt separator, because it cannot 1006 be faithfully represented in the msgid of a .mo file. */ 1007 if (strchr (buf, MSGCTXT_SEPARATOR) != NULL) 1008 po_gram_error_at_line (&gram_pos, 1009 _("context separator <EOT> within string")); 1010 1011 /* FIXME: Treatment of embedded \000 chars is incorrect. */ 1012 po_gram_lval.string.string = xstrdup (buf); 1013 po_gram_lval.string.pos = gram_pos; 1014 po_gram_lval.string.obsolete = po_lex_obsolete; 1015 return (po_lex_previous ? PREV_STRING : STRING); 1016 1017 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': 1018 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l': 1019 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r': 1020 case 's': case 't': case 'u': case 'v': case 'w': case 'x': 1021 case 'y': case 'z': 1022 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': 1023 case 'G': case 'H': case 'I': case 'J': case 'K': case 'L': 1024 case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R': 1025 case 'S': case 'T': case 'U': case 'V': case 'W': case 'X': 1026 case 'Y': case 'Z': 1027 case '_': case '$': 1028 bufpos = 0; 1029 for (;;) 1030 { 1031 char c = mb_ptr (mbc) [0]; 1032 if (bufpos + 1 >= bufmax) 1033 { 1034 bufmax += 100; 1035 buf = xrealloc (buf, bufmax); 1036 } 1037 buf[bufpos++] = c; 1038 lex_getc (mbc); 1039 if (mb_len (mbc) == 1) 1040 switch (mb_ptr (mbc) [0]) 1041 { 1042 default: 1043 break; 1044 case 'a': case 'b': case 'c': case 'd': case 'e': 1045 case 'f': case 'g': case 'h': case 'i': case 'j': 1046 case 'k': case 'l': case 'm': case 'n': case 'o': 1047 case 'p': case 'q': case 'r': case 's': case 't': 1048 case 'u': case 'v': case 'w': case 'x': case 'y': 1049 case 'z': 1050 case 'A': case 'B': case 'C': case 'D': case 'E': 1051 case 'F': case 'G': case 'H': case 'I': case 'J': 1052 case 'K': case 'L': case 'M': case 'N': case 'O': 1053 case 'P': case 'Q': case 'R': case 'S': case 'T': 1054 case 'U': case 'V': case 'W': case 'X': case 'Y': 1055 case 'Z': 1056 case '_': case '$': 1057 case '0': case '1': case '2': case '3': case '4': 1058 case '5': case '6': case '7': case '8': case '9': 1059 continue; 1060 } 1061 break; 1062 } 1063 lex_ungetc (mbc); 1064 1065 buf[bufpos] = '\0'; 1066 1067 { 1068 int k = keyword_p (buf); 1069 if (k == NAME) 1070 { 1071 po_gram_lval.string.string = xstrdup (buf); 1072 po_gram_lval.string.pos = gram_pos; 1073 po_gram_lval.string.obsolete = po_lex_obsolete; 1074 } 1075 else 1076 { 1077 po_gram_lval.pos.pos = gram_pos; 1078 po_gram_lval.pos.obsolete = po_lex_obsolete; 1079 } 1080 return k; 1081 } 1082 1083 case '0': case '1': case '2': case '3': case '4': 1084 case '5': case '6': case '7': case '8': case '9': 1085 bufpos = 0; 1086 for (;;) 1087 { 1088 char c = mb_ptr (mbc) [0]; 1089 if (bufpos + 1 >= bufmax) 1090 { 1091 bufmax += 100; 1092 buf = xrealloc (buf, bufmax + 1); 1093 } 1094 buf[bufpos++] = c; 1095 lex_getc (mbc); 1096 if (mb_len (mbc) == 1) 1097 switch (mb_ptr (mbc) [0]) 1098 { 1099 default: 1100 break; 1101 1102 case '0': case '1': case '2': case '3': case '4': 1103 case '5': case '6': case '7': case '8': case '9': 1104 continue; 1105 } 1106 break; 1107 } 1108 lex_ungetc (mbc); 1109 1110 buf[bufpos] = '\0'; 1111 1112 po_gram_lval.number.number = atol (buf); 1113 po_gram_lval.number.pos = gram_pos; 1114 po_gram_lval.number.obsolete = po_lex_obsolete; 1115 return NUMBER; 1116 1117 case '[': 1118 po_gram_lval.pos.pos = gram_pos; 1119 po_gram_lval.pos.obsolete = po_lex_obsolete; 1120 return '['; 1121 1122 case ']': 1123 po_gram_lval.pos.pos = gram_pos; 1124 po_gram_lval.pos.obsolete = po_lex_obsolete; 1125 return ']'; 1126 1127 default: 1128 /* This will cause a syntax error. */ 1129 return JUNK; 1130 } 1131 else 1132 /* This will cause a syntax error. */ 1133 return JUNK; 1134 } 1135} 1136 1137 1138/* po_gram_lex() can return comments as COMMENT. Switch this on or off. */ 1139void 1140po_lex_pass_comments (bool flag) 1141{ 1142 pass_comments = flag; 1143} 1144 1145 1146/* po_gram_lex() can return obsolete entries as if they were normal entries. 1147 Switch this on or off. */ 1148void 1149po_lex_pass_obsolete_entries (bool flag) 1150{ 1151 pass_obsolete_entries = flag; 1152} 1153