1/* GNU gettext - internationalization aids
2   Copyright (C) 1995-1999, 2000-2006 Free Software Foundation, Inc.
3
4   This file was written by Peter Miller <millerp@canb.auug.org.au>.
5   Multibyte character handling by Bruno Haible <haible@clisp.cons.org>.
6
7   This program is free software; you can redistribute it and/or modify
8   it under the terms of the GNU General Public License as published by
9   the Free Software Foundation; either version 2, or (at your option)
10   any later version.
11
12   This program is distributed in the hope that it will be useful,
13   but WITHOUT ANY WARRANTY; without even the implied warranty of
14   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15   GNU General Public License for more details.
16
17   You should have received a copy of the GNU General Public License
18   along with this program; if not, write to the Free Software Foundation,
19   Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.  */
20
21
22#ifdef HAVE_CONFIG_H
23# include "config.h"
24#endif
25
26/* Specification.  */
27#include "po-lex.h"
28
29#include <errno.h>
30#include <limits.h>
31#include <stdio.h>
32#include <stdlib.h>
33#include <string.h>
34#include <stdarg.h>
35
36#if HAVE_ICONV
37# include <iconv.h>
38#endif
39
40#include "c-ctype.h"
41#include "linebreak.h"
42#include "vasprintf.h"
43#include "gettext.h"
44#include "po-charset.h"
45#include "xalloc.h"
46#include "exit.h"
47#include "error.h"
48#include "error-progname.h"
49#include "xvasprintf.h"
50#include "po-error.h"
51#include "po-xerror.h"
52#include "pos.h"
53#include "message.h"
54#include "str-list.h"
55#include "po-gram-gen2.h"
56
57#define _(str) gettext(str)
58
59#if HAVE_ICONV
60# include "utf8-ucs4.h"
61#endif
62
63#if HAVE_DECL_GETC_UNLOCKED
64# undef getc
65# define getc getc_unlocked
66#endif
67
68
69/* Current position within the PO file.  */
70lex_pos_ty gram_pos;
71int gram_pos_column;
72
73
74/* Error handling during the parsing of a PO file.
75   These functions can access gram_pos and gram_pos_column.  */
76
77/* VARARGS1 */
78void
79po_gram_error (const char *fmt, ...)
80{
81  va_list ap;
82  char *buffer;
83
84  va_start (ap, fmt);
85  if (vasprintf (&buffer, fmt, ap) < 0)
86    error (EXIT_FAILURE, 0, _("memory exhausted"));
87  va_end (ap);
88  po_xerror (PO_SEVERITY_ERROR, NULL, gram_pos.file_name, gram_pos.line_number,
89	     gram_pos_column + 1, false, buffer);
90  free (buffer);
91
92  if (error_message_count >= gram_max_allowed_errors)
93    po_error (EXIT_FAILURE, 0, _("too many errors, aborting"));
94}
95
96/* VARARGS2 */
97void
98po_gram_error_at_line (const lex_pos_ty *pp, const char *fmt, ...)
99{
100  va_list ap;
101  char *buffer;
102
103  va_start (ap, fmt);
104  if (vasprintf (&buffer, fmt, ap) < 0)
105    error (EXIT_FAILURE, 0, _("memory exhausted"));
106  va_end (ap);
107  po_xerror (PO_SEVERITY_ERROR, NULL, pp->file_name, pp->line_number,
108	     (size_t)(-1), false, buffer);
109  free (buffer);
110
111  if (error_message_count >= gram_max_allowed_errors)
112    po_error (EXIT_FAILURE, 0, _("too many errors, aborting"));
113}
114
115
116/* The lowest level of PO file parsing converts bytes to multibyte characters.
117   This is needed
118   1. for C compatibility: ISO C 99 section 5.1.1.2 says that the first
119      translation phase maps bytes to characters.
120   2. to keep track of the current column, for the sake of precise error
121      location. Emacs compile.el interprets the column in error messages
122      by default as a screen column number, not as character number.
123   3. to avoid skipping backslash-newline in the midst of a multibyte
124      character. If XY is a multibyte character,  X \ newline Y  is invalid.
125 */
126
127/* Multibyte character data type.  */
128/* Note this depends on po_lex_charset and po_lex_iconv, which get set
129   while the file is being parsed.  */
130
131#define MBCHAR_BUF_SIZE 24
132
133struct mbchar
134{
135  size_t bytes;		/* number of bytes of current character, > 0 */
136#if HAVE_ICONV
137  bool uc_valid;	/* true if uc is a valid Unicode character */
138  unsigned int uc;	/* if uc_valid: the current character */
139#endif
140  char buf[MBCHAR_BUF_SIZE]; /* room for the bytes */
141};
142
143/* We want to pass multibyte characters by reference automatically,
144   therefore we use an array type.  */
145typedef struct mbchar mbchar_t[1];
146
147/* A version of memcpy optimized for the case n <= 1.  */
148static inline void
149memcpy_small (void *dst, const void *src, size_t n)
150{
151  if (n > 0)
152    {
153      char *q = (char *) dst;
154      const char *p = (const char *) src;
155
156      *q = *p;
157      if (--n > 0)
158	do *++q = *++p; while (--n > 0);
159    }
160}
161
162/* EOF (not a real character) is represented with bytes = 0 and
163   uc_valid = false.  */
164static inline bool
165mb_iseof (const mbchar_t mbc)
166{
167  return (mbc->bytes == 0);
168}
169
170/* Access the current character.  */
171static inline const char *
172mb_ptr (const mbchar_t mbc)
173{
174  return mbc->buf;
175}
176static inline size_t
177mb_len (const mbchar_t mbc)
178{
179  return mbc->bytes;
180}
181
182/* Comparison of characters.  */
183
184static inline bool
185mb_iseq (const mbchar_t mbc, char sc)
186{
187  /* Note: It is wrong to compare only mbc->uc, because when the encoding is
188     SHIFT_JIS, mbc->buf[0] == '\\' corresponds to mbc->uc == 0x00A5, but we
189     want to treat it as an escape character, although it looks like a Yen
190     sign.  */
191#if HAVE_ICONV && 0
192  if (mbc->uc_valid)
193    return (mbc->uc == sc); /* wrong! */
194  else
195#endif
196    return (mbc->bytes == 1 && mbc->buf[0] == sc);
197}
198
199static inline bool
200mb_isnul (const mbchar_t mbc)
201{
202#if HAVE_ICONV
203  if (mbc->uc_valid)
204    return (mbc->uc == 0);
205  else
206#endif
207    return (mbc->bytes == 1 && mbc->buf[0] == 0);
208}
209
210static inline int
211mb_cmp (const mbchar_t mbc1, const mbchar_t mbc2)
212{
213#if HAVE_ICONV
214  if (mbc1->uc_valid && mbc2->uc_valid)
215    return (int) mbc1->uc - (int) mbc2->uc;
216  else
217#endif
218    return (mbc1->bytes == mbc2->bytes
219	    ? memcmp (mbc1->buf, mbc2->buf, mbc1->bytes)
220	    : mbc1->bytes < mbc2->bytes
221	      ? (memcmp (mbc1->buf, mbc2->buf, mbc1->bytes) > 0 ? 1 : -1)
222	      : (memcmp (mbc1->buf, mbc2->buf, mbc2->bytes) >= 0 ? 1 : -1));
223}
224
225static inline bool
226mb_equal (const mbchar_t mbc1, const mbchar_t mbc2)
227{
228#if HAVE_ICONV
229  if (mbc1->uc_valid && mbc2->uc_valid)
230    return mbc1->uc == mbc2->uc;
231  else
232#endif
233    return (mbc1->bytes == mbc2->bytes
234	    && memcmp (mbc1->buf, mbc2->buf, mbc1->bytes) == 0);
235}
236
237/* <ctype.h>, <wctype.h> classification.  */
238
239static inline bool
240mb_isascii (const mbchar_t mbc)
241{
242#if HAVE_ICONV
243  if (mbc->uc_valid)
244    return (mbc->uc >= 0x0000 && mbc->uc <= 0x007F);
245  else
246#endif
247    return mbc->bytes == 1 && (mbc->buf[0] & 0x80) == 0;
248}
249
250/* Extra <wchar.h> function.  */
251
252/* Unprintable characters appear as a small box of width 1.  */
253#define MB_UNPRINTABLE_WIDTH 1
254
255static int
256mb_width (const mbchar_t mbc)
257{
258#if HAVE_ICONV
259  if (mbc->uc_valid)
260    {
261      unsigned int uc = mbc->uc;
262      const char *encoding =
263	(po_lex_iconv != (iconv_t)(-1) ? po_lex_charset : "");
264      int w = uc_width (uc, encoding);
265      /* For unprintable characters, arbitrarily return 0 for control
266	 characters (except tab) and MB_UNPRINTABLE_WIDTH otherwise.  */
267      if (w >= 0)
268	return w;
269      if (uc >= 0x0000 && uc <= 0x001F)
270	{
271	  if (uc == 0x0009)
272	    return 8 - (gram_pos_column & 7);
273	  return 0;
274	}
275      if ((uc >= 0x007F && uc <= 0x009F) || (uc >= 0x2028 && uc <= 0x2029))
276	return 0;
277      return MB_UNPRINTABLE_WIDTH;
278    }
279  else
280#endif
281    {
282      if (mbc->bytes == 1)
283	{
284	  if (
285#if CHAR_MIN < 0x00 /* to avoid gcc warning */
286	      mbc->buf[0] >= 0x00 &&
287#endif
288	      mbc->buf[0] <= 0x1F)
289	    {
290	      if (mbc->buf[0] == 0x09)
291		return 8 - (gram_pos_column & 7);
292	      return 0;
293	    }
294	  if (mbc->buf[0] == 0x7F)
295	    return 0;
296	}
297      return MB_UNPRINTABLE_WIDTH;
298    }
299}
300
301/* Output.  */
302static inline void
303mb_putc (const mbchar_t mbc, FILE *stream)
304{
305  fwrite (mbc->buf, 1, mbc->bytes, stream);
306}
307
308/* Assignment.  */
309static inline void
310mb_setascii (mbchar_t mbc, char sc)
311{
312  mbc->bytes = 1;
313#if HAVE_ICONV
314  mbc->uc_valid = 1;
315  mbc->uc = sc;
316#endif
317  mbc->buf[0] = sc;
318}
319
320/* Copying a character.  */
321static inline void
322mb_copy (mbchar_t new, const mbchar_t old)
323{
324  memcpy_small (&new->buf[0], &old->buf[0], old->bytes);
325  new->bytes = old->bytes;
326#if HAVE_ICONV
327  if ((new->uc_valid = old->uc_valid))
328    new->uc = old->uc;
329#endif
330}
331
332
333/* Multibyte character input.  */
334
335/* Number of characters that can be pushed back.
336   We need 1 for lex_getc, plus 1 for lex_ungetc.  */
337#define NPUSHBACK 2
338
339/* Data type of a multibyte character input stream.  */
340struct mbfile
341{
342  FILE *fp;
343  bool eof_seen;
344  int have_pushback;
345  unsigned int bufcount;
346  char buf[MBCHAR_BUF_SIZE];
347  struct mbchar pushback[NPUSHBACK];
348};
349
350/* We want to pass multibyte streams by reference automatically,
351   therefore we use an array type.  */
352typedef struct mbfile mbfile_t[1];
353
354/* Whether invalid multibyte sequences in the input shall be signalled
355   or silently tolerated.  */
356static bool signal_eilseq;
357
358static inline void
359mbfile_init (mbfile_t mbf, FILE *stream)
360{
361  mbf->fp = stream;
362  mbf->eof_seen = false;
363  mbf->have_pushback = 0;
364  mbf->bufcount = 0;
365}
366
367/* Read the next multibyte character from mbf and put it into mbc.
368   If a read error occurs, errno is set and ferror (mbf->fp) becomes true.  */
369static void
370mbfile_getc (mbchar_t mbc, mbfile_t mbf)
371{
372  size_t bytes;
373
374  /* If EOF has already been seen, don't use getc.  This matters if
375     mbf->fp is connected to an interactive tty.  */
376  if (mbf->eof_seen)
377    goto eof;
378
379  /* Return character pushed back, if there is one.  */
380  if (mbf->have_pushback > 0)
381    {
382      mbf->have_pushback--;
383      mb_copy (mbc, &mbf->pushback[mbf->have_pushback]);
384      return;
385    }
386
387  /* Before using iconv, we need at least one byte.  */
388  if (mbf->bufcount == 0)
389    {
390      int c = getc (mbf->fp);
391      if (c == EOF)
392	{
393	  mbf->eof_seen = true;
394	  goto eof;
395	}
396      mbf->buf[0] = (unsigned char) c;
397      mbf->bufcount++;
398    }
399
400#if HAVE_ICONV
401  if (po_lex_iconv != (iconv_t)(-1))
402    {
403      /* Use iconv on an increasing number of bytes.  Read only as many
404	 bytes from mbf->fp as needed.  This is needed to give reasonable
405	 interactive behaviour when mbf->fp is connected to an interactive
406	 tty.  */
407      for (;;)
408	{
409	  unsigned char scratchbuf[64];
410	  const char *inptr = &mbf->buf[0];
411	  size_t insize = mbf->bufcount;
412	  char *outptr = (char *) &scratchbuf[0];
413	  size_t outsize = sizeof (scratchbuf);
414
415	  size_t res = iconv (po_lex_iconv,
416			      (ICONV_CONST char **) &inptr, &insize,
417			      &outptr, &outsize);
418	  /* We expect that a character has been produced if and only if
419	     some input bytes have been consumed.  */
420	  if ((insize < mbf->bufcount) != (outsize < sizeof (scratchbuf)))
421	    abort ();
422	  if (outsize == sizeof (scratchbuf))
423	    {
424	      /* No character has been produced.  Must be an error.  */
425	      if (res != (size_t)(-1))
426		abort ();
427
428	      if (errno == EILSEQ)
429		{
430		  /* An invalid multibyte sequence was encountered.  */
431		  /* Return a single byte.  */
432		  if (signal_eilseq)
433		    po_gram_error (_("invalid multibyte sequence"));
434		  bytes = 1;
435		  mbc->uc_valid = false;
436		  break;
437		}
438	      else if (errno == EINVAL)
439		{
440		  /* An incomplete multibyte character.  */
441		  int c;
442
443		  if (mbf->bufcount == MBCHAR_BUF_SIZE)
444		    {
445		      /* An overlong incomplete multibyte sequence was
446			 encountered.  */
447		      /* Return a single byte.  */
448		      bytes = 1;
449		      mbc->uc_valid = false;
450		      break;
451		    }
452
453		  /* Read one more byte and retry iconv.  */
454		  c = getc (mbf->fp);
455		  if (c == EOF)
456		    {
457		      mbf->eof_seen = true;
458		      if (ferror (mbf->fp))
459			goto eof;
460		      if (signal_eilseq)
461			po_gram_error (_("\
462incomplete multibyte sequence at end of file"));
463		      bytes = mbf->bufcount;
464		      mbc->uc_valid = false;
465		      break;
466		    }
467		  mbf->buf[mbf->bufcount++] = (unsigned char) c;
468		  if (c == '\n')
469		    {
470		      if (signal_eilseq)
471			po_gram_error (_("\
472incomplete multibyte sequence at end of line"));
473		      bytes = mbf->bufcount - 1;
474		      mbc->uc_valid = false;
475		      break;
476		    }
477		}
478	      else
479		{
480		  const char *errno_description = strerror (errno);
481		  po_xerror (PO_SEVERITY_FATAL_ERROR, NULL, NULL, 0, 0, false,
482			     xasprintf ("%s: %s",
483					_("iconv failure"),
484					errno_description));
485		}
486	    }
487	  else
488	    {
489	      size_t outbytes = sizeof (scratchbuf) - outsize;
490	      bytes = mbf->bufcount - insize;
491
492	      /* We expect that one character has been produced.  */
493	      if (bytes == 0)
494		abort ();
495	      if (outbytes == 0)
496		abort ();
497	      /* Convert it from UTF-8 to UCS-4.  */
498	      if (u8_mbtouc (&mbc->uc, scratchbuf, outbytes) < outbytes)
499		{
500		  /* scratchbuf contains an out-of-range Unicode character
501		     (> 0x10ffff).  */
502		  if (signal_eilseq)
503		    po_gram_error (_("invalid multibyte sequence"));
504		  mbc->uc_valid = false;
505		  break;
506		}
507	      mbc->uc_valid = true;
508	      break;
509	    }
510	}
511    }
512  else
513#endif
514    {
515      if (po_lex_weird_cjk
516	  /* Special handling of encodings with CJK structure.  */
517	  && (unsigned char) mbf->buf[0] >= 0x80)
518	{
519	  if (mbf->bufcount == 1)
520	    {
521	      /* Read one more byte.  */
522	      int c = getc (mbf->fp);
523	      if (c == EOF)
524		{
525		  if (ferror (mbf->fp))
526		    {
527		      mbf->eof_seen = true;
528		      goto eof;
529		    }
530		}
531	      else
532		{
533		  mbf->buf[1] = (unsigned char) c;
534		  mbf->bufcount++;
535		}
536	    }
537	  if (mbf->bufcount >= 2 && (unsigned char) mbf->buf[1] >= 0x30)
538	    /* Return a double byte.  */
539	    bytes = 2;
540	  else
541	    /* Return a single byte.  */
542	    bytes = 1;
543	}
544      else
545	{
546	  /* Return a single byte.  */
547	  bytes = 1;
548	}
549#if HAVE_ICONV
550      mbc->uc_valid = false;
551#endif
552    }
553
554  /* Return the multibyte sequence mbf->buf[0..bytes-1].  */
555  memcpy_small (&mbc->buf[0], &mbf->buf[0], bytes);
556  mbc->bytes = bytes;
557
558  mbf->bufcount -= bytes;
559  if (mbf->bufcount > 0)
560    {
561      /* It's not worth calling memmove() for so few bytes.  */
562      unsigned int count = mbf->bufcount;
563      char *p = &mbf->buf[0];
564
565      do
566	{
567	  *p = *(p + bytes);
568	  p++;
569	}
570      while (--count > 0);
571    }
572  return;
573
574eof:
575  /* An mbchar_t with bytes == 0 is used to indicate EOF.  */
576  mbc->bytes = 0;
577#if HAVE_ICONV
578  mbc->uc_valid = false;
579#endif
580  return;
581}
582
583static void
584mbfile_ungetc (const mbchar_t mbc, mbfile_t mbf)
585{
586  if (mbf->have_pushback >= NPUSHBACK)
587    abort ();
588  mb_copy (&mbf->pushback[mbf->have_pushback], mbc);
589  mbf->have_pushback++;
590}
591
592
593/* Lexer variables.  */
594
595static mbfile_t mbf;
596unsigned int gram_max_allowed_errors = 20;
597static bool po_lex_obsolete;
598static bool po_lex_previous;
599static bool pass_comments = false;
600bool pass_obsolete_entries = false;
601
602
603/* Prepare lexical analysis.  */
604void
605lex_start (FILE *fp, const char *real_filename, const char *logical_filename)
606{
607  /* Ignore the logical_filename, because PO file entries already have
608     their file names attached.  But use real_filename for error messages.  */
609  gram_pos.file_name = xstrdup (real_filename);
610
611  mbfile_init (mbf, fp);
612
613  gram_pos.line_number = 1;
614  gram_pos_column = 0;
615  signal_eilseq = true;
616  po_lex_obsolete = false;
617  po_lex_previous = false;
618  po_lex_charset_init ();
619}
620
621/* Terminate lexical analysis.  */
622void
623lex_end ()
624{
625  mbf->fp = NULL;
626  gram_pos.file_name = NULL;
627  gram_pos.line_number = 0;
628  gram_pos_column = 0;
629  signal_eilseq = false;
630  po_lex_obsolete = false;
631  po_lex_previous = false;
632  po_lex_charset_close ();
633}
634
635
636/* Read a single character, dealing with backslash-newline.
637   Also keep track of the current line number and column number.  */
638static void
639lex_getc (mbchar_t mbc)
640{
641  for (;;)
642    {
643      mbfile_getc (mbc, mbf);
644
645      if (mb_iseof (mbc))
646	{
647	  if (ferror (mbf->fp))
648	   bomb:
649	    {
650	      const char *errno_description = strerror (errno);
651	      po_xerror (PO_SEVERITY_FATAL_ERROR, NULL, NULL, 0, 0, false,
652			 xasprintf ("%s: %s",
653				    xasprintf (_("error while reading \"%s\""),
654					       gram_pos.file_name),
655				    errno_description));
656	    }
657	  break;
658	}
659
660      if (mb_iseq (mbc, '\n'))
661	{
662	  gram_pos.line_number++;
663	  gram_pos_column = 0;
664	  break;
665	}
666
667      gram_pos_column += mb_width (mbc);
668
669      if (mb_iseq (mbc, '\\'))
670	{
671	  mbchar_t mbc2;
672
673	  mbfile_getc (mbc2, mbf);
674
675	  if (mb_iseof (mbc2))
676	    {
677	      if (ferror (mbf->fp))
678		goto bomb;
679	      break;
680	    }
681
682	  if (!mb_iseq (mbc2, '\n'))
683	    {
684	      mbfile_ungetc (mbc2, mbf);
685	      break;
686	    }
687
688	  gram_pos.line_number++;
689	  gram_pos_column = 0;
690	}
691      else
692	break;
693    }
694}
695
696
697static void
698lex_ungetc (const mbchar_t mbc)
699{
700  if (!mb_iseof (mbc))
701    {
702      if (mb_iseq (mbc, '\n'))
703	/* Decrement the line number, but don't care about the column.  */
704	gram_pos.line_number--;
705      else
706	/* Decrement the column number.  Also works well enough for tabs.  */
707	gram_pos_column -= mb_width (mbc);
708
709      mbfile_ungetc (mbc, mbf);
710    }
711}
712
713
714static int
715keyword_p (const char *s)
716{
717  if (!po_lex_previous)
718    {
719      if (!strcmp (s, "domain"))
720	return DOMAIN;
721      if (!strcmp (s, "msgid"))
722	return MSGID;
723      if (!strcmp (s, "msgid_plural"))
724	return MSGID_PLURAL;
725      if (!strcmp (s, "msgstr"))
726	return MSGSTR;
727      if (!strcmp (s, "msgctxt"))
728	return MSGCTXT;
729    }
730  else
731    {
732      /* Inside a "#|" context, the keywords have a different meaning.  */
733      if (!strcmp (s, "msgid"))
734	return PREV_MSGID;
735      if (!strcmp (s, "msgid_plural"))
736	return PREV_MSGID_PLURAL;
737      if (!strcmp (s, "msgctxt"))
738	return PREV_MSGCTXT;
739    }
740  po_gram_error_at_line (&gram_pos, _("keyword \"%s\" unknown"), s);
741  return NAME;
742}
743
744
745static int
746control_sequence ()
747{
748  mbchar_t mbc;
749  int val;
750  int max;
751
752  lex_getc (mbc);
753  if (mb_len (mbc) == 1)
754    switch (mb_ptr (mbc) [0])
755      {
756      case 'n':
757	return '\n';
758
759      case 't':
760	return '\t';
761
762      case 'b':
763	return '\b';
764
765      case 'r':
766	return '\r';
767
768      case 'f':
769	return '\f';
770
771      case 'v':
772	return '\v';
773
774      case 'a':
775	return '\a';
776
777      case '\\':
778      case '"':
779	return mb_ptr (mbc) [0];
780
781      case '0': case '1': case '2': case '3':
782      case '4': case '5': case '6': case '7':
783	val = 0;
784	max = 0;
785	for (;;)
786	  {
787	    char c = mb_ptr (mbc) [0];
788	    /* Warning: not portable, can't depend on '0'..'7' ordering.  */
789	    val = val * 8 + (c - '0');
790	    if (++max == 3)
791	      break;
792	    lex_getc (mbc);
793	    if (mb_len (mbc) == 1)
794	      switch (mb_ptr (mbc) [0])
795		{
796		case '0': case '1': case '2': case '3':
797		case '4': case '5': case '6': case '7':
798		  continue;
799
800		default:
801		  break;
802		}
803	    lex_ungetc (mbc);
804	    break;
805	  }
806	return val;
807
808      case 'x':
809	lex_getc (mbc);
810	if (mb_iseof (mbc) || mb_len (mbc) != 1
811	    || !c_isxdigit (mb_ptr (mbc) [0]))
812	  break;
813
814	val = 0;
815	for (;;)
816	  {
817	    char c = mb_ptr (mbc) [0];
818	    val *= 16;
819	    if (c_isdigit (c))
820	      /* Warning: not portable, can't depend on '0'..'9' ordering */
821	      val += c - '0';
822	    else if (c_isupper (c))
823	      /* Warning: not portable, can't depend on 'A'..'F' ordering */
824	      val += c - 'A' + 10;
825	    else
826	      /* Warning: not portable, can't depend on 'a'..'f' ordering */
827	      val += c - 'a' + 10;
828
829	    lex_getc (mbc);
830	    if (mb_len (mbc) == 1)
831	      switch (mb_ptr (mbc) [0])
832		{
833		case '0': case '1': case '2': case '3': case '4':
834		case '5': case '6': case '7': case '8': case '9':
835		case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
836		case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
837		  continue;
838
839		default:
840		  break;
841		}
842	    lex_ungetc (mbc);
843	    break;
844	  }
845	return val;
846
847      /* FIXME: \u and \U are not handled.  */
848      }
849  lex_ungetc (mbc);
850  po_gram_error (_("invalid control sequence"));
851  return ' ';
852}
853
854
855/* Return the next token in the PO file.  The return codes are defined
856   in "po-gram-gen2.h".  Associated data is put in 'po_gram_lval'.  */
857int
858po_gram_lex ()
859{
860  static char *buf;
861  static size_t bufmax;
862  mbchar_t mbc;
863  size_t bufpos;
864
865  for (;;)
866    {
867      lex_getc (mbc);
868
869      if (mb_iseof (mbc))
870	/* Yacc want this for end of file.  */
871	return 0;
872
873      if (mb_len (mbc) == 1)
874	switch (mb_ptr (mbc) [0])
875	  {
876	  case '\n':
877	    po_lex_obsolete = false;
878	    po_lex_previous = false;
879	    /* Ignore whitespace, not relevant for the grammar.  */
880	    break;
881
882	  case ' ':
883	  case '\t':
884	  case '\r':
885	  case '\f':
886	  case '\v':
887	    /* Ignore whitespace, not relevant for the grammar.  */
888	    break;
889
890	  case '#':
891	    lex_getc (mbc);
892	    if (mb_iseq (mbc, '~'))
893	      /* A pseudo-comment beginning with #~ is found.  This is
894		 not a comment.  It is the format for obsolete entries.
895		 We simply discard the "#~" prefix.  The following
896		 characters are expected to be well formed.  */
897	      {
898		po_lex_obsolete = true;
899		/* A pseudo-comment beginning with #~| denotes a previous
900		   untranslated string in an obsolete entry.  This does not
901		   make much sense semantically, and is implemented here
902		   for completeness only.  */
903		lex_getc (mbc);
904		if (mb_iseq (mbc, '|'))
905		  po_lex_previous = true;
906		else
907		  lex_ungetc (mbc);
908		break;
909	      }
910	    if (mb_iseq (mbc, '|'))
911	      /* A pseudo-comment beginning with #| is found.  This is
912		 the previous untranslated string.  We discard the "#|"
913		 prefix, but change the keywords and string returns
914		 accordingly.  */
915	      {
916		po_lex_previous = true;
917		break;
918	      }
919
920	    /* Accumulate comments into a buffer.  If we have been asked
921	       to pass comments, generate a COMMENT token, otherwise
922	       discard it.  */
923	    signal_eilseq = false;
924	    if (pass_comments)
925	      {
926		bufpos = 0;
927		for (;;)
928		  {
929		    while (bufpos + mb_len (mbc) >= bufmax)
930		      {
931			bufmax += 100;
932			buf = xrealloc (buf, bufmax);
933		      }
934		    if (mb_iseof (mbc) || mb_iseq (mbc, '\n'))
935		      break;
936
937		    memcpy_small (&buf[bufpos], mb_ptr (mbc), mb_len (mbc));
938		    bufpos += mb_len (mbc);
939
940		    lex_getc (mbc);
941		  }
942		buf[bufpos] = '\0';
943
944		po_gram_lval.string.string = buf;
945		po_gram_lval.string.pos = gram_pos;
946		po_gram_lval.string.obsolete = po_lex_obsolete;
947		po_lex_obsolete = false;
948		signal_eilseq = true;
949		return COMMENT;
950	      }
951	    else
952	      {
953		/* We do this in separate loop because collecting large
954		   comments while they get not passed to the upper layers
955		   is not very efficient.  */
956		while (!mb_iseof (mbc) && !mb_iseq (mbc, '\n'))
957		  lex_getc (mbc);
958		po_lex_obsolete = false;
959		signal_eilseq = true;
960	      }
961	    break;
962
963	  case '"':
964	    /* Accumulate a string.  */
965	    bufpos = 0;
966	    for (;;)
967	      {
968		lex_getc (mbc);
969		while (bufpos + mb_len (mbc) >= bufmax)
970		  {
971		    bufmax += 100;
972		    buf = xrealloc (buf, bufmax);
973		  }
974		if (mb_iseof (mbc))
975		  {
976		    po_gram_error_at_line (&gram_pos,
977					   _("end-of-file within string"));
978		    break;
979		  }
980		if (mb_iseq (mbc, '\n'))
981		  {
982		    po_gram_error_at_line (&gram_pos,
983					   _("end-of-line within string"));
984		    break;
985		  }
986		if (mb_iseq (mbc, '"'))
987		  break;
988		if (mb_iseq (mbc, '\\'))
989		  {
990		    buf[bufpos++] = control_sequence ();
991		    continue;
992		  }
993
994		/* Add mbc to the accumulator.  */
995		memcpy_small (&buf[bufpos], mb_ptr (mbc), mb_len (mbc));
996		bufpos += mb_len (mbc);
997	      }
998	    buf[bufpos] = '\0';
999
1000	    /* Strings cannot contain the msgctxt separator, because it cannot
1001	       be faithfully represented in the msgid of a .mo file.  */
1002	    if (strchr (buf, MSGCTXT_SEPARATOR) != NULL)
1003	      po_gram_error_at_line (&gram_pos,
1004				     _("context separator <EOT> within string"));
1005
1006	    /* FIXME: Treatment of embedded \000 chars is incorrect.  */
1007	    po_gram_lval.string.string = xstrdup (buf);
1008	    po_gram_lval.string.pos = gram_pos;
1009	    po_gram_lval.string.obsolete = po_lex_obsolete;
1010	    return (po_lex_previous ? PREV_STRING : STRING);
1011
1012	  case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
1013	  case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
1014	  case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
1015	  case 's': case 't': case 'u': case 'v': case 'w': case 'x':
1016	  case 'y': case 'z':
1017	  case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
1018	  case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
1019	  case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
1020	  case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
1021	  case 'Y': case 'Z':
1022	  case '_': case '$':
1023	    bufpos = 0;
1024	    for (;;)
1025	      {
1026		char c = mb_ptr (mbc) [0];
1027		if (bufpos + 1 >= bufmax)
1028		  {
1029		    bufmax += 100;
1030		    buf = xrealloc (buf, bufmax);
1031		  }
1032		buf[bufpos++] = c;
1033		lex_getc (mbc);
1034		if (mb_len (mbc) == 1)
1035		  switch (mb_ptr (mbc) [0])
1036		    {
1037		    default:
1038		      break;
1039		    case 'a': case 'b': case 'c': case 'd': case 'e':
1040		    case 'f': case 'g': case 'h': case 'i': case 'j':
1041		    case 'k': case 'l': case 'm': case 'n': case 'o':
1042		    case 'p': case 'q': case 'r': case 's': case 't':
1043		    case 'u': case 'v': case 'w': case 'x': case 'y':
1044		    case 'z':
1045		    case 'A': case 'B': case 'C': case 'D': case 'E':
1046		    case 'F': case 'G': case 'H': case 'I': case 'J':
1047		    case 'K': case 'L': case 'M': case 'N': case 'O':
1048		    case 'P': case 'Q': case 'R': case 'S': case 'T':
1049		    case 'U': case 'V': case 'W': case 'X': case 'Y':
1050		    case 'Z':
1051		    case '_': case '$':
1052		    case '0': case '1': case '2': case '3': case '4':
1053		    case '5': case '6': case '7': case '8': case '9':
1054		      continue;
1055		    }
1056		break;
1057	      }
1058	    lex_ungetc (mbc);
1059
1060	    buf[bufpos] = '\0';
1061
1062	    {
1063	      int k = keyword_p (buf);
1064	      if (k == NAME)
1065		{
1066		  po_gram_lval.string.string = xstrdup (buf);
1067		  po_gram_lval.string.pos = gram_pos;
1068		  po_gram_lval.string.obsolete = po_lex_obsolete;
1069		}
1070	      else
1071		{
1072		  po_gram_lval.pos.pos = gram_pos;
1073		  po_gram_lval.pos.obsolete = po_lex_obsolete;
1074		}
1075	      return k;
1076	    }
1077
1078	  case '0': case '1': case '2': case '3': case '4':
1079	  case '5': case '6': case '7': case '8': case '9':
1080	    bufpos = 0;
1081	    for (;;)
1082	      {
1083		char c = mb_ptr (mbc) [0];
1084		if (bufpos + 1 >= bufmax)
1085		  {
1086		    bufmax += 100;
1087		    buf = xrealloc (buf, bufmax + 1);
1088		  }
1089		buf[bufpos++] = c;
1090		lex_getc (mbc);
1091		if (mb_len (mbc) == 1)
1092		  switch (mb_ptr (mbc) [0])
1093		    {
1094		    default:
1095		      break;
1096
1097		    case '0': case '1': case '2': case '3': case '4':
1098		    case '5': case '6': case '7': case '8': case '9':
1099		      continue;
1100		    }
1101		break;
1102	      }
1103	    lex_ungetc (mbc);
1104
1105	    buf[bufpos] = '\0';
1106
1107	    po_gram_lval.number.number = atol (buf);
1108	    po_gram_lval.number.pos = gram_pos;
1109	    po_gram_lval.number.obsolete = po_lex_obsolete;
1110	    return NUMBER;
1111
1112	  case '[':
1113	    po_gram_lval.pos.pos = gram_pos;
1114	    po_gram_lval.pos.obsolete = po_lex_obsolete;
1115	    return '[';
1116
1117	  case ']':
1118	    po_gram_lval.pos.pos = gram_pos;
1119	    po_gram_lval.pos.obsolete = po_lex_obsolete;
1120	    return ']';
1121
1122	  default:
1123	    /* This will cause a syntax error.  */
1124	    return JUNK;
1125	  }
1126      else
1127	/* This will cause a syntax error.  */
1128	return JUNK;
1129    }
1130}
1131
1132
1133/* po_gram_lex() can return comments as COMMENT.  Switch this on or off.  */
1134void
1135po_lex_pass_comments (bool flag)
1136{
1137  pass_comments = flag;
1138}
1139
1140
1141/* po_gram_lex() can return obsolete entries as if they were normal entries.
1142   Switch this on or off.  */
1143void
1144po_lex_pass_obsolete_entries (bool flag)
1145{
1146  pass_obsolete_entries = flag;
1147}
1148