1/* xgettext Emacs Lisp backend.
2   Copyright (C) 2001-2003, 2005-2006 Free Software Foundation, Inc.
3
4   This file was written by Bruno Haible <haible@clisp.cons.org>, 2001-2002.
5
6   This program is free software; you can redistribute it and/or modify
7   it under the terms of the GNU General Public License as published by
8   the Free Software Foundation; either version 2, or (at your option)
9   any later version.
10
11   This program is distributed in the hope that it will be useful,
12   but WITHOUT ANY WARRANTY; without even the implied warranty of
13   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14   GNU General Public License for more details.
15
16   You should have received a copy of the GNU General Public License
17   along with this program; if not, write to the Free Software Foundation,
18   Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.  */
19
20#ifdef HAVE_CONFIG_H
21# include "config.h"
22#endif
23
24#include <errno.h>
25#include <stdbool.h>
26#include <stdio.h>
27#include <stdlib.h>
28#include <string.h>
29
30#include "message.h"
31#include "xgettext.h"
32#include "x-elisp.h"
33#include "error.h"
34#include "xalloc.h"
35#include "exit.h"
36#include "hash.h"
37#include "c-ctype.h"
38#include "gettext.h"
39
40#define _(s) gettext(s)
41
42
43/* Summary of Emacs Lisp syntax:
44   - ';' starts a comment until end of line.
45   - '#@nn' starts a comment of nn bytes.
46   - Integers are constituted of an optional prefix (#b, #B for binary,
47     #o, #O for octal, #x, #X for hexadecimal, #nnr, #nnR for any radix),
48     an optional sign (+ or -), the digits, and an optional trailing dot.
49   - Characters are written as '?' followed by the character, possibly
50     with an escape sequence, for examples '?a', '?\n', '?\177'.
51   - Strings are delimited by double quotes. Backslash introduces an escape
52     sequence. The following are understood: '\n', '\r', '\f', '\t', '\a',
53     '\\', '\^C', '\012' (octal), '\x12' (hexadecimal).
54   - Symbols: can contain meta-characters if preceded by backslash.
55   - Uninterned symbols: written as #:SYMBOL.
56   - () delimit lists.
57   - [] delimit vectors.
58   The reader is implemented in emacs-21.1/src/lread.c.  */
59
60
61/* ====================== Keyword set customization.  ====================== */
62
63/* If true extract all strings.  */
64static bool extract_all = false;
65
66static hash_table keywords;
67static bool default_keywords = true;
68
69
70void
71x_elisp_extract_all ()
72{
73  extract_all = true;
74}
75
76
77void
78x_elisp_keyword (const char *name)
79{
80  if (name == NULL)
81    default_keywords = false;
82  else
83    {
84      const char *end;
85      struct callshape shape;
86      const char *colon;
87
88      if (keywords.table == NULL)
89	hash_init (&keywords, 100);
90
91      split_keywordspec (name, &end, &shape);
92
93      /* The characters between name and end should form a valid Lisp
94	 symbol.  */
95      colon = strchr (name, ':');
96      if (colon == NULL || colon >= end)
97	insert_keyword_callshape (&keywords, name, end - name, &shape);
98    }
99}
100
101/* Finish initializing the keywords hash table.
102   Called after argument processing, before each file is processed.  */
103static void
104init_keywords ()
105{
106  if (default_keywords)
107    {
108      /* When adding new keywords here, also update the documentation in
109	 xgettext.texi!  */
110      x_elisp_keyword ("_");
111      default_keywords = false;
112    }
113}
114
115void
116init_flag_table_elisp ()
117{
118  xgettext_record_flag ("_:1:pass-elisp-format");
119  xgettext_record_flag ("format:1:elisp-format");
120}
121
122
123/* ======================== Reading of characters.  ======================== */
124
125/* Real filename, used in error messages about the input file.  */
126static const char *real_file_name;
127
128/* Logical filename and line number, used to label the extracted messages.  */
129static char *logical_file_name;
130static int line_number;
131
132/* The input file stream.  */
133static FILE *fp;
134
135
136/* Fetch the next character from the input file.  */
137static int
138do_getc ()
139{
140  int c = getc (fp);
141
142  if (c == EOF)
143    {
144      if (ferror (fp))
145	error (EXIT_FAILURE, errno, _("\
146error while reading \"%s\""), real_file_name);
147    }
148  else if (c == '\n')
149   line_number++;
150
151  return c;
152}
153
154/* Put back the last fetched character, not EOF.  */
155static void
156do_ungetc (int c)
157{
158  if (c == '\n')
159    line_number--;
160  ungetc (c, fp);
161}
162
163
164/* ========================== Reading of tokens.  ========================== */
165
166
167/* A token consists of a sequence of characters.  */
168struct token
169{
170  int allocated;		/* number of allocated 'token_char's */
171  int charcount;		/* number of used 'token_char's */
172  char *chars;			/* the token's constituents */
173};
174
175/* Initialize a 'struct token'.  */
176static inline void
177init_token (struct token *tp)
178{
179  tp->allocated = 10;
180  tp->chars = (char *) xmalloc (tp->allocated * sizeof (char));
181  tp->charcount = 0;
182}
183
184/* Free the memory pointed to by a 'struct token'.  */
185static inline void
186free_token (struct token *tp)
187{
188  free (tp->chars);
189}
190
191/* Ensure there is enough room in the token for one more character.  */
192static inline void
193grow_token (struct token *tp)
194{
195  if (tp->charcount == tp->allocated)
196    {
197      tp->allocated *= 2;
198      tp->chars = (char *) xrealloc (tp->chars, tp->allocated * sizeof (char));
199    }
200}
201
202/* Test whether a token has integer syntax.  */
203static inline bool
204is_integer (const char *p)
205{
206  /* NB: Yes, '+.' and '-.' both designate the integer 0.  */
207  const char *p_start = p;
208
209  if (*p == '+' || *p == '-')
210    p++;
211  if (*p == '\0')
212    return false;
213  while (*p >= '0' && *p <= '9')
214    p++;
215  if (p > p_start && *p == '.')
216    p++;
217  return (*p == '\0');
218}
219
220/* Test whether a token has float syntax.  */
221static inline bool
222is_float (const char *p)
223{
224  enum { LEAD_INT = 1, DOT_CHAR = 2, TRAIL_INT = 4, E_CHAR = 8, EXP_INT = 16 };
225  int state;
226
227  state = 0;
228  if (*p == '+' || *p == '-')
229    p++;
230  if (*p >= '0' && *p <= '9')
231    {
232      state |= LEAD_INT;
233      do
234	p++;
235      while (*p >= '0' && *p <= '9');
236    }
237  if (*p == '.')
238    {
239      state |= DOT_CHAR;
240      p++;
241    }
242  if (*p >= '0' && *p <= '9')
243    {
244      state |= TRAIL_INT;
245      do
246	p++;
247      while (*p >= '0' && *p <= '9');
248    }
249  if (*p == 'e' || *p == 'E')
250    {
251      state |= E_CHAR;
252      p++;
253      if (*p == '+' || *p == '-')
254	p++;
255      if (*p >= '0' && *p <= '9')
256	{
257	  state |= EXP_INT;
258	  do
259	    p++;
260	  while (*p >= '0' && *p <= '9');
261	}
262      else if (p[-1] == '+'
263	       && ((p[0] == 'I' && p[1] == 'N' && p[2] == 'F')
264		   || (p[0] == 'N' && p[1] == 'a' && p[2] == 'N')))
265	{
266	  state |= EXP_INT;
267	  p += 3;
268	}
269    }
270  return (*p == '\0')
271	 && (state == (LEAD_INT | DOT_CHAR | TRAIL_INT)
272	     || state == (DOT_CHAR | TRAIL_INT)
273	     || state == (LEAD_INT | E_CHAR | EXP_INT)
274	     || state == (LEAD_INT | DOT_CHAR | TRAIL_INT | E_CHAR | EXP_INT)
275	     || state == (DOT_CHAR | TRAIL_INT | E_CHAR | EXP_INT));
276}
277
278/* Read the next token.  'first' is the first character, which has already
279   been read.  Returns true for a symbol, false for a number.  */
280static bool
281read_token (struct token *tp, int first)
282{
283  int c;
284  bool quoted = false;
285
286  init_token (tp);
287
288  c = first;
289
290  for (;; c = do_getc ())
291    {
292      if (c == EOF)
293	break;
294      if (c <= ' ') /* FIXME: Assumes ASCII compatible encoding */
295	break;
296      if (c == '\"' || c == '\'' || c == ';' || c == '(' || c == ')'
297	  || c == '[' || c == ']' || c == '#')
298	break;
299      if (c == '\\')
300	{
301	  quoted = true;
302	  c = do_getc ();
303	  if (c == EOF)
304	    /* Invalid, but be tolerant.  */
305	    break;
306	}
307      grow_token (tp);
308      tp->chars[tp->charcount++] = c;
309    }
310  if (c != EOF)
311    do_ungetc (c);
312
313  if (quoted)
314    return true; /* symbol */
315
316  /* Add a NUL byte at the end, for is_integer and is_float.  */
317  grow_token (tp);
318  tp->chars[tp->charcount] = '\0';
319
320  if (is_integer (tp->chars) || is_float (tp->chars))
321    return false; /* number */
322  else
323    return true; /* symbol */
324}
325
326
327/* ========================= Accumulating comments ========================= */
328
329
330static char *buffer;
331static size_t bufmax;
332static size_t buflen;
333
334static inline void
335comment_start ()
336{
337  buflen = 0;
338}
339
340static inline void
341comment_add (int c)
342{
343  if (buflen >= bufmax)
344    {
345      bufmax = 2 * bufmax + 10;
346      buffer = xrealloc (buffer, bufmax);
347    }
348  buffer[buflen++] = c;
349}
350
351static inline void
352comment_line_end (size_t chars_to_remove)
353{
354  buflen -= chars_to_remove;
355  while (buflen >= 1
356	 && (buffer[buflen - 1] == ' ' || buffer[buflen - 1] == '\t'))
357    --buflen;
358  if (chars_to_remove == 0 && buflen >= bufmax)
359    {
360      bufmax = 2 * bufmax + 10;
361      buffer = xrealloc (buffer, bufmax);
362    }
363  buffer[buflen] = '\0';
364  savable_comment_add (buffer);
365}
366
367
368/* These are for tracking whether comments count as immediately before
369   keyword.  */
370static int last_comment_line;
371static int last_non_comment_line;
372
373
374/* ========================= Accumulating messages ========================= */
375
376
377static message_list_ty *mlp;
378
379
380/* ============== Reading of objects.  See CLHS 2 "Syntax".  ============== */
381
382
383/* We are only interested in symbols (e.g. GETTEXT or NGETTEXT) and strings.
384   Other objects need not to be represented precisely.  */
385enum object_type
386{
387  t_symbol,	/* symbol */
388  t_string,	/* string */
389  t_other,	/* other kind of real object */
390  t_dot,	/* '.' pseudo object */
391  t_listclose,	/* ')' pseudo object */
392  t_vectorclose,/* ']' pseudo object */
393  t_eof		/* EOF marker */
394};
395
396struct object
397{
398  enum object_type type;
399  struct token *token;		/* for t_symbol and t_string */
400  int line_number_at_start;	/* for t_string */
401};
402
403/* Free the memory pointed to by a 'struct object'.  */
404static inline void
405free_object (struct object *op)
406{
407  if (op->type == t_symbol || op->type == t_string)
408    {
409      free_token (op->token);
410      free (op->token);
411    }
412}
413
414/* Convert a t_symbol/t_string token to a char*.  */
415static char *
416string_of_object (const struct object *op)
417{
418  char *str;
419  int n;
420
421  if (!(op->type == t_symbol || op->type == t_string))
422    abort ();
423  n = op->token->charcount;
424  str = (char *) xmalloc (n + 1);
425  memcpy (str, op->token->chars, n);
426  str[n] = '\0';
427  return str;
428}
429
430/* Context lookup table.  */
431static flag_context_list_table_ty *flag_context_list_table;
432
433/* Returns the character represented by an escape sequence.  */
434#define IGNORABLE_ESCAPE (EOF - 1)
435static int
436do_getc_escaped (int c, bool in_string)
437{
438  switch (c)
439    {
440    case 'a':
441      return '\a';
442    case 'b':
443      return '\b';
444    case 'd':
445      return 0x7F;
446    case 'e':
447      return 0x1B;
448    case 'f':
449      return '\f';
450    case 'n':
451      return '\n';
452    case 'r':
453      return '\r';
454    case 't':
455      return '\t';
456    case 'v':
457      return '\v';
458
459    case '\n':
460      return IGNORABLE_ESCAPE;
461
462    case ' ':
463      return (in_string ? IGNORABLE_ESCAPE : ' ');
464
465    case 'M': /* meta */
466      c = do_getc ();
467      if (c == EOF)
468	return EOF;
469      if (c != '-')
470	/* Invalid input.  But be tolerant.  */
471	return c;
472      c = do_getc ();
473      if (c == EOF)
474	return EOF;
475      if (c == '\\')
476	{
477	  c = do_getc ();
478	  if (c == EOF)
479	    return EOF;
480	  c = do_getc_escaped (c, false);
481	}
482      return c | 0x80;
483
484    case 'S': /* shift */
485      c = do_getc ();
486      if (c == EOF)
487	return EOF;
488      if (c != '-')
489	/* Invalid input.  But be tolerant.  */
490	return c;
491      c = do_getc ();
492      if (c == EOF)
493	return EOF;
494      if (c == '\\')
495	{
496	  c = do_getc ();
497	  if (c == EOF)
498	    return EOF;
499	  c = do_getc_escaped (c, false);
500	}
501      return (c >= 'a' && c <= 'z' ? c - 'a' + 'A' : c);
502
503    case 'H': /* hyper */
504    case 'A': /* alt */
505    case 's': /* super */
506      c = do_getc ();
507      if (c == EOF)
508	return EOF;
509      if (c != '-')
510	/* Invalid input.  But be tolerant.  */
511	return c;
512      c = do_getc ();
513      if (c == EOF)
514	return EOF;
515      if (c == '\\')
516	{
517	  c = do_getc ();
518	  if (c == EOF)
519	    return EOF;
520	  c = do_getc_escaped (c, false);
521	}
522      return c;
523
524    case 'C': /* ctrl */
525      c = do_getc ();
526      if (c == EOF)
527	return EOF;
528      if (c != '-')
529	/* Invalid input.  But be tolerant.  */
530	return c;
531      /*FALLTHROUGH*/
532    case '^':
533      c = do_getc ();
534      if (c == EOF)
535	return EOF;
536      if (c == '\\')
537	{
538	  c = do_getc ();
539	  if (c == EOF)
540	    return EOF;
541	  c = do_getc_escaped (c, false);
542	}
543      if (c == '?')
544	return 0x7F;
545      if ((c & 0x5F) >= 0x41 && (c & 0x5F) <= 0x5A)
546	return c & 0x9F;
547      if ((c & 0x7F) >= 0x40 && (c & 0x7F) <= 0x5F)
548	return c & 0x9F;
549#if 0 /* We cannot handle NUL bytes in strings.  */
550      if (c == ' ')
551	return 0x00;
552#endif
553      return c;
554
555    case '0': case '1': case '2': case '3': case '4':
556    case '5': case '6': case '7':
557      /* An octal escape, as in ANSI C.  */
558      {
559	int n = c - '0';
560
561	c = do_getc ();
562	if (c != EOF)
563	  {
564	    if (c >= '0' && c <= '7')
565	      {
566		n = (n << 3) + (c - '0');
567		c = do_getc ();
568		if (c != EOF)
569		  {
570		    if (c >= '0' && c <= '7')
571		      n = (n << 3) + (c - '0');
572		    else
573		      do_ungetc (c);
574		  }
575	      }
576	    else
577	      do_ungetc (c);
578	  }
579	return (unsigned char) n;
580      }
581
582    case 'x':
583      /* A hexadecimal escape, as in ANSI C.  */
584      {
585	int n = 0;
586
587	for (;;)
588	  {
589	    c = do_getc ();
590	    if (c == EOF)
591	      break;
592	    else if (c >= '0' && c <= '9')
593	      n = (n << 4) + (c - '0');
594	    else if (c >= 'A' && c <= 'F')
595	      n = (n << 4) + (c - 'A' + 10);
596	    else if (c >= 'a' && c <= 'f')
597	      n = (n << 4) + (c - 'a' + 10);
598	    else
599	      {
600		do_ungetc (c);
601		break;
602	      }
603	  }
604	return (unsigned char) n;
605      }
606
607    default:
608      /* Ignore Emacs multibyte character stuff.  All the strings we are
609	 interested in are ASCII strings.  */
610      return c;
611    }
612}
613
614/* Read the next object.
615   'first_in_list' and 'new_backquote_flag' are used for reading old
616   backquote syntax and new backquote syntax.  */
617static void
618read_object (struct object *op, bool first_in_list, bool new_backquote_flag,
619	     flag_context_ty outer_context)
620{
621  for (;;)
622    {
623      int c;
624
625      c = do_getc ();
626
627      switch (c)
628	{
629	case EOF:
630	  op->type = t_eof;
631	  return;
632
633	case '\n':
634	  /* Comments assumed to be grouped with a message must immediately
635	     precede it, with no non-whitespace token on a line between
636	     both.  */
637	  if (last_non_comment_line > last_comment_line)
638	    savable_comment_reset ();
639	  continue;
640
641	case '(':
642	  {
643	    int arg = 0;		/* Current argument number.  */
644	    flag_context_list_iterator_ty context_iter;
645	    const struct callshapes *shapes = NULL;
646	    struct arglist_parser *argparser = NULL;
647
648	    for (;; arg++)
649	      {
650		struct object inner;
651		flag_context_ty inner_context;
652
653		if (arg == 0)
654		  inner_context = null_context;
655		else
656		  inner_context =
657		    inherited_context (outer_context,
658				       flag_context_list_iterator_advance (
659					 &context_iter));
660
661		read_object (&inner, arg == 0, new_backquote_flag,
662			     inner_context);
663
664		/* Recognize end of list.  */
665		if (inner.type == t_listclose)
666		  {
667		    op->type = t_other;
668		    /* Don't bother converting "()" to "NIL".  */
669		    last_non_comment_line = line_number;
670		    if (argparser != NULL)
671		      arglist_parser_done (argparser, arg);
672		    return;
673		  }
674
675		/* Dots are not allowed in every position. ']' is not allowed.
676		   But be tolerant.  */
677
678		/* EOF inside list is illegal.  But be tolerant.  */
679		if (inner.type == t_eof)
680		  break;
681
682		if (arg == 0)
683		  {
684		    /* This is the function position.  */
685		    if (inner.type == t_symbol)
686		      {
687			char *symbol_name = string_of_object (&inner);
688			void *keyword_value;
689
690			if (hash_find_entry (&keywords,
691					     symbol_name, strlen (symbol_name),
692					     &keyword_value)
693			    == 0)
694			  shapes = (const struct callshapes *) keyword_value;
695
696			argparser = arglist_parser_alloc (mlp, shapes);
697
698			context_iter =
699			  flag_context_list_iterator (
700			    flag_context_list_table_lookup (
701			      flag_context_list_table,
702			      symbol_name, strlen (symbol_name)));
703
704			free (symbol_name);
705		      }
706		    else
707		      context_iter = null_context_list_iterator;
708		  }
709		else
710		  {
711		    /* These are the argument positions.  */
712		    if (argparser != NULL && inner.type == t_string)
713		      arglist_parser_remember (argparser, arg,
714					       string_of_object (&inner),
715					       inner_context,
716					       logical_file_name,
717					       inner.line_number_at_start,
718					       savable_comment);
719		  }
720
721		free_object (&inner);
722	      }
723
724	    if (argparser != NULL)
725	      arglist_parser_done (argparser, arg);
726	  }
727	  op->type = t_other;
728	  last_non_comment_line = line_number;
729	  return;
730
731	case ')':
732	  /* Tell the caller about the end of list.
733	     Unmatched closing parenthesis is illegal.  But be tolerant.  */
734	  op->type = t_listclose;
735	  last_non_comment_line = line_number;
736	  return;
737
738	case '[':
739	  {
740	    for (;;)
741	      {
742		struct object inner;
743
744		read_object (&inner, false, new_backquote_flag, null_context);
745
746		/* Recognize end of vector.  */
747		if (inner.type == t_vectorclose)
748		  {
749		    op->type = t_other;
750		    last_non_comment_line = line_number;
751		    return;
752		  }
753
754		/* Dots and ')' are not allowed.  But be tolerant.  */
755
756		/* EOF inside vector is illegal.  But be tolerant.  */
757		if (inner.type == t_eof)
758		  break;
759
760		free_object (&inner);
761	      }
762	  }
763	  op->type = t_other;
764	  last_non_comment_line = line_number;
765	  return;
766
767	case ']':
768	  /* Tell the caller about the end of vector.
769	     Unmatched closing bracket is illegal.  But be tolerant.  */
770	  op->type = t_vectorclose;
771	  last_non_comment_line = line_number;
772	  return;
773
774	case '\'':
775	  {
776	    struct object inner;
777
778	    read_object (&inner, false, new_backquote_flag, null_context);
779
780	    /* Dots and EOF are not allowed here.  But be tolerant.  */
781
782	    free_object (&inner);
783
784	    op->type = t_other;
785	    last_non_comment_line = line_number;
786	    return;
787	  }
788
789	case '`':
790	  if (first_in_list)
791	    goto default_label;
792	  {
793	    struct object inner;
794
795	    read_object (&inner, false, true, null_context);
796
797	    /* Dots and EOF are not allowed here.  But be tolerant.  */
798
799	    free_object (&inner);
800
801	    op->type = t_other;
802	    last_non_comment_line = line_number;
803	    return;
804	  }
805
806	case ',':
807	  if (!new_backquote_flag)
808	    goto default_label;
809	  {
810	    int c = do_getc ();
811	    /* The ,@ handling inside lists is wrong anyway, because
812	       ,@form expands to an unknown number of elements.  */
813	    if (c != EOF && c != '@' && c != '.')
814	      do_ungetc (c);
815	  }
816	  {
817	    struct object inner;
818
819	    read_object (&inner, false, false, null_context);
820
821	    /* Dots and EOF are not allowed here.  But be tolerant.  */
822
823	    free_object (&inner);
824
825	    op->type = t_other;
826	    last_non_comment_line = line_number;
827	    return;
828	  }
829
830	case ';':
831	  {
832	    bool all_semicolons = true;
833
834	    last_comment_line = line_number;
835	    comment_start ();
836	    for (;;)
837	      {
838		int c = do_getc ();
839		if (c == EOF || c == '\n')
840		  break;
841		if (c != ';')
842		  all_semicolons = false;
843		if (!all_semicolons)
844		  {
845		    /* We skip all leading white space, but not EOLs.  */
846		    if (!(buflen == 0 && (c == ' ' || c == '\t')))
847		      comment_add (c);
848		  }
849	      }
850	    comment_line_end (0);
851	    continue;
852	  }
853
854	case '"':
855	  {
856	    op->token = (struct token *) xmalloc (sizeof (struct token));
857	    init_token (op->token);
858	    op->line_number_at_start = line_number;
859	    for (;;)
860	      {
861		int c = do_getc ();
862		if (c == EOF)
863		  /* Invalid input.  Be tolerant, no error message.  */
864		  break;
865		if (c == '"')
866		  break;
867		if (c == '\\')
868		  {
869		    c = do_getc ();
870		    if (c == EOF)
871		      /* Invalid input.  Be tolerant, no error message.  */
872		      break;
873		    c = do_getc_escaped (c, true);
874		    if (c == EOF)
875		      /* Invalid input.  Be tolerant, no error message.  */
876		      break;
877		    if (c == IGNORABLE_ESCAPE)
878		      /* Ignore escaped newline and escaped space.  */
879		      ;
880		    else
881		      {
882			grow_token (op->token);
883			op->token->chars[op->token->charcount++] = c;
884		      }
885		  }
886		else
887		  {
888		    grow_token (op->token);
889		    op->token->chars[op->token->charcount++] = c;
890		  }
891	      }
892	    op->type = t_string;
893
894	    if (extract_all)
895	      {
896		lex_pos_ty pos;
897
898		pos.file_name = logical_file_name;
899		pos.line_number = op->line_number_at_start;
900		remember_a_message (mlp, NULL, string_of_object (op),
901				    null_context, &pos, savable_comment);
902	      }
903	    last_non_comment_line = line_number;
904	    return;
905	  }
906
907	case '?':
908	  c = do_getc ();
909	  if (c == EOF)
910	    /* Invalid input.  Be tolerant, no error message.  */
911	    ;
912	  else if (c == '\\')
913	    {
914	      c = do_getc ();
915	      if (c == EOF)
916		/* Invalid input.  Be tolerant, no error message.  */
917		;
918	      else
919		{
920		  c = do_getc_escaped (c, false);
921		  if (c == EOF)
922		    /* Invalid input.  Be tolerant, no error message.  */
923		    ;
924		}
925	    }
926	  /* Impossible to deal with Emacs multibyte character stuff here.  */
927	  op->type = t_other;
928	  last_non_comment_line = line_number;
929	  return;
930
931	case '#':
932	  /* Dispatch macro handling.  */
933	  c = do_getc ();
934	  if (c == EOF)
935	    /* Invalid input.  Be tolerant, no error message.  */
936	    {
937	      op->type = t_other;
938	      return;
939	    }
940
941	  switch (c)
942	    {
943	    case '^':
944	      c = do_getc ();
945	      if (c == '^')
946		c = do_getc ();
947	      if (c == '[')
948		{
949		  /* Read a char table, same syntax as a vector.  */
950		  for (;;)
951		    {
952		      struct object inner;
953
954		      read_object (&inner, false, new_backquote_flag,
955				   null_context);
956
957		      /* Recognize end of vector.  */
958		      if (inner.type == t_vectorclose)
959			{
960			  op->type = t_other;
961			  last_non_comment_line = line_number;
962			  return;
963			}
964
965		      /* Dots and ')' are not allowed.  But be tolerant.  */
966
967		      /* EOF inside vector is illegal.  But be tolerant.  */
968		      if (inner.type == t_eof)
969			break;
970
971		      free_object (&inner);
972		    }
973		  op->type = t_other;
974		  last_non_comment_line = line_number;
975		  return;
976		}
977	      else
978		/* Invalid input.  Be tolerant, no error message.  */
979		{
980		  op->type = t_other;
981		  if (c != EOF)
982		    last_non_comment_line = line_number;
983		  return;
984		}
985
986	    case '&':
987	      /* Read a bit vector.  */
988	      {
989		struct object length;
990		read_object (&length, first_in_list, new_backquote_flag,
991			     null_context);
992		/* Dots and EOF are not allowed here.
993		   But be tolerant.  */
994		free_object (&length);
995	      }
996	      c = do_getc ();
997	      if (c == '"')
998		{
999		  struct object string;
1000		  read_object (&string, first_in_list, new_backquote_flag,
1001			       null_context);
1002		  free_object (&string);
1003		}
1004	      else
1005		/* Invalid input.  Be tolerant, no error message.  */
1006		do_ungetc (c);
1007	      op->type = t_other;
1008	      last_non_comment_line = line_number;
1009	      return;
1010
1011	    case '[':
1012	      /* Read a compiled function, same syntax as a vector.  */
1013	    case '(':
1014	      /* Read a string with properties, same syntax as a list.  */
1015	      {
1016		struct object inner;
1017		do_ungetc (c);
1018		read_object (&inner, false, new_backquote_flag, null_context);
1019		/* Dots and EOF are not allowed here.
1020		   But be tolerant.  */
1021		free_object (&inner);
1022		op->type = t_other;
1023		last_non_comment_line = line_number;
1024		return;
1025	      }
1026
1027	    case '@':
1028	      /* Read a comment consisting of a given number of bytes.  */
1029	      {
1030		unsigned int nskip = 0;
1031
1032		for (;;)
1033		  {
1034		    c = do_getc ();
1035		    if (!(c >= '0' && c <= '9'))
1036		      break;
1037		    nskip = 10 * nskip + (c - '0');
1038		  }
1039		if (c != EOF)
1040		  {
1041		    do_ungetc (c);
1042		    for (; nskip > 0; nskip--)
1043		      if (do_getc () == EOF)
1044			break;
1045		  }
1046		continue;
1047	      }
1048
1049	    case '$':
1050	      op->type = t_other;
1051	      last_non_comment_line = line_number;
1052	      return;
1053
1054	    case '\'':
1055	    case ':':
1056	    case 'S': case 's': /* XEmacs only */
1057	      {
1058		struct object inner;
1059		read_object (&inner, false, new_backquote_flag, null_context);
1060		/* Dots and EOF are not allowed here.
1061		   But be tolerant.  */
1062		free_object (&inner);
1063		op->type = t_other;
1064		last_non_comment_line = line_number;
1065		return;
1066	      }
1067
1068	    case '0': case '1': case '2': case '3': case '4':
1069	    case '5': case '6': case '7': case '8': case '9':
1070	      /* Read Common Lisp style #n# or #n=.  */
1071	      for (;;)
1072		{
1073		  c = do_getc ();
1074		  if (!(c >= '0' && c <= '9'))
1075		    break;
1076		}
1077	      if (c == EOF)
1078		/* Invalid input.  Be tolerant, no error message.  */
1079		{
1080		  op->type = t_other;
1081		  return;
1082		}
1083	      if (c == '=')
1084		{
1085		  read_object (op, false, new_backquote_flag, outer_context);
1086		  last_non_comment_line = line_number;
1087		  return;
1088		}
1089	      if (c == '#')
1090		{
1091		  op->type = t_other;
1092		  last_non_comment_line = line_number;
1093		  return;
1094		}
1095	      if (c == 'R' || c == 'r')
1096		{
1097		  /* Read an integer.  */
1098		  c = do_getc ();
1099		  if (c == '+' || c == '-')
1100		    c = do_getc ();
1101		  for (; c != EOF; c = do_getc ())
1102		    if (!c_isalnum (c))
1103		      {
1104			do_ungetc (c);
1105			break;
1106		      }
1107		  op->type = t_other;
1108		  last_non_comment_line = line_number;
1109		  return;
1110		}
1111	      /* Invalid input.  Be tolerant, no error message.  */
1112	      op->type = t_other;
1113	      last_non_comment_line = line_number;
1114	      return;
1115
1116	    case 'X': case 'x':
1117	    case 'O': case 'o':
1118	    case 'B': case 'b':
1119	      {
1120		/* Read an integer.  */
1121		c = do_getc ();
1122		if (c == '+' || c == '-')
1123		  c = do_getc ();
1124		for (; c != EOF; c = do_getc ())
1125		  if (!c_isalnum (c))
1126		    {
1127		      do_ungetc (c);
1128		      break;
1129		    }
1130		op->type = t_other;
1131		last_non_comment_line = line_number;
1132		return;
1133	      }
1134
1135	    case '*': /* XEmacs only */
1136	      {
1137		/* Read a bit-vector.  */
1138		do
1139		  c = do_getc ();
1140		while (c == '0' || c == '1');
1141		if (c != EOF)
1142		  do_ungetc (c);
1143		op->type = t_other;
1144		last_non_comment_line = line_number;
1145		return;
1146	      }
1147
1148	    case '+': /* XEmacs only */
1149	    case '-': /* XEmacs only */
1150	      /* Simply assume every feature expression is true.  */
1151	      {
1152		struct object inner;
1153		read_object (&inner, false, new_backquote_flag, null_context);
1154		/* Dots and EOF are not allowed here.
1155		   But be tolerant.  */
1156		free_object (&inner);
1157		continue;
1158	      }
1159
1160	    default:
1161	      /* Invalid input.  Be tolerant, no error message.  */
1162	      op->type = t_other;
1163	      last_non_comment_line = line_number;
1164	      return;
1165	    }
1166
1167	  /*NOTREACHED*/
1168	  abort ();
1169
1170	case '.':
1171	  c = do_getc ();
1172	  if (c != EOF)
1173	    {
1174	      do_ungetc (c);
1175	      if (c <= ' ' /* FIXME: Assumes ASCII compatible encoding */
1176		  || strchr ("\"'`,(", c) != NULL)
1177		{
1178		  op->type = t_dot;
1179		  last_non_comment_line = line_number;
1180		  return;
1181		}
1182	    }
1183	  c = '.';
1184	  /*FALLTHROUGH*/
1185	default:
1186	default_label:
1187	  if (c <= ' ') /* FIXME: Assumes ASCII compatible encoding */
1188	    continue;
1189	  /* Read a token.  */
1190	  {
1191	    bool symbol;
1192
1193	    op->token = (struct token *) xmalloc (sizeof (struct token));
1194	    symbol = read_token (op->token, c);
1195	    if (symbol)
1196	      {
1197		op->type = t_symbol;
1198		last_non_comment_line = line_number;
1199		return;
1200	      }
1201	    else
1202	      {
1203		free_token (op->token);
1204		free (op->token);
1205		op->type = t_other;
1206		last_non_comment_line = line_number;
1207		return;
1208	      }
1209	  }
1210	}
1211    }
1212}
1213
1214
1215void
1216extract_elisp (FILE *f,
1217	       const char *real_filename, const char *logical_filename,
1218	       flag_context_list_table_ty *flag_table,
1219	       msgdomain_list_ty *mdlp)
1220{
1221  mlp = mdlp->item[0]->messages;
1222
1223  fp = f;
1224  real_file_name = real_filename;
1225  logical_file_name = xstrdup (logical_filename);
1226  line_number = 1;
1227
1228  last_comment_line = -1;
1229  last_non_comment_line = -1;
1230
1231  flag_context_list_table = flag_table;
1232
1233  init_keywords ();
1234
1235  /* Eat tokens until eof is seen.  When read_object returns
1236     due to an unbalanced closing parenthesis, just restart it.  */
1237  do
1238    {
1239      struct object toplevel_object;
1240
1241      read_object (&toplevel_object, false, false, null_context);
1242
1243      if (toplevel_object.type == t_eof)
1244	break;
1245
1246      free_object (&toplevel_object);
1247    }
1248  while (!feof (fp));
1249
1250  /* Close scanner.  */
1251  fp = NULL;
1252  real_file_name = NULL;
1253  logical_file_name = NULL;
1254  line_number = 0;
1255}
1256