1/* xgettext Scheme backend.
2   Copyright (C) 2004-2006 Free Software Foundation, Inc.
3
4   This file was written by Bruno Haible <bruno@clisp.org>, 2004-2005.
5
6   This program is free software; you can redistribute it and/or modify
7   it under the terms of the GNU General Public License as published by
8   the Free Software Foundation; either version 2, or (at your option)
9   any later version.
10
11   This program is distributed in the hope that it will be useful,
12   but WITHOUT ANY WARRANTY; without even the implied warranty of
13   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14   GNU General Public License for more details.
15
16   You should have received a copy of the GNU General Public License
17   along with this program; if not, write to the Free Software Foundation,
18   Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.  */
19
20#ifdef HAVE_CONFIG_H
21# include "config.h"
22#endif
23
24#include <errno.h>
25#include <stdbool.h>
26#include <stdio.h>
27#include <stdlib.h>
28#include <string.h>
29
30#include "message.h"
31#include "xgettext.h"
32#include "x-scheme.h"
33#include "error.h"
34#include "xalloc.h"
35#include "exit.h"
36#include "hash.h"
37#include "gettext.h"
38
39#define _(s) gettext(s)
40
41
42/* The Scheme syntax is described in R5RS.  It is implemented in
43   guile-1.6.4/libguile/read.c.
44   Since we are interested only in strings and in forms similar to
45        (gettext msgid ...)
46   or   (ngettext msgid msgid_plural ...)
47   we make the following simplifications:
48
49   - Assume the keywords and strings are in an ASCII compatible encoding.
50     This means we can read the input file one byte at a time, instead of
51     one character at a time.  No need to worry about multibyte characters:
52     If they occur as part of identifiers, they most probably act as
53     constituent characters, and the byte based approach will do the same.
54
55   - Assume the read-hash-procedures is in the default state.
56     Non-standard reader extensions are mostly used to read data, not programs.
57
58   The remaining syntax rules are:
59
60   - The syntax code assigned to each character, and how tokens are built
61     up from characters (single escape, multiple escape etc.).
62
63   - Comment syntax: ';' and '#! ... \n!#\n'.
64
65   - String syntax: "..." with single escapes.
66
67   - Read macros and dispatch macro character '#'.  Needed to be able to
68     tell which is the n-th argument of a function call.
69
70 */
71
72
73/* ====================== Keyword set customization.  ====================== */
74
75/* If true extract all strings.  */
76static bool extract_all = false;
77
78static hash_table keywords;
79static bool default_keywords = true;
80
81
82void
83x_scheme_extract_all ()
84{
85  extract_all = true;
86}
87
88
89void
90x_scheme_keyword (const char *name)
91{
92  if (name == NULL)
93    default_keywords = false;
94  else
95    {
96      const char *end;
97      struct callshape shape;
98      const char *colon;
99
100      if (keywords.table == NULL)
101	hash_init (&keywords, 100);
102
103      split_keywordspec (name, &end, &shape);
104
105      /* The characters between name and end should form a valid Lisp symbol.
106	 Extract the symbol name part.  */
107      colon = strchr (name, ':');
108      if (colon != NULL && colon < end)
109	{
110	  name = colon + 1;
111	  if (name < end && *name == ':')
112	    name++;
113	  colon = strchr (name, ':');
114	  if (colon != NULL && colon < end)
115	    return;
116	}
117
118      insert_keyword_callshape (&keywords, name, end - name, &shape);
119    }
120}
121
122/* Finish initializing the keywords hash table.
123   Called after argument processing, before each file is processed.  */
124static void
125init_keywords ()
126{
127  if (default_keywords)
128    {
129      /* When adding new keywords here, also update the documentation in
130	 xgettext.texi!  */
131      x_scheme_keyword ("gettext");		/* libguile/i18n.c */
132      x_scheme_keyword ("ngettext:1,2");	/* libguile/i18n.c */
133      x_scheme_keyword ("gettext-noop");
134      default_keywords = false;
135    }
136}
137
138void
139init_flag_table_scheme ()
140{
141  xgettext_record_flag ("gettext:1:pass-scheme-format");
142  xgettext_record_flag ("ngettext:1:pass-scheme-format");
143  xgettext_record_flag ("ngettext:2:pass-scheme-format");
144  xgettext_record_flag ("gettext-noop:1:pass-scheme-format");
145  xgettext_record_flag ("format:2:scheme-format");
146}
147
148
149/* ======================== Reading of characters.  ======================== */
150
151/* Real filename, used in error messages about the input file.  */
152static const char *real_file_name;
153
154/* Logical filename and line number, used to label the extracted messages.  */
155static char *logical_file_name;
156static int line_number;
157
158/* The input file stream.  */
159static FILE *fp;
160
161
162/* Fetch the next character from the input file.  */
163static int
164do_getc ()
165{
166  int c = getc (fp);
167
168  if (c == EOF)
169    {
170      if (ferror (fp))
171	error (EXIT_FAILURE, errno, _("\
172error while reading \"%s\""), real_file_name);
173    }
174  else if (c == '\n')
175   line_number++;
176
177  return c;
178}
179
180/* Put back the last fetched character, not EOF.  */
181static void
182do_ungetc (int c)
183{
184  if (c == '\n')
185    line_number--;
186  ungetc (c, fp);
187}
188
189
190/* ========================== Reading of tokens.  ========================== */
191
192
193/* A token consists of a sequence of characters.  */
194struct token
195{
196  int allocated;		/* number of allocated 'token_char's */
197  int charcount;		/* number of used 'token_char's */
198  char *chars;			/* the token's constituents */
199};
200
201/* Initialize a 'struct token'.  */
202static inline void
203init_token (struct token *tp)
204{
205  tp->allocated = 10;
206  tp->chars = (char *) xmalloc (tp->allocated * sizeof (char));
207  tp->charcount = 0;
208}
209
210/* Free the memory pointed to by a 'struct token'.  */
211static inline void
212free_token (struct token *tp)
213{
214  free (tp->chars);
215}
216
217/* Ensure there is enough room in the token for one more character.  */
218static inline void
219grow_token (struct token *tp)
220{
221  if (tp->charcount == tp->allocated)
222    {
223      tp->allocated *= 2;
224      tp->chars = (char *) xrealloc (tp->chars, tp->allocated * sizeof (char));
225    }
226}
227
228/* Read the next token.  'first' is the first character, which has already
229   been read.  */
230static void
231read_token (struct token *tp, int first)
232{
233  init_token (tp);
234
235  grow_token (tp);
236  tp->chars[tp->charcount++] = first;
237
238  for (;;)
239    {
240      int c = do_getc ();
241
242      if (c == EOF)
243	break;
244      if (c == ' ' || c == '\r' || c == '\f' || c == '\t' || c == '\n'
245	  || c == '"' || c == '(' || c == ')' || c == ';')
246	{
247	  do_ungetc (c);
248	  break;
249	}
250      grow_token (tp);
251      tp->chars[tp->charcount++] = c;
252    }
253}
254
255/* Tests if a token represents an integer.
256   Taken from guile-1.6.4/libguile/numbers.c:scm_istr2int().  */
257static inline bool
258is_integer_syntax (const char *str, int len, int radix)
259{
260  const char *p = str;
261  const char *p_end = str + len;
262
263  /* The accepted syntax is
264       ['+'|'-'] DIGIT+
265     where DIGIT is a hexadecimal digit whose value is below radix.  */
266
267  if (p == p_end)
268    return false;
269  if (*p == '+' || *p == '-')
270    {
271      p++;
272      if (p == p_end)
273	return false;
274    }
275  do
276    {
277      int c = *p++;
278
279      if (c >= '0' && c <= '9')
280	c = c - '0';
281      else if (c >= 'A' && c <= 'F')
282	c = c - 'A' + 10;
283      else if (c >= 'a' && c <= 'f')
284	c = c - 'a' + 10;
285      else
286	return false;
287      if (c >= radix)
288	return false;
289    }
290  while (p < p_end);
291  return true;
292}
293
294/* Tests if a token represents a rational, floating-point or complex number.
295   If unconstrained is false, only real numbers are accepted; otherwise,
296   complex numbers are accepted as well.
297   Taken from guile-1.6.4/libguile/numbers.c:scm_istr2flo().  */
298static inline bool
299is_other_number_syntax (const char *str, int len, int radix, bool unconstrained)
300{
301  const char *p = str;
302  const char *p_end = str + len;
303  bool seen_sign;
304  bool seen_digits;
305
306  /* The accepted syntaxes are:
307     for a floating-point number:
308       ['+'|'-'] DIGIT+ [EXPONENT]
309       ['+'|'-'] DIGIT* '.' DIGIT+ [EXPONENT]
310       where EXPONENT ::= ['d'|'e'|'f'|'l'|'s'] DIGIT+
311       (Dot and exponent are allowed only if radix is 10.)
312     for a rational number:
313       ['+'|'-'] DIGIT+ '/' DIGIT+
314     for a complex number:
315       REAL-NUMBER {'+'|'-'} REAL-NUMBER-WITHOUT-SIGN 'i'
316       REAL-NUMBER {'+'|'-'} 'i'
317       {'+'|'-'} REAL-NUMBER-WITHOUT-SIGN 'i'
318       {'+'|'-'} 'i'
319       REAL-NUMBER '@' REAL-NUMBER
320   */
321  if (p == p_end)
322    return false;
323  /* Parse leading sign.  */
324  seen_sign = false;
325  if (*p == '+' || *p == '-')
326    {
327      p++;
328      if (p == p_end)
329	return false;
330      seen_sign = true;
331      /* Recognize complex number syntax: {'+'|'-'} 'i'  */
332      if (unconstrained && (*p == 'I' || *p == 'i') && p + 1 == p_end)
333	return true;
334    }
335  /* Parse digits before dot or exponent or slash.  */
336  seen_digits = false;
337  do
338    {
339      int c = *p;
340
341      if (c >= '0' && c <= '9')
342	c = c - '0';
343      else if (c >= 'A' && c <= 'F')
344	{
345	  if (c >= 'D' && radix == 10) /* exponent? */
346	    break;
347	  c = c - 'A' + 10;
348	}
349      else if (c >= 'a' && c <= 'f')
350	{
351	  if (c >= 'd' && radix == 10) /* exponent? */
352	    break;
353	  c = c - 'a' + 10;
354	}
355      else
356	break;
357      if (c >= radix)
358	return false;
359      seen_digits = true;
360      p++;
361    }
362  while (p < p_end);
363  /* If p == p_end, we know that seen_digits = true, and the number is an
364     integer without exponent.  */
365  if (p < p_end)
366    {
367      /* If we have no digits so far, we need a decimal point later.  */
368      if (!seen_digits && !(*p == '.' && radix == 10))
369	return false;
370      /* Trailing '#' signs are equivalent to zeroes.  */
371      while (p < p_end && *p == '#')
372	p++;
373      if (p < p_end)
374	{
375	  if (*p == '/')
376	    {
377	      /* Parse digits after the slash.  */
378	      bool all_zeroes = true;
379	      p++;
380	      for (; p < p_end; p++)
381		{
382		  int c = *p;
383
384		  if (c >= '0' && c <= '9')
385		    c = c - '0';
386		  else if (c >= 'A' && c <= 'F')
387		    c = c - 'A' + 10;
388		  else if (c >= 'a' && c <= 'f')
389		    c = c - 'a' + 10;
390		  else
391		    break;
392		  if (c >= radix)
393		    return false;
394		  if (c != 0)
395		    all_zeroes = false;
396		}
397	      /* A zero denominator is not allowed.  */
398	      if (all_zeroes)
399		return false;
400	      /* Trailing '#' signs are equivalent to zeroes.  */
401	      while (p < p_end && *p == '#')
402		p++;
403	    }
404	  else
405	    {
406	      if (*p == '.')
407		{
408		  /* Decimal point notation.  */
409		  if (radix != 10)
410		    return false;
411		  /* Parse digits after the decimal point.  */
412		  p++;
413		  for (; p < p_end; p++)
414		    {
415		      int c = *p;
416
417		      if (c >= '0' && c <= '9')
418			seen_digits = true;
419		      else
420			break;
421		    }
422		  /* Digits are required before or after the decimal point.  */
423		  if (!seen_digits)
424		    return false;
425		  /* Trailing '#' signs are equivalent to zeroes.  */
426		  while (p < p_end && *p == '#')
427		    p++;
428		}
429	      if (p < p_end)
430		{
431		  /* Parse exponent.  */
432		  switch (*p)
433		    {
434		    case 'D': case 'd':
435		    case 'E': case 'e':
436		    case 'F': case 'f':
437		    case 'L': case 'l':
438		    case 'S': case 's':
439		      if (radix != 10)
440			return false;
441		      p++;
442		      if (p == p_end)
443			return false;
444		      if (*p == '+' || *p == '-')
445			{
446			  p++;
447			  if (p == p_end)
448			    return false;
449			}
450		      if (!(*p >= '0' && *p <= '9'))
451			return false;
452		      for (;;)
453			{
454			  p++;
455			  if (p == p_end)
456			    break;
457			  if (!(*p >= '0' && *p <= '9'))
458			    break;
459			}
460		      break;
461		    default:
462		      break;
463		    }
464		}
465	    }
466	}
467    }
468  if (p == p_end)
469    return true;
470  /* Recognize complex number syntax.  */
471  if (unconstrained)
472    {
473      /* Recognize the syntax  {'+'|'-'} REAL-NUMBER-WITHOUT-SIGN 'i'  */
474      if (seen_sign && (*p == 'I' || *p == 'i') && p + 1 == p_end)
475	return true;
476      /* Recognize the syntaxes
477	   REAL-NUMBER {'+'|'-'} REAL-NUMBER-WITHOUT-SIGN 'i'
478	   REAL-NUMBER {'+'|'-'} 'i'
479       */
480      if (*p == '+' || *p == '-')
481	return (p_end[-1] == 'I' || p_end[-1] == 'i')
482		&& (p + 1 == p_end - 1
483		    || is_other_number_syntax (p, p_end - 1 - p, radix, false));
484      /* Recognize the syntax  REAL-NUMBER '@' REAL-NUMBER  */
485      if (*p == '@')
486	{
487	  p++;
488	  return is_other_number_syntax (p, p_end - p, radix, false);
489	}
490    }
491  return false;
492}
493
494/* Tests if a token represents a number.
495   Taken from guile-1.6.4/libguile/numbers.c:scm_istring2number().  */
496static bool
497is_number (const struct token *tp)
498{
499  const char *str = tp->chars;
500  int len = tp->charcount;
501  int radix = 10;
502  enum { unknown, exact, inexact } exactness = unknown;
503  bool seen_radix_prefix = false;
504  bool seen_exactness_prefix = false;
505
506  if (len == 1)
507    if (*str == '+' || *str == '-')
508      return false;
509  while (len >= 2 && *str == '#')
510    {
511      switch (str[1])
512	{
513	case 'B': case 'b':
514	  if (seen_radix_prefix)
515	    return false;
516	  radix = 2;
517	  seen_radix_prefix = true;
518	  break;
519	case 'O': case 'o':
520	  if (seen_radix_prefix)
521	    return false;
522	  radix = 8;
523	  seen_radix_prefix = true;
524	  break;
525	case 'D': case 'd':
526	  if (seen_radix_prefix)
527	    return false;
528	  radix = 10;
529	  seen_radix_prefix = true;
530	  break;
531	case 'X': case 'x':
532	  if (seen_radix_prefix)
533	    return false;
534	  radix = 16;
535	  seen_radix_prefix = true;
536	  break;
537	case 'E': case 'e':
538	  if (seen_exactness_prefix)
539	    return false;
540	  exactness = exact;
541	  seen_exactness_prefix = true;
542	  break;
543	case 'I': case 'i':
544	  if (seen_exactness_prefix)
545	    return false;
546	  exactness = inexact;
547	  seen_exactness_prefix = true;
548	  break;
549	default:
550	  return false;
551	}
552      str += 2;
553      len -= 2;
554    }
555  if (exactness != inexact)
556    {
557      /* Try to parse an integer.  */
558      if (is_integer_syntax (str, len, 10))
559	return true;
560      /* FIXME: Other Scheme implementations support exact rational numbers
561	 or exact complex numbers.  */
562    }
563  if (exactness != exact)
564    {
565      /* Try to parse a rational, floating-point or complex number.  */
566      if (is_other_number_syntax (str, len, 10, true))
567	return true;
568    }
569  return false;
570}
571
572
573/* ========================= Accumulating comments ========================= */
574
575
576static char *buffer;
577static size_t bufmax;
578static size_t buflen;
579
580static inline void
581comment_start ()
582{
583  buflen = 0;
584}
585
586static inline void
587comment_add (int c)
588{
589  if (buflen >= bufmax)
590    {
591      bufmax = 2 * bufmax + 10;
592      buffer = xrealloc (buffer, bufmax);
593    }
594  buffer[buflen++] = c;
595}
596
597static inline void
598comment_line_end (size_t chars_to_remove)
599{
600  buflen -= chars_to_remove;
601  while (buflen >= 1
602	 && (buffer[buflen - 1] == ' ' || buffer[buflen - 1] == '\t'))
603    --buflen;
604  if (chars_to_remove == 0 && buflen >= bufmax)
605    {
606      bufmax = 2 * bufmax + 10;
607      buffer = xrealloc (buffer, bufmax);
608    }
609  buffer[buflen] = '\0';
610  savable_comment_add (buffer);
611}
612
613
614/* These are for tracking whether comments count as immediately before
615   keyword.  */
616static int last_comment_line;
617static int last_non_comment_line;
618
619
620/* ========================= Accumulating messages ========================= */
621
622
623static message_list_ty *mlp;
624
625
626/* ========================== Reading of objects.  ========================= */
627
628
629/* We are only interested in symbols (e.g. gettext or ngettext) and strings.
630   Other objects need not to be represented precisely.  */
631enum object_type
632{
633  t_symbol,	/* symbol */
634  t_string,	/* string */
635  t_other,	/* other kind of real object */
636  t_dot,	/* '.' pseudo object */
637  t_close,	/* ')' pseudo object */
638  t_eof		/* EOF marker */
639};
640
641struct object
642{
643  enum object_type type;
644  struct token *token;		/* for t_symbol and t_string */
645  int line_number_at_start;	/* for t_string */
646};
647
648/* Free the memory pointed to by a 'struct object'.  */
649static inline void
650free_object (struct object *op)
651{
652  if (op->type == t_symbol || op->type == t_string)
653    {
654      free_token (op->token);
655      free (op->token);
656    }
657}
658
659/* Convert a t_symbol/t_string token to a char*.  */
660static char *
661string_of_object (const struct object *op)
662{
663  char *str;
664  int n;
665
666  if (!(op->type == t_symbol || op->type == t_string))
667    abort ();
668  n = op->token->charcount;
669  str = (char *) xmalloc (n + 1);
670  memcpy (str, op->token->chars, n);
671  str[n] = '\0';
672  return str;
673}
674
675/* Context lookup table.  */
676static flag_context_list_table_ty *flag_context_list_table;
677
678/* Read the next object.  */
679static void
680read_object (struct object *op, flag_context_ty outer_context)
681{
682  for (;;)
683    {
684      int c = do_getc ();
685
686      switch (c)
687	{
688	case EOF:
689	  op->type = t_eof;
690	  return;
691
692	case ' ': case '\r': case '\f': case '\t':
693	  continue;
694
695	case '\n':
696	  /* Comments assumed to be grouped with a message must immediately
697	     precede it, with no non-whitespace token on a line between
698	     both.  */
699	  if (last_non_comment_line > last_comment_line)
700	    savable_comment_reset ();
701	  continue;
702
703	case ';':
704	  {
705	    bool all_semicolons = true;
706
707	    last_comment_line = line_number;
708	    comment_start ();
709	    for (;;)
710	      {
711		c = do_getc ();
712		if (c == EOF || c == '\n')
713		  break;
714		if (c != ';')
715		  all_semicolons = false;
716		if (!all_semicolons)
717		  {
718		    /* We skip all leading white space, but not EOLs.  */
719		    if (!(buflen == 0 && (c == ' ' || c == '\t')))
720		      comment_add (c);
721		  }
722	      }
723	    comment_line_end (0);
724	    continue;
725	  }
726
727	case '(':
728	  {
729	     int arg = 0;		/* Current argument number.  */
730	     flag_context_list_iterator_ty context_iter;
731	    const struct callshapes *shapes = NULL;
732	    struct arglist_parser *argparser = NULL;
733
734	     for (;; arg++)
735	       {
736		struct object inner;
737		flag_context_ty inner_context;
738
739		if (arg == 0)
740		  inner_context = null_context;
741		else
742		  inner_context =
743		    inherited_context (outer_context,
744				       flag_context_list_iterator_advance (
745					 &context_iter));
746
747		read_object (&inner, inner_context);
748
749		/* Recognize end of list.  */
750		if (inner.type == t_close)
751		  {
752		    op->type = t_other;
753		    last_non_comment_line = line_number;
754		    if (argparser != NULL)
755		      arglist_parser_done (argparser, arg);
756		    return;
757		  }
758
759		/* Dots are not allowed in every position.
760		   But be tolerant.  */
761
762		/* EOF inside list is illegal.
763		   But be tolerant.  */
764		if (inner.type == t_eof)
765		  break;
766
767		if (arg == 0)
768		  {
769		    /* This is the function position.  */
770		    if (inner.type == t_symbol)
771		      {
772			char *symbol_name = string_of_object (&inner);
773			void *keyword_value;
774
775			if (hash_find_entry (&keywords,
776					     symbol_name, strlen (symbol_name),
777					     &keyword_value)
778			    == 0)
779			  shapes = (const struct callshapes *) keyword_value;
780
781			argparser = arglist_parser_alloc (mlp, shapes);
782
783			context_iter =
784			  flag_context_list_iterator (
785			    flag_context_list_table_lookup (
786			      flag_context_list_table,
787			      symbol_name, strlen (symbol_name)));
788
789			free (symbol_name);
790		      }
791		    else
792		      context_iter = null_context_list_iterator;
793		  }
794		else
795		  {
796		    /* These are the argument positions.  */
797		    if (argparser != NULL && inner.type == t_string)
798		      arglist_parser_remember (argparser, arg,
799					       string_of_object (&inner),
800					       inner_context,
801					       logical_file_name,
802					       inner.line_number_at_start,
803					       savable_comment);
804		  }
805
806		free_object (&inner);
807	      }
808	    if (argparser != NULL)
809	      arglist_parser_done (argparser, arg);
810	  }
811	  op->type = t_other;
812	  last_non_comment_line = line_number;
813	  return;
814
815	case ')':
816	  /* Tell the caller about the end of list.
817	     Unmatched closing parenthesis is illegal.
818	     But be tolerant.  */
819	  op->type = t_close;
820	  last_non_comment_line = line_number;
821	  return;
822
823	case ',':
824	  {
825	    int c = do_getc ();
826	    /* The ,@ handling inside lists is wrong anyway, because
827	       ,@form expands to an unknown number of elements.  */
828	    if (c != EOF && c != '@')
829	      do_ungetc (c);
830	  }
831	  /*FALLTHROUGH*/
832	case '\'':
833	case '`':
834	  {
835	    struct object inner;
836
837	    read_object (&inner, null_context);
838
839	    /* Dots and EOF are not allowed here.  But be tolerant.  */
840
841	    free_object (&inner);
842
843	    op->type = t_other;
844	    last_non_comment_line = line_number;
845	    return;
846	  }
847
848	case '#':
849	  /* Dispatch macro handling.  */
850	  {
851	    c = do_getc ();
852	    if (c == EOF)
853	      /* Invalid input.  Be tolerant, no error message.  */
854	      {
855		op->type = t_other;
856		return;
857	      }
858
859	    switch (c)
860	      {
861	      case '(': /* Vector */
862		do_ungetc (c);
863		{
864		  struct object inner;
865		  read_object (&inner, null_context);
866		  /* Dots and EOF are not allowed here.
867		     But be tolerant.  */
868		  free_object (&inner);
869		  op->type = t_other;
870		  last_non_comment_line = line_number;
871		  return;
872		}
873
874	      case 'T': case 't': /* Boolean true */
875	      case 'F': case 'f': /* Boolean false */
876		op->type = t_other;
877		last_non_comment_line = line_number;
878		return;
879
880	      case 'B': case 'b':
881	      case 'O': case 'o':
882	      case 'D': case 'd':
883	      case 'X': case 'x':
884	      case 'E': case 'e':
885	      case 'I': case 'i':
886		{
887		  struct token token;
888		  do_ungetc (c);
889		  read_token (&token, '#');
890		  if (is_number (&token))
891		    {
892		      /* A number.  */
893		      free_token (&token);
894		      op->type = t_other;
895		      last_non_comment_line = line_number;
896		      return;
897		    }
898		  else
899		    {
900		      if (token.charcount == 2
901			  && (token.chars[1] == 'e' || token.chars[1] == 'i'))
902			{
903			  c = do_getc ();
904			  if (c != EOF)
905			    do_ungetc (c);
906			  if (c == '(')
907			    /* Homogenous vector syntax, see arrays.scm.  */
908			    case 'a':	/* Vectors of char */
909			    case 'c':	/* Vectors of complex */
910			  /*case 'e':*/	/* Vectors of long */
911			    case 'h':	/* Vectors of short */
912			  /*case 'i':*/	/* Vectors of double-float */
913			    case 'l':	/* Vectors of long long */
914			    case 's':	/* Vectors of single-float */
915			    case 'u':	/* Vectors of unsigned long */
916			    case 'y':	/* Vectors of byte */
917			      {
918				struct object inner;
919				read_object (&inner, null_context);
920				/* Dots and EOF are not allowed here.
921				   But be tolerant.  */
922				free_token (&token);
923				free_object (&inner);
924				op->type = t_other;
925				last_non_comment_line = line_number;
926				return;
927			      }
928			}
929		      /* Unknown # object.  But be tolerant.  */
930		      free_token (&token);
931		      op->type = t_other;
932		      last_non_comment_line = line_number;
933		      return;
934		    }
935		}
936
937	      case '!':
938		/* Block comment '#! ... \n!#\n'.  We don't extract it
939		   because it's only used to introduce scripts on Unix.  */
940		{
941		  int last1 = 0;
942		  int last2 = 0;
943		  int last3 = 0;
944
945		  for (;;)
946		    {
947		      c = do_getc ();
948		      if (c == EOF)
949			/* EOF is not allowed here.  But be tolerant.  */
950			break;
951		      if (last3 == '\n' && last2 == '!' && last1 == '#'
952			  && c == '\n')
953			break;
954		      last3 = last2;
955		      last2 = last1;
956		      last1 = c;
957		    }
958		  continue;
959		}
960
961	      case '*':
962		/* Bit vector.  */
963		{
964		  struct token token;
965		  read_token (&token, c);
966		  /* The token should consists only of '0' and '1', except
967		     for the initial '*'.  But be tolerant.  */
968		  free_token (&token);
969		  op->type = t_other;
970		  last_non_comment_line = line_number;
971		  return;
972		}
973
974	      case '{':
975		/* Symbol with multiple escapes: #{...}#  */
976		{
977		  op->token = (struct token *) xmalloc (sizeof (struct token));
978
979		  init_token (op->token);
980
981		  for (;;)
982		    {
983		      c = do_getc ();
984
985		      if (c == EOF)
986			break;
987		      if (c == '\\')
988			{
989			  c = do_getc ();
990			  if (c == EOF)
991			    break;
992			}
993		      else if (c == '}')
994			{
995			  c = do_getc ();
996			  if (c == '#')
997			    break;
998			  if (c != EOF)
999			    do_ungetc (c);
1000			  c = '}';
1001			}
1002		      grow_token (op->token);
1003		      op->token->chars[op->token->charcount++] = c;
1004		    }
1005
1006		  op->type = t_symbol;
1007		  last_non_comment_line = line_number;
1008		  return;
1009		}
1010
1011	      case '\\':
1012		/* Character.  */
1013		{
1014		  struct token token;
1015		  c = do_getc ();
1016		  if (c != EOF)
1017		    {
1018		      read_token (&token, c);
1019		      free_token (&token);
1020		    }
1021		  op->type = t_other;
1022		  last_non_comment_line = line_number;
1023		  return;
1024		}
1025
1026	      case ':': /* Keyword.  */
1027	      case '&': /* Deprecated keyword, installed in optargs.scm.  */
1028		{
1029		  struct token token;
1030		  read_token (&token, '-');
1031		  free_token (&token);
1032		  op->type = t_other;
1033		  last_non_comment_line = line_number;
1034		  return;
1035		}
1036
1037	      /* The following are installed through read-hash-extend.  */
1038
1039	      /* arrays.scm */
1040	      case '0': case '1': case '2': case '3': case '4':
1041	      case '5': case '6': case '7': case '8': case '9':
1042		/* Multidimensional array syntax: #nx(...) where
1043		     n ::= DIGIT+
1044		     x ::= {'a'|'b'|'c'|'e'|'i'|'s'|'u'}
1045		 */
1046		do
1047		  c = do_getc ();
1048		while (c >= '0' && c <= '9');
1049		/* c should be one of {'a'|'b'|'c'|'e'|'i'|'s'|'u'}.
1050		   But be tolerant.  */
1051		/*FALLTHROUGH*/
1052	      case '\'': /* boot-9.scm */
1053	      case '.': /* boot-9.scm */
1054	      case ',': /* srfi-10.scm */
1055		{
1056		  struct object inner;
1057		  read_object (&inner, null_context);
1058		  /* Dots and EOF are not allowed here.
1059		     But be tolerant.  */
1060		  free_object (&inner);
1061		  op->type = t_other;
1062		  last_non_comment_line = line_number;
1063		  return;
1064		}
1065
1066	      default:
1067		/* Unknown.  */
1068		op->type = t_other;
1069		last_non_comment_line = line_number;
1070		return;
1071	      }
1072	    /*NOTREACHED*/
1073	    abort ();
1074	  }
1075
1076	case '"':
1077	  {
1078	    op->token = (struct token *) xmalloc (sizeof (struct token));
1079	    init_token (op->token);
1080	    op->line_number_at_start = line_number;
1081	    for (;;)
1082	      {
1083		int c = do_getc ();
1084		if (c == EOF)
1085		  /* Invalid input.  Be tolerant, no error message.  */
1086		  break;
1087		if (c == '"')
1088		  break;
1089		if (c == '\\')
1090		  {
1091		    c = do_getc ();
1092		    if (c == EOF)
1093		      /* Invalid input.  Be tolerant, no error message.  */
1094		      break;
1095		    switch (c)
1096		      {
1097		      case '\n':
1098			continue;
1099		      case '0':
1100			c = '\0';
1101			break;
1102		      case 'a':
1103			c = '\a';
1104			break;
1105		      case 'f':
1106			c = '\f';
1107			break;
1108		      case 'n':
1109			c = '\n';
1110			break;
1111		      case 'r':
1112			c = '\r';
1113			break;
1114		      case 't':
1115			c = '\t';
1116			break;
1117		      case 'v':
1118			c = '\v';
1119			break;
1120		      default:
1121			break;
1122		      }
1123		  }
1124		grow_token (op->token);
1125		op->token->chars[op->token->charcount++] = c;
1126	      }
1127	    op->type = t_string;
1128
1129	    if (extract_all)
1130	      {
1131		lex_pos_ty pos;
1132
1133		pos.file_name = logical_file_name;
1134		pos.line_number = op->line_number_at_start;
1135		remember_a_message (mlp, NULL, string_of_object (op),
1136				    null_context, &pos, savable_comment);
1137	      }
1138	    last_non_comment_line = line_number;
1139	    return;
1140	  }
1141
1142	case '0': case '1': case '2': case '3': case '4':
1143	case '5': case '6': case '7': case '8': case '9':
1144	case '+': case '-': case '.':
1145	  /* Read a number or symbol token.  */
1146	  op->token = (struct token *) xmalloc (sizeof (struct token));
1147	  read_token (op->token, c);
1148	  if (op->token->charcount == 1 && op->token->chars[0] == '.')
1149	    {
1150	      free_token (op->token);
1151	      free (op->token);
1152	      op->type = t_dot;
1153	    }
1154	  else if (is_number (op->token))
1155	    {
1156	      /* A number.  */
1157	      free_token (op->token);
1158	      free (op->token);
1159	      op->type = t_other;
1160	    }
1161	  else
1162	    {
1163	      /* A symbol.  */
1164	      op->type = t_symbol;
1165	    }
1166	  last_non_comment_line = line_number;
1167	  return;
1168
1169	case ':':
1170	default:
1171	  /* Read a symbol token.  */
1172	  op->token = (struct token *) xmalloc (sizeof (struct token));
1173	  read_token (op->token, c);
1174	  op->type = t_symbol;
1175	  last_non_comment_line = line_number;
1176	  return;
1177	}
1178    }
1179}
1180
1181
1182void
1183extract_scheme (FILE *f,
1184		const char *real_filename, const char *logical_filename,
1185		flag_context_list_table_ty *flag_table,
1186		msgdomain_list_ty *mdlp)
1187{
1188  mlp = mdlp->item[0]->messages;
1189
1190  fp = f;
1191  real_file_name = real_filename;
1192  logical_file_name = xstrdup (logical_filename);
1193  line_number = 1;
1194
1195  last_comment_line = -1;
1196  last_non_comment_line = -1;
1197
1198  flag_context_list_table = flag_table;
1199
1200  init_keywords ();
1201
1202  /* Eat tokens until eof is seen.  When read_object returns
1203     due to an unbalanced closing parenthesis, just restart it.  */
1204  do
1205    {
1206      struct object toplevel_object;
1207
1208      read_object (&toplevel_object, null_context);
1209
1210      if (toplevel_object.type == t_eof)
1211	break;
1212
1213      free_object (&toplevel_object);
1214    }
1215  while (!feof (fp));
1216
1217  /* Close scanner.  */
1218  fp = NULL;
1219  real_file_name = NULL;
1220  logical_file_name = NULL;
1221  line_number = 0;
1222}
1223