gettext-tools/src/x-lisp.c

/* xgettext Lisp backend.
   Copyright (C) 2001-2003, 2005-2006 Free Software Foundation, Inc.

   This file was written by Bruno Haible <haible@clisp.cons.org>, 2001.

   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 2, or (at your option)
   any later version.

   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with this program; if not, write to the Free Software Foundation,
   Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.  */

#ifdef HAVE_CONFIG_H
# include "config.h"
#endif

#include <errno.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#include "message.h"
#include "xgettext.h"
#include "x-lisp.h"
#include "error.h"
#include "xalloc.h"
#include "exit.h"
#include "hash.h"
#include "gettext.h"

#define _(s) gettext(s)


/* The Common Lisp syntax is described in the Common Lisp HyperSpec, chapter 2.
   Since we are interested only in strings and in forms similar to
        (gettext msgid ...)
   or   (ngettext msgid msgid_plural ...)
   we make the following simplifications:

   - Assume the keywords and strings are in an ASCII compatible encoding.
     This means we can read the input file one byte at a time, instead of
     one character at a time.  No need to worry about multibyte characters:
     If they occur as part of identifiers, they most probably act as
     constituent characters, and the byte based approach will do the same.

   - Assume the read table is the standard Common Lisp read table.
     Non-standard read tables are mostly used to read data, not programs.

   - Assume the read table case is :UPCASE, and *READ-BASE* is 10.

   - Don't interpret #n= and #n#, they usually don't appear in programs.

   - Don't interpret #+, #-, they are unlikely to appear in a gettext form.

   The remaining syntax rules are:

   - The syntax code assigned to each character, and how tokens are built
     up from characters (single escape, multiple escape etc.).

   - Comment syntax: ';' and '#| ... |#'.

   - String syntax: "..." with single escapes.

   - Read macros and dispatch macro character '#'.  Needed to be able to
     tell which is the n-th argument of a function call.

 */


/* ========================= Lexer customization.  ========================= */

/* 'readtable_case' is the case conversion that is applied to non-escaped
    parts of symbol tokens.  In Common Lisp: (readtable-case *readtable*).  */

enum rtcase
{
  case_upcase,
  case_downcase,
  case_preserve,
  case_invert
};

static enum rtcase readtable_case = case_upcase;

/* 'read_base' is the assumed radix of integers and rational numbers.
   In Common Lisp: *read-base*.  */
static int read_base = 10;

/* 'read_preserve_whitespace' specifies whether a whitespace character
   that terminates a token must be pushed back on the input stream.
   We set it to true, because the special newline side effect in read_object()
   requires that read_object() sees every newline not inside a token.  */
static bool read_preserve_whitespace = true;


/* ====================== Keyword set customization.  ====================== */

/* If true extract all strings.  */
static bool extract_all = false;

static hash_table keywords;
static bool default_keywords = true;


void
x_lisp_extract_all ()
{
  extract_all = true;
}


void
x_lisp_keyword (const char *name)
{
  if (name == NULL)
    default_keywords = false;
  else
    {
      const char *end;
      struct callshape shape;
      const char *colon;
      size_t len;
      char *symname;
      size_t i;

      if (keywords.table == NULL)
	hash_init (&keywords, 100);

      split_keywordspec (name, &end, &shape);

      /* The characters between name and end should form a valid Lisp symbol.
	 Extract the symbol name part.  */
      colon = strchr (name, ':');
      if (colon != NULL && colon < end)
	{
	  name = colon + 1;
	  if (name < end && *name == ':')
	    name++;
	  colon = strchr (name, ':');
	  if (colon != NULL && colon < end)
	    return;
	}

      /* Uppercase it.  */
      len = end - name;
      symname = (char *) xmalloc (len);
      for (i = 0; i < len; i++)
	symname[i] =
	  (name[i] >= 'a' && name[i] <= 'z' ? name[i] - 'a' + 'A' : name[i]);

      insert_keyword_callshape (&keywords, symname, len, &shape);
    }
}

/* Finish initializing the keywords hash table.
   Called after argument processing, before each file is processed.  */
static void
init_keywords ()
{
  if (default_keywords)
    {
      /* When adding new keywords here, also update the documentation in
	 xgettext.texi!  */
      x_lisp_keyword ("gettext");	/* I18N:GETTEXT */
      x_lisp_keyword ("ngettext:1,2");	/* I18N:NGETTEXT */
      x_lisp_keyword ("gettext-noop");
      default_keywords = false;
    }
}

void
init_flag_table_lisp ()
{
  xgettext_record_flag ("gettext:1:pass-lisp-format");
  xgettext_record_flag ("ngettext:1:pass-lisp-format");
  xgettext_record_flag ("ngettext:2:pass-lisp-format");
  xgettext_record_flag ("gettext-noop:1:pass-lisp-format");
  xgettext_record_flag ("format:2:lisp-format");
}


/* ======================== Reading of characters.  ======================== */

/* Real filename, used in error messages about the input file.  */
static const char *real_file_name;

/* Logical filename and line number, used to label the extracted messages.  */
static char *logical_file_name;
static int line_number;

/* The input file stream.  */
static FILE *fp;


/* Fetch the next character from the input file.  */
static int
do_getc ()
{
  int c = getc (fp);

  if (c == EOF)
    {
      if (ferror (fp))
	error (EXIT_FAILURE, errno, _("\
error while reading \"%s\""), real_file_name);
    }
  else if (c == '\n')
   line_number++;

  return c;
}

/* Put back the last fetched character, not EOF.  */
static void
do_ungetc (int c)
{
  if (c == '\n')
    line_number--;
  ungetc (c, fp);
}


/* ========= Reading of tokens.  See CLHS 2.2 "Reader Algorithm".  ========= */


/* Syntax code.  See CLHS 2.1.4 "Character Syntax Types".  */

enum syntax_code
{
  syntax_illegal,	/* non-printable, except whitespace	*/
  syntax_single_esc,	/* '\' (single escape)			*/
  syntax_multi_esc,	/* '|' (multiple escape)		*/
  syntax_constituent,	/* everything else (constituent)	*/
  syntax_whitespace,	/* TAB,LF,FF,CR,' ' (whitespace)	*/
  syntax_eof,		/* EOF					*/
  syntax_t_macro,	/* '()'"' (terminating macro)		*/
  syntax_nt_macro	/* '#' (non-terminating macro)		*/
};

/* Returns the syntax code of a character.  */
static enum syntax_code
syntax_code_of (unsigned char c)
{
  switch (c)
    {
    case '\\':
      return syntax_single_esc;
    case '|':
      return syntax_multi_esc;
    case '\t': case '\n': case '\f': case '\r': case ' ':
      return syntax_whitespace;
    case '(': case ')': case '\'': case '"': case ',': case ';': case '`':
      return syntax_t_macro;
    case '#':
      return syntax_nt_macro;
    default:
      if (c < ' ' && c != '\b')
	return syntax_illegal;
      else
	return syntax_constituent;
    }
}

struct char_syntax
{
  int ch;			/* character */
  enum syntax_code scode;	/* syntax code */
};

/* Returns the next character and its syntax code.  */
static void
read_char_syntax (struct char_syntax *p)
{
  int c = do_getc ();

  p->ch = c;
  p->scode = (c == EOF ? syntax_eof : syntax_code_of (c));
}

/* Every character in a token has an attribute assigned.  The attributes
   help during interpretation of the token.  See
   CLHS 2.3 "Interpretation of Tokens" for the possible interpretations,
   and CLHS 2.1.4.2 "Constituent Traits".  */

enum attribute
{
  a_illg,	/* invalid constituent */
  a_pack_m,	/* ':' package marker */
  a_alpha,	/* normal alphabetic */
  a_escaped,	/* alphabetic but not subject to case conversion */
  a_ratio,	/* '/' */
  a_dot,	/* '.' */
  a_sign,	/* '+-' */
  a_extens,	/* '_^' extension characters */
  a_digit,	/* '0123456789' */
  a_letterdigit,/* 'A'-'Z','a'-'z' below base, except 'esfdlESFDL' */
  a_expodigit,	/* 'esfdlESFDL' below base */
  a_letter,	/* 'A'-'Z','a'-'z', except 'esfdlESFDL' */
  a_expo	/* 'esfdlESFDL' */
};

#define is_letter_attribute(a) ((a) >= a_letter)
#define is_number_attribute(a) ((a) >= a_ratio)

/* Returns the attribute of a character, assuming base 10.  */
static enum attribute
attribute_of (unsigned char c)
{
  switch (c)
    {
    case ':':
      return a_pack_m;
    case '/':
      return a_ratio;
    case '.':
      return a_dot;
    case '+': case '-':
      return a_sign;
    case '_': case '^':
      return a_extens;
    case '0': case '1': case '2': case '3': case '4':
    case '5': case '6': case '7': case '8': case '9':
      return a_digit;
    case 'a': case 'b': case 'c': case 'g': case 'h': case 'i': case 'j':
    case 'k': case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
    case 't': case 'u': case 'v': case 'w': case 'x': case 'y': case 'z':
    case 'A': case 'B': case 'C': case 'G': case 'H': case 'I': case 'J':
    case 'K': case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
    case 'T': case 'U': case 'V': case 'W': case 'X': case 'Y': case 'Z':
      return a_letter;
    case 'e': case 's': case 'd': case 'f': case 'l':
    case 'E': case 'S': case 'D': case 'F': case 'L':
      return a_expo;
    default:
      /* Treat everything as valid.  Never return a_illg.  */
      return a_alpha;
    }
}

struct token_char
{
  unsigned char ch;		/* character */
  unsigned char attribute;	/* attribute */
};

/* A token consists of a sequence of characters with associated attribute.  */
struct token
{
  int allocated;		/* number of allocated 'token_char's */
  int charcount;		/* number of used 'token_char's */
  struct token_char *chars;	/* the token's constituents */
  bool with_escape;		/* whether single-escape or multiple escape occurs */
};

/* Initialize a 'struct token'.  */
static inline void
init_token (struct token *tp)
{
  tp->allocated = 10;
  tp->chars =
    (struct token_char *) xmalloc (tp->allocated * sizeof (struct token_char));
  tp->charcount = 0;
}

/* Free the memory pointed to by a 'struct token'.  */
static inline void
free_token (struct token *tp)
{
  free (tp->chars);
}

/* Ensure there is enough room in the token for one more character.  */
static inline void
grow_token (struct token *tp)
{
  if (tp->charcount == tp->allocated)
    {
      tp->allocated *= 2;
      tp->chars = (struct token_char *) xrealloc (tp->chars, tp->allocated * sizeof (struct token_char));
    }
}

/* Read the next token.  If 'first' is given, it points to the first
   character, which has already been read.
   The algorithm follows CLHS 2.2 "Reader Algorithm".  */
static void
read_token (struct token *tp, const struct char_syntax *first)
{
  bool multiple_escape_flag;
  struct char_syntax curr;

  init_token (tp);
  tp->with_escape = false;

  multiple_escape_flag = false;
  if (first)
    curr = *first;
  else
    read_char_syntax (&curr);

  for (;; read_char_syntax (&curr))
    {
      switch (curr.scode)
	{
	case syntax_illegal:
	  /* Invalid input.  Be tolerant, no error message.  */
	  do_ungetc (curr.ch);
	  return;

	case syntax_single_esc:
	  tp->with_escape = true;
	  read_char_syntax (&curr);
	  if (curr.scode == syntax_eof)
	    /* Invalid input.  Be tolerant, no error message.  */
	    return;
	  grow_token (tp);
	  tp->chars[tp->charcount].ch = curr.ch;
	  tp->chars[tp->charcount].attribute = a_escaped;
	  tp->charcount++;
	  break;

	case syntax_multi_esc:
	  multiple_escape_flag = !multiple_escape_flag;
	  tp->with_escape = true;
	  break;

	case syntax_constituent:
	case syntax_nt_macro:
	  grow_token (tp);
	  if (multiple_escape_flag)
	    {
	      tp->chars[tp->charcount].ch = curr.ch;
	      tp->chars[tp->charcount].attribute = a_escaped;
	      tp->charcount++;
	    }
	  else
	    {
	      tp->chars[tp->charcount].ch = curr.ch;
	      tp->chars[tp->charcount].attribute = attribute_of (curr.ch);
	      tp->charcount++;
	    }
	  break;

	case syntax_whitespace:
	case syntax_t_macro:
	  if (multiple_escape_flag)
	    {
	      grow_token (tp);
	      tp->chars[tp->charcount].ch = curr.ch;
	      tp->chars[tp->charcount].attribute = a_escaped;
	      tp->charcount++;
	    }
	  else
	    {
	      if (curr.scode != syntax_whitespace || read_preserve_whitespace)
		do_ungetc (curr.ch);
	      return;
	    }
	  break;

	case syntax_eof:
	  if (multiple_escape_flag)
	    /* Invalid input.  Be tolerant, no error message.  */
	    ;
	  return;
	}
    }
}

/* A potential number is a token which
   1. consists only of digits, '+','-','/','^','_','.' and number markers.
      The base for digits is context dependent, but always 10 if a dot '.'
      occurs. A number marker is a non-digit letter which is not adjacent
      to a non-digit letter.
   2. has at least one digit.
   3. starts with a digit, '+','-','.','^' or '_'.
   4. does not end with '+' or '-'.
   See CLHS 2.3.1.1 "Potential Numbers as Tokens".
 */

static inline bool
has_a_dot (const struct token *tp)
{
  int n = tp->charcount;
  int i;

  for (i = 0; i < n; i++)
    if (tp->chars[i].attribute == a_dot)
      return true;
  return false;
}

static inline bool
all_a_number (const struct token *tp)
{
  int n = tp->charcount;
  int i;

  for (i = 0; i < n; i++)
    if (!is_number_attribute (tp->chars[i].attribute))
      return false;
  return true;
}

static inline void
a_letter_to_digit (const struct token *tp, int base)
{
  int n = tp->charcount;
  int i;

  for (i = 0; i < n; i++)
    if (is_letter_attribute (tp->chars[i].attribute))
      {
	int c = tp->chars[i].ch;

	if (c >= 'a')
	  c -= 'a' - 'A';
	if (c - 'A' + 10 < base)
	  tp->chars[i].attribute -= 2; /* a_letter -> a_letterdigit,
					  a_expo -> a_expodigit */
      }
}

static inline bool
has_a_digit (const struct token *tp)
{
  int n = tp->charcount;
  int i;

  for (i = 0; i < n; i++)
    if (tp->chars[i].attribute == a_digit
	|| tp->chars[i].attribute == a_letterdigit
	|| tp->chars[i].attribute == a_expodigit)
      return true;
  return false;
}

static inline bool
has_adjacent_letters (const struct token *tp)
{
  int n = tp->charcount;
  int i;

  for (i = 1; i < n; i++)
    if (is_letter_attribute (tp->chars[i-1].attribute)
	&& is_letter_attribute (tp->chars[i].attribute))
      return true;
  return false;
}

static bool
is_potential_number (const struct token *tp, int *basep)
{
  /* CLHS 2.3.1.1.1:
     "A potential number cannot contain any escape characters."  */
  if (tp->with_escape)
    return false;

  if (has_a_dot (tp))
    *basep = 10;

  if (!all_a_number (tp))
    return false;

  a_letter_to_digit (tp, *basep);

  if (!has_a_digit (tp))
    return false;

  if (has_adjacent_letters (tp))
    return false;

  if (!(tp->chars[0].attribute >= a_dot
	&& tp->chars[0].attribute <= a_expodigit))
    return false;

  if (tp->chars[tp->charcount - 1].attribute == a_sign)
    return false;

  return true;
}

/* A number is one of integer, ratio, float.  Each has a particular syntax.
   See CLHS 2.3.1 "Numbers as Tokens".
   But note a mistake: The exponent rule should read:
       exponent ::= exponent-marker [sign] {decimal-digit}+
   (see 22.1.3.1.3 "Printing Floats").  */

enum number_type
{
  n_none,
  n_integer,
  n_ratio,
  n_float
};

static enum number_type
is_number (const struct token *tp, int *basep)
{
  struct token_char *ptr_limit;
  struct token_char *ptr1;

  if (!is_potential_number (tp, basep))
    return n_none;

  /* is_potential_number guarantees
     - all attributes are >= a_ratio,
     - there is at least one a_digit or a_letterdigit or a_expodigit, and
     - if there is an a_dot, then *basep = 10.  */

  ptr1 = &tp->chars[0];
  ptr_limit = &tp->chars[tp->charcount];

  if (ptr1->attribute == a_sign)
    ptr1++;

  /* Test for syntax
   * { a_sign | }
   * { a_digit < base }+ { a_ratio { a_digit < base }+ | }
   */
  {
    bool seen_a_ratio = false;
    bool seen_a_digit = false;	/* seen a digit in last digit block? */
    struct token_char *ptr;

    for (ptr = ptr1;; ptr++)
      {
	if (ptr >= ptr_limit)
	  {
	    if (!seen_a_digit)
	      break;
	    if (seen_a_ratio)
	      return n_ratio;
	    else
	      return n_integer;
	  }
	if (ptr->attribute == a_digit
	    || ptr->attribute == a_letterdigit
	    || ptr->attribute == a_expodigit)
	  {
	    int c = ptr->ch;

	    c = (c < 'A' ? c - '0' : c < 'a' ? c - 'A' + 10 : c - 'a' + 10);
	    if (c >= *basep)
	      break;
	    seen_a_digit = true;
	  }
	else if (ptr->attribute == a_ratio)
	  {
	    if (seen_a_ratio || !seen_a_digit)
	      break;
	    seen_a_ratio = true;
	    seen_a_digit = false;
	  }
	else
	  break;
      }
  }

  /* Test for syntax
   * { a_sign | }
   * { a_digit }* { a_dot { a_digit }* | }
   * { a_expo { a_sign | } { a_digit }+ | }
   *
   * If there is an exponent part, there must be digits before the dot or
   * after the dot. The result is a float.
   * If there is no exponen:
   *   If there is no dot, it would an integer in base 10, but is has already
   *   been verified to not be an integer in the current base.
   *   If there is a dot:
   *     If there are digits after the dot, it's a float.
   *     Otherwise, if there are digits before the dot, it's an integer.
   */
  *basep = 10;
  {
    bool seen_a_dot = false;
    bool seen_a_dot_with_leading_digits = false;
    bool seen_a_digit = false;	/* seen a digit in last digit block? */
    struct token_char *ptr;

    for (ptr = ptr1;; ptr++)
      {
	if (ptr >= ptr_limit)
	  {
	    /* no exponent */
	    if (!seen_a_dot)
	      return n_none;
	    if (seen_a_digit)
	      return n_float;
	    if (seen_a_dot_with_leading_digits)
	      return n_integer;
	    else
	      return n_none;
	  }
	if (ptr->attribute == a_digit)
	  {
	    seen_a_digit = true;
	  }
	else if (ptr->attribute == a_dot)
	  {
	    if (seen_a_dot)
	      return n_none;
	    seen_a_dot = true;
	    if (seen_a_digit)
	      seen_a_dot_with_leading_digits = true;
	    seen_a_digit = false;
	  }
	else if (ptr->attribute == a_expo || ptr->attribute == a_expodigit)
	  break;
	else
	  return n_none;
      }
    ptr++;
    if (!seen_a_dot_with_leading_digits || !seen_a_digit)
      return n_none;
    if (ptr >= ptr_limit)
      return n_none;
    if (ptr->attribute == a_sign)
      ptr++;
    seen_a_digit = false;
    for (;; ptr++)
      {
	if (ptr >= ptr_limit)
	  break;
	if (ptr->attribute != a_digit)
	  return n_none;
	seen_a_digit = true;
      }
    if (!seen_a_digit)
      return n_none;
    return n_float;
  }
}

/* A token representing a symbol must be case converted.
   For portability, we convert only ASCII characters here.  */

static void
upcase_token (struct token *tp)
{
  int n = tp->charcount;
  int i;

  for (i = 0; i < n; i++)
    if (tp->chars[i].attribute != a_escaped)
      {
	unsigned char c = tp->chars[i].ch;
	if (c >= 'a' && c <= 'z')
	  tp->chars[i].ch = c - 'a' + 'A';
      }
}

static void
downcase_token (struct token *tp)
{
  int n = tp->charcount;
  int i;

  for (i = 0; i < n; i++)
    if (tp->chars[i].attribute != a_escaped)
      {
	unsigned char c = tp->chars[i].ch;
	if (c >= 'A' && c <= 'Z')
	  tp->chars[i].ch = c - 'A' + 'a';
      }
}

static void
case_convert_token (struct token *tp)
{
  int n = tp->charcount;
  int i;

  switch (readtable_case)
    {
    case case_upcase:
      upcase_token (tp);
      break;

    case case_downcase:
      downcase_token (tp);
      break;

    case case_preserve:
      break;

    case case_invert:
      {
	bool seen_uppercase = false;
	bool seen_lowercase = false;
	for (i = 0; i < n; i++)
	  if (tp->chars[i].attribute != a_escaped)
	    {
	      unsigned char c = tp->chars[i].ch;
	      if (c >= 'a' && c <= 'z')
		seen_lowercase = true;
	      if (c >= 'A' && c <= 'Z')
		seen_uppercase = true;
	    }
	if (seen_uppercase)
	  {
	    if (!seen_lowercase)
	      downcase_token (tp);
	  }
	else
	  {
	    if (seen_lowercase)
	      upcase_token (tp);
	  }
      }
      break;
    }
}


/* ========================= Accumulating comments ========================= */


static char *buffer;
static size_t bufmax;
static size_t buflen;

static inline void
comment_start ()
{
  buflen = 0;
}

static inline void
comment_add (int c)
{
  if (buflen >= bufmax)
    {
      bufmax = 2 * bufmax + 10;
      buffer = xrealloc (buffer, bufmax);
    }
  buffer[buflen++] = c;
}

static inline void
comment_line_end (size_t chars_to_remove)
{
  buflen -= chars_to_remove;
  while (buflen >= 1
	 && (buffer[buflen - 1] == ' ' || buffer[buflen - 1] == '\t'))
    --buflen;
  if (chars_to_remove == 0 && buflen >= bufmax)
    {
      bufmax = 2 * bufmax + 10;
      buffer = xrealloc (buffer, bufmax);
    }
  buffer[buflen] = '\0';
  savable_comment_add (buffer);
}


/* These are for tracking whether comments count as immediately before
   keyword.  */
static int last_comment_line;
static int last_non_comment_line;


/* ========================= Accumulating messages ========================= */


static message_list_ty *mlp;


/* ============== Reading of objects.  See CLHS 2 "Syntax".  ============== */


/* We are only interested in symbols (e.g. GETTEXT or NGETTEXT) and strings.
   Other objects need not to be represented precisely.  */
enum object_type
{
  t_symbol,	/* symbol */
  t_string,	/* string */
  t_other,	/* other kind of real object */
  t_dot,	/* '.' pseudo object */
  t_close,	/* ')' pseudo object */
  t_eof		/* EOF marker */
};

struct object
{
  enum object_type type;
  struct token *token;		/* for t_symbol and t_string */
  int line_number_at_start;	/* for t_string */
};

/* Free the memory pointed to by a 'struct object'.  */
static inline void
free_object (struct object *op)
{
  if (op->type == t_symbol || op->type == t_string)
    {
      free_token (op->token);
      free (op->token);
    }
}

/* Convert a t_symbol/t_string token to a char*.  */
static char *
string_of_object (const struct object *op)
{
  char *str;
  const struct token_char *p;
  char *q;
  int n;

  if (!(op->type == t_symbol || op->type == t_string))
    abort ();
  n = op->token->charcount;
  str = (char *) xmalloc (n + 1);
  q = str;
  for (p = op->token->chars; n > 0; p++, n--)
    *q++ = p->ch;
  *q = '\0';
  return str;
}

/* Context lookup table.  */
static flag_context_list_table_ty *flag_context_list_table;

/* Read the next object.  */
static void
read_object (struct object *op, flag_context_ty outer_context)
{
  for (;;)
    {
      struct char_syntax curr;

      read_char_syntax (&curr);

      switch (curr.scode)
	{
	case syntax_eof:
	  op->type = t_eof;
	  return;

	case syntax_whitespace:
	  if (curr.ch == '\n')
	    /* Comments assumed to be grouped with a message must immediately
	       precede it, with no non-whitespace token on a line between
	       both.  */
	    if (last_non_comment_line > last_comment_line)
	      savable_comment_reset ();
	  continue;

	case syntax_illegal:
	  op->type = t_other;
	  return;

	case syntax_single_esc:
	case syntax_multi_esc:
	case syntax_constituent:
	  /* Start reading a token.  */
	  op->token = (struct token *) xmalloc (sizeof (struct token));
	  read_token (op->token, &curr);
	  last_non_comment_line = line_number;

	  /* Interpret the token.  */

	  /* Dots.  */
	  if (!op->token->with_escape
	      && op->token->charcount == 1
	      && op->token->chars[0].attribute == a_dot)
	    {
	      free_token (op->token);
	      free (op->token);
	      op->type = t_dot;
	      return;
	    }
	  /* Tokens consisting entirely of dots are illegal, but be tolerant
	     here.  */

	  /* Number.  */
	  {
	    int base = read_base;

	    if (is_number (op->token, &base) != n_none)
	      {
		free_token (op->token);
		free (op->token);
		op->type = t_other;
		return;
	      }
	  }

	  /* We interpret all other tokens as symbols (including 'reserved
	     tokens', i.e. potential numbers which are not numbers).  */
	  case_convert_token (op->token);
	  op->type = t_symbol;
	  return;

	case syntax_t_macro:
	case syntax_nt_macro:
	  /* Read a macro.  */
	  switch (curr.ch)
	    {
	    case '(':
	      {
		int arg = 0;		/* Current argument number.  */
		flag_context_list_iterator_ty context_iter;
		const struct callshapes *shapes = NULL;
		struct arglist_parser *argparser = NULL;

		for (;; arg++)
		  {
		    struct object inner;
		    flag_context_ty inner_context;

		    if (arg == 0)
		      inner_context = null_context;
		    else
		      inner_context =
			inherited_context (outer_context,
					   flag_context_list_iterator_advance (
					     &context_iter));

		    read_object (&inner, inner_context);

		    /* Recognize end of list.  */
		    if (inner.type == t_close)
		      {
			op->type = t_other;
			/* Don't bother converting "()" to "NIL".  */
			last_non_comment_line = line_number;
			if (argparser != NULL)
			  arglist_parser_done (argparser, arg);
			return;
		      }

		    /* Dots are not allowed in every position.
		       But be tolerant.  */

		    /* EOF inside list is illegal.
		       But be tolerant.  */
		    if (inner.type == t_eof)
		      break;

		    if (arg == 0)
		      {
			/* This is the function position.  */
			if (inner.type == t_symbol)
			  {
			    char *symbol_name = string_of_object (&inner);
			    int i;
			    int prefix_len;
			    void *keyword_value;

			    /* Omit any package name.  */
			    i = inner.token->charcount;
			    while (i > 0
				   && inner.token->chars[i-1].attribute != a_pack_m)
			      i--;
			    prefix_len = i;

			    if (hash_find_entry (&keywords,
						 symbol_name + prefix_len,
						 strlen (symbol_name + prefix_len),
						 &keyword_value)
				== 0)
			      shapes = (const struct callshapes *) keyword_value;

			    argparser = arglist_parser_alloc (mlp, shapes);

			    context_iter =
			      flag_context_list_iterator (
				flag_context_list_table_lookup (
				  flag_context_list_table,
				  symbol_name, strlen (symbol_name)));

			    free (symbol_name);
			  }
			else
			  context_iter = null_context_list_iterator;
		      }
		    else
		      {
			/* These are the argument positions.  */
			if (argparser != NULL && inner.type == t_string)
			  arglist_parser_remember (argparser, arg,
						   string_of_object (&inner),
						   inner_context,
						   logical_file_name,
						   inner.line_number_at_start,
						   savable_comment);
		      }

		    free_object (&inner);
		  }

		if (argparser != NULL)
		  arglist_parser_done (argparser, arg);
	      }
	      op->type = t_other;
	      last_non_comment_line = line_number;
	      return;

	    case ')':
	      /* Tell the caller about the end of list.
		 Unmatched closing parenthesis is illegal.
		 But be tolerant.  */
	      op->type = t_close;
	      last_non_comment_line = line_number;
	      return;

	    case ',':
	      {
		int c = do_getc ();
		/* The ,@ handling inside lists is wrong anyway, because
		   ,@form expands to an unknown number of elements.  */
		if (c != EOF && c != '@' && c != '.')
		  do_ungetc (c);
	      }
	      /*FALLTHROUGH*/
	    case '\'':
	    case '`':
	      {
		struct object inner;

		read_object (&inner, null_context);

		/* Dots and EOF are not allowed here.  But be tolerant.  */

		free_object (&inner);

		op->type = t_other;
		last_non_comment_line = line_number;
		return;
	      }

	    case ';':
	      {
		bool all_semicolons = true;

		last_comment_line = line_number;
		comment_start ();
		for (;;)
		  {
		    int c = do_getc ();
		    if (c == EOF || c == '\n')
		      break;
		    if (c != ';')
		      all_semicolons = false;
		    if (!all_semicolons)
		      {
			/* We skip all leading white space, but not EOLs.  */
			if (!(buflen == 0 && (c == ' ' || c == '\t')))
			  comment_add (c);
		      }
		  }
		comment_line_end (0);
		continue;
	      }

	    case '"':
	      {
		op->token = (struct token *) xmalloc (sizeof (struct token));
		init_token (op->token);
		op->line_number_at_start = line_number;
		for (;;)
		  {
		    int c = do_getc ();
		    if (c == EOF)
		      /* Invalid input.  Be tolerant, no error message.  */
		      break;
		    if (c == '"')
		      break;
		    if (c == '\\') /* syntax_single_esc */
		      {
			c = do_getc ();
			if (c == EOF)
			  /* Invalid input.  Be tolerant, no error message.  */
			  break;
		      }
		    grow_token (op->token);
		    op->token->chars[op->token->charcount++].ch = c;
		  }
		op->type = t_string;

		if (extract_all)
		  {
		    lex_pos_ty pos;

		    pos.file_name = logical_file_name;
		    pos.line_number = op->line_number_at_start;
		    remember_a_message (mlp, NULL, string_of_object (op),
					null_context, &pos, savable_comment);
		  }
		last_non_comment_line = line_number;
		return;
	      }

	    case '#':
	      /* Dispatch macro handling.  */
	      {
		int c;

		for (;;)
		  {
		    c = do_getc ();
		    if (c == EOF)
		      /* Invalid input.  Be tolerant, no error message.  */
		      {
			op->type = t_other;
			return;
		      }
		    if (!(c >= '0' && c <= '9'))
		      break;
		  }

		switch (c)
		  {
		  case '(':
		  case '"':
		    do_ungetc (c);
		    /*FALLTHROUGH*/
		  case '\'':
		  case ':':
		  case '.':
		  case ',':
		  case 'A': case 'a':
		  case 'C': case 'c':
		  case 'P': case 'p':
		  case 'S': case 's':
		    {
		      struct object inner;
		      read_object (&inner, null_context);
		      /* Dots and EOF are not allowed here.
			 But be tolerant.  */
		      free_object (&inner);
		      op->type = t_other;
		      last_non_comment_line = line_number;
		      return;
		    }

		  case '|':
		    {
		      int depth = 0;
		      int c;

		      comment_start ();
		      c = do_getc ();
		      for (;;)
			{
			  if (c == EOF)
			    break;
			  if (c == '|')
			    {
			      c = do_getc ();
			      if (c == EOF)
				break;
			      if (c == '#')
				{
				  if (depth == 0)
				    {
				      comment_line_end (0);
				      break;
				    }
				  depth--;
				  comment_add ('|');
				  comment_add ('#');
				  c = do_getc ();
				}
			      else
				comment_add ('|');
			    }
			  else if (c == '#')
			    {
			      c = do_getc ();
			      if (c == EOF)
				break;
			      comment_add ('#');
			      if (c == '|')
				{
				  depth++;
				  comment_add ('|');
				  c = do_getc ();
				}
			    }
			  else
			    {
			      /* We skip all leading white space.  */
			      if (!(buflen == 0 && (c == ' ' || c == '\t')))
				comment_add (c);
			      if (c == '\n')
				{
				  comment_line_end (1);
				  comment_start ();
				}
			      c = do_getc ();
			    }
			}
		      if (c == EOF)
			{
			  /* EOF not allowed here.  But be tolerant.  */
			  op->type = t_eof;
			  return;
			}
		      last_comment_line = line_number;
		      continue;
		    }

		  case '\\':
		    {
		      struct token token;
		      struct char_syntax first;
		      first.ch = '\\';
		      first.scode = syntax_single_esc;
		      read_token (&token, &first);
		      free_token (&token);
		      op->type = t_other;
		      last_non_comment_line = line_number;
		      return;
		    }

		  case 'B': case 'b':
		  case 'O': case 'o':
		  case 'X': case 'x':
		  case 'R': case 'r':
		  case '*':
		    {
		      struct token token;
		      read_token (&token, NULL);
		      free_token (&token);
		      op->type = t_other;
		      last_non_comment_line = line_number;
		      return;
		    }

		  case '=':
		    /* Ignore read labels.  */
		    continue;

		  case '#':
		    /* Don't bother looking up the corresponding object.  */
		    op->type = t_other;
		    last_non_comment_line = line_number;
		    return;

		  case '+':
		  case '-':
		    /* Simply assume every feature expression is true.  */
		    {
		      struct object inner;
		      read_object (&inner, null_context);
		      /* Dots and EOF are not allowed here.
			 But be tolerant.  */
		      free_object (&inner);
		      continue;
		    }

		  default:
		    op->type = t_other;
		    last_non_comment_line = line_number;
		    return;
		  }
		/*NOTREACHED*/
		abort ();
	      }

	    default:
	      /*NOTREACHED*/
	      abort ();
	    }

	default:
	  /*NOTREACHED*/
	  abort ();
	}
    }
}


void
extract_lisp (FILE *f,
	      const char *real_filename, const char *logical_filename,
	      flag_context_list_table_ty *flag_table,
	      msgdomain_list_ty *mdlp)
{
  mlp = mdlp->item[0]->messages;

  fp = f;
  real_file_name = real_filename;
  logical_file_name = xstrdup (logical_filename);
  line_number = 1;

  last_comment_line = -1;
  last_non_comment_line = -1;

  flag_context_list_table = flag_table;

  init_keywords ();

  /* Eat tokens until eof is seen.  When read_object returns
     due to an unbalanced closing parenthesis, just restart it.  */
  do
    {
      struct object toplevel_object;

      read_object (&toplevel_object, null_context);

      if (toplevel_object.type == t_eof)
	break;

      free_object (&toplevel_object);
    }
  while (!feof (fp));

  /* Close scanner.  */
  fp = NULL;
  real_file_name = NULL;
  logical_file_name = NULL;
  line_number = 0;
}