gettext-tools/src/x-tcl.c

/* xgettext Tcl backend.
   Copyright (C) 2002-2003, 2005-2006 Free Software Foundation, Inc.

   This file was written by Bruno Haible <haible@clisp.cons.org>, 2002.

   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 2, or (at your option)
   any later version.

   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with this program; if not, write to the Free Software Foundation,
   Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.  */

#ifdef HAVE_CONFIG_H
# include "config.h"
#endif

#include <assert.h>
#include <errno.h>
#include <limits.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#include "message.h"
#include "xgettext.h"
#include "x-tcl.h"
#include "error.h"
#include "xalloc.h"
#include "exit.h"
#include "hash.h"
#include "c-ctype.h"
#include "po-charset.h"
#include "ucs4-utf8.h"
#include "gettext.h"

#define _(s) gettext(s)

#define SIZEOF(a) (sizeof(a) / sizeof(a[0]))


/* The Tcl syntax is defined in the Tcl.n manual page.
   Summary of Tcl syntax:
   Like sh syntax, except that `...` is replaced with [...]. In detail:
   - In a preprocessing pass, backslash-newline-anywhitespace is replaced
     with single space.
   - Input is broken into words, which are then subject to command
     substitution [...] , variable substitution $var, backslash substitution
     \escape.
   - Strings are enclosed in "..."; command substitution, variable
     substitution and backslash substitutions are performed here as well.
   - {...} is a string without substitutions.
   - The list of resulting words is split into commands by semicolon and
     newline.
   - '#' at the beginning of a command introduces a comment until end of line.
   The parser is implemented in tcl8.3.3/generic/tclParse.c.  */


/* ====================== Keyword set customization.  ====================== */

/* If true extract all strings.  */
static bool extract_all = false;

static hash_table keywords;
static bool default_keywords = true;


void
x_tcl_extract_all ()
{
  extract_all = true;
}


void
x_tcl_keyword (const char *name)
{
  if (name == NULL)
    default_keywords = false;
  else
    {
      const char *end;
      struct callshape shape;

      if (keywords.table == NULL)
	hash_init (&keywords, 100);

      split_keywordspec (name, &end, &shape);

      /* The characters between name and end should form a valid Tcl
	 function name.  A leading "::" is redundant.  */
      if (end - name >= 2 && name[0] == ':' && name[1] == ':')
	name += 2;

      insert_keyword_callshape (&keywords, name, end - name, &shape);
    }
}

/* Finish initializing the keywords hash table.
   Called after argument processing, before each file is processed.  */
static void
init_keywords ()
{
  if (default_keywords)
    {
      /* When adding new keywords here, also update the documentation in
	 xgettext.texi!  */
      x_tcl_keyword ("::msgcat::mc");
      default_keywords = false;
    }
}

void
init_flag_table_tcl ()
{
  xgettext_record_flag ("::msgcat::mc:1:pass-tcl-format");
  xgettext_record_flag ("format:1:tcl-format");
}


/* ======================== Reading of characters.  ======================== */

/* Real filename, used in error messages about the input file.  */
static const char *real_file_name;

/* Logical filename and line number, used to label the extracted messages.  */
static char *logical_file_name;
static int line_number;

/* The input file stream.  */
static FILE *fp;


/* Fetch the next character from the input file.  */
static int
do_getc ()
{
  int c = getc (fp);

  if (c == EOF)
    {
      if (ferror (fp))
	error (EXIT_FAILURE, errno, _("\
error while reading \"%s\""), real_file_name);
    }
  else if (c == '\n')
   line_number++;

  return c;
}

/* Put back the last fetched character, not EOF.  */
static void
do_ungetc (int c)
{
  if (c == '\n')
    line_number--;
  ungetc (c, fp);
}


/* Combine backslash followed by newline and additional whitespace to
   a single space.  */

/* An int that becomes a space when casted to 'unsigned char'.  */
#define BS_NL (UCHAR_MAX + 1 + ' ')

static int phase1_pushback[1];
static int phase1_pushback_length;

static int
phase1_getc ()
{
  int c;

  if (phase1_pushback_length)
    {
      c = phase1_pushback[--phase1_pushback_length];
      if (c == '\n' || c == BS_NL)
	++line_number;
      return c;
    }
  c = do_getc ();
  if (c != '\\')
    return c;
  c = do_getc ();
  if (c != '\n')
    {
      if (c != EOF)
	do_ungetc (c);
      return '\\';
    }
  for (;;)
    {
      c = do_getc ();
      if (!(c == ' ' || c == '\t'))
	break;
    }
  if (c != EOF)
    do_ungetc (c);
  return BS_NL;
}

/* Supports only one pushback character.  */
static void
phase1_ungetc (int c)
{
  switch (c)
    {
    case EOF:
      break;

    case '\n':
    case BS_NL:
      --line_number;
      /* FALLTHROUGH */

    default:
      if (phase1_pushback_length == SIZEOF (phase1_pushback))
	abort ();
      phase1_pushback[phase1_pushback_length++] = c;
      break;
    }
}


/* Keep track of brace nesting depth.
   When a word starts with an opening brace, a character group begins that
   ends with the corresponding closing brace.  In theory these character
   groups are string literals, but they are used by so many Tcl primitives
   (proc, if, ...) as representing command lists, that we treat them as
   command lists.  */

/* An int that becomes a closing brace when casted to 'unsigned char'.  */
#define CL_BRACE (UCHAR_MAX + 1 + '}')

static int phase2_pushback[2];
static int phase2_pushback_length;

/* Brace nesting depth inside the current character group.  */
static int brace_depth;

static int
phase2_push ()
{
  int previous_depth = brace_depth;
  brace_depth = 1;
  return previous_depth;
}

static void
phase2_pop (int previous_depth)
{
  brace_depth = previous_depth;
}

static int
phase2_getc ()
{
  int c;

  if (phase2_pushback_length)
    {
      c = phase2_pushback[--phase2_pushback_length];
      if (c == '\n' || c == BS_NL)
	++line_number;
      else if (c == '{')
	++brace_depth;
      else if (c == '}')
	--brace_depth;
      return c;
    }
  c = phase1_getc ();
  if (c == '{')
    ++brace_depth;
  else if (c == '}')
    {
      if (--brace_depth == 0)
	c = CL_BRACE;
    }
  return c;
}

/* Supports 2 characters of pushback.  */
static void
phase2_ungetc (int c)
{
  if (c != EOF)
    {
      switch (c)
	{
	case '\n':
	case BS_NL:
	  --line_number;
	  break;

	case '{':
	  --brace_depth;
	  break;

	case '}':
	  ++brace_depth;
	  break;
	}
      if (phase2_pushback_length == SIZEOF (phase2_pushback))
	abort ();
      phase2_pushback[phase2_pushback_length++] = c;
    }
}


/* ========================== Reading of tokens.  ========================== */


/* A token consists of a sequence of characters.  */
struct token
{
  int allocated;		/* number of allocated 'token_char's */
  int charcount;		/* number of used 'token_char's */
  char *chars;			/* the token's constituents */
};

/* Initialize a 'struct token'.  */
static inline void
init_token (struct token *tp)
{
  tp->allocated = 10;
  tp->chars = (char *) xmalloc (tp->allocated * sizeof (char));
  tp->charcount = 0;
}

/* Free the memory pointed to by a 'struct token'.  */
static inline void
free_token (struct token *tp)
{
  free (tp->chars);
}

/* Ensure there is enough room in the token for one more character.  */
static inline void
grow_token (struct token *tp)
{
  if (tp->charcount == tp->allocated)
    {
      tp->allocated *= 2;
      tp->chars = (char *) xrealloc (tp->chars, tp->allocated * sizeof (char));
    }
}


/* ========================= Accumulating comments ========================= */


static char *buffer;
static size_t bufmax;
static size_t buflen;

static inline void
comment_start ()
{
  buflen = 0;
}

static inline void
comment_add (int c)
{
  if (buflen >= bufmax)
    {
      bufmax = 2 * bufmax + 10;
      buffer = xrealloc (buffer, bufmax);
    }
  buffer[buflen++] = c;
}

static inline void
comment_line_end ()
{
  while (buflen >= 1
	 && (buffer[buflen - 1] == ' ' || buffer[buflen - 1] == '\t'))
    --buflen;
  if (buflen >= bufmax)
    {
      bufmax = 2 * bufmax + 10;
      buffer = xrealloc (buffer, bufmax);
    }
  buffer[buflen] = '\0';
  savable_comment_add (buffer);
}


/* These are for tracking whether comments count as immediately before
   keyword.  */
static int last_comment_line;
static int last_non_comment_line;


/* ========================= Accumulating messages ========================= */


static message_list_ty *mlp;


/* ========================== Reading of commands ========================== */


/* We are only interested in constant strings (e.g. "msgcat::mc" or other
   string literals).  Other words need not to be represented precisely.  */
enum word_type
{
  t_string,	/* constant string */
  t_other,	/* other string */
  t_separator,	/* command separator: semicolon or newline */
  t_bracket,	/* ']' pseudo word */
  t_brace,	/* '}' pseudo word */
  t_eof		/* EOF marker */
};

struct word
{
  enum word_type type;
  struct token *token;		/* for t_string */
  int line_number_at_start;	/* for t_string */
};

/* Free the memory pointed to by a 'struct word'.  */
static inline void
free_word (struct word *wp)
{
  if (wp->type == t_string)
    {
      free_token (wp->token);
      free (wp->token);
    }
}

/* Convert a t_string token to a char*.  */
static char *
string_of_word (const struct word *wp)
{
  char *str;
  int n;

  if (!(wp->type == t_string))
    abort ();
  n = wp->token->charcount;
  str = (char *) xmalloc (n + 1);
  memcpy (str, wp->token->chars, n);
  str[n] = '\0';
  return str;
}


/* Context lookup table.  */
static flag_context_list_table_ty *flag_context_list_table;


/* Read an escape sequence.  The value is an ISO-8859-1 character (in the
   range 0x00..0xff) or a Unicode character (in the range 0x0000..0xffff).  */
static int
do_getc_escaped ()
{
  int c;

  c = phase1_getc ();
  switch (c)
    {
    case EOF:
      return '\\';
    case 'a':
      return '\a';
    case 'b':
      return '\b';
    case 'f':
      return '\f';
    case 'n':
      return '\n';
    case 'r':
      return '\r';
    case 't':
      return '\t';
    case 'v':
      return '\v';
    case 'x':
      {
	int n = 0;
	unsigned int i;

	for (i = 0;; i++)
	  {
	    c = phase1_getc ();
	    if (c == EOF || !c_isxdigit ((unsigned char) c))
	      break;

	    if (c >= '0' && c <= '9')
	      n = (n << 4) + (c - '0');
	    else if (c >= 'A' && c <= 'F')
	      n = (n << 4) + (c - 'A' + 10);
	    else if (c >= 'a' && c <= 'f')
	      n = (n << 4) + (c - 'a' + 10);
	  }
	phase1_ungetc (c);
	return (i > 0 ? (unsigned char) n : 'x');
      }
    case 'u':
      {
	int n = 0;
	unsigned int i;

	for (i = 0; i < 4; i++)
	  {
	    c = phase1_getc ();
	    if (c == EOF || !c_isxdigit ((unsigned char) c))
	      break;

	    if (c >= '0' && c <= '9')
	      n = (n << 4) + (c - '0');
	    else if (c >= 'A' && c <= 'F')
	      n = (n << 4) + (c - 'A' + 10);
	    else if (c >= 'a' && c <= 'f')
	      n = (n << 4) + (c - 'a' + 10);
	  }
	phase1_ungetc (c);
	return (i > 0 ? n : 'u');
      }
    case '0': case '1': case '2': case '3': case '4':
    case '5': case '6': case '7':
      {
	int n = c - '0';

	c = phase1_getc ();
	if (c != EOF)
	  {
	    if (c >= '0' && c <= '7')
	      {
		n = (n << 3) + (c - '0');
		c = phase1_getc ();
		if (c != EOF)
		  {
		    if (c >= '0' && c <= '7')
		      n = (n << 3) + (c - '0');
		    else
		      phase1_ungetc (c);
		  }
	      }
	    else
	      phase1_ungetc (c);
	  }
	return (unsigned char) n;
      }
    default:
      /* Note: If c is non-ASCII, Tcl's behaviour is undefined here.  */
      return (unsigned char) c;
    }
}


enum terminator
{
  te_space_separator,		/* looking for space semicolon newline */
  te_space_separator_bracket,	/* looking for space semicolon newline ']' */
  te_paren,			/* looking for ')' */
  te_quote			/* looking for '"' */
};

/* Forward declaration of local functions.  */
static enum word_type read_command_list (int looking_for,
					 flag_context_ty outer_context);

/* Accumulate tokens into the given word.
   'looking_for' denotes a parse terminator combination.
   Return the first character past the token.  */
static int
accumulate_word (struct word *wp, enum terminator looking_for,
		 flag_context_ty context)
{
  int c;

  for (;;)
    {
      c = phase2_getc ();

      if (c == EOF || c == CL_BRACE)
	return c;
      if ((looking_for == te_space_separator
	   || looking_for == te_space_separator_bracket)
	  && (c == ' ' || c == BS_NL
	      || c == '\t' || c == '\v' || c == '\f' || c == '\r'
	      || c == ';' || c == '\n'))
	return c;
      if (looking_for == te_space_separator_bracket && c == ']')
	return c;
      if (looking_for == te_paren && c == ')')
	return c;
      if (looking_for == te_quote && c == '"')
	return c;

      if (c == '$')
	{
	  /* Distinguish $varname, ${varname} and lone $.  */
	  c = phase2_getc ();
	  if (c == '{')
	    {
	      /* ${varname} */
	      do
		c = phase2_getc ();
	      while (c != EOF && c != '}');
	      wp->type = t_other;
	    }
	  else
	    {
	      bool nonempty = false;

	      for (; c != EOF && c != CL_BRACE; c = phase2_getc ())
		{
		  if (c_isalnum ((unsigned char) c) || (c == '_'))
		    {
		      nonempty = true;
		      continue;
		    }
		  if (c == ':')
		    {
		      c = phase2_getc ();
		      if (c == ':')
			{
			  do
			    c = phase2_getc ();
			  while (c == ':');

			  phase2_ungetc (c);
			  nonempty = true;
			  continue;
			}
		      phase2_ungetc (c);
		      c = ':';
		    }
		  break;
		}
	      if (c == '(')
		{
		  /* $varname(index) */
		  struct word index_word;

		  index_word.type = t_other;
		  c = accumulate_word (&index_word, te_paren, null_context);
		  if (c != EOF && c != ')')
		    phase2_ungetc (c);
		  wp->type = t_other;
		}
	      else
		{
		  phase2_ungetc (c);
		  if (nonempty)
		    {
		      /* $varname */
		      wp->type = t_other;
		    }
		  else
		    {
		      /* lone $ */
		      if (wp->type == t_string)
			{
			  grow_token (wp->token);
			  wp->token->chars[wp->token->charcount++] = '$';
			}
		    }
		}
	    }
	}
      else if (c == '[')
	{
	  read_command_list (']', context);
	  wp->type = t_other;
	}
      else if (c == '\\')
	{
	  unsigned int uc;
	  unsigned char utf8buf[6];
	  int count;
	  int i;

	  uc = do_getc_escaped ();
	  assert (uc < 0x10000);
	  count = u8_uctomb (utf8buf, uc, 6);
	  assert (count > 0);
	  if (wp->type == t_string)
	    for (i = 0; i < count; i++)
	      {
		grow_token (wp->token);
		wp->token->chars[wp->token->charcount++] = utf8buf[i];
	      }
	}
      else
	{
	  if (wp->type == t_string)
	    {
	      grow_token (wp->token);
	      wp->token->chars[wp->token->charcount++] = (unsigned char) c;
	    }
	}
    }
}


/* Read the next word.
   'looking_for' denotes a parse terminator, either ']' or '\0'.  */
static void
read_word (struct word *wp, int looking_for, flag_context_ty context)
{
  int c;

  do
    c = phase2_getc ();
  while (c == ' ' || c == BS_NL
	 || c == '\t' || c == '\v' || c == '\f' || c == '\r');

  if (c == EOF)
    {
      wp->type = t_eof;
      return;
    }

  if (c == CL_BRACE)
    {
      wp->type = t_brace;
      last_non_comment_line = line_number;
      return;
    }

  if (c == '\n')
    {
      /* Comments assumed to be grouped with a message must immediately
	 precede it, with no non-whitespace token on a line between both.  */
      if (last_non_comment_line > last_comment_line)
	savable_comment_reset ();
      wp->type = t_separator;
      return;
    }

  if (c == ';')
    {
      wp->type = t_separator;
      last_non_comment_line = line_number;
      return;
    }

  if (looking_for == ']' && c == ']')
    {
      wp->type = t_bracket;
      last_non_comment_line = line_number;
      return;
    }

  if (c == '{')
    {
      int previous_depth;
      enum word_type terminator;

      /* Start a new nested character group, which lasts until the next
	 balanced '}' (ignoring \} things).  */
      previous_depth = phase2_push () - 1;

      /* Interpret it as a command list.  */
      terminator = read_command_list ('\0', null_context);

      if (terminator == t_brace)
	phase2_pop (previous_depth);

      wp->type = t_other;
      last_non_comment_line = line_number;
      return;
    }

  wp->type = t_string;
  wp->token = (struct token *) xmalloc (sizeof (struct token));
  init_token (wp->token);
  wp->line_number_at_start = line_number;

  if (c == '"')
    {
      c = accumulate_word (wp, te_quote, context);
      if (c != EOF && c != '"')
	phase2_ungetc (c);
    }
  else
    {
      phase2_ungetc (c);
      c = accumulate_word (wp,
			   looking_for == ']'
			   ? te_space_separator_bracket
			   : te_space_separator,
			   context);
      if (c != EOF)
	phase2_ungetc (c);
    }

  if (wp->type != t_string)
    {
      free_token (wp->token);
      free (wp->token);
    }
  last_non_comment_line = line_number;
}


/* Read the next command.
   'looking_for' denotes a parse terminator, either ']' or '\0'.
   Returns the type of the word that terminated the command: t_separator or
   t_bracket (only if looking_for is ']') or t_brace or t_eof.  */
static enum word_type
read_command (int looking_for, flag_context_ty outer_context)
{
  int c;

  /* Skip whitespace and comments.  */
  for (;;)
    {
      c = phase2_getc ();

      if (c == ' ' || c == BS_NL
	  || c == '\t' || c == '\v' || c == '\f' || c == '\r')
	continue;
      if (c == '#')
	{
	  /* Skip a comment up to end of line.  */
	  last_comment_line = line_number;
	  comment_start ();
	  for (;;)
	    {
	      c = phase2_getc ();
	      if (c == EOF || c == CL_BRACE || c == '\n')
		break;
	      /* We skip all leading white space, but not EOLs.  */
	      if (!(buflen == 0 && (c == ' ' || c == '\t')))
		comment_add (c);
	    }
	  comment_line_end ();
	  continue;
	}
      break;
    }
  phase2_ungetc (c);

  /* Read the words that make up the command.  */
  {
    int arg = 0;		/* Current argument number.  */
    flag_context_list_iterator_ty context_iter;
    const struct callshapes *shapes = NULL;
    struct arglist_parser *argparser = NULL;

    for (;; arg++)
      {
	struct word inner;
	flag_context_ty inner_context;

	if (arg == 0)
	  inner_context = null_context;
	else
	  inner_context =
	    inherited_context (outer_context,
			       flag_context_list_iterator_advance (
				 &context_iter));

	read_word (&inner, looking_for, inner_context);

	/* Recognize end of command.  */
	if (inner.type == t_separator || inner.type == t_bracket
	    || inner.type == t_brace || inner.type == t_eof)
	  {
	    if (argparser != NULL)
	      arglist_parser_done (argparser, arg);
	    return inner.type;
	  }

	if (extract_all)
	  {
	    if (inner.type == t_string)
	      {
		lex_pos_ty pos;

		pos.file_name = logical_file_name;
		pos.line_number = inner.line_number_at_start;
		remember_a_message (mlp, NULL, string_of_word (&inner),
				    inner_context, &pos, savable_comment);
	      }
	  }

	if (arg == 0)
	  {
	    /* This is the function position.  */
	    if (inner.type == t_string)
	      {
		char *function_name = string_of_word (&inner);
		char *stripped_name;
		void *keyword_value;

		/* A leading "::" is redundant.  */
		stripped_name = function_name;
		if (function_name[0] == ':' && function_name[1] == ':')
		  stripped_name += 2;

		if (hash_find_entry (&keywords,
				     stripped_name, strlen (stripped_name),
				     &keyword_value)
		    == 0)
		  shapes = (const struct callshapes *) keyword_value;

		argparser = arglist_parser_alloc (mlp, shapes);

		context_iter =
		  flag_context_list_iterator (
		    flag_context_list_table_lookup (
		      flag_context_list_table,
		      stripped_name, strlen (stripped_name)));

		free (function_name);
	      }
	    else
	      context_iter = null_context_list_iterator;
	  }
	else
	  {
	    /* These are the argument positions.  */
	    if (argparser != NULL && inner.type == t_string)
	      arglist_parser_remember (argparser, arg,
				       string_of_word (&inner),
				       inner_context,
				       logical_file_name,
				       inner.line_number_at_start,
				       savable_comment);
	  }

	free_word (&inner);
      }
  }
}


/* Read a list of commands.
   'looking_for' denotes a parse terminator, either ']' or '\0'.
   Returns the type of the word that terminated the command list:
   t_bracket (only if looking_for is ']') or t_brace or t_eof.  */
static enum word_type
read_command_list (int looking_for, flag_context_ty outer_context)
{
  for (;;)
    {
      enum word_type terminator;

      terminator = read_command (looking_for, outer_context);
      if (terminator != t_separator)
	return terminator;
    }
}


void
extract_tcl (FILE *f,
	     const char *real_filename, const char *logical_filename,
	     flag_context_list_table_ty *flag_table,
	     msgdomain_list_ty *mdlp)
{
  mlp = mdlp->item[0]->messages;

  /* We convert our strings to UTF-8 encoding.  */
  xgettext_current_source_encoding = po_charset_utf8;

  fp = f;
  real_file_name = real_filename;
  logical_file_name = xstrdup (logical_filename);
  line_number = 1;

  /* Initially, no brace is open.  */
  brace_depth = 1000000;

  last_comment_line = -1;
  last_non_comment_line = -1;

  flag_context_list_table = flag_table;

  init_keywords ();

  /* Eat tokens until eof is seen.  */
  read_command_list ('\0', null_context);

  fp = NULL;
  real_file_name = NULL;
  logical_file_name = NULL;
  line_number = 0;
}