gettext-tools/src/x-awk.c

/* xgettext awk backend.
   Copyright (C) 2002-2003, 2005-2006 Free Software Foundation, Inc.

   This file was written by Bruno Haible <haible@clisp.cons.org>, 2002.

   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 2, or (at your option)
   any later version.

   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with this program; if not, write to the Free Software Foundation,
   Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.  */

#ifdef HAVE_CONFIG_H
# include "config.h"
#endif

#include <errno.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#include "message.h"
#include "xgettext.h"
#include "x-awk.h"
#include "error.h"
#include "error-progname.h"
#include "xalloc.h"
#include "exit.h"
#include "gettext.h"

#define _(s) gettext(s)


/* The awk syntax is defined in the gawk manual page and documentation.
   See also gawk/awkgram.y.  */


/* ====================== Keyword set customization.  ====================== */

/* If true extract all strings.  */
static bool extract_all = false;

static hash_table keywords;
static bool default_keywords = true;


void
x_awk_extract_all ()
{
  extract_all = true;
}


void
x_awk_keyword (const char *name)
{
  if (name == NULL)
    default_keywords = false;
  else
    {
      const char *end;
      struct callshape shape;
      const char *colon;

      if (keywords.table == NULL)
	hash_init (&keywords, 100);

      split_keywordspec (name, &end, &shape);

      /* The characters between name and end should form a valid C identifier.
	 A colon means an invalid parse in split_keywordspec().  */
      colon = strchr (name, ':');
      if (colon == NULL || colon >= end)
	insert_keyword_callshape (&keywords, name, end - name, &shape);
    }
}

/* Finish initializing the keywords hash table.
   Called after argument processing, before each file is processed.  */
static void
init_keywords ()
{
  if (default_keywords)
    {
      /* When adding new keywords here, also update the documentation in
	 xgettext.texi!  */
      x_awk_keyword ("dcgettext");
      x_awk_keyword ("dcngettext:1,2");
      default_keywords = false;
    }
}

void
init_flag_table_awk ()
{
  xgettext_record_flag ("dcgettext:1:pass-awk-format");
  xgettext_record_flag ("dcngettext:1:pass-awk-format");
  xgettext_record_flag ("dcngettext:2:pass-awk-format");
  xgettext_record_flag ("printf:1:awk-format");
}


/* ======================== Reading of characters.  ======================== */

/* Real filename, used in error messages about the input file.  */
static const char *real_file_name;

/* Logical filename and line number, used to label the extracted messages.  */
static char *logical_file_name;
static int line_number;

/* The input file stream.  */
static FILE *fp;

/* These are for tracking whether comments count as immediately before
   keyword.  */
static int last_comment_line;
static int last_non_comment_line;


/* 1. line_number handling.  */

static int
phase1_getc ()
{
  int c = getc (fp);

  if (c == EOF)
    {
      if (ferror (fp))
	error (EXIT_FAILURE, errno, _("error while reading \"%s\""),
	       real_file_name);
      return EOF;
    }

  if (c == '\n')
    line_number++;

  return c;
}

/* Supports only one pushback character.  */
static void
phase1_ungetc (int c)
{
  if (c != EOF)
    {
      if (c == '\n')
	--line_number;

      ungetc (c, fp);
    }
}


/* 2. Replace each comment that is not inside a string literal or regular
   expression with a newline character.  We need to remember the comment
   for later, because it may be attached to a keyword string.  */

static int
phase2_getc ()
{
  static char *buffer;
  static size_t bufmax;
  size_t buflen;
  int lineno;
  int c;

  c = phase1_getc ();
  if (c == '#')
    {
      buflen = 0;
      lineno = line_number;
      for (;;)
	{
	  c = phase1_getc ();
	  if (c == '\n' || c == EOF)
	    break;
	  /* We skip all leading white space, but not EOLs.  */
	  if (!(buflen == 0 && (c == ' ' || c == '\t')))
	    {
	      if (buflen >= bufmax)
		{
		  bufmax = 2 * bufmax + 10;
		  buffer = xrealloc (buffer, bufmax);
		}
	      buffer[buflen++] = c;
	    }
	}
      if (buflen >= bufmax)
	{
	  bufmax = 2 * bufmax + 10;
	  buffer = xrealloc (buffer, bufmax);
	}
      buffer[buflen] = '\0';
      savable_comment_add (buffer);
      last_comment_line = lineno;
    }
  return c;
}

/* Supports only one pushback character.  */
static void
phase2_ungetc (int c)
{
  if (c != EOF)
    phase1_ungetc (c);
}


/* ========================== Reading of tokens.  ========================== */


enum token_type_ty
{
  token_type_eof,
  token_type_lparen,		/* ( */
  token_type_rparen,		/* ) */
  token_type_comma,		/* , */
  token_type_string,		/* "abc" */
  token_type_i18nstring,	/* _"abc" */
  token_type_symbol,		/* symbol, number */
  token_type_semicolon,		/* ; */
  token_type_other		/* regexp, misc. operator */
};
typedef enum token_type_ty token_type_ty;

typedef struct token_ty token_ty;
struct token_ty
{
  token_type_ty type;
  char *string;		/* for token_type_{symbol,string,i18nstring} */
  int line_number;
};


/* 7. Replace escape sequences within character strings with their
   single character equivalents.  */

#define P7_QUOTES (1000 + '"')

static int
phase7_getc ()
{
  int c;

  for (;;)
    {
      /* Use phase 1, because phase 2 elides comments.  */
      c = phase1_getc ();

      if (c == EOF || c == '\n')
	break;
      if (c == '"')
	return P7_QUOTES;
      if (c != '\\')
	return c;
      c = phase1_getc ();
      if (c == EOF)
	break;
      if (c != '\n')
	switch (c)
	  {
	  case 'a':
	    return '\a';
	  case 'b':
	    return '\b';
	  case 'f':
	    return '\f';
	  case 'n':
	    return '\n';
	  case 'r':
	    return '\r';
	  case 't':
	    return '\t';
	  case 'v':
	    return '\v';
	  case '0': case '1': case '2': case '3': case '4':
	  case '5': case '6': case '7':
	    {
	      int n = c - '0';

	      c = phase1_getc ();
	      if (c != EOF)
		{
		  if (c >= '0' && c <= '7')
		    {
		      n = (n << 3) + (c - '0');
		      c = phase1_getc ();
		      if (c != EOF)
			{
			  if (c >= '0' && c <= '7')
			    n = (n << 3) + (c - '0');
			  else
			    phase1_ungetc (c);
			}
		    }
		  else
		    phase1_ungetc (c);
		}
	      return (unsigned char) n;
	    }
	  case 'x':
	    {
	      int n = 0;

	      for (;;)
		{
		  c = phase1_getc ();
		  if (c == EOF)
		    break;
		  else if (c >= '0' && c <= '9')
		    n = (n << 4) + (c - '0');
		  else if (c >= 'A' && c <= 'F')
		    n = (n << 4) + (c - 'A' + 10);
		  else if (c >= 'a' && c <= 'f')
		    n = (n << 4) + (c - 'a' + 10);
		  else
		    {
		      phase1_ungetc (c);
		      break;
		    }
		}
	      return (unsigned char) n;
	    }
	  default:
	    return c;
	  }
    }

  phase1_ungetc (c);
  error_with_progname = false;
  error (0, 0, _("%s:%d: warning: unterminated string"), logical_file_name,
	 line_number);
  error_with_progname = true;
  return P7_QUOTES;
}


/* Free the memory pointed to by a 'struct token_ty'.  */
static inline void
free_token (token_ty *tp)
{
  switch (tp->type)
    {
    case token_type_string:
    case token_type_i18nstring:
    case token_type_symbol:
      free (tp->string);
      break;
    default:
      break;
    }
}


/* Combine characters into tokens.  Discard whitespace.  */

/* There is an ambiguity about '/': It can start a division operator ('/' or
   '/=') or it can start a regular expression.  The distinction is important
   because inside regular expressions, '#' and '"' lose its special meanings.
   If you look at the awk grammar, you see that the operator is only allowed
   right after a 'variable' or 'simp_exp' nonterminal, and these nonterminals
   can only end in the NAME, LENGTH, YSTRING, YNUMBER, ')', ']' terminals.
   So we prefer the division operator interpretation only right after
   symbol, string, number, ')', ']', with whitespace but no newline allowed
   in between.  */
static bool prefer_division_over_regexp;

static void
x_awk_lex (token_ty *tp)
{
  static char *buffer;
  static int bufmax;
  int bufpos;
  int c;

  for (;;)
    {
      tp->line_number = line_number;
      c = phase2_getc ();

      switch (c)
	{
	case EOF:
	  tp->type = token_type_eof;
	  return;

	case '\n':
	  if (last_non_comment_line > last_comment_line)
	    savable_comment_reset ();
	  /* Newline is not allowed inside expressions.  It usually
	     introduces a fresh statement.
	     FIXME: Newlines after any of ',' '{' '?' ':' '||' '&&' 'do' 'else'
	     does *not* introduce a fresh statement.  */
	  prefer_division_over_regexp = false;
	  /* FALLTHROUGH */
	case '\t':
	case ' ':
	  /* Ignore whitespace and comments.  */
	  continue;

	case '\\':
	  /* Backslash ought to be immediately followed by a newline.  */
	  continue;
	}

      last_non_comment_line = tp->line_number;

      switch (c)
	{
	case '.':
	  {
	    int c2 = phase2_getc ();
	    phase2_ungetc (c2);
	    if (!(c2 >= '0' && c2 <= '9'))
	      {

		tp->type = token_type_other;
		prefer_division_over_regexp = false;
		return;
	      }
	  }
	  /* FALLTHROUGH */
	case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
	case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
	case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
	case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
	case 'Y': case 'Z':
	case '_':
	case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
	case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
	case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
	case 's': case 't': case 'u': case 'v': case 'w': case 'x':
	case 'y': case 'z':
	case '0': case '1': case '2': case '3': case '4':
	case '5': case '6': case '7': case '8': case '9':
	  /* Symbol, or part of a number.  */
	  bufpos = 0;
	  for (;;)
	    {
	      if (bufpos >= bufmax)
		{
		  bufmax = 2 * bufmax + 10;
		  buffer = xrealloc (buffer, bufmax);
		}
	      buffer[bufpos++] = c;
	      c = phase2_getc ();
	      switch (c)
		{
		case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
		case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
		case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
		case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
		case 'Y': case 'Z':
		case '_':
		case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
		case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
		case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
		case 's': case 't': case 'u': case 'v': case 'w': case 'x':
		case 'y': case 'z':
		case '0': case '1': case '2': case '3': case '4':
		case '5': case '6': case '7': case '8': case '9':
		  continue;
		default:
		  if (bufpos == 1 && buffer[0] == '_' && c == '"')
		    {
		      tp->type = token_type_i18nstring;
		      goto case_string;
		    }
		  phase2_ungetc (c);
		  break;
		}
	      break;
	    }
	  if (bufpos >= bufmax)
	    {
	      bufmax = 2 * bufmax + 10;
	      buffer = xrealloc (buffer, bufmax);
	    }
	  buffer[bufpos] = '\0';
	  tp->string = xstrdup (buffer);
	  tp->type = token_type_symbol;
	  /* Most identifiers can be variable names; after them we must
	     interpret '/' as division operator.  But for awk's builtin
	     keywords we have three cases:
	     (a) Must interpret '/' as division operator. "length".
	     (b) Must interpret '/' as start of a regular expression.
		 "do", "exit", "print", "printf", "return".
	     (c) '/' after this keyword in invalid anyway. All others.
	     I used the following script for the distinction.
		for k in $awk_keywords; do
		  echo; echo $k; awk "function foo () { $k / 10 }" < /dev/null
		done
	   */
	  if (strcmp (buffer, "do") == 0
	      || strcmp (buffer, "exit") == 0
	      || strcmp (buffer, "print") == 0
	      || strcmp (buffer, "printf") == 0
	      || strcmp (buffer, "return") == 0)
	    prefer_division_over_regexp = false;
	  else
	    prefer_division_over_regexp = true;
	  return;

	case '"':
	  tp->type = token_type_string;
	case_string:
	  bufpos = 0;
	  for (;;)
	    {
	      c = phase7_getc ();
	      if (c == EOF || c == P7_QUOTES)
		break;
	      if (bufpos >= bufmax)
		{
		  bufmax = 2 * bufmax + 10;
		  buffer = xrealloc (buffer, bufmax);
		}
	      buffer[bufpos++] = c;
	    }
	  if (bufpos >= bufmax)
	    {
	      bufmax = 2 * bufmax + 10;
	      buffer = xrealloc (buffer, bufmax);
	    }
	  buffer[bufpos] = '\0';
	  tp->string = xstrdup (buffer);
	  prefer_division_over_regexp = true;
	  return;

	case '(':
	  tp->type = token_type_lparen;
	  prefer_division_over_regexp = false;
	  return;

	case ')':
	  tp->type = token_type_rparen;
	  prefer_division_over_regexp = true;
	  return;

	case ',':
	  tp->type = token_type_comma;
	  prefer_division_over_regexp = false;
	  return;

	case ';':
	  tp->type = token_type_semicolon;
	  prefer_division_over_regexp = false;
	  return;

	case ']':
	  tp->type = token_type_other;
	  prefer_division_over_regexp = true;
	  return;

	case '/':
	  if (!prefer_division_over_regexp)
	    {
	      /* Regular expression.
	         Counting brackets is non-trivial. [[] is balanced, and so is
	         [\]]. Also, /[/]/ is balanced and ends at the third slash.
	         Do not count [ or ] if either one is preceded by a \.
	         A '[' should be counted if
	          a) it is the first one so far (brackets == 0), or
	          b) it is the '[' in '[:'.
	         A ']' should be counted if not preceded by a \.
	         According to POSIX, []] is how you put a ] into a set.
	         Try to handle that too.
	       */
	      int brackets = 0;
	      bool pos0 = true;		/* true at start of regexp */
	      bool pos1_open = false;	/* true after [ at start of regexp */
	      bool pos2_open_not = false; /* true after [^ at start of regexp */

	      for (;;)
		{
		  c = phase1_getc ();

		  if (c == EOF || c == '\n')
		    {
		      phase1_ungetc (c);
		      error_with_progname = false;
		      error (0, 0, _("%s:%d: warning: unterminated regular expression"),
			     logical_file_name, line_number);
		      error_with_progname = true;
		      break;
		    }
		  else if (c == '[')
		    {
		      if (brackets == 0)
			brackets++;
		      else
			{
			  c = phase1_getc ();
			  if (c == ':')
			    brackets++;
			  phase1_ungetc (c);
			}
		      if (pos0)
			{
			  pos0 = false;
			  pos1_open = true;
			  continue;
			}
		    }
		  else if (c == ']')
		    {
		      if (!(pos1_open || pos2_open_not))
			brackets--;
		    }
		  else if (c == '^')
		    {
		      if (pos1_open)
			{
			  pos1_open = false;
			  pos2_open_not = true;
			  continue;
			}
		    }
		  else if (c == '\\')
		    {
		      c = phase1_getc ();
		      /* Backslash-newline is valid and ignored.  */
		    }
		  else if (c == '/')
		    {
		      if (brackets <= 0)
			break;
		    }

		  pos0 = false;
		  pos1_open = false;
		  pos2_open_not = false;
		}

	      tp->type = token_type_other;
	      prefer_division_over_regexp = false;
	      return;
	    }
	  /* FALLTHROUGH */

	default:
	  /* We could carefully recognize each of the 2 and 3 character
	     operators, but it is not necessary, as we only need to recognize
	     gettext invocations.  Don't bother.  */
	  tp->type = token_type_other;
	  prefer_division_over_regexp = false;
	  return;
	}
    }
}


/* ========================= Extracting strings.  ========================== */


/* Context lookup table.  */
static flag_context_list_table_ty *flag_context_list_table;


/* The file is broken into tokens.  Scan the token stream, looking for
   a keyword, followed by a left paren, followed by a string.  When we
   see this sequence, we have something to remember.  We assume we are
   looking at a valid C or C++ program, and leave the complaints about
   the grammar to the compiler.

     Normal handling: Look for
       keyword ( ... msgid ... )
     Plural handling: Look for
       keyword ( ... msgid ... msgid_plural ... )

   We use recursion because the arguments before msgid or between msgid
   and msgid_plural can contain subexpressions of the same form.  */


/* Extract messages until the next balanced closing parenthesis.
   Extracted messages are added to MLP.
   Return true upon eof, false upon closing parenthesis.  */
static bool
extract_parenthesized (message_list_ty *mlp,
		       flag_context_ty outer_context,
		       flag_context_list_iterator_ty context_iter,
		       struct arglist_parser *argparser)
{
  /* Current argument number.  */
  int arg = 1;
  /* 0 when no keyword has been seen.  1 right after a keyword is seen.  */
  int state;
  /* Parameters of the keyword just seen.  Defined only in state 1.  */
  const struct callshapes *next_shapes = NULL;
  /* Whether to implicitly assume the next tokens are arguments even without
     a '('.  */
  bool next_is_argument = false;
  /* Context iterator that will be used if the next token is a '('.  */
  flag_context_list_iterator_ty next_context_iter =
    passthrough_context_list_iterator;
  /* Current context.  */
  flag_context_ty inner_context =
    inherited_context (outer_context,
		       flag_context_list_iterator_advance (&context_iter));

  /* Start state is 0.  */
  state = 0;

  for (;;)
    {
      token_ty token;

      x_awk_lex (&token);

      if (next_is_argument && token.type != token_type_lparen)
	{
	  /* An argument list starts, even though there is no '('.  */
	  context_iter = next_context_iter;
	  outer_context = inner_context;
	  inner_context =
	    inherited_context (outer_context,
			       flag_context_list_iterator_advance (
				 &context_iter));
	}

      switch (token.type)
	{
	case token_type_symbol:
	  {
	    void *keyword_value;

	    if (hash_find_entry (&keywords, token.string, strlen (token.string),
				 &keyword_value)
		== 0)
	      {
		next_shapes = (const struct callshapes *) keyword_value;
		state = 1;
	      }
	    else
	      state = 0;
	  }
	  next_is_argument =
	    (strcmp (token.string, "print") == 0
	     || strcmp (token.string, "printf") == 0);
	  next_context_iter =
	    flag_context_list_iterator (
	      flag_context_list_table_lookup (
		flag_context_list_table,
		token.string, strlen (token.string)));
	  free (token.string);
	  continue;

	case token_type_lparen:
	  if (extract_parenthesized (mlp, inner_context, next_context_iter,
				     arglist_parser_alloc (mlp,
							   state ? next_shapes : NULL)))
	    {
	      arglist_parser_done (argparser, arg);
	      return true;
	    }
	  next_is_argument = false;
	  next_context_iter = null_context_list_iterator;
	  state = 0;
	  continue;

	case token_type_rparen:
	  arglist_parser_done (argparser, arg);
	  return false;

	case token_type_comma:
	  arg++;
	  inner_context =
	    inherited_context (outer_context,
			       flag_context_list_iterator_advance (
				 &context_iter));
	  next_is_argument = false;
	  next_context_iter = passthrough_context_list_iterator;
	  state = 0;
	  continue;

	case token_type_string:
	  {
	    lex_pos_ty pos;
	    pos.file_name = logical_file_name;
	    pos.line_number = token.line_number;

	    if (extract_all)
	      remember_a_message (mlp, NULL, token.string, inner_context, &pos,
				  savable_comment);
	    else
	      arglist_parser_remember (argparser, arg, token.string,
				       inner_context,
				       pos.file_name, pos.line_number,
				       savable_comment);
	  }
	  next_is_argument = false;
	  next_context_iter = null_context_list_iterator;
	  state = 0;
	  continue;

	case token_type_i18nstring:
	  {
	    lex_pos_ty pos;
	    pos.file_name = logical_file_name;
	    pos.line_number = token.line_number;

	    remember_a_message (mlp, NULL, token.string, inner_context, &pos,
				savable_comment);
	  }
	  next_is_argument = false;
	  next_context_iter = null_context_list_iterator;
	  state = 0;
	  continue;

	case token_type_semicolon:
	  /* An argument list ends, and a new statement begins.  */
	  /* FIXME: Should handle newline that acts as statement separator
	     in the same way.  */
	  /* FIXME: Instead of resetting outer_context here, it may be better
	     to recurse in the next_is_argument handling above, waiting for
	     the next semicolon or other statement terminator.  */
	  outer_context = null_context;
	  context_iter = null_context_list_iterator;
	  next_is_argument = false;
	  next_context_iter = passthrough_context_list_iterator;
	  inner_context =
	    inherited_context (outer_context,
			       flag_context_list_iterator_advance (
				 &context_iter));
	  state = 0;
	  continue;

	case token_type_eof:
	  arglist_parser_done (argparser, arg);
	  return true;

	case token_type_other:
	  next_is_argument = false;
	  next_context_iter = null_context_list_iterator;
	  state = 0;
	  continue;

	default:
	  abort ();
	}
    }
}


void
extract_awk (FILE *f,
	     const char *real_filename, const char *logical_filename,
	     flag_context_list_table_ty *flag_table,
	     msgdomain_list_ty *mdlp)
{
  message_list_ty *mlp = mdlp->item[0]->messages;

  fp = f;
  real_file_name = real_filename;
  logical_file_name = xstrdup (logical_filename);
  line_number = 1;

  last_comment_line = -1;
  last_non_comment_line = -1;

  prefer_division_over_regexp = false;

  flag_context_list_table = flag_table;

  init_keywords ();

  /* Eat tokens until eof is seen.  When extract_parenthesized returns
     due to an unbalanced closing parenthesis, just restart it.  */
  while (!extract_parenthesized (mlp, null_context, null_context_list_iterator,
				 arglist_parser_alloc (mlp, NULL)))
    ;

  fp = NULL;
  real_file_name = NULL;
  logical_file_name = NULL;
  line_number = 0;
}