gettext-tools/src/read-stringtable.c

/* Reading NeXTstep/GNUstep .strings files.
   Copyright (C) 2003, 2005-2007 Free Software Foundation, Inc.
   Written by Bruno Haible <bruno@clisp.org>, 2003.

   This program is free software: you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 3 of the License, or
   (at your option) any later version.

   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */

#ifdef HAVE_CONFIG_H
# include <config.h>
#endif

/* Specification.  */
#include "read-stringtable.h"

#include <assert.h>
#include <errno.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#include "error.h"
#include "error-progname.h"
#include "read-catalog-abstract.h"
#include "xalloc.h"
#include "xvasprintf.h"
#include "po-xerror.h"
#include "unistr.h"
#include "gettext.h"

#define _(str) gettext (str)

/* The format of NeXTstep/GNUstep .strings files is documented in
     gnustep-base-1.8.0/Tools/make_strings/Using.txt
   and in the comments of method propertyListFromStringsFileFormat in
     gnustep-base-1.8.0/Source/NSString.m
   In summary, it's a Objective-C like file with pseudo-assignments of the form
          "key" = "value";
   where the key is the msgid and the value is the msgstr.

   The implementation of the parser of .strings files is in
     gnustep-base-1.8.0/Source/NSString.m
     function GSPropertyListFromStringsFormat
     (indirectly called from NSBundle's method localizedStringForKey).

   A test case is in
     gnustep-base-1.8.0/Testing/English.lproj/NXStringTable.example
 */

/* Handling of comments: We copy all comments from the .strings file to
   the PO file. This is not really needed; it's a service for translators
   who don't like PO files and prefer to maintain the .strings file.  */


/* Real filename, used in error messages about the input file.  */
static const char *real_file_name;

/* File name and line number.  */
extern lex_pos_ty gram_pos;

/* The input file stream.  */
static FILE *fp;


/* Phase 1: Read a byte.
   Max. 4 pushback characters.  */

static unsigned char phase1_pushback[4];
static int phase1_pushback_length;

static int
phase1_getc ()
{
  int c;

  if (phase1_pushback_length)
    return phase1_pushback[--phase1_pushback_length];

  c = getc (fp);

  if (c == EOF)
    {
      if (ferror (fp))
	{
	  const char *errno_description = strerror (errno);
	  po_xerror (PO_SEVERITY_FATAL_ERROR, NULL, NULL, 0, 0, false,
		     xasprintf ("%s: %s",
				xasprintf (_("error while reading \"%s\""),
					   real_file_name),
				errno_description));
	}
      return EOF;
    }

  return c;
}

static void
phase1_ungetc (int c)
{
  if (c != EOF)
    phase1_pushback[phase1_pushback_length++] = c;
}


/* Phase 2: Read an UCS-4 character.
   Max. 2 pushback characters.  */

/* End-of-file indicator for functions returning an UCS-4 character.  */
#define UEOF -1

static int phase2_pushback[4];
static int phase2_pushback_length;

/* The input file can be in Unicode encoding (UCS-2BE, UCS-2LE, UTF-8, each
   with a BOM!), or otherwise the locale-dependent default encoding is used.
   Since we don't want to depend on the locale here, we use ISO-8859-1
   instead.  */
enum enc
{
  enc_undetermined,
  enc_ucs2be,
  enc_ucs2le,
  enc_utf8,
  enc_iso8859_1
};
static enum enc encoding;

static int
phase2_getc ()
{
  if (phase2_pushback_length)
    return phase2_pushback[--phase2_pushback_length];

  if (encoding == enc_undetermined)
    {
      /* Determine the input file's encoding.  */
      int c0, c1;

      c0 = phase1_getc ();
      if (c0 == EOF)
	return UEOF;
      c1 = phase1_getc ();
      if (c1 == EOF)
	{
	  phase1_ungetc (c0);
	  encoding = enc_iso8859_1;
	}
      else if (c0 == 0xfe && c1 == 0xff)
	encoding = enc_ucs2be;
      else if (c0 == 0xff && c1 == 0xfe)
	encoding = enc_ucs2le;
      else
	{
	  int c2;

	  c2 = phase1_getc ();
	  if (c2 == EOF)
	    {
	      phase1_ungetc (c1);
	      phase1_ungetc (c0);
	      encoding = enc_iso8859_1;
	    }
	  else if (c0 == 0xef && c1 == 0xbb && c2 == 0xbf)
	    encoding = enc_utf8;
	  else
	    {
	      phase1_ungetc (c2);
	      phase1_ungetc (c1);
	      phase1_ungetc (c0);
	      encoding = enc_iso8859_1;
	    }
	}
    }

  switch (encoding)
    {
    case enc_ucs2be:
      /* Read an UCS-2BE encoded character.  */
      {
	int c0, c1;

	c0 = phase1_getc ();
	if (c0 == EOF)
	  return UEOF;
	c1 = phase1_getc ();
	if (c1 == EOF)
	  return UEOF;
	return (c0 << 8) + c1;
      }

    case enc_ucs2le:
      /* Read an UCS-2LE encoded character.  */
      {
	int c0, c1;

	c0 = phase1_getc ();
	if (c0 == EOF)
	  return UEOF;
	c1 = phase1_getc ();
	if (c1 == EOF)
	  return UEOF;
	return c0 + (c1 << 8);
      }

    case enc_utf8:
      /* Read an UTF-8 encoded character.  */
      {
	unsigned char buf[6];
	unsigned int count;
	int c;
	unsigned int uc;

	c = phase1_getc ();
	if (c == EOF)
	  return UEOF;
	buf[0] = c;
	count = 1;

	if (buf[0] >= 0xc0)
	  {
	    c = phase1_getc ();
	    if (c == EOF)
	      return UEOF;
	    buf[1] = c;
	    count = 2;

	    if (buf[0] >= 0xe0
		&& ((buf[1] ^ 0x80) < 0x40))
	      {
		c = phase1_getc ();
		if (c == EOF)
		  return UEOF;
		buf[2] = c;
		count = 3;

		if (buf[0] >= 0xf0
		    && ((buf[2] ^ 0x80) < 0x40))
		  {
		    c = phase1_getc ();
		    if (c == EOF)
		      return UEOF;
		    buf[3] = c;
		    count = 4;

		    if (buf[0] >= 0xf8
			&& ((buf[3] ^ 0x80) < 0x40))
		      {
			c = phase1_getc ();
			if (c == EOF)
			  return UEOF;
			buf[4] = c;
			count = 5;

			if (buf[0] >= 0xfc
			    && ((buf[4] ^ 0x80) < 0x40))
			  {
			    c = phase1_getc ();
			    if (c == EOF)
			      return UEOF;
			    buf[5] = c;
			    count = 6;
			  }
		      }
		  }
	      }
	  }

	u8_mbtouc (&uc, buf, count);
	return uc;
      }

    case enc_iso8859_1:
      /* Read an ISO-8859-1 encoded character.  */
      {
	int c = phase1_getc ();

	if (c == EOF)
	  return UEOF;
	return c;
      }

    default:
      abort ();
    }
}

static void
phase2_ungetc (int c)
{
  if (c != UEOF)
    phase2_pushback[phase2_pushback_length++] = c;
}


/* Phase 3: Read an UCS-4 character, with line number handling.  */

static int
phase3_getc ()
{
  int c = phase2_getc ();

  if (c == '\n')
    gram_pos.line_number++;

  return c;
}

static void
phase3_ungetc (int c)
{
  if (c == '\n')
    --gram_pos.line_number;
  phase2_ungetc (c);
}


/* Convert from UCS-4 to UTF-8.  */
static char *
conv_from_ucs4 (const int *buffer, size_t buflen)
{
  unsigned char *utf8_string;
  size_t pos;
  unsigned char *q;

  /* Each UCS-4 word needs 6 bytes at worst.  */
  utf8_string = XNMALLOC (6 * buflen + 1, unsigned char);

  for (pos = 0, q = utf8_string; pos < buflen; )
    {
      unsigned int uc;
      int n;

      uc = buffer[pos++];
      n = u8_uctomb (q, uc, 6);
      assert (n > 0);
      q += n;
    }
  *q = '\0';
  assert (q - utf8_string <= 6 * buflen);

  return (char *) utf8_string;
}


/* Parse a string enclosed in double-quotes.  Input is UCS-4 encoded.
   Return the string in UTF-8 encoding, or NULL if the input doesn't represent
   a valid string enclosed in double-quotes.  */
static char *
parse_escaped_string (const int *string, size_t length)
{
  static int *buffer;
  static size_t bufmax;
  static size_t buflen;
  const int *string_limit = string + length;
  int c;

  if (string == string_limit)
    return NULL;
  c = *string++;
  if (c != '"')
    return NULL;
  buflen = 0;
  for (;;)
    {
      if (string == string_limit)
	return NULL;
      c = *string++;
      if (c == '"')
	break;
      if (c == '\\')
	{
	  if (string == string_limit)
	    return NULL;
	  c = *string++;
	  if (c >= '0' && c <= '7')
	    {
	      unsigned int n = 0;
	      int j = 0;
	      for (;;)
		{
		  n = n * 8 + (c - '0');
		  if (++j == 3)
		    break;
		  if (string == string_limit)
		    break;
		  c = *string;
		  if (!(c >= '0' && c <= '7'))
		    break;
		  string++;
		}
	      c = n;
	    }
	  else if (c == 'u' || c == 'U')
	    {
	      unsigned int n = 0;
	      int j;
	      for (j = 0; j < 4; j++)
		{
		  if (string == string_limit)
		    break;
		  c = *string;
		  if (c >= '0' && c <= '9')
		    n = n * 16 + (c - '0');
		  else if (c >= 'A' && c <= 'F')
		    n = n * 16 + (c - 'A' + 10);
		  else if (c >= 'a' && c <= 'f')
		    n = n * 16 + (c - 'a' + 10);
		  else
		    break;
		  string++;
		}
	      c = n;
	    }
	  else
	    switch (c)
	      {
	      case 'a': c = '\a'; break;
	      case 'b': c = '\b'; break;
	      case 't': c = '\t'; break;
	      case 'r': c = '\r'; break;
	      case 'n': c = '\n'; break;
	      case 'v': c = '\v'; break;
	      case 'f': c = '\f'; break;
	      }
	}
      if (buflen >= bufmax)
	{
	  bufmax = 2 * bufmax + 10;
	  buffer = xrealloc (buffer, bufmax * sizeof (int));
	}
      buffer[buflen++] = c;
    }

  return conv_from_ucs4 (buffer, buflen);
}


/* Accumulating flag comments.  */

static char *special_comment;

static inline void
special_comment_reset ()
{
  if (special_comment != NULL)
    free (special_comment);
  special_comment = NULL;
}

static void
special_comment_add (const char *flag)
{
  if (special_comment == NULL)
    special_comment = xstrdup (flag);
  else
    {
      size_t total_len = strlen (special_comment) + 2 + strlen (flag) + 1;
      special_comment = xrealloc (special_comment, total_len);
      strcat (special_comment, ", ");
      strcat (special_comment, flag);
    }
}

static inline void
special_comment_finish ()
{
  if (special_comment != NULL)
    {
      po_callback_comment_special (special_comment);
      free (special_comment);
      special_comment = NULL;
    }
}


/* Accumulating comments.  */

static int *buffer;
static size_t bufmax;
static size_t buflen;
static bool next_is_obsolete;
static bool next_is_fuzzy;
static char *fuzzy_msgstr;
static bool expect_fuzzy_msgstr_as_c_comment;
static bool expect_fuzzy_msgstr_as_cxx_comment;

static inline void
comment_start ()
{
  buflen = 0;
}

static inline void
comment_add (int c)
{
  if (buflen >= bufmax)
    {
      bufmax = 2 * bufmax + 10;
      buffer = xrealloc (buffer, bufmax * sizeof (int));
    }
  buffer[buflen++] = c;
}

static inline void
comment_line_end (size_t chars_to_remove, bool test_for_fuzzy_msgstr)
{
  char *line;

  buflen -= chars_to_remove;
  /* Drop trailing white space, but not EOLs.  */
  while (buflen >= 1
	 && (buffer[buflen - 1] == ' ' || buffer[buflen - 1] == '\t'))
    --buflen;

  /* At special positions we interpret a comment of the form
       = "escaped string"
     with an optional trailing semicolon as being the fuzzy msgstr, not a
     regular comment.  */
  if (test_for_fuzzy_msgstr
      && buflen > 2 && buffer[0] == '=' && buffer[1] == ' '
      && (fuzzy_msgstr =
	  parse_escaped_string (buffer + 2,
				buflen - (buffer[buflen - 1] == ';') - 2)))
    return;

  line = conv_from_ucs4 (buffer, buflen);

  if (strcmp (line, "Flag: untranslated") == 0)
    {
      special_comment_add ("fuzzy");
      next_is_fuzzy = true;
    }
  else if (strcmp (line, "Flag: unmatched") == 0)
    next_is_obsolete = true;
  else if (strlen (line) >= 6 && memcmp (line, "Flag: ", 6) == 0)
    special_comment_add (line + 6);
  else if (strlen (line) >= 9 && memcmp (line, "Comment: ", 9) == 0)
    /* A comment extracted from the source.  */
    po_callback_comment_dot (line + 9);
  else
    {
      char *last_colon;
      unsigned long number;
      char *endp;

      if (strlen (line) >= 6 && memcmp (line, "File: ", 6) == 0
	  && (last_colon = strrchr (line + 6, ':')) != NULL
	  && *(last_colon + 1) != '\0'
	  && (number = strtoul (last_colon + 1, &endp, 10), *endp == '\0'))
	{
	  /* A "File: <filename>:<number>" type comment.  */
	  *last_colon = '\0';
	  po_callback_comment_filepos (line + 6, number);
	}
      else
	po_callback_comment (line);
    }
}


/* Phase 4: Replace each comment that is not inside a string with a space
   character.  */

static int
phase4_getc ()
{
  int c;

  c = phase3_getc ();
  if (c != '/')
    return c;
  c = phase3_getc ();
  switch (c)
    {
    default:
      phase3_ungetc (c);
      return '/';

    case '*':
      /* C style comment.  */
      {
	bool last_was_star;
	size_t trailing_stars;
	bool seen_newline;

	comment_start ();
	last_was_star = false;
	trailing_stars = 0;
	seen_newline = false;
	/* Drop additional stars at the beginning of the comment.  */
	for (;;)
	  {
	    c = phase3_getc ();
	    if (c != '*')
	      break;
	    last_was_star = true;
	  }
	phase3_ungetc (c);
	for (;;)
	  {
	    c = phase3_getc ();
	    if (c == UEOF)
	      break;
	    /* We skip all leading white space, but not EOLs.  */
	    if (!(buflen == 0 && (c == ' ' || c == '\t')))
	      comment_add (c);
	    switch (c)
	      {
	      case '\n':
		seen_newline = true;
		comment_line_end (1, false);
		comment_start ();
		last_was_star = false;
		trailing_stars = 0;
		continue;

	      case '*':
		last_was_star = true;
		trailing_stars++;
		continue;

	      case '/':
		if (last_was_star)
		  {
		    /* Drop additional stars at the end of the comment.  */
		    comment_line_end (trailing_stars + 1,
				      expect_fuzzy_msgstr_as_c_comment
				      && !seen_newline);
		    break;
		  }
		/* FALLTHROUGH */

	      default:
		last_was_star = false;
		trailing_stars = 0;
		continue;
	      }
	    break;
	  }
	return ' ';
      }

    case '/':
      /* C++ style comment.  */
      comment_start ();
      for (;;)
	{
	  c = phase3_getc ();
	  if (c == '\n' || c == UEOF)
	    break;
	  /* We skip all leading white space, but not EOLs.  */
	  if (!(buflen == 0 && (c == ' ' || c == '\t')))
	    comment_add (c);
	}
      comment_line_end (0, expect_fuzzy_msgstr_as_cxx_comment);
      return '\n';
    }
}

static inline void
phase4_ungetc (int c)
{
  phase3_ungetc (c);
}


/* Return true if a character is considered as whitespace.  */
static bool
is_whitespace (int c)
{
  return (c == ' ' || c == '\t' || c == '\r' || c == '\n' || c == '\f'
	  || c == '\b');
}

/* Return true if a character needs quoting, i.e. cannot be used in unquoted
   tokens.  */
static bool
is_quotable (int c)
{
  if ((c >= '0' && c <= '9')
      || (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'))
    return false;
  switch (c)
    {
    case '!': case '#': case '$': case '%': case '&': case '*':
    case '+': case '-': case '.': case '/': case ':': case '?':
    case '@': case '|': case '~': case '_': case '^':
      return false;
    default:
      return true;
    }
}


/* Read a key or value string.
   Return the string in UTF-8 encoding, or NULL if no string is seen.
   Return the start position of the string in *pos.  */
static char *
read_string (lex_pos_ty *pos)
{
  static int *buffer;
  static size_t bufmax;
  static size_t buflen;
  int c;

  /* Skip whitespace before the string.  */
  do
    c = phase4_getc ();
  while (is_whitespace (c));

  if (c == UEOF)
    /* No more string.  */
    return NULL;

  *pos = gram_pos;
  buflen = 0;
  if (c == '"')
    {
      /* Read a string enclosed in double-quotes.  */
      for (;;)
	{
	  c = phase3_getc ();
	  if (c == UEOF || c == '"')
	    break;
	  if (c == '\\')
	    {
	      c = phase3_getc ();
	      if (c == UEOF)
		break;
	      if (c >= '0' && c <= '7')
		{
		  unsigned int n = 0;
		  int j = 0;
		  for (;;)
		    {
		      n = n * 8 + (c - '0');
		      if (++j == 3)
			break;
		      c = phase3_getc ();
		      if (!(c >= '0' && c <= '7'))
			{
			  phase3_ungetc (c);
			  break;
			}
		    }
		  c = n;
		}
	      else if (c == 'u' || c == 'U')
		{
		  unsigned int n = 0;
		  int j;
		  for (j = 0; j < 4; j++)
		    {
		      c = phase3_getc ();
		      if (c >= '0' && c <= '9')
			n = n * 16 + (c - '0');
		      else if (c >= 'A' && c <= 'F')
			n = n * 16 + (c - 'A' + 10);
		      else if (c >= 'a' && c <= 'f')
			n = n * 16 + (c - 'a' + 10);
		      else
			{
			  phase3_ungetc (c);
			  break;
			}
		    }
		  c = n;
		}
	      else
		switch (c)
		  {
		  case 'a': c = '\a'; break;
		  case 'b': c = '\b'; break;
		  case 't': c = '\t'; break;
		  case 'r': c = '\r'; break;
		  case 'n': c = '\n'; break;
		  case 'v': c = '\v'; break;
		  case 'f': c = '\f'; break;
		  }
	    }
	  if (buflen >= bufmax)
	    {
	      bufmax = 2 * bufmax + 10;
	      buffer = xrealloc (buffer, bufmax * sizeof (int));
	    }
	  buffer[buflen++] = c;
	}
      if (c == UEOF)
	po_xerror (PO_SEVERITY_ERROR, NULL,
		   real_file_name, gram_pos.line_number, (size_t)(-1), false,
		   _("warning: unterminated string"));
    }
  else
    {
      /* Read a token outside quotes.  */
      if (is_quotable (c))
	po_xerror (PO_SEVERITY_ERROR, NULL,
		   real_file_name, gram_pos.line_number, (size_t)(-1), false,
		   _("warning: syntax error"));
      for (; c != UEOF && !is_quotable (c); c = phase4_getc ())
	{
	  if (buflen >= bufmax)
	    {
	      bufmax = 2 * bufmax + 10;
	      buffer = xrealloc (buffer, bufmax * sizeof (int));
	    }
	  buffer[buflen++] = c;
	}
    }

  return conv_from_ucs4 (buffer, buflen);
}


/* Read a .strings file from a stream, and dispatch to the various
   abstract_catalog_reader_class_ty methods.  */
static void
stringtable_parse (abstract_catalog_reader_ty *pop, FILE *file,
		   const char *real_filename, const char *logical_filename)
{
  fp = file;
  real_file_name = real_filename;
  gram_pos.file_name = xstrdup (real_file_name);
  gram_pos.line_number = 1;
  encoding = enc_undetermined;
  expect_fuzzy_msgstr_as_c_comment = false;
  expect_fuzzy_msgstr_as_cxx_comment = false;

  for (;;)
    {
      char *msgid;
      lex_pos_ty msgid_pos;
      char *msgstr;
      lex_pos_ty msgstr_pos;
      int c;

      /* Prepare for next msgid/msgstr pair.  */
      special_comment_reset ();
      next_is_obsolete = false;
      next_is_fuzzy = false;
      fuzzy_msgstr = NULL;

      /* Read the key and all the comments preceding it.  */
      msgid = read_string (&msgid_pos);
      if (msgid == NULL)
	break;

      special_comment_finish ();

      /* Skip whitespace.  */
      do
	c = phase4_getc ();
      while (is_whitespace (c));

      /* Expect a '=' or ';'.  */
      if (c == UEOF)
	{
	  po_xerror (PO_SEVERITY_ERROR, NULL,
		     real_file_name, gram_pos.line_number, (size_t)(-1), false,
		     _("warning: unterminated key/value pair"));
	  break;
	}
      if (c == ';')
	{
	  /* "key"; is an abbreviation for "key"=""; and does not
	     necessarily designate an untranslated entry.  */
	  msgstr = xstrdup ("");
	  msgstr_pos = msgid_pos;
	  po_callback_message (NULL, msgid, &msgid_pos, NULL,
			       msgstr, strlen (msgstr) + 1, &msgstr_pos,
			       NULL, NULL, NULL,
			       false, next_is_obsolete);
	}
      else if (c == '=')
	{
	  /* Read the value.  */
	  msgstr = read_string (&msgstr_pos);
	  if (msgstr == NULL)
	    {
	      po_xerror (PO_SEVERITY_ERROR, NULL,
			 real_file_name, gram_pos.line_number, (size_t)(-1),
			 false, _("warning: unterminated key/value pair"));
	      break;
	    }

	  /* Skip whitespace.  But for fuzzy key/value pairs, look for the
	     tentative msgstr in the form of a C style comment.  */
	  expect_fuzzy_msgstr_as_c_comment = next_is_fuzzy;
	  do
	    {
	      c = phase4_getc ();
	      if (fuzzy_msgstr != NULL)
		expect_fuzzy_msgstr_as_c_comment = false;
	    }
	  while (is_whitespace (c));
	  expect_fuzzy_msgstr_as_c_comment = false;

	  /* Expect a ';'.  */
	  if (c == ';')
	    {
	      /* But for fuzzy key/value pairs, look for the tentative msgstr
		 in the form of a C++ style comment. */
	      if (fuzzy_msgstr == NULL && next_is_fuzzy)
		{
		  do
		    c = phase3_getc ();
		  while (c == ' ');
		  phase3_ungetc (c);

		  expect_fuzzy_msgstr_as_cxx_comment = true;
		  c = phase4_getc ();
		  phase4_ungetc (c);
		  expect_fuzzy_msgstr_as_cxx_comment = false;
		}
	      if (fuzzy_msgstr != NULL && strcmp (msgstr, msgid) == 0)
		msgstr = fuzzy_msgstr;

	      /* A key/value pair.  */
	      po_callback_message (NULL, msgid, &msgid_pos, NULL,
				   msgstr, strlen (msgstr) + 1, &msgstr_pos,
				   NULL, NULL, NULL,
				   false, next_is_obsolete);
	    }
	  else
	    {
	      po_xerror (PO_SEVERITY_ERROR, NULL,
			 real_file_name, gram_pos.line_number, (size_t)(-1),
			 false, _("\
warning: syntax error, expected ';' after string"));
	      break;
	    }
	}
      else
	{
	  po_xerror (PO_SEVERITY_ERROR, NULL,
		     real_file_name, gram_pos.line_number, (size_t)(-1), false,
		     _("\
warning: syntax error, expected '=' or ';' after string"));
	  break;
	}
    }

  fp = NULL;
  real_file_name = NULL;
  gram_pos.line_number = 0;
}

const struct catalog_input_format input_format_stringtable =
{
  stringtable_parse,			/* parse */
  true					/* produces_utf8 */
};