gettext-tools/src/msgl-fsearch.c

/* Fast fuzzy searching among messages.
   Copyright (C) 2006 Free Software Foundation, Inc.
   Written by Bruno Haible <bruno@clisp.org>, 2006.

   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 2, or (at your option)
   any later version.

   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with this program; if not, write to the Free Software Foundation,
   Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.  */

#ifdef HAVE_CONFIG_H
# include <config.h>
#endif

/* Specification.  */
#include "msgl-fsearch.h"

#include <math.h>
#include <stdlib.h>

#include "xalloc.h"
#include "po-charset.h"


/* Fuzzy searching of L strings in a large set of N messages (assuming
   they have all the same small size) takes O(L * N) when using repeated
   fstrcmp() calls.  When for example L = 800 and N = 69000, this is slow.

   So we preprocess the set of N messages, yielding a data structure
   that allows to select the similar messages for any given string, and
   apply fstrcmp() only to the search result.  This allows to reduce the
   time to something between O(L * 1) and O(L * N) - depending on the
   structure of the strings.  In the example with L = 800 and N = 69000,
   memory use increases by a factor of 2 and the time decreases from
   9068 s to 230 s.

   The data structure is a hash table mapping each n-gram (here with n=4)
   occurring in the message strings to the list of messages that contains
   it.  For example, if the message list is
      [0] "close"
      [1] "losers"
   the index is a hash table mapping
      "clos" -> { 0 }
      "lose" -> { 0, 1 }
      "oser" -> { 1 }
      "sers" -> { 1 }
   Searching the similar messages to, say, "closest", is done by looking up
   all its 4-grams in the hash table and summing up the results:
      "clos" -> { 0 }
      "lose" -> { 0, 1 }
      "oses" -> {}
      "sest" -> {}
   => total: { 2x0, 1x1 }
   The list is sorted according to decreasing frequency: { 0, 1, ... }
   and only the first few messages from this frequency list are passed to
   fstrcmp().

   The n-gram similarity and the fstrcmp() similarity are quite different
   metrics.  For example, "close" and "c l o s e" have no n-grams in common
   (even for n as small as n = 2); however, fstrcmp() will find that they
   are quite similar (= 0.71).  Conversely, "AAAA BBBB ... ZZZZ" and
   "ZZZZ ... BBBB AAAA" have many 4-grams in common, but their fstrcmp() is
   only 0.02.  Also, repeated n-grams don't have the same effect on the
   two similarity measures.  But all this doesn't matter much in practice.

   We chose n = 4 because for alphabetic languages, with n = 3 the occurrence
   lists are likely too long.  (Well, with ideographic languages like Chinese,
   n = 3 might actually be quite good.  Anyway, n = 4 is not bad in this case
   either.)

   The units are characters in the current encoding.  Not just bytes,
   because 4 consecutive bytes in UTF-8 or GB18030 don't mean much.  */

/* Each message is represented by its index in the message list.  */
typedef unsigned int index_ty;

/* An index list has its allocated size and length tacked at the front.
   The indices are sorted in ascending order.  */
typedef index_ty *index_list_ty;
#define IL_ALLOCATED 0
#define IL_LENGTH 1

/* Create a new index list containing only a given index.  */
static inline index_list_ty
new_index (index_ty idx)
{
  index_ty *list = (index_ty *) xmalloc ((2 + 1) * sizeof (index_ty));
  list[IL_ALLOCATED] = 1;
  list[IL_LENGTH] = 1;
  list[2] = idx;

  return list;
}

/* Add a given index, greater or equal than all previous indices, to an
   index list.
   Return the new index list, if it had to be reallocated, or NULL if it
   didn't change.  */
static inline index_list_ty
addlast_index (index_list_ty list, index_ty idx)
{
  index_list_ty result;
  size_t length = list[IL_LENGTH];

  /* Look whether it should be inserted.  */
  if (length > 0 && list[2 + (length - 1)] == idx)
    return NULL;

  /* Now make room for one more list element.  */
  result = NULL;
  if (length == list[IL_ALLOCATED])
    {
      size_t new_allocated = 2 * length - (length >> 6);
      list = (index_ty *) xrealloc (list, (2 + new_allocated) * sizeof (index_ty));
      list[IL_ALLOCATED] = new_allocated;
      result = list;
    }
  list[2 + length] = idx;
  list[IL_LENGTH] = length + 1;
  return result;
}

/* Add a given index to an index list.
   Return the new index list, if it had to be reallocated, or NULL if it
   didn't change.  */
static inline index_list_ty
add_index (index_list_ty list, index_ty idx)
{
  index_list_ty result;
  size_t length = list[IL_LENGTH];

  /* Look where it should be inserted.  */
  size_t lo = 0;
  size_t hi = length;
  while (lo < hi)
    {
      /* Here we know that idx must be inserted at a position >= lo, <= hi.  */
      size_t mid = (lo + hi) / 2; /* lo <= mid < hi */
      index_ty val = list[2 + mid];
      if (val < idx)
	lo = mid + 1;
      else if (val > idx)
	hi = mid;
      else
	return NULL;
    }

  /* Now make room for one more list element.  */
  result = NULL;
  if (length == list[IL_ALLOCATED])
    {
      size_t new_allocated = 2 * length - (length >> 6);
      list = (index_ty *) xrealloc (list, (2 + new_allocated) * sizeof (index_ty));
      list[IL_ALLOCATED] = new_allocated;
      result = list;
    }
  list[IL_LENGTH] = length + 1;
  for (; length > hi; length--)
    list[2 + length] = list[1 + length];
  list[2 + length] = idx;
  return result;
}

/* We use 4-grams, therefore strings with less than 4 characters cannot be
   handled through the 4-grams table and need to be handled specially.
   Since every character occupies at most 4 bytes (see po-charset.c),
   this means the size of such short strings is bounded by:  */
#define SHORT_STRING_MAX_CHARACTERS (4 - 1)
#define SHORT_STRING_MAX_BYTES (SHORT_STRING_MAX_CHARACTERS * 4)

/* Such short strings are handled by direct comparison with all messages
   of appropriate size.  Note that for two strings of length 0 <= l1 <= l2,
   fstrcmp() is <= 2 * l1 / (l1 + l2).  Since we are only interested in
   fstrcmp() values >= FUZZY_THRESHOLD, we can for a given string of length l
   limit the search to lengths l' in the range
     l / (2 / FUZZY_THRESHOLD - 1) <= l' <= l * (2 / FUZZY_THRESHOLD - 1)
   Thus we need the list of the short strings up to length:  */
#define SHORT_MSG_MAX (int) (SHORT_STRING_MAX_BYTES * (2 * FUZZY_THRESHOLD_INV - 1))

/* A fuzzy index contains a hash table mapping all n-grams to their
   occurrences list.  */
struct message_fuzzy_index_ty
{
  message_ty **messages;
  character_iterator_t iterator;
  hash_table gram4;
  size_t firstfew;
  message_list_ty *short_messages[SHORT_MSG_MAX + 1];
};

/* Allocate a fuzzy index corresponding to a given list of messages.
   The list of messages and the msgctxt and msgid fields of the messages
   inside it must not be modified while the returned fuzzy index is in use.  */
message_fuzzy_index_ty *
message_fuzzy_index_alloc (const message_list_ty *mlp,
			   const char *canon_charset)
{
  message_fuzzy_index_ty *findex =
    (message_fuzzy_index_ty *) xmalloc (sizeof (message_fuzzy_index_ty));
  size_t count = mlp->nitems;
  size_t j;
  size_t l;

  findex->messages = mlp->item;
  findex->iterator = po_charset_character_iterator (canon_charset);

  /* Setup hash table.  */
  if (hash_init (&findex->gram4, 10 * count) < 0)
    xalloc_die ();
  for (j = 0; j < count; j++)
    {
      message_ty *mp = mlp->item[j];

      if (mp->msgstr != NULL && mp->msgstr[0] != '\0')
	{
	  const char *str = mp->msgid;

	  /* Let p0 < p1 < p2 < p3 < p4 walk through the string.  */
	  const char *p0 = str;
	  if (*p0 != '\0')
	    {
	      const char *p1 = p0 + findex->iterator (p0);
	      if (*p1 != '\0')
		{
		  const char *p2 = p1 + findex->iterator (p1);
		  if (*p2 != '\0')
		    {
		      const char *p3 = p2 + findex->iterator (p2);
		      if (*p3 != '\0')
			{
			  const char *p4 = p3 + findex->iterator (p3);
			  for (;;)
			    {
			      /* The segment from p0 to p4 is a 4-gram of
				 characters.  Add a hash table entry that maps
				 it to the index j, or extend the existing
				 hash table entry accordingly.  */
			      void *found;

			      if (hash_find_entry (&findex->gram4, p0, p4 - p0,
						   &found) == 0)
				{
				  index_list_ty list = (index_list_ty) found;
				  list = addlast_index (list, j);
				  if (list != NULL)
				    hash_set_value (&findex->gram4, p0, p4 - p0,
						    list);
				}
			      else
				hash_insert_entry (&findex->gram4, p0, p4 - p0,
						   new_index (j));

			      /* Advance.  */
			      if (*p4 == '\0')
				break;
			      p0 = p1;
			      p1 = p2;
			      p2 = p3;
			      p3 = p4;
			      p4 = p4 + findex->iterator (p4);
			    }
			}
		    }
		}
	    }
	}
    }

  /* Shrink memory used by the hash table.  */
  {
    void *iter;
    const void *key;
    size_t keylen;
    void **valuep;

    iter = NULL;
    while (hash_iterate_modify (&findex->gram4, &iter, &key, &keylen, &valuep)
	   == 0)
      {
	index_list_ty list = (index_list_ty) *valuep;
	index_ty length = list[IL_LENGTH];

	if (length < list[IL_ALLOCATED])
	  {
	    list[IL_ALLOCATED] = length;
	    *valuep = xrealloc (list, (2 + length) * sizeof (index_ty));
	  }
      }
  }

  findex->firstfew = (int) sqrt ((double) count);
  if (findex->firstfew < 10)
    findex->firstfew = 10;

  /* Setup lists of short messages.  */
  for (l = 0; l <= SHORT_MSG_MAX; l++)
    findex->short_messages[l] = message_list_alloc (false);
  for (j = 0; j < count; j++)
    {
      message_ty *mp = mlp->item[j];

      if (mp->msgstr != NULL && mp->msgstr[0] != '\0')
	{
	  const char *str = mp->msgid;
	  size_t len = strlen (str);

	  if (len <= SHORT_MSG_MAX)
	    message_list_append (findex->short_messages[len], mp);
	}
    }

  /* Shrink memory used by the lists of short messages.  */
  for (l = 0; l <= SHORT_MSG_MAX; l++)
    {
      message_list_ty *mlp = findex->short_messages[l];

      if (mlp->nitems < mlp->nitems_max)
	{
	  mlp->nitems_max = mlp->nitems;
	  mlp->item =
	    (message_ty **)
	    xrealloc (mlp->item, mlp->nitems_max * sizeof (message_ty *));
	}
    }

  return findex;
}

/* An index with multiplicity.  */
struct mult_index
{
  index_ty index;
  unsigned int count;
};

/* A list of indices with multiplicity.
   The indices are sorted in ascending order.  */
struct mult_index_list
{
  struct mult_index *item;
  size_t nitems;
  size_t nitems_max;
  /* Work area.  */
  struct mult_index *item2;
  size_t nitems2_max;
};

/* Initialize an empty list of indices with multiplicity.  */
static inline void
mult_index_list_init (struct mult_index_list *accu)
{
  accu->nitems = 0;
  accu->nitems_max = 0;
  accu->item = NULL;
  accu->nitems2_max = 0;
  accu->item2 = NULL;
}

/* Add an index list to a list of indices with multiplicity.  */
static inline void
mult_index_list_accumulate (struct mult_index_list *accu, index_list_ty list)
{
  size_t len1 = accu->nitems;
  size_t len2 = list[IL_LENGTH];
  size_t need = len1 + len2;
  struct mult_index *ptr1;
  struct mult_index *ptr1_end;
  index_ty *ptr2;
  index_ty *ptr2_end;
  struct mult_index *destptr;

  /* Make the work area large enough.  */
  if (accu->nitems2_max < need)
    {
      size_t new_max = 2 * accu->nitems2_max + 1;

      if (new_max < need)
	new_max = need;
      if (accu->item2 != NULL)
	free (accu->item2);
      accu->item2 =
	(struct mult_index *) xmalloc (new_max * sizeof (struct mult_index));
      accu->nitems2_max = new_max;
    }

  /* Make a linear pass through accu and list simultaneously.  */
  ptr1 = accu->item;
  ptr1_end = ptr1 + len1;
  ptr2 = list + 2;
  ptr2_end = ptr2 + len2;
  destptr = accu->item2;
  while (ptr1 < ptr1_end && ptr2 < ptr2_end)
    {
      if (ptr1->index < *ptr2)
	{
	  *destptr = *ptr1;
	  ptr1++;
	}
      else if (ptr1->index > *ptr2)
	{
	  destptr->index = *ptr2;
	  destptr->count = 1;
	  ptr2++;
	}
      else /* ptr1->index == list[2 + i2] */
	{
	  destptr->index = ptr1->index;
	  destptr->count = ptr1->count + 1;
	  ptr1++;
	  ptr2++;
	}
      destptr++;
    }
  while (ptr1 < ptr1_end)
    {
      *destptr = *ptr1;
      ptr1++;
      destptr++;
    }
  while (ptr2 < ptr2_end)
    {
      destptr->index = *ptr2;
      destptr->count = 1;
      ptr2++;
      destptr++;
    }

  /* Swap accu->item and accu->item2.  */
  {
    struct mult_index *dest = accu->item2;
    size_t dest_max = accu->nitems2_max;

    accu->item2 = accu->item;
    accu->nitems2_max = accu->nitems_max;

    accu->item = dest;
    accu->nitems = destptr - dest;
    accu->nitems_max = dest_max;
  }
}

/* Compares two indices with multiplicity, according to their multiplicity.  */
static int
mult_index_compare (const void *p1, const void *p2)
{
  const struct mult_index *ptr1 = (const struct mult_index *) p1;
  const struct mult_index *ptr2 = (const struct mult_index *) p2;

  if (ptr1->count < ptr2->count)
    return 1;
  if (ptr1->count > ptr2->count)
    return -1;
  /* For reproduceable results, also take into account the indices.  */
  if (ptr1->index > ptr2->index)
    return 1;
  if (ptr1->index < ptr2->index)
    return -1;
  return 0;
}

/* Sort a list of indices with multiplicity according to decreasing
   multiplicity.  */
static inline void
mult_index_list_sort (struct mult_index_list *accu)
{
  if (accu->nitems > 1)
    qsort (accu->item, accu->nitems, sizeof (struct mult_index),
	   mult_index_compare);
}

/* Frees a list of indices with multiplicity.  */
static inline void
mult_index_list_free (struct mult_index_list *accu)
{
  if (accu->item != NULL)
    free (accu->item);
  if (accu->item2 != NULL)
    free (accu->item2);
}

/* Find a good match for the given msgctxt and msgid in the given fuzzy index.
   The match does not need to be optimal.  */
message_ty *
message_fuzzy_index_search (message_fuzzy_index_ty *findex,
			    const char *msgctxt, const char *msgid)
{
  const char *str = msgid;

  /* Let p0 < p1 < p2 < p3 < p4 walk through the string.  */
  const char *p0 = str;
  if (*p0 != '\0')
    {
      const char *p1 = p0 + findex->iterator (p0);
      if (*p1 != '\0')
	{
	  const char *p2 = p1 + findex->iterator (p1);
	  if (*p2 != '\0')
	    {
	      const char *p3 = p2 + findex->iterator (p2);
	      if (*p3 != '\0')
		{
		  const char *p4 = p3 + findex->iterator (p3);
		  struct mult_index_list accu;

		  mult_index_list_init (&accu);
		  for (;;)
		    {
		      /* The segment from p0 to p4 is a 4-gram of
			 characters.  Get the hash table entry containing
			 a list of indices, and add it to the accu.  */
		      void *found;

		      if (hash_find_entry (&findex->gram4, p0, p4 - p0,
					   &found) == 0)
			{
			  index_list_ty list = (index_list_ty) found;
			  mult_index_list_accumulate (&accu, list);
			}

		      /* Advance.  */
		      if (*p4 == '\0')
			break;
		      p0 = p1;
		      p1 = p2;
		      p2 = p3;
		      p3 = p4;
		      p4 = p4 + findex->iterator (p4);
		    }

		  /* Sort in decreasing count order.  */
		  mult_index_list_sort (&accu);

		  /* Take the first few messages from this sorted list, and
		     maximize the fstrcmp() result.  */
		  {
		    size_t count;
		    struct mult_index *ptr;
		    message_ty *best_mp;
		    double best_weight;

		    count = findex->firstfew;
		    if (count > accu.nitems)
		      count = accu.nitems;

		    best_weight = FUZZY_THRESHOLD;
		    best_mp = NULL;
		    for (ptr = accu.item; count > 0; ptr++, count--)
		      {
			message_ty *mp = findex->messages[ptr->index];
			double weight =
			  fuzzy_search_goal_function (mp, msgctxt, msgid);

			if (weight > best_weight)
			  {
			    best_weight = weight;
			    best_mp = mp;
			  }
		      }

		    mult_index_list_free (&accu);

		    return best_mp;
		  }
		}
	    }
	}
    }

  /* The string had less than 4 characters.  */
  {
    size_t l = strlen (str);
    size_t lmin, lmax;
    message_ty *best_mp;
    double best_weight;

    if (!(l <= SHORT_STRING_MAX_BYTES))
      abort ();

    /* Walk through those short messages which have an appropriate length.
       See the comment before SHORT_MSG_MAX.  */
    lmin = (int) ceil (l / (2 / FUZZY_THRESHOLD - 1));
    lmax = (int) (l * (2 / FUZZY_THRESHOLD - 1));
    if (!(lmax <= SHORT_MSG_MAX))
      abort ();

    best_weight = FUZZY_THRESHOLD;
    best_mp = NULL;
    for (l = lmin; l <= lmax; l++)
      {
	message_list_ty *mlp = findex->short_messages[l];
	size_t j;

	for (j = 0; j < mlp->nitems; j++)
	  {
	    message_ty *mp = mlp->item[j];
	    double weight = fuzzy_search_goal_function (mp, msgctxt, msgid);

	    if (weight > best_weight)
	      {
		best_weight = weight;
		best_mp = mp;
	      }
	  }
      }

    return best_mp;
  }
}

/* Free a fuzzy index.  */
void
message_fuzzy_index_free (message_fuzzy_index_ty *findex)
{
  size_t l;
  void *iter;
  const void *key;
  size_t keylen;
  void *data;

  /* Free the short lists.  */
  for (l = 0; l <= SHORT_MSG_MAX; l++)
    message_list_free (findex->short_messages[l], 1);

  /* Free the index lists occurring as values in the hash tables.  */
  iter = NULL;
  while (hash_iterate (&findex->gram4, &iter, &key, &keylen, &data) == 0)
    free ((index_list_ty *) data);
  /* Free the hash table itself.  */
  hash_destroy (&findex->gram4);

  free (findex);
}