1/* Fast fuzzy searching among messages.
2   Copyright (C) 2006 Free Software Foundation, Inc.
3   Written by Bruno Haible <bruno@clisp.org>, 2006.
4
5   This program is free software; you can redistribute it and/or modify
6   it under the terms of the GNU General Public License as published by
7   the Free Software Foundation; either version 2, or (at your option)
8   any later version.
9
10   This program is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13   GNU General Public License for more details.
14
15   You should have received a copy of the GNU General Public License
16   along with this program; if not, write to the Free Software Foundation,
17   Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.  */
18
19#ifdef HAVE_CONFIG_H
20# include <config.h>
21#endif
22
23/* Specification.  */
24#include "msgl-fsearch.h"
25
26#include <math.h>
27#include <stdlib.h>
28
29#include "xalloc.h"
30#include "po-charset.h"
31
32
33/* Fuzzy searching of L strings in a large set of N messages (assuming
34   they have all the same small size) takes O(L * N) when using repeated
35   fstrcmp() calls.  When for example L = 800 and N = 69000, this is slow.
36
37   So we preprocess the set of N messages, yielding a data structure
38   that allows to select the similar messages for any given string, and
39   apply fstrcmp() only to the search result.  This allows to reduce the
40   time to something between O(L * 1) and O(L * N) - depending on the
41   structure of the strings.  In the example with L = 800 and N = 69000,
42   memory use increases by a factor of 2 and the time decreases from
43   9068 s to 230 s.
44
45   The data structure is a hash table mapping each n-gram (here with n=4)
46   occurring in the message strings to the list of messages that contains
47   it.  For example, if the message list is
48      [0] "close"
49      [1] "losers"
50   the index is a hash table mapping
51      "clos" -> { 0 }
52      "lose" -> { 0, 1 }
53      "oser" -> { 1 }
54      "sers" -> { 1 }
55   Searching the similar messages to, say, "closest", is done by looking up
56   all its 4-grams in the hash table and summing up the results:
57      "clos" -> { 0 }
58      "lose" -> { 0, 1 }
59      "oses" -> {}
60      "sest" -> {}
61   => total: { 2x0, 1x1 }
62   The list is sorted according to decreasing frequency: { 0, 1, ... }
63   and only the first few messages from this frequency list are passed to
64   fstrcmp().
65
66   The n-gram similarity and the fstrcmp() similarity are quite different
67   metrics.  For example, "close" and "c l o s e" have no n-grams in common
68   (even for n as small as n = 2); however, fstrcmp() will find that they
69   are quite similar (= 0.71).  Conversely, "AAAA BBBB ... ZZZZ" and
70   "ZZZZ ... BBBB AAAA" have many 4-grams in common, but their fstrcmp() is
71   only 0.02.  Also, repeated n-grams don't have the same effect on the
72   two similarity measures.  But all this doesn't matter much in practice.
73
74   We chose n = 4 because for alphabetic languages, with n = 3 the occurrence
75   lists are likely too long.  (Well, with ideographic languages like Chinese,
76   n = 3 might actually be quite good.  Anyway, n = 4 is not bad in this case
77   either.)
78
79   The units are characters in the current encoding.  Not just bytes,
80   because 4 consecutive bytes in UTF-8 or GB18030 don't mean much.  */
81
82/* Each message is represented by its index in the message list.  */
83typedef unsigned int index_ty;
84
85/* An index list has its allocated size and length tacked at the front.
86   The indices are sorted in ascending order.  */
87typedef index_ty *index_list_ty;
88#define IL_ALLOCATED 0
89#define IL_LENGTH 1
90
91/* Create a new index list containing only a given index.  */
92static inline index_list_ty
93new_index (index_ty idx)
94{
95  index_ty *list = (index_ty *) xmalloc ((2 + 1) * sizeof (index_ty));
96  list[IL_ALLOCATED] = 1;
97  list[IL_LENGTH] = 1;
98  list[2] = idx;
99
100  return list;
101}
102
103/* Add a given index, greater or equal than all previous indices, to an
104   index list.
105   Return the new index list, if it had to be reallocated, or NULL if it
106   didn't change.  */
107static inline index_list_ty
108addlast_index (index_list_ty list, index_ty idx)
109{
110  index_list_ty result;
111  size_t length = list[IL_LENGTH];
112
113  /* Look whether it should be inserted.  */
114  if (length > 0 && list[2 + (length - 1)] == idx)
115    return NULL;
116
117  /* Now make room for one more list element.  */
118  result = NULL;
119  if (length == list[IL_ALLOCATED])
120    {
121      size_t new_allocated = 2 * length - (length >> 6);
122      list = (index_ty *) xrealloc (list, (2 + new_allocated) * sizeof (index_ty));
123      list[IL_ALLOCATED] = new_allocated;
124      result = list;
125    }
126  list[2 + length] = idx;
127  list[IL_LENGTH] = length + 1;
128  return result;
129}
130
131/* Add a given index to an index list.
132   Return the new index list, if it had to be reallocated, or NULL if it
133   didn't change.  */
134static inline index_list_ty
135add_index (index_list_ty list, index_ty idx)
136{
137  index_list_ty result;
138  size_t length = list[IL_LENGTH];
139
140  /* Look where it should be inserted.  */
141  size_t lo = 0;
142  size_t hi = length;
143  while (lo < hi)
144    {
145      /* Here we know that idx must be inserted at a position >= lo, <= hi.  */
146      size_t mid = (lo + hi) / 2; /* lo <= mid < hi */
147      index_ty val = list[2 + mid];
148      if (val < idx)
149	lo = mid + 1;
150      else if (val > idx)
151	hi = mid;
152      else
153	return NULL;
154    }
155
156  /* Now make room for one more list element.  */
157  result = NULL;
158  if (length == list[IL_ALLOCATED])
159    {
160      size_t new_allocated = 2 * length - (length >> 6);
161      list = (index_ty *) xrealloc (list, (2 + new_allocated) * sizeof (index_ty));
162      list[IL_ALLOCATED] = new_allocated;
163      result = list;
164    }
165  list[IL_LENGTH] = length + 1;
166  for (; length > hi; length--)
167    list[2 + length] = list[1 + length];
168  list[2 + length] = idx;
169  return result;
170}
171
172/* We use 4-grams, therefore strings with less than 4 characters cannot be
173   handled through the 4-grams table and need to be handled specially.
174   Since every character occupies at most 4 bytes (see po-charset.c),
175   this means the size of such short strings is bounded by:  */
176#define SHORT_STRING_MAX_CHARACTERS (4 - 1)
177#define SHORT_STRING_MAX_BYTES (SHORT_STRING_MAX_CHARACTERS * 4)
178
179/* Such short strings are handled by direct comparison with all messages
180   of appropriate size.  Note that for two strings of length 0 <= l1 <= l2,
181   fstrcmp() is <= 2 * l1 / (l1 + l2).  Since we are only interested in
182   fstrcmp() values >= FUZZY_THRESHOLD, we can for a given string of length l
183   limit the search to lengths l' in the range
184     l / (2 / FUZZY_THRESHOLD - 1) <= l' <= l * (2 / FUZZY_THRESHOLD - 1)
185   Thus we need the list of the short strings up to length:  */
186#define SHORT_MSG_MAX (int) (SHORT_STRING_MAX_BYTES * (2 * FUZZY_THRESHOLD_INV - 1))
187
188/* A fuzzy index contains a hash table mapping all n-grams to their
189   occurrences list.  */
190struct message_fuzzy_index_ty
191{
192  message_ty **messages;
193  character_iterator_t iterator;
194  hash_table gram4;
195  size_t firstfew;
196  message_list_ty *short_messages[SHORT_MSG_MAX + 1];
197};
198
199/* Allocate a fuzzy index corresponding to a given list of messages.
200   The list of messages and the msgctxt and msgid fields of the messages
201   inside it must not be modified while the returned fuzzy index is in use.  */
202message_fuzzy_index_ty *
203message_fuzzy_index_alloc (const message_list_ty *mlp,
204			   const char *canon_charset)
205{
206  message_fuzzy_index_ty *findex =
207    (message_fuzzy_index_ty *) xmalloc (sizeof (message_fuzzy_index_ty));
208  size_t count = mlp->nitems;
209  size_t j;
210  size_t l;
211
212  findex->messages = mlp->item;
213  findex->iterator = po_charset_character_iterator (canon_charset);
214
215  /* Setup hash table.  */
216  if (hash_init (&findex->gram4, 10 * count) < 0)
217    xalloc_die ();
218  for (j = 0; j < count; j++)
219    {
220      message_ty *mp = mlp->item[j];
221
222      if (mp->msgstr != NULL && mp->msgstr[0] != '\0')
223	{
224	  const char *str = mp->msgid;
225
226	  /* Let p0 < p1 < p2 < p3 < p4 walk through the string.  */
227	  const char *p0 = str;
228	  if (*p0 != '\0')
229	    {
230	      const char *p1 = p0 + findex->iterator (p0);
231	      if (*p1 != '\0')
232		{
233		  const char *p2 = p1 + findex->iterator (p1);
234		  if (*p2 != '\0')
235		    {
236		      const char *p3 = p2 + findex->iterator (p2);
237		      if (*p3 != '\0')
238			{
239			  const char *p4 = p3 + findex->iterator (p3);
240			  for (;;)
241			    {
242			      /* The segment from p0 to p4 is a 4-gram of
243				 characters.  Add a hash table entry that maps
244				 it to the index j, or extend the existing
245				 hash table entry accordingly.  */
246			      void *found;
247
248			      if (hash_find_entry (&findex->gram4, p0, p4 - p0,
249						   &found) == 0)
250				{
251				  index_list_ty list = (index_list_ty) found;
252				  list = addlast_index (list, j);
253				  if (list != NULL)
254				    hash_set_value (&findex->gram4, p0, p4 - p0,
255						    list);
256				}
257			      else
258				hash_insert_entry (&findex->gram4, p0, p4 - p0,
259						   new_index (j));
260
261			      /* Advance.  */
262			      if (*p4 == '\0')
263				break;
264			      p0 = p1;
265			      p1 = p2;
266			      p2 = p3;
267			      p3 = p4;
268			      p4 = p4 + findex->iterator (p4);
269			    }
270			}
271		    }
272		}
273	    }
274	}
275    }
276
277  /* Shrink memory used by the hash table.  */
278  {
279    void *iter;
280    const void *key;
281    size_t keylen;
282    void **valuep;
283
284    iter = NULL;
285    while (hash_iterate_modify (&findex->gram4, &iter, &key, &keylen, &valuep)
286	   == 0)
287      {
288	index_list_ty list = (index_list_ty) *valuep;
289	index_ty length = list[IL_LENGTH];
290
291	if (length < list[IL_ALLOCATED])
292	  {
293	    list[IL_ALLOCATED] = length;
294	    *valuep = xrealloc (list, (2 + length) * sizeof (index_ty));
295	  }
296      }
297  }
298
299  findex->firstfew = (int) sqrt ((double) count);
300  if (findex->firstfew < 10)
301    findex->firstfew = 10;
302
303  /* Setup lists of short messages.  */
304  for (l = 0; l <= SHORT_MSG_MAX; l++)
305    findex->short_messages[l] = message_list_alloc (false);
306  for (j = 0; j < count; j++)
307    {
308      message_ty *mp = mlp->item[j];
309
310      if (mp->msgstr != NULL && mp->msgstr[0] != '\0')
311	{
312	  const char *str = mp->msgid;
313	  size_t len = strlen (str);
314
315	  if (len <= SHORT_MSG_MAX)
316	    message_list_append (findex->short_messages[len], mp);
317	}
318    }
319
320  /* Shrink memory used by the lists of short messages.  */
321  for (l = 0; l <= SHORT_MSG_MAX; l++)
322    {
323      message_list_ty *mlp = findex->short_messages[l];
324
325      if (mlp->nitems < mlp->nitems_max)
326	{
327	  mlp->nitems_max = mlp->nitems;
328	  mlp->item =
329	    (message_ty **)
330	    xrealloc (mlp->item, mlp->nitems_max * sizeof (message_ty *));
331	}
332    }
333
334  return findex;
335}
336
337/* An index with multiplicity.  */
338struct mult_index
339{
340  index_ty index;
341  unsigned int count;
342};
343
344/* A list of indices with multiplicity.
345   The indices are sorted in ascending order.  */
346struct mult_index_list
347{
348  struct mult_index *item;
349  size_t nitems;
350  size_t nitems_max;
351  /* Work area.  */
352  struct mult_index *item2;
353  size_t nitems2_max;
354};
355
356/* Initialize an empty list of indices with multiplicity.  */
357static inline void
358mult_index_list_init (struct mult_index_list *accu)
359{
360  accu->nitems = 0;
361  accu->nitems_max = 0;
362  accu->item = NULL;
363  accu->nitems2_max = 0;
364  accu->item2 = NULL;
365}
366
367/* Add an index list to a list of indices with multiplicity.  */
368static inline void
369mult_index_list_accumulate (struct mult_index_list *accu, index_list_ty list)
370{
371  size_t len1 = accu->nitems;
372  size_t len2 = list[IL_LENGTH];
373  size_t need = len1 + len2;
374  struct mult_index *ptr1;
375  struct mult_index *ptr1_end;
376  index_ty *ptr2;
377  index_ty *ptr2_end;
378  struct mult_index *destptr;
379
380  /* Make the work area large enough.  */
381  if (accu->nitems2_max < need)
382    {
383      size_t new_max = 2 * accu->nitems2_max + 1;
384
385      if (new_max < need)
386	new_max = need;
387      if (accu->item2 != NULL)
388	free (accu->item2);
389      accu->item2 =
390	(struct mult_index *) xmalloc (new_max * sizeof (struct mult_index));
391      accu->nitems2_max = new_max;
392    }
393
394  /* Make a linear pass through accu and list simultaneously.  */
395  ptr1 = accu->item;
396  ptr1_end = ptr1 + len1;
397  ptr2 = list + 2;
398  ptr2_end = ptr2 + len2;
399  destptr = accu->item2;
400  while (ptr1 < ptr1_end && ptr2 < ptr2_end)
401    {
402      if (ptr1->index < *ptr2)
403	{
404	  *destptr = *ptr1;
405	  ptr1++;
406	}
407      else if (ptr1->index > *ptr2)
408	{
409	  destptr->index = *ptr2;
410	  destptr->count = 1;
411	  ptr2++;
412	}
413      else /* ptr1->index == list[2 + i2] */
414	{
415	  destptr->index = ptr1->index;
416	  destptr->count = ptr1->count + 1;
417	  ptr1++;
418	  ptr2++;
419	}
420      destptr++;
421    }
422  while (ptr1 < ptr1_end)
423    {
424      *destptr = *ptr1;
425      ptr1++;
426      destptr++;
427    }
428  while (ptr2 < ptr2_end)
429    {
430      destptr->index = *ptr2;
431      destptr->count = 1;
432      ptr2++;
433      destptr++;
434    }
435
436  /* Swap accu->item and accu->item2.  */
437  {
438    struct mult_index *dest = accu->item2;
439    size_t dest_max = accu->nitems2_max;
440
441    accu->item2 = accu->item;
442    accu->nitems2_max = accu->nitems_max;
443
444    accu->item = dest;
445    accu->nitems = destptr - dest;
446    accu->nitems_max = dest_max;
447  }
448}
449
450/* Compares two indices with multiplicity, according to their multiplicity.  */
451static int
452mult_index_compare (const void *p1, const void *p2)
453{
454  const struct mult_index *ptr1 = (const struct mult_index *) p1;
455  const struct mult_index *ptr2 = (const struct mult_index *) p2;
456
457  if (ptr1->count < ptr2->count)
458    return 1;
459  if (ptr1->count > ptr2->count)
460    return -1;
461  /* For reproduceable results, also take into account the indices.  */
462  if (ptr1->index > ptr2->index)
463    return 1;
464  if (ptr1->index < ptr2->index)
465    return -1;
466  return 0;
467}
468
469/* Sort a list of indices with multiplicity according to decreasing
470   multiplicity.  */
471static inline void
472mult_index_list_sort (struct mult_index_list *accu)
473{
474  if (accu->nitems > 1)
475    qsort (accu->item, accu->nitems, sizeof (struct mult_index),
476	   mult_index_compare);
477}
478
479/* Frees a list of indices with multiplicity.  */
480static inline void
481mult_index_list_free (struct mult_index_list *accu)
482{
483  if (accu->item != NULL)
484    free (accu->item);
485  if (accu->item2 != NULL)
486    free (accu->item2);
487}
488
489/* Find a good match for the given msgctxt and msgid in the given fuzzy index.
490   The match does not need to be optimal.  */
491message_ty *
492message_fuzzy_index_search (message_fuzzy_index_ty *findex,
493			    const char *msgctxt, const char *msgid)
494{
495  const char *str = msgid;
496
497  /* Let p0 < p1 < p2 < p3 < p4 walk through the string.  */
498  const char *p0 = str;
499  if (*p0 != '\0')
500    {
501      const char *p1 = p0 + findex->iterator (p0);
502      if (*p1 != '\0')
503	{
504	  const char *p2 = p1 + findex->iterator (p1);
505	  if (*p2 != '\0')
506	    {
507	      const char *p3 = p2 + findex->iterator (p2);
508	      if (*p3 != '\0')
509		{
510		  const char *p4 = p3 + findex->iterator (p3);
511		  struct mult_index_list accu;
512
513		  mult_index_list_init (&accu);
514		  for (;;)
515		    {
516		      /* The segment from p0 to p4 is a 4-gram of
517			 characters.  Get the hash table entry containing
518			 a list of indices, and add it to the accu.  */
519		      void *found;
520
521		      if (hash_find_entry (&findex->gram4, p0, p4 - p0,
522					   &found) == 0)
523			{
524			  index_list_ty list = (index_list_ty) found;
525			  mult_index_list_accumulate (&accu, list);
526			}
527
528		      /* Advance.  */
529		      if (*p4 == '\0')
530			break;
531		      p0 = p1;
532		      p1 = p2;
533		      p2 = p3;
534		      p3 = p4;
535		      p4 = p4 + findex->iterator (p4);
536		    }
537
538		  /* Sort in decreasing count order.  */
539		  mult_index_list_sort (&accu);
540
541		  /* Take the first few messages from this sorted list, and
542		     maximize the fstrcmp() result.  */
543		  {
544		    size_t count;
545		    struct mult_index *ptr;
546		    message_ty *best_mp;
547		    double best_weight;
548
549		    count = findex->firstfew;
550		    if (count > accu.nitems)
551		      count = accu.nitems;
552
553		    best_weight = FUZZY_THRESHOLD;
554		    best_mp = NULL;
555		    for (ptr = accu.item; count > 0; ptr++, count--)
556		      {
557			message_ty *mp = findex->messages[ptr->index];
558			double weight =
559			  fuzzy_search_goal_function (mp, msgctxt, msgid);
560
561			if (weight > best_weight)
562			  {
563			    best_weight = weight;
564			    best_mp = mp;
565			  }
566		      }
567
568		    mult_index_list_free (&accu);
569
570		    return best_mp;
571		  }
572		}
573	    }
574	}
575    }
576
577  /* The string had less than 4 characters.  */
578  {
579    size_t l = strlen (str);
580    size_t lmin, lmax;
581    message_ty *best_mp;
582    double best_weight;
583
584    if (!(l <= SHORT_STRING_MAX_BYTES))
585      abort ();
586
587    /* Walk through those short messages which have an appropriate length.
588       See the comment before SHORT_MSG_MAX.  */
589    lmin = (int) ceil (l / (2 / FUZZY_THRESHOLD - 1));
590    lmax = (int) (l * (2 / FUZZY_THRESHOLD - 1));
591    if (!(lmax <= SHORT_MSG_MAX))
592      abort ();
593
594    best_weight = FUZZY_THRESHOLD;
595    best_mp = NULL;
596    for (l = lmin; l <= lmax; l++)
597      {
598	message_list_ty *mlp = findex->short_messages[l];
599	size_t j;
600
601	for (j = 0; j < mlp->nitems; j++)
602	  {
603	    message_ty *mp = mlp->item[j];
604	    double weight = fuzzy_search_goal_function (mp, msgctxt, msgid);
605
606	    if (weight > best_weight)
607	      {
608		best_weight = weight;
609		best_mp = mp;
610	      }
611	  }
612      }
613
614    return best_mp;
615  }
616}
617
618/* Free a fuzzy index.  */
619void
620message_fuzzy_index_free (message_fuzzy_index_ty *findex)
621{
622  size_t l;
623  void *iter;
624  const void *key;
625  size_t keylen;
626  void *data;
627
628  /* Free the short lists.  */
629  for (l = 0; l <= SHORT_MSG_MAX; l++)
630    message_list_free (findex->short_messages[l], 1);
631
632  /* Free the index lists occurring as values in the hash tables.  */
633  iter = NULL;
634  while (hash_iterate (&findex->gram4, &iter, &key, &keylen, &data) == 0)
635    free ((index_list_ty *) data);
636  /* Free the hash table itself.  */
637  hash_destroy (&findex->gram4);
638
639  free (findex);
640}
641