• Home
  • History
  • Annotate
  • Line#
  • Navigate
  • Raw
  • Download
  • only in /netgear-WNDR4500v2-V1.0.0.60_1.0.38/ap/gpl/timemachine/gettext-0.17/gettext-tools/src/
1/* Fast fuzzy searching among messages.
2   Copyright (C) 2006 Free Software Foundation, Inc.
3   Written by Bruno Haible <bruno@clisp.org>, 2006.
4
5   This program is free software: you can redistribute it and/or modify
6   it under the terms of the GNU General Public License as published by
7   the Free Software Foundation; either version 3 of the License, or
8   (at your option) any later version.
9
10   This program is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13   GNU General Public License for more details.
14
15   You should have received a copy of the GNU General Public License
16   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
17
18#ifdef HAVE_CONFIG_H
19# include <config.h>
20#endif
21
22/* Specification.  */
23#include "msgl-fsearch.h"
24
25#include <math.h>
26#include <stdlib.h>
27
28#include "xalloc.h"
29#include "po-charset.h"
30
31
32/* Fuzzy searching of L strings in a large set of N messages (assuming
33   they have all the same small size) takes O(L * N) when using repeated
34   fstrcmp() calls.  When for example L = 800 and N = 69000, this is slow.
35
36   So we preprocess the set of N messages, yielding a data structure
37   that allows to select the similar messages for any given string, and
38   apply fstrcmp() only to the search result.  This allows to reduce the
39   time to something between O(L * 1) and O(L * N) - depending on the
40   structure of the strings.  In the example with L = 800 and N = 69000,
41   memory use increases by a factor of 2 and the time decreases from
42   9068 s to 230 s.
43
44   The data structure is a hash table mapping each n-gram (here with n=4)
45   occurring in the message strings to the list of messages that contains
46   it.  For example, if the message list is
47      [0] "close"
48      [1] "losers"
49   the index is a hash table mapping
50      "clos" -> { 0 }
51      "lose" -> { 0, 1 }
52      "oser" -> { 1 }
53      "sers" -> { 1 }
54   Searching the similar messages to, say, "closest", is done by looking up
55   all its 4-grams in the hash table and summing up the results:
56      "clos" -> { 0 }
57      "lose" -> { 0, 1 }
58      "oses" -> {}
59      "sest" -> {}
60   => total: { 2x0, 1x1 }
61   The list is sorted according to decreasing frequency: { 0, 1, ... }
62   and only the first few messages from this frequency list are passed to
63   fstrcmp().
64
65   The n-gram similarity and the fstrcmp() similarity are quite different
66   metrics.  For example, "close" and "c l o s e" have no n-grams in common
67   (even for n as small as n = 2); however, fstrcmp() will find that they
68   are quite similar (= 0.71).  Conversely, "AAAA BBBB ... ZZZZ" and
69   "ZZZZ ... BBBB AAAA" have many 4-grams in common, but their fstrcmp() is
70   only 0.02.  Also, repeated n-grams don't have the same effect on the
71   two similarity measures.  But all this doesn't matter much in practice.
72
73   We chose n = 4 because for alphabetic languages, with n = 3 the occurrence
74   lists are likely too long.  (Well, with ideographic languages like Chinese,
75   n = 3 might actually be quite good.  Anyway, n = 4 is not bad in this case
76   either.)
77
78   The units are characters in the current encoding.  Not just bytes,
79   because 4 consecutive bytes in UTF-8 or GB18030 don't mean much.  */
80
81/* Each message is represented by its index in the message list.  */
82typedef unsigned int index_ty;
83
84/* An index list has its allocated size and length tacked at the front.
85   The indices are sorted in ascending order.  */
86typedef index_ty *index_list_ty;
87#define IL_ALLOCATED 0
88#define IL_LENGTH 1
89
90/* Create a new index list containing only a given index.  */
91static inline index_list_ty
92new_index (index_ty idx)
93{
94  index_ty *list = XNMALLOC (2 + 1, index_ty);
95  list[IL_ALLOCATED] = 1;
96  list[IL_LENGTH] = 1;
97  list[2] = idx;
98
99  return list;
100}
101
102/* Add a given index, greater or equal than all previous indices, to an
103   index list.
104   Return the new index list, if it had to be reallocated, or NULL if it
105   didn't change.  */
106static inline index_list_ty
107addlast_index (index_list_ty list, index_ty idx)
108{
109  index_list_ty result;
110  size_t length = list[IL_LENGTH];
111
112  /* Look whether it should be inserted.  */
113  if (length > 0 && list[2 + (length - 1)] == idx)
114    return NULL;
115
116  /* Now make room for one more list element.  */
117  result = NULL;
118  if (length == list[IL_ALLOCATED])
119    {
120      size_t new_allocated = 2 * length - (length >> 6);
121      list = (index_ty *) xrealloc (list, (2 + new_allocated) * sizeof (index_ty));
122      list[IL_ALLOCATED] = new_allocated;
123      result = list;
124    }
125  list[2 + length] = idx;
126  list[IL_LENGTH] = length + 1;
127  return result;
128}
129
130/* Add a given index to an index list.
131   Return the new index list, if it had to be reallocated, or NULL if it
132   didn't change.  */
133static inline index_list_ty
134add_index (index_list_ty list, index_ty idx)
135{
136  index_list_ty result;
137  size_t length = list[IL_LENGTH];
138
139  /* Look where it should be inserted.  */
140  size_t lo = 0;
141  size_t hi = length;
142  while (lo < hi)
143    {
144      /* Here we know that idx must be inserted at a position >= lo, <= hi.  */
145      size_t mid = (lo + hi) / 2; /* lo <= mid < hi */
146      index_ty val = list[2 + mid];
147      if (val < idx)
148	lo = mid + 1;
149      else if (val > idx)
150	hi = mid;
151      else
152	return NULL;
153    }
154
155  /* Now make room for one more list element.  */
156  result = NULL;
157  if (length == list[IL_ALLOCATED])
158    {
159      size_t new_allocated = 2 * length - (length >> 6);
160      list = (index_ty *) xrealloc (list, (2 + new_allocated) * sizeof (index_ty));
161      list[IL_ALLOCATED] = new_allocated;
162      result = list;
163    }
164  list[IL_LENGTH] = length + 1;
165  for (; length > hi; length--)
166    list[2 + length] = list[1 + length];
167  list[2 + length] = idx;
168  return result;
169}
170
171/* We use 4-grams, therefore strings with less than 4 characters cannot be
172   handled through the 4-grams table and need to be handled specially.
173   Since every character occupies at most 4 bytes (see po-charset.c),
174   this means the size of such short strings is bounded by:  */
175#define SHORT_STRING_MAX_CHARACTERS (4 - 1)
176#define SHORT_STRING_MAX_BYTES (SHORT_STRING_MAX_CHARACTERS * 4)
177
178/* Such short strings are handled by direct comparison with all messages
179   of appropriate size.  Note that for two strings of length 0 <= l1 <= l2,
180   fstrcmp() is <= 2 * l1 / (l1 + l2).  Since we are only interested in
181   fstrcmp() values >= FUZZY_THRESHOLD, we can for a given string of length l
182   limit the search to lengths l' in the range
183     l / (2 / FUZZY_THRESHOLD - 1) <= l' <= l * (2 / FUZZY_THRESHOLD - 1)
184   Thus we need the list of the short strings up to length:  */
185#define SHORT_MSG_MAX (int) (SHORT_STRING_MAX_BYTES * (2 / FUZZY_THRESHOLD - 1))
186
187/* A fuzzy index contains a hash table mapping all n-grams to their
188   occurrences list.  */
189struct message_fuzzy_index_ty
190{
191  message_ty **messages;
192  character_iterator_t iterator;
193  hash_table gram4;
194  size_t firstfew;
195  message_list_ty *short_messages[SHORT_MSG_MAX + 1];
196};
197
198/* Allocate a fuzzy index corresponding to a given list of messages.
199   The list of messages and the msgctxt and msgid fields of the messages
200   inside it must not be modified while the returned fuzzy index is in use.  */
201message_fuzzy_index_ty *
202message_fuzzy_index_alloc (const message_list_ty *mlp,
203			   const char *canon_charset)
204{
205  message_fuzzy_index_ty *findex = XMALLOC (message_fuzzy_index_ty);
206  size_t count = mlp->nitems;
207  size_t j;
208  size_t l;
209
210  findex->messages = mlp->item;
211  findex->iterator = po_charset_character_iterator (canon_charset);
212
213  /* Setup hash table.  */
214  if (hash_init (&findex->gram4, 10 * count) < 0)
215    xalloc_die ();
216  for (j = 0; j < count; j++)
217    {
218      message_ty *mp = mlp->item[j];
219
220      if (mp->msgstr != NULL && mp->msgstr[0] != '\0')
221	{
222	  const char *str = mp->msgid;
223
224	  /* Let p0 < p1 < p2 < p3 < p4 walk through the string.  */
225	  const char *p0 = str;
226	  if (*p0 != '\0')
227	    {
228	      const char *p1 = p0 + findex->iterator (p0);
229	      if (*p1 != '\0')
230		{
231		  const char *p2 = p1 + findex->iterator (p1);
232		  if (*p2 != '\0')
233		    {
234		      const char *p3 = p2 + findex->iterator (p2);
235		      if (*p3 != '\0')
236			{
237			  const char *p4 = p3 + findex->iterator (p3);
238			  for (;;)
239			    {
240			      /* The segment from p0 to p4 is a 4-gram of
241				 characters.  Add a hash table entry that maps
242				 it to the index j, or extend the existing
243				 hash table entry accordingly.  */
244			      void *found;
245
246			      if (hash_find_entry (&findex->gram4, p0, p4 - p0,
247						   &found) == 0)
248				{
249				  index_list_ty list = (index_list_ty) found;
250				  list = addlast_index (list, j);
251				  if (list != NULL)
252				    hash_set_value (&findex->gram4, p0, p4 - p0,
253						    list);
254				}
255			      else
256				hash_insert_entry (&findex->gram4, p0, p4 - p0,
257						   new_index (j));
258
259			      /* Advance.  */
260			      if (*p4 == '\0')
261				break;
262			      p0 = p1;
263			      p1 = p2;
264			      p2 = p3;
265			      p3 = p4;
266			      p4 = p4 + findex->iterator (p4);
267			    }
268			}
269		    }
270		}
271	    }
272	}
273    }
274
275  /* Shrink memory used by the hash table.  */
276  {
277    void *iter;
278    const void *key;
279    size_t keylen;
280    void **valuep;
281
282    iter = NULL;
283    while (hash_iterate_modify (&findex->gram4, &iter, &key, &keylen, &valuep)
284	   == 0)
285      {
286	index_list_ty list = (index_list_ty) *valuep;
287	index_ty length = list[IL_LENGTH];
288
289	if (length < list[IL_ALLOCATED])
290	  {
291	    list[IL_ALLOCATED] = length;
292	    *valuep = xrealloc (list, (2 + length) * sizeof (index_ty));
293	  }
294      }
295  }
296
297  findex->firstfew = (int) sqrt ((double) count);
298  if (findex->firstfew < 10)
299    findex->firstfew = 10;
300
301  /* Setup lists of short messages.  */
302  for (l = 0; l <= SHORT_MSG_MAX; l++)
303    findex->short_messages[l] = message_list_alloc (false);
304  for (j = 0; j < count; j++)
305    {
306      message_ty *mp = mlp->item[j];
307
308      if (mp->msgstr != NULL && mp->msgstr[0] != '\0')
309	{
310	  const char *str = mp->msgid;
311	  size_t len = strlen (str);
312
313	  if (len <= SHORT_MSG_MAX)
314	    message_list_append (findex->short_messages[len], mp);
315	}
316    }
317
318  /* Shrink memory used by the lists of short messages.  */
319  for (l = 0; l <= SHORT_MSG_MAX; l++)
320    {
321      message_list_ty *mlp = findex->short_messages[l];
322
323      if (mlp->nitems < mlp->nitems_max)
324	{
325	  mlp->nitems_max = mlp->nitems;
326	  mlp->item =
327	    (message_ty **)
328	    xrealloc (mlp->item, mlp->nitems_max * sizeof (message_ty *));
329	}
330    }
331
332  return findex;
333}
334
335/* An index with multiplicity.  */
336struct mult_index
337{
338  index_ty index;
339  unsigned int count;
340};
341
342/* A list of indices with multiplicity.
343   The indices are sorted in ascending order.  */
344struct mult_index_list
345{
346  struct mult_index *item;
347  size_t nitems;
348  size_t nitems_max;
349  /* Work area.  */
350  struct mult_index *item2;
351  size_t nitems2_max;
352};
353
354/* Initialize an empty list of indices with multiplicity.  */
355static inline void
356mult_index_list_init (struct mult_index_list *accu)
357{
358  accu->nitems = 0;
359  accu->nitems_max = 0;
360  accu->item = NULL;
361  accu->nitems2_max = 0;
362  accu->item2 = NULL;
363}
364
365/* Add an index list to a list of indices with multiplicity.  */
366static inline void
367mult_index_list_accumulate (struct mult_index_list *accu, index_list_ty list)
368{
369  size_t len1 = accu->nitems;
370  size_t len2 = list[IL_LENGTH];
371  size_t need = len1 + len2;
372  struct mult_index *ptr1;
373  struct mult_index *ptr1_end;
374  index_ty *ptr2;
375  index_ty *ptr2_end;
376  struct mult_index *destptr;
377
378  /* Make the work area large enough.  */
379  if (accu->nitems2_max < need)
380    {
381      size_t new_max = 2 * accu->nitems2_max + 1;
382
383      if (new_max < need)
384	new_max = need;
385      if (accu->item2 != NULL)
386	free (accu->item2);
387      accu->item2 = XNMALLOC (new_max, struct mult_index);
388      accu->nitems2_max = new_max;
389    }
390
391  /* Make a linear pass through accu and list simultaneously.  */
392  ptr1 = accu->item;
393  ptr1_end = ptr1 + len1;
394  ptr2 = list + 2;
395  ptr2_end = ptr2 + len2;
396  destptr = accu->item2;
397  while (ptr1 < ptr1_end && ptr2 < ptr2_end)
398    {
399      if (ptr1->index < *ptr2)
400	{
401	  *destptr = *ptr1;
402	  ptr1++;
403	}
404      else if (ptr1->index > *ptr2)
405	{
406	  destptr->index = *ptr2;
407	  destptr->count = 1;
408	  ptr2++;
409	}
410      else /* ptr1->index == list[2 + i2] */
411	{
412	  destptr->index = ptr1->index;
413	  destptr->count = ptr1->count + 1;
414	  ptr1++;
415	  ptr2++;
416	}
417      destptr++;
418    }
419  while (ptr1 < ptr1_end)
420    {
421      *destptr = *ptr1;
422      ptr1++;
423      destptr++;
424    }
425  while (ptr2 < ptr2_end)
426    {
427      destptr->index = *ptr2;
428      destptr->count = 1;
429      ptr2++;
430      destptr++;
431    }
432
433  /* Swap accu->item and accu->item2.  */
434  {
435    struct mult_index *dest = accu->item2;
436    size_t dest_max = accu->nitems2_max;
437
438    accu->item2 = accu->item;
439    accu->nitems2_max = accu->nitems_max;
440
441    accu->item = dest;
442    accu->nitems = destptr - dest;
443    accu->nitems_max = dest_max;
444  }
445}
446
447/* Compares two indices with multiplicity, according to their multiplicity.  */
448static int
449mult_index_compare (const void *p1, const void *p2)
450{
451  const struct mult_index *ptr1 = (const struct mult_index *) p1;
452  const struct mult_index *ptr2 = (const struct mult_index *) p2;
453
454  if (ptr1->count < ptr2->count)
455    return 1;
456  if (ptr1->count > ptr2->count)
457    return -1;
458  /* For reproduceable results, also take into account the indices.  */
459  if (ptr1->index > ptr2->index)
460    return 1;
461  if (ptr1->index < ptr2->index)
462    return -1;
463  return 0;
464}
465
466/* Sort a list of indices with multiplicity according to decreasing
467   multiplicity.  */
468static inline void
469mult_index_list_sort (struct mult_index_list *accu)
470{
471  if (accu->nitems > 1)
472    qsort (accu->item, accu->nitems, sizeof (struct mult_index),
473	   mult_index_compare);
474}
475
476/* Frees a list of indices with multiplicity.  */
477static inline void
478mult_index_list_free (struct mult_index_list *accu)
479{
480  if (accu->item != NULL)
481    free (accu->item);
482  if (accu->item2 != NULL)
483    free (accu->item2);
484}
485
486/* Find a good match for the given msgctxt and msgid in the given fuzzy index.
487   The match does not need to be optimal.  */
488message_ty *
489message_fuzzy_index_search (message_fuzzy_index_ty *findex,
490			    const char *msgctxt, const char *msgid)
491{
492  const char *str = msgid;
493
494  /* Let p0 < p1 < p2 < p3 < p4 walk through the string.  */
495  const char *p0 = str;
496  if (*p0 != '\0')
497    {
498      const char *p1 = p0 + findex->iterator (p0);
499      if (*p1 != '\0')
500	{
501	  const char *p2 = p1 + findex->iterator (p1);
502	  if (*p2 != '\0')
503	    {
504	      const char *p3 = p2 + findex->iterator (p2);
505	      if (*p3 != '\0')
506		{
507		  const char *p4 = p3 + findex->iterator (p3);
508		  struct mult_index_list accu;
509
510		  mult_index_list_init (&accu);
511		  for (;;)
512		    {
513		      /* The segment from p0 to p4 is a 4-gram of
514			 characters.  Get the hash table entry containing
515			 a list of indices, and add it to the accu.  */
516		      void *found;
517
518		      if (hash_find_entry (&findex->gram4, p0, p4 - p0,
519					   &found) == 0)
520			{
521			  index_list_ty list = (index_list_ty) found;
522			  mult_index_list_accumulate (&accu, list);
523			}
524
525		      /* Advance.  */
526		      if (*p4 == '\0')
527			break;
528		      p0 = p1;
529		      p1 = p2;
530		      p2 = p3;
531		      p3 = p4;
532		      p4 = p4 + findex->iterator (p4);
533		    }
534
535		  /* Sort in decreasing count order.  */
536		  mult_index_list_sort (&accu);
537
538		  /* Take the first few messages from this sorted list, and
539		     maximize the fstrcmp() result.  */
540		  {
541		    size_t count;
542		    struct mult_index *ptr;
543		    message_ty *best_mp;
544		    double best_weight;
545
546		    count = findex->firstfew;
547		    if (count > accu.nitems)
548		      count = accu.nitems;
549
550		    best_weight = FUZZY_THRESHOLD;
551		    best_mp = NULL;
552		    for (ptr = accu.item; count > 0; ptr++, count--)
553		      {
554			message_ty *mp = findex->messages[ptr->index];
555			double weight =
556			  fuzzy_search_goal_function (mp, msgctxt, msgid);
557
558			if (weight > best_weight)
559			  {
560			    best_weight = weight;
561			    best_mp = mp;
562			  }
563		      }
564
565		    mult_index_list_free (&accu);
566
567		    return best_mp;
568		  }
569		}
570	    }
571	}
572    }
573
574  /* The string had less than 4 characters.  */
575  {
576    size_t l = strlen (str);
577    size_t lmin, lmax;
578    message_ty *best_mp;
579    double best_weight;
580
581    if (!(l <= SHORT_STRING_MAX_BYTES))
582      abort ();
583
584    /* Walk through those short messages which have an appropriate length.
585       See the comment before SHORT_MSG_MAX.  */
586    lmin = (int) ceil (l / (2 / FUZZY_THRESHOLD - 1));
587    lmax = (int) (l * (2 / FUZZY_THRESHOLD - 1));
588    if (!(lmax <= SHORT_MSG_MAX))
589      abort ();
590
591    best_weight = FUZZY_THRESHOLD;
592    best_mp = NULL;
593    for (l = lmin; l <= lmax; l++)
594      {
595	message_list_ty *mlp = findex->short_messages[l];
596	size_t j;
597
598	for (j = 0; j < mlp->nitems; j++)
599	  {
600	    message_ty *mp = mlp->item[j];
601	    double weight = fuzzy_search_goal_function (mp, msgctxt, msgid);
602
603	    if (weight > best_weight)
604	      {
605		best_weight = weight;
606		best_mp = mp;
607	      }
608	  }
609      }
610
611    return best_mp;
612  }
613}
614
615/* Free a fuzzy index.  */
616void
617message_fuzzy_index_free (message_fuzzy_index_ty *findex)
618{
619  size_t l;
620  void *iter;
621  const void *key;
622  size_t keylen;
623  void *data;
624
625  /* Free the short lists.  */
626  for (l = 0; l <= SHORT_MSG_MAX; l++)
627    message_list_free (findex->short_messages[l], 1);
628
629  /* Free the index lists occurring as values in the hash tables.  */
630  iter = NULL;
631  while (hash_iterate (&findex->gram4, &iter, &key, &keylen, &data) == 0)
632    free ((index_list_ty *) data);
633  /* Free the hash table itself.  */
634  hash_destroy (&findex->gram4);
635
636  free (findex);
637}
638