1/* Fast fuzzy searching among messages. 2 Copyright (C) 2006 Free Software Foundation, Inc. 3 Written by Bruno Haible <bruno@clisp.org>, 2006. 4 5 This program is free software; you can redistribute it and/or modify 6 it under the terms of the GNU General Public License as published by 7 the Free Software Foundation; either version 2, or (at your option) 8 any later version. 9 10 This program is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 GNU General Public License for more details. 14 15 You should have received a copy of the GNU General Public License 16 along with this program; if not, write to the Free Software Foundation, 17 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ 18 19#ifdef HAVE_CONFIG_H 20# include <config.h> 21#endif 22 23/* Specification. */ 24#include "msgl-fsearch.h" 25 26#include <math.h> 27#include <stdlib.h> 28 29#include "xalloc.h" 30#include "po-charset.h" 31 32 33/* Fuzzy searching of L strings in a large set of N messages (assuming 34 they have all the same small size) takes O(L * N) when using repeated 35 fstrcmp() calls. When for example L = 800 and N = 69000, this is slow. 36 37 So we preprocess the set of N messages, yielding a data structure 38 that allows to select the similar messages for any given string, and 39 apply fstrcmp() only to the search result. This allows to reduce the 40 time to something between O(L * 1) and O(L * N) - depending on the 41 structure of the strings. In the example with L = 800 and N = 69000, 42 memory use increases by a factor of 2 and the time decreases from 43 9068 s to 230 s. 44 45 The data structure is a hash table mapping each n-gram (here with n=4) 46 occurring in the message strings to the list of messages that contains 47 it. For example, if the message list is 48 [0] "close" 49 [1] "losers" 50 the index is a hash table mapping 51 "clos" -> { 0 } 52 "lose" -> { 0, 1 } 53 "oser" -> { 1 } 54 "sers" -> { 1 } 55 Searching the similar messages to, say, "closest", is done by looking up 56 all its 4-grams in the hash table and summing up the results: 57 "clos" -> { 0 } 58 "lose" -> { 0, 1 } 59 "oses" -> {} 60 "sest" -> {} 61 => total: { 2x0, 1x1 } 62 The list is sorted according to decreasing frequency: { 0, 1, ... } 63 and only the first few messages from this frequency list are passed to 64 fstrcmp(). 65 66 The n-gram similarity and the fstrcmp() similarity are quite different 67 metrics. For example, "close" and "c l o s e" have no n-grams in common 68 (even for n as small as n = 2); however, fstrcmp() will find that they 69 are quite similar (= 0.71). Conversely, "AAAA BBBB ... ZZZZ" and 70 "ZZZZ ... BBBB AAAA" have many 4-grams in common, but their fstrcmp() is 71 only 0.02. Also, repeated n-grams don't have the same effect on the 72 two similarity measures. But all this doesn't matter much in practice. 73 74 We chose n = 4 because for alphabetic languages, with n = 3 the occurrence 75 lists are likely too long. (Well, with ideographic languages like Chinese, 76 n = 3 might actually be quite good. Anyway, n = 4 is not bad in this case 77 either.) 78 79 The units are characters in the current encoding. Not just bytes, 80 because 4 consecutive bytes in UTF-8 or GB18030 don't mean much. */ 81 82/* Each message is represented by its index in the message list. */ 83typedef unsigned int index_ty; 84 85/* An index list has its allocated size and length tacked at the front. 86 The indices are sorted in ascending order. */ 87typedef index_ty *index_list_ty; 88#define IL_ALLOCATED 0 89#define IL_LENGTH 1 90 91/* Create a new index list containing only a given index. */ 92static inline index_list_ty 93new_index (index_ty idx) 94{ 95 index_ty *list = (index_ty *) xmalloc ((2 + 1) * sizeof (index_ty)); 96 list[IL_ALLOCATED] = 1; 97 list[IL_LENGTH] = 1; 98 list[2] = idx; 99 100 return list; 101} 102 103/* Add a given index, greater or equal than all previous indices, to an 104 index list. 105 Return the new index list, if it had to be reallocated, or NULL if it 106 didn't change. */ 107static inline index_list_ty 108addlast_index (index_list_ty list, index_ty idx) 109{ 110 index_list_ty result; 111 size_t length = list[IL_LENGTH]; 112 113 /* Look whether it should be inserted. */ 114 if (length > 0 && list[2 + (length - 1)] == idx) 115 return NULL; 116 117 /* Now make room for one more list element. */ 118 result = NULL; 119 if (length == list[IL_ALLOCATED]) 120 { 121 size_t new_allocated = 2 * length - (length >> 6); 122 list = (index_ty *) xrealloc (list, (2 + new_allocated) * sizeof (index_ty)); 123 list[IL_ALLOCATED] = new_allocated; 124 result = list; 125 } 126 list[2 + length] = idx; 127 list[IL_LENGTH] = length + 1; 128 return result; 129} 130 131/* Add a given index to an index list. 132 Return the new index list, if it had to be reallocated, or NULL if it 133 didn't change. */ 134static inline index_list_ty 135add_index (index_list_ty list, index_ty idx) 136{ 137 index_list_ty result; 138 size_t length = list[IL_LENGTH]; 139 140 /* Look where it should be inserted. */ 141 size_t lo = 0; 142 size_t hi = length; 143 while (lo < hi) 144 { 145 /* Here we know that idx must be inserted at a position >= lo, <= hi. */ 146 size_t mid = (lo + hi) / 2; /* lo <= mid < hi */ 147 index_ty val = list[2 + mid]; 148 if (val < idx) 149 lo = mid + 1; 150 else if (val > idx) 151 hi = mid; 152 else 153 return NULL; 154 } 155 156 /* Now make room for one more list element. */ 157 result = NULL; 158 if (length == list[IL_ALLOCATED]) 159 { 160 size_t new_allocated = 2 * length - (length >> 6); 161 list = (index_ty *) xrealloc (list, (2 + new_allocated) * sizeof (index_ty)); 162 list[IL_ALLOCATED] = new_allocated; 163 result = list; 164 } 165 list[IL_LENGTH] = length + 1; 166 for (; length > hi; length--) 167 list[2 + length] = list[1 + length]; 168 list[2 + length] = idx; 169 return result; 170} 171 172/* We use 4-grams, therefore strings with less than 4 characters cannot be 173 handled through the 4-grams table and need to be handled specially. 174 Since every character occupies at most 4 bytes (see po-charset.c), 175 this means the size of such short strings is bounded by: */ 176#define SHORT_STRING_MAX_CHARACTERS (4 - 1) 177#define SHORT_STRING_MAX_BYTES (SHORT_STRING_MAX_CHARACTERS * 4) 178 179/* Such short strings are handled by direct comparison with all messages 180 of appropriate size. Note that for two strings of length 0 <= l1 <= l2, 181 fstrcmp() is <= 2 * l1 / (l1 + l2). Since we are only interested in 182 fstrcmp() values >= FUZZY_THRESHOLD, we can for a given string of length l 183 limit the search to lengths l' in the range 184 l / (2 / FUZZY_THRESHOLD - 1) <= l' <= l * (2 / FUZZY_THRESHOLD - 1) 185 Thus we need the list of the short strings up to length: */ 186#define SHORT_MSG_MAX (int) (SHORT_STRING_MAX_BYTES * (2 * FUZZY_THRESHOLD_INV - 1)) 187 188/* A fuzzy index contains a hash table mapping all n-grams to their 189 occurrences list. */ 190struct message_fuzzy_index_ty 191{ 192 message_ty **messages; 193 character_iterator_t iterator; 194 hash_table gram4; 195 size_t firstfew; 196 message_list_ty *short_messages[SHORT_MSG_MAX + 1]; 197}; 198 199/* Allocate a fuzzy index corresponding to a given list of messages. 200 The list of messages and the msgctxt and msgid fields of the messages 201 inside it must not be modified while the returned fuzzy index is in use. */ 202message_fuzzy_index_ty * 203message_fuzzy_index_alloc (const message_list_ty *mlp, 204 const char *canon_charset) 205{ 206 message_fuzzy_index_ty *findex = 207 (message_fuzzy_index_ty *) xmalloc (sizeof (message_fuzzy_index_ty)); 208 size_t count = mlp->nitems; 209 size_t j; 210 size_t l; 211 212 findex->messages = mlp->item; 213 findex->iterator = po_charset_character_iterator (canon_charset); 214 215 /* Setup hash table. */ 216 if (hash_init (&findex->gram4, 10 * count) < 0) 217 xalloc_die (); 218 for (j = 0; j < count; j++) 219 { 220 message_ty *mp = mlp->item[j]; 221 222 if (mp->msgstr != NULL && mp->msgstr[0] != '\0') 223 { 224 const char *str = mp->msgid; 225 226 /* Let p0 < p1 < p2 < p3 < p4 walk through the string. */ 227 const char *p0 = str; 228 if (*p0 != '\0') 229 { 230 const char *p1 = p0 + findex->iterator (p0); 231 if (*p1 != '\0') 232 { 233 const char *p2 = p1 + findex->iterator (p1); 234 if (*p2 != '\0') 235 { 236 const char *p3 = p2 + findex->iterator (p2); 237 if (*p3 != '\0') 238 { 239 const char *p4 = p3 + findex->iterator (p3); 240 for (;;) 241 { 242 /* The segment from p0 to p4 is a 4-gram of 243 characters. Add a hash table entry that maps 244 it to the index j, or extend the existing 245 hash table entry accordingly. */ 246 void *found; 247 248 if (hash_find_entry (&findex->gram4, p0, p4 - p0, 249 &found) == 0) 250 { 251 index_list_ty list = (index_list_ty) found; 252 list = addlast_index (list, j); 253 if (list != NULL) 254 hash_set_value (&findex->gram4, p0, p4 - p0, 255 list); 256 } 257 else 258 hash_insert_entry (&findex->gram4, p0, p4 - p0, 259 new_index (j)); 260 261 /* Advance. */ 262 if (*p4 == '\0') 263 break; 264 p0 = p1; 265 p1 = p2; 266 p2 = p3; 267 p3 = p4; 268 p4 = p4 + findex->iterator (p4); 269 } 270 } 271 } 272 } 273 } 274 } 275 } 276 277 /* Shrink memory used by the hash table. */ 278 { 279 void *iter; 280 const void *key; 281 size_t keylen; 282 void **valuep; 283 284 iter = NULL; 285 while (hash_iterate_modify (&findex->gram4, &iter, &key, &keylen, &valuep) 286 == 0) 287 { 288 index_list_ty list = (index_list_ty) *valuep; 289 index_ty length = list[IL_LENGTH]; 290 291 if (length < list[IL_ALLOCATED]) 292 { 293 list[IL_ALLOCATED] = length; 294 *valuep = xrealloc (list, (2 + length) * sizeof (index_ty)); 295 } 296 } 297 } 298 299 findex->firstfew = (int) sqrt ((double) count); 300 if (findex->firstfew < 10) 301 findex->firstfew = 10; 302 303 /* Setup lists of short messages. */ 304 for (l = 0; l <= SHORT_MSG_MAX; l++) 305 findex->short_messages[l] = message_list_alloc (false); 306 for (j = 0; j < count; j++) 307 { 308 message_ty *mp = mlp->item[j]; 309 310 if (mp->msgstr != NULL && mp->msgstr[0] != '\0') 311 { 312 const char *str = mp->msgid; 313 size_t len = strlen (str); 314 315 if (len <= SHORT_MSG_MAX) 316 message_list_append (findex->short_messages[len], mp); 317 } 318 } 319 320 /* Shrink memory used by the lists of short messages. */ 321 for (l = 0; l <= SHORT_MSG_MAX; l++) 322 { 323 message_list_ty *mlp = findex->short_messages[l]; 324 325 if (mlp->nitems < mlp->nitems_max) 326 { 327 mlp->nitems_max = mlp->nitems; 328 mlp->item = 329 (message_ty **) 330 xrealloc (mlp->item, mlp->nitems_max * sizeof (message_ty *)); 331 } 332 } 333 334 return findex; 335} 336 337/* An index with multiplicity. */ 338struct mult_index 339{ 340 index_ty index; 341 unsigned int count; 342}; 343 344/* A list of indices with multiplicity. 345 The indices are sorted in ascending order. */ 346struct mult_index_list 347{ 348 struct mult_index *item; 349 size_t nitems; 350 size_t nitems_max; 351 /* Work area. */ 352 struct mult_index *item2; 353 size_t nitems2_max; 354}; 355 356/* Initialize an empty list of indices with multiplicity. */ 357static inline void 358mult_index_list_init (struct mult_index_list *accu) 359{ 360 accu->nitems = 0; 361 accu->nitems_max = 0; 362 accu->item = NULL; 363 accu->nitems2_max = 0; 364 accu->item2 = NULL; 365} 366 367/* Add an index list to a list of indices with multiplicity. */ 368static inline void 369mult_index_list_accumulate (struct mult_index_list *accu, index_list_ty list) 370{ 371 size_t len1 = accu->nitems; 372 size_t len2 = list[IL_LENGTH]; 373 size_t need = len1 + len2; 374 struct mult_index *ptr1; 375 struct mult_index *ptr1_end; 376 index_ty *ptr2; 377 index_ty *ptr2_end; 378 struct mult_index *destptr; 379 380 /* Make the work area large enough. */ 381 if (accu->nitems2_max < need) 382 { 383 size_t new_max = 2 * accu->nitems2_max + 1; 384 385 if (new_max < need) 386 new_max = need; 387 if (accu->item2 != NULL) 388 free (accu->item2); 389 accu->item2 = 390 (struct mult_index *) xmalloc (new_max * sizeof (struct mult_index)); 391 accu->nitems2_max = new_max; 392 } 393 394 /* Make a linear pass through accu and list simultaneously. */ 395 ptr1 = accu->item; 396 ptr1_end = ptr1 + len1; 397 ptr2 = list + 2; 398 ptr2_end = ptr2 + len2; 399 destptr = accu->item2; 400 while (ptr1 < ptr1_end && ptr2 < ptr2_end) 401 { 402 if (ptr1->index < *ptr2) 403 { 404 *destptr = *ptr1; 405 ptr1++; 406 } 407 else if (ptr1->index > *ptr2) 408 { 409 destptr->index = *ptr2; 410 destptr->count = 1; 411 ptr2++; 412 } 413 else /* ptr1->index == list[2 + i2] */ 414 { 415 destptr->index = ptr1->index; 416 destptr->count = ptr1->count + 1; 417 ptr1++; 418 ptr2++; 419 } 420 destptr++; 421 } 422 while (ptr1 < ptr1_end) 423 { 424 *destptr = *ptr1; 425 ptr1++; 426 destptr++; 427 } 428 while (ptr2 < ptr2_end) 429 { 430 destptr->index = *ptr2; 431 destptr->count = 1; 432 ptr2++; 433 destptr++; 434 } 435 436 /* Swap accu->item and accu->item2. */ 437 { 438 struct mult_index *dest = accu->item2; 439 size_t dest_max = accu->nitems2_max; 440 441 accu->item2 = accu->item; 442 accu->nitems2_max = accu->nitems_max; 443 444 accu->item = dest; 445 accu->nitems = destptr - dest; 446 accu->nitems_max = dest_max; 447 } 448} 449 450/* Compares two indices with multiplicity, according to their multiplicity. */ 451static int 452mult_index_compare (const void *p1, const void *p2) 453{ 454 const struct mult_index *ptr1 = (const struct mult_index *) p1; 455 const struct mult_index *ptr2 = (const struct mult_index *) p2; 456 457 if (ptr1->count < ptr2->count) 458 return 1; 459 if (ptr1->count > ptr2->count) 460 return -1; 461 /* For reproduceable results, also take into account the indices. */ 462 if (ptr1->index > ptr2->index) 463 return 1; 464 if (ptr1->index < ptr2->index) 465 return -1; 466 return 0; 467} 468 469/* Sort a list of indices with multiplicity according to decreasing 470 multiplicity. */ 471static inline void 472mult_index_list_sort (struct mult_index_list *accu) 473{ 474 if (accu->nitems > 1) 475 qsort (accu->item, accu->nitems, sizeof (struct mult_index), 476 mult_index_compare); 477} 478 479/* Frees a list of indices with multiplicity. */ 480static inline void 481mult_index_list_free (struct mult_index_list *accu) 482{ 483 if (accu->item != NULL) 484 free (accu->item); 485 if (accu->item2 != NULL) 486 free (accu->item2); 487} 488 489/* Find a good match for the given msgctxt and msgid in the given fuzzy index. 490 The match does not need to be optimal. */ 491message_ty * 492message_fuzzy_index_search (message_fuzzy_index_ty *findex, 493 const char *msgctxt, const char *msgid) 494{ 495 const char *str = msgid; 496 497 /* Let p0 < p1 < p2 < p3 < p4 walk through the string. */ 498 const char *p0 = str; 499 if (*p0 != '\0') 500 { 501 const char *p1 = p0 + findex->iterator (p0); 502 if (*p1 != '\0') 503 { 504 const char *p2 = p1 + findex->iterator (p1); 505 if (*p2 != '\0') 506 { 507 const char *p3 = p2 + findex->iterator (p2); 508 if (*p3 != '\0') 509 { 510 const char *p4 = p3 + findex->iterator (p3); 511 struct mult_index_list accu; 512 513 mult_index_list_init (&accu); 514 for (;;) 515 { 516 /* The segment from p0 to p4 is a 4-gram of 517 characters. Get the hash table entry containing 518 a list of indices, and add it to the accu. */ 519 void *found; 520 521 if (hash_find_entry (&findex->gram4, p0, p4 - p0, 522 &found) == 0) 523 { 524 index_list_ty list = (index_list_ty) found; 525 mult_index_list_accumulate (&accu, list); 526 } 527 528 /* Advance. */ 529 if (*p4 == '\0') 530 break; 531 p0 = p1; 532 p1 = p2; 533 p2 = p3; 534 p3 = p4; 535 p4 = p4 + findex->iterator (p4); 536 } 537 538 /* Sort in decreasing count order. */ 539 mult_index_list_sort (&accu); 540 541 /* Take the first few messages from this sorted list, and 542 maximize the fstrcmp() result. */ 543 { 544 size_t count; 545 struct mult_index *ptr; 546 message_ty *best_mp; 547 double best_weight; 548 549 count = findex->firstfew; 550 if (count > accu.nitems) 551 count = accu.nitems; 552 553 best_weight = FUZZY_THRESHOLD; 554 best_mp = NULL; 555 for (ptr = accu.item; count > 0; ptr++, count--) 556 { 557 message_ty *mp = findex->messages[ptr->index]; 558 double weight = 559 fuzzy_search_goal_function (mp, msgctxt, msgid); 560 561 if (weight > best_weight) 562 { 563 best_weight = weight; 564 best_mp = mp; 565 } 566 } 567 568 mult_index_list_free (&accu); 569 570 return best_mp; 571 } 572 } 573 } 574 } 575 } 576 577 /* The string had less than 4 characters. */ 578 { 579 size_t l = strlen (str); 580 size_t lmin, lmax; 581 message_ty *best_mp; 582 double best_weight; 583 584 if (!(l <= SHORT_STRING_MAX_BYTES)) 585 abort (); 586 587 /* Walk through those short messages which have an appropriate length. 588 See the comment before SHORT_MSG_MAX. */ 589 lmin = (int) ceil (l / (2 / FUZZY_THRESHOLD - 1)); 590 lmax = (int) (l * (2 / FUZZY_THRESHOLD - 1)); 591 if (!(lmax <= SHORT_MSG_MAX)) 592 abort (); 593 594 best_weight = FUZZY_THRESHOLD; 595 best_mp = NULL; 596 for (l = lmin; l <= lmax; l++) 597 { 598 message_list_ty *mlp = findex->short_messages[l]; 599 size_t j; 600 601 for (j = 0; j < mlp->nitems; j++) 602 { 603 message_ty *mp = mlp->item[j]; 604 double weight = fuzzy_search_goal_function (mp, msgctxt, msgid); 605 606 if (weight > best_weight) 607 { 608 best_weight = weight; 609 best_mp = mp; 610 } 611 } 612 } 613 614 return best_mp; 615 } 616} 617 618/* Free a fuzzy index. */ 619void 620message_fuzzy_index_free (message_fuzzy_index_ty *findex) 621{ 622 size_t l; 623 void *iter; 624 const void *key; 625 size_t keylen; 626 void *data; 627 628 /* Free the short lists. */ 629 for (l = 0; l <= SHORT_MSG_MAX; l++) 630 message_list_free (findex->short_messages[l], 1); 631 632 /* Free the index lists occurring as values in the hash tables. */ 633 iter = NULL; 634 while (hash_iterate (&findex->gram4, &iter, &key, &keylen, &data) == 0) 635 free ((index_list_ty *) data); 636 /* Free the hash table itself. */ 637 hash_destroy (&findex->gram4); 638 639 free (findex); 640} 641