1/* Fast fuzzy searching among messages. 2 Copyright (C) 2006 Free Software Foundation, Inc. 3 Written by Bruno Haible <bruno@clisp.org>, 2006. 4 5 This program is free software: you can redistribute it and/or modify 6 it under the terms of the GNU General Public License as published by 7 the Free Software Foundation; either version 3 of the License, or 8 (at your option) any later version. 9 10 This program is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 GNU General Public License for more details. 14 15 You should have received a copy of the GNU General Public License 16 along with this program. If not, see <http://www.gnu.org/licenses/>. */ 17 18#ifdef HAVE_CONFIG_H 19# include <config.h> 20#endif 21 22/* Specification. */ 23#include "msgl-fsearch.h" 24 25#include <math.h> 26#include <stdlib.h> 27 28#include "xalloc.h" 29#include "po-charset.h" 30 31 32/* Fuzzy searching of L strings in a large set of N messages (assuming 33 they have all the same small size) takes O(L * N) when using repeated 34 fstrcmp() calls. When for example L = 800 and N = 69000, this is slow. 35 36 So we preprocess the set of N messages, yielding a data structure 37 that allows to select the similar messages for any given string, and 38 apply fstrcmp() only to the search result. This allows to reduce the 39 time to something between O(L * 1) and O(L * N) - depending on the 40 structure of the strings. In the example with L = 800 and N = 69000, 41 memory use increases by a factor of 2 and the time decreases from 42 9068 s to 230 s. 43 44 The data structure is a hash table mapping each n-gram (here with n=4) 45 occurring in the message strings to the list of messages that contains 46 it. For example, if the message list is 47 [0] "close" 48 [1] "losers" 49 the index is a hash table mapping 50 "clos" -> { 0 } 51 "lose" -> { 0, 1 } 52 "oser" -> { 1 } 53 "sers" -> { 1 } 54 Searching the similar messages to, say, "closest", is done by looking up 55 all its 4-grams in the hash table and summing up the results: 56 "clos" -> { 0 } 57 "lose" -> { 0, 1 } 58 "oses" -> {} 59 "sest" -> {} 60 => total: { 2x0, 1x1 } 61 The list is sorted according to decreasing frequency: { 0, 1, ... } 62 and only the first few messages from this frequency list are passed to 63 fstrcmp(). 64 65 The n-gram similarity and the fstrcmp() similarity are quite different 66 metrics. For example, "close" and "c l o s e" have no n-grams in common 67 (even for n as small as n = 2); however, fstrcmp() will find that they 68 are quite similar (= 0.71). Conversely, "AAAA BBBB ... ZZZZ" and 69 "ZZZZ ... BBBB AAAA" have many 4-grams in common, but their fstrcmp() is 70 only 0.02. Also, repeated n-grams don't have the same effect on the 71 two similarity measures. But all this doesn't matter much in practice. 72 73 We chose n = 4 because for alphabetic languages, with n = 3 the occurrence 74 lists are likely too long. (Well, with ideographic languages like Chinese, 75 n = 3 might actually be quite good. Anyway, n = 4 is not bad in this case 76 either.) 77 78 The units are characters in the current encoding. Not just bytes, 79 because 4 consecutive bytes in UTF-8 or GB18030 don't mean much. */ 80 81/* Each message is represented by its index in the message list. */ 82typedef unsigned int index_ty; 83 84/* An index list has its allocated size and length tacked at the front. 85 The indices are sorted in ascending order. */ 86typedef index_ty *index_list_ty; 87#define IL_ALLOCATED 0 88#define IL_LENGTH 1 89 90/* Create a new index list containing only a given index. */ 91static inline index_list_ty 92new_index (index_ty idx) 93{ 94 index_ty *list = XNMALLOC (2 + 1, index_ty); 95 list[IL_ALLOCATED] = 1; 96 list[IL_LENGTH] = 1; 97 list[2] = idx; 98 99 return list; 100} 101 102/* Add a given index, greater or equal than all previous indices, to an 103 index list. 104 Return the new index list, if it had to be reallocated, or NULL if it 105 didn't change. */ 106static inline index_list_ty 107addlast_index (index_list_ty list, index_ty idx) 108{ 109 index_list_ty result; 110 size_t length = list[IL_LENGTH]; 111 112 /* Look whether it should be inserted. */ 113 if (length > 0 && list[2 + (length - 1)] == idx) 114 return NULL; 115 116 /* Now make room for one more list element. */ 117 result = NULL; 118 if (length == list[IL_ALLOCATED]) 119 { 120 size_t new_allocated = 2 * length - (length >> 6); 121 list = (index_ty *) xrealloc (list, (2 + new_allocated) * sizeof (index_ty)); 122 list[IL_ALLOCATED] = new_allocated; 123 result = list; 124 } 125 list[2 + length] = idx; 126 list[IL_LENGTH] = length + 1; 127 return result; 128} 129 130/* Add a given index to an index list. 131 Return the new index list, if it had to be reallocated, or NULL if it 132 didn't change. */ 133static inline index_list_ty 134add_index (index_list_ty list, index_ty idx) 135{ 136 index_list_ty result; 137 size_t length = list[IL_LENGTH]; 138 139 /* Look where it should be inserted. */ 140 size_t lo = 0; 141 size_t hi = length; 142 while (lo < hi) 143 { 144 /* Here we know that idx must be inserted at a position >= lo, <= hi. */ 145 size_t mid = (lo + hi) / 2; /* lo <= mid < hi */ 146 index_ty val = list[2 + mid]; 147 if (val < idx) 148 lo = mid + 1; 149 else if (val > idx) 150 hi = mid; 151 else 152 return NULL; 153 } 154 155 /* Now make room for one more list element. */ 156 result = NULL; 157 if (length == list[IL_ALLOCATED]) 158 { 159 size_t new_allocated = 2 * length - (length >> 6); 160 list = (index_ty *) xrealloc (list, (2 + new_allocated) * sizeof (index_ty)); 161 list[IL_ALLOCATED] = new_allocated; 162 result = list; 163 } 164 list[IL_LENGTH] = length + 1; 165 for (; length > hi; length--) 166 list[2 + length] = list[1 + length]; 167 list[2 + length] = idx; 168 return result; 169} 170 171/* We use 4-grams, therefore strings with less than 4 characters cannot be 172 handled through the 4-grams table and need to be handled specially. 173 Since every character occupies at most 4 bytes (see po-charset.c), 174 this means the size of such short strings is bounded by: */ 175#define SHORT_STRING_MAX_CHARACTERS (4 - 1) 176#define SHORT_STRING_MAX_BYTES (SHORT_STRING_MAX_CHARACTERS * 4) 177 178/* Such short strings are handled by direct comparison with all messages 179 of appropriate size. Note that for two strings of length 0 <= l1 <= l2, 180 fstrcmp() is <= 2 * l1 / (l1 + l2). Since we are only interested in 181 fstrcmp() values >= FUZZY_THRESHOLD, we can for a given string of length l 182 limit the search to lengths l' in the range 183 l / (2 / FUZZY_THRESHOLD - 1) <= l' <= l * (2 / FUZZY_THRESHOLD - 1) 184 Thus we need the list of the short strings up to length: */ 185#define SHORT_MSG_MAX (int) (SHORT_STRING_MAX_BYTES * (2 / FUZZY_THRESHOLD - 1)) 186 187/* A fuzzy index contains a hash table mapping all n-grams to their 188 occurrences list. */ 189struct message_fuzzy_index_ty 190{ 191 message_ty **messages; 192 character_iterator_t iterator; 193 hash_table gram4; 194 size_t firstfew; 195 message_list_ty *short_messages[SHORT_MSG_MAX + 1]; 196}; 197 198/* Allocate a fuzzy index corresponding to a given list of messages. 199 The list of messages and the msgctxt and msgid fields of the messages 200 inside it must not be modified while the returned fuzzy index is in use. */ 201message_fuzzy_index_ty * 202message_fuzzy_index_alloc (const message_list_ty *mlp, 203 const char *canon_charset) 204{ 205 message_fuzzy_index_ty *findex = XMALLOC (message_fuzzy_index_ty); 206 size_t count = mlp->nitems; 207 size_t j; 208 size_t l; 209 210 findex->messages = mlp->item; 211 findex->iterator = po_charset_character_iterator (canon_charset); 212 213 /* Setup hash table. */ 214 if (hash_init (&findex->gram4, 10 * count) < 0) 215 xalloc_die (); 216 for (j = 0; j < count; j++) 217 { 218 message_ty *mp = mlp->item[j]; 219 220 if (mp->msgstr != NULL && mp->msgstr[0] != '\0') 221 { 222 const char *str = mp->msgid; 223 224 /* Let p0 < p1 < p2 < p3 < p4 walk through the string. */ 225 const char *p0 = str; 226 if (*p0 != '\0') 227 { 228 const char *p1 = p0 + findex->iterator (p0); 229 if (*p1 != '\0') 230 { 231 const char *p2 = p1 + findex->iterator (p1); 232 if (*p2 != '\0') 233 { 234 const char *p3 = p2 + findex->iterator (p2); 235 if (*p3 != '\0') 236 { 237 const char *p4 = p3 + findex->iterator (p3); 238 for (;;) 239 { 240 /* The segment from p0 to p4 is a 4-gram of 241 characters. Add a hash table entry that maps 242 it to the index j, or extend the existing 243 hash table entry accordingly. */ 244 void *found; 245 246 if (hash_find_entry (&findex->gram4, p0, p4 - p0, 247 &found) == 0) 248 { 249 index_list_ty list = (index_list_ty) found; 250 list = addlast_index (list, j); 251 if (list != NULL) 252 hash_set_value (&findex->gram4, p0, p4 - p0, 253 list); 254 } 255 else 256 hash_insert_entry (&findex->gram4, p0, p4 - p0, 257 new_index (j)); 258 259 /* Advance. */ 260 if (*p4 == '\0') 261 break; 262 p0 = p1; 263 p1 = p2; 264 p2 = p3; 265 p3 = p4; 266 p4 = p4 + findex->iterator (p4); 267 } 268 } 269 } 270 } 271 } 272 } 273 } 274 275 /* Shrink memory used by the hash table. */ 276 { 277 void *iter; 278 const void *key; 279 size_t keylen; 280 void **valuep; 281 282 iter = NULL; 283 while (hash_iterate_modify (&findex->gram4, &iter, &key, &keylen, &valuep) 284 == 0) 285 { 286 index_list_ty list = (index_list_ty) *valuep; 287 index_ty length = list[IL_LENGTH]; 288 289 if (length < list[IL_ALLOCATED]) 290 { 291 list[IL_ALLOCATED] = length; 292 *valuep = xrealloc (list, (2 + length) * sizeof (index_ty)); 293 } 294 } 295 } 296 297 findex->firstfew = (int) sqrt ((double) count); 298 if (findex->firstfew < 10) 299 findex->firstfew = 10; 300 301 /* Setup lists of short messages. */ 302 for (l = 0; l <= SHORT_MSG_MAX; l++) 303 findex->short_messages[l] = message_list_alloc (false); 304 for (j = 0; j < count; j++) 305 { 306 message_ty *mp = mlp->item[j]; 307 308 if (mp->msgstr != NULL && mp->msgstr[0] != '\0') 309 { 310 const char *str = mp->msgid; 311 size_t len = strlen (str); 312 313 if (len <= SHORT_MSG_MAX) 314 message_list_append (findex->short_messages[len], mp); 315 } 316 } 317 318 /* Shrink memory used by the lists of short messages. */ 319 for (l = 0; l <= SHORT_MSG_MAX; l++) 320 { 321 message_list_ty *mlp = findex->short_messages[l]; 322 323 if (mlp->nitems < mlp->nitems_max) 324 { 325 mlp->nitems_max = mlp->nitems; 326 mlp->item = 327 (message_ty **) 328 xrealloc (mlp->item, mlp->nitems_max * sizeof (message_ty *)); 329 } 330 } 331 332 return findex; 333} 334 335/* An index with multiplicity. */ 336struct mult_index 337{ 338 index_ty index; 339 unsigned int count; 340}; 341 342/* A list of indices with multiplicity. 343 The indices are sorted in ascending order. */ 344struct mult_index_list 345{ 346 struct mult_index *item; 347 size_t nitems; 348 size_t nitems_max; 349 /* Work area. */ 350 struct mult_index *item2; 351 size_t nitems2_max; 352}; 353 354/* Initialize an empty list of indices with multiplicity. */ 355static inline void 356mult_index_list_init (struct mult_index_list *accu) 357{ 358 accu->nitems = 0; 359 accu->nitems_max = 0; 360 accu->item = NULL; 361 accu->nitems2_max = 0; 362 accu->item2 = NULL; 363} 364 365/* Add an index list to a list of indices with multiplicity. */ 366static inline void 367mult_index_list_accumulate (struct mult_index_list *accu, index_list_ty list) 368{ 369 size_t len1 = accu->nitems; 370 size_t len2 = list[IL_LENGTH]; 371 size_t need = len1 + len2; 372 struct mult_index *ptr1; 373 struct mult_index *ptr1_end; 374 index_ty *ptr2; 375 index_ty *ptr2_end; 376 struct mult_index *destptr; 377 378 /* Make the work area large enough. */ 379 if (accu->nitems2_max < need) 380 { 381 size_t new_max = 2 * accu->nitems2_max + 1; 382 383 if (new_max < need) 384 new_max = need; 385 if (accu->item2 != NULL) 386 free (accu->item2); 387 accu->item2 = XNMALLOC (new_max, struct mult_index); 388 accu->nitems2_max = new_max; 389 } 390 391 /* Make a linear pass through accu and list simultaneously. */ 392 ptr1 = accu->item; 393 ptr1_end = ptr1 + len1; 394 ptr2 = list + 2; 395 ptr2_end = ptr2 + len2; 396 destptr = accu->item2; 397 while (ptr1 < ptr1_end && ptr2 < ptr2_end) 398 { 399 if (ptr1->index < *ptr2) 400 { 401 *destptr = *ptr1; 402 ptr1++; 403 } 404 else if (ptr1->index > *ptr2) 405 { 406 destptr->index = *ptr2; 407 destptr->count = 1; 408 ptr2++; 409 } 410 else /* ptr1->index == list[2 + i2] */ 411 { 412 destptr->index = ptr1->index; 413 destptr->count = ptr1->count + 1; 414 ptr1++; 415 ptr2++; 416 } 417 destptr++; 418 } 419 while (ptr1 < ptr1_end) 420 { 421 *destptr = *ptr1; 422 ptr1++; 423 destptr++; 424 } 425 while (ptr2 < ptr2_end) 426 { 427 destptr->index = *ptr2; 428 destptr->count = 1; 429 ptr2++; 430 destptr++; 431 } 432 433 /* Swap accu->item and accu->item2. */ 434 { 435 struct mult_index *dest = accu->item2; 436 size_t dest_max = accu->nitems2_max; 437 438 accu->item2 = accu->item; 439 accu->nitems2_max = accu->nitems_max; 440 441 accu->item = dest; 442 accu->nitems = destptr - dest; 443 accu->nitems_max = dest_max; 444 } 445} 446 447/* Compares two indices with multiplicity, according to their multiplicity. */ 448static int 449mult_index_compare (const void *p1, const void *p2) 450{ 451 const struct mult_index *ptr1 = (const struct mult_index *) p1; 452 const struct mult_index *ptr2 = (const struct mult_index *) p2; 453 454 if (ptr1->count < ptr2->count) 455 return 1; 456 if (ptr1->count > ptr2->count) 457 return -1; 458 /* For reproduceable results, also take into account the indices. */ 459 if (ptr1->index > ptr2->index) 460 return 1; 461 if (ptr1->index < ptr2->index) 462 return -1; 463 return 0; 464} 465 466/* Sort a list of indices with multiplicity according to decreasing 467 multiplicity. */ 468static inline void 469mult_index_list_sort (struct mult_index_list *accu) 470{ 471 if (accu->nitems > 1) 472 qsort (accu->item, accu->nitems, sizeof (struct mult_index), 473 mult_index_compare); 474} 475 476/* Frees a list of indices with multiplicity. */ 477static inline void 478mult_index_list_free (struct mult_index_list *accu) 479{ 480 if (accu->item != NULL) 481 free (accu->item); 482 if (accu->item2 != NULL) 483 free (accu->item2); 484} 485 486/* Find a good match for the given msgctxt and msgid in the given fuzzy index. 487 The match does not need to be optimal. */ 488message_ty * 489message_fuzzy_index_search (message_fuzzy_index_ty *findex, 490 const char *msgctxt, const char *msgid) 491{ 492 const char *str = msgid; 493 494 /* Let p0 < p1 < p2 < p3 < p4 walk through the string. */ 495 const char *p0 = str; 496 if (*p0 != '\0') 497 { 498 const char *p1 = p0 + findex->iterator (p0); 499 if (*p1 != '\0') 500 { 501 const char *p2 = p1 + findex->iterator (p1); 502 if (*p2 != '\0') 503 { 504 const char *p3 = p2 + findex->iterator (p2); 505 if (*p3 != '\0') 506 { 507 const char *p4 = p3 + findex->iterator (p3); 508 struct mult_index_list accu; 509 510 mult_index_list_init (&accu); 511 for (;;) 512 { 513 /* The segment from p0 to p4 is a 4-gram of 514 characters. Get the hash table entry containing 515 a list of indices, and add it to the accu. */ 516 void *found; 517 518 if (hash_find_entry (&findex->gram4, p0, p4 - p0, 519 &found) == 0) 520 { 521 index_list_ty list = (index_list_ty) found; 522 mult_index_list_accumulate (&accu, list); 523 } 524 525 /* Advance. */ 526 if (*p4 == '\0') 527 break; 528 p0 = p1; 529 p1 = p2; 530 p2 = p3; 531 p3 = p4; 532 p4 = p4 + findex->iterator (p4); 533 } 534 535 /* Sort in decreasing count order. */ 536 mult_index_list_sort (&accu); 537 538 /* Take the first few messages from this sorted list, and 539 maximize the fstrcmp() result. */ 540 { 541 size_t count; 542 struct mult_index *ptr; 543 message_ty *best_mp; 544 double best_weight; 545 546 count = findex->firstfew; 547 if (count > accu.nitems) 548 count = accu.nitems; 549 550 best_weight = FUZZY_THRESHOLD; 551 best_mp = NULL; 552 for (ptr = accu.item; count > 0; ptr++, count--) 553 { 554 message_ty *mp = findex->messages[ptr->index]; 555 double weight = 556 fuzzy_search_goal_function (mp, msgctxt, msgid); 557 558 if (weight > best_weight) 559 { 560 best_weight = weight; 561 best_mp = mp; 562 } 563 } 564 565 mult_index_list_free (&accu); 566 567 return best_mp; 568 } 569 } 570 } 571 } 572 } 573 574 /* The string had less than 4 characters. */ 575 { 576 size_t l = strlen (str); 577 size_t lmin, lmax; 578 message_ty *best_mp; 579 double best_weight; 580 581 if (!(l <= SHORT_STRING_MAX_BYTES)) 582 abort (); 583 584 /* Walk through those short messages which have an appropriate length. 585 See the comment before SHORT_MSG_MAX. */ 586 lmin = (int) ceil (l / (2 / FUZZY_THRESHOLD - 1)); 587 lmax = (int) (l * (2 / FUZZY_THRESHOLD - 1)); 588 if (!(lmax <= SHORT_MSG_MAX)) 589 abort (); 590 591 best_weight = FUZZY_THRESHOLD; 592 best_mp = NULL; 593 for (l = lmin; l <= lmax; l++) 594 { 595 message_list_ty *mlp = findex->short_messages[l]; 596 size_t j; 597 598 for (j = 0; j < mlp->nitems; j++) 599 { 600 message_ty *mp = mlp->item[j]; 601 double weight = fuzzy_search_goal_function (mp, msgctxt, msgid); 602 603 if (weight > best_weight) 604 { 605 best_weight = weight; 606 best_mp = mp; 607 } 608 } 609 } 610 611 return best_mp; 612 } 613} 614 615/* Free a fuzzy index. */ 616void 617message_fuzzy_index_free (message_fuzzy_index_ty *findex) 618{ 619 size_t l; 620 void *iter; 621 const void *key; 622 size_t keylen; 623 void *data; 624 625 /* Free the short lists. */ 626 for (l = 0; l <= SHORT_MSG_MAX; l++) 627 message_list_free (findex->short_messages[l], 1); 628 629 /* Free the index lists occurring as values in the hash tables. */ 630 iter = NULL; 631 while (hash_iterate (&findex->gram4, &iter, &key, &keylen, &data) == 0) 632 free ((index_list_ty *) data); 633 /* Free the hash table itself. */ 634 hash_destroy (&findex->gram4); 635 636 free (findex); 637} 638