1/* Functions to make fuzzy comparisons between strings 2 Copyright (C) 1988-1989, 1992-1993, 1995, 2001-2003, 2006 Free Software Foundation, Inc. 3 4 This program is free software; you can redistribute it and/or modify 5 it under the terms of the GNU General Public License as published by 6 the Free Software Foundation; either version 2 of the License, or (at 7 your option) any later version. 8 9 This program is distributed in the hope that it will be useful, but 10 WITHOUT ANY WARRANTY; without even the implied warranty of 11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 12 General Public License for more details. 13 14 You should have received a copy of the GNU General Public License 15 along with this program; if not, write to the Free Software 16 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 17 18 19 Derived from GNU diff 2.7, analyze.c et al. 20 21 The basic idea is to consider two strings as similar if, when 22 transforming the first string into the second string through a 23 sequence of edits (inserts and deletes of one character each), 24 this sequence is short - or equivalently, if the ordered list 25 of characters that are untouched by these edits is long. For a 26 good introduction to the subject, read about the "Levenshtein 27 distance" in Wikipedia. 28 29 The basic algorithm is described in: 30 "An O(ND) Difference Algorithm and its Variations", Eugene Myers, 31 Algorithmica Vol. 1 No. 2, 1986, pp. 251-266; 32 see especially section 4.2, which describes the variation used below. 33 34 The basic algorithm was independently discovered as described in: 35 "Algorithms for Approximate String Matching", E. Ukkonen, 36 Information and Control Vol. 64, 1985, pp. 100-118. 37 38 Unless the 'minimal' flag is set, this code uses the TOO_EXPENSIVE 39 heuristic, by Paul Eggert, to limit the cost to O(N**1.5 log N) 40 at the price of producing suboptimal output for large inputs with 41 many differences. 42 43 Modified to work on strings rather than files 44 by Peter Miller <pmiller@agso.gov.au>, October 1995 */ 45 46#include <config.h> 47 48/* Specification. */ 49#include "fstrcmp.h" 50 51#include <string.h> 52#include <stdio.h> 53#include <stdlib.h> 54#include <limits.h> 55 56#include "lock.h" 57#include "tls.h" 58#include "xalloc.h" 59 60#ifndef uintptr_t 61# define uintptr_t unsigned long 62#endif 63 64 65/* 66 * Context of comparison operation. 67 */ 68struct context 69{ 70 /* 71 * Data on one input string being compared. 72 */ 73 struct string_data 74 { 75 /* The string to be compared. */ 76 const char *data; 77 78 /* The length of the string to be compared. */ 79 int data_length; 80 81 /* The number of characters inserted or deleted. */ 82 int edit_count; 83 } 84 string[2]; 85 86 #ifdef MINUS_H_FLAG 87 88 /* This corresponds to the diff -H flag. With this heuristic, for 89 strings with a constant small density of changes, the algorithm is 90 linear in the strings size. This is unlikely in typical uses of 91 fstrcmp, and so is usually compiled out. Besides, there is no 92 interface to set it true. */ 93 int heuristic; 94 95 #endif 96 97 /* Vector, indexed by diagonal, containing 1 + the X coordinate of the 98 point furthest along the given diagonal in the forward search of the 99 edit matrix. */ 100 int *fdiag; 101 102 /* Vector, indexed by diagonal, containing the X coordinate of the point 103 furthest along the given diagonal in the backward search of the edit 104 matrix. */ 105 int *bdiag; 106 107 /* Edit scripts longer than this are too expensive to compute. */ 108 int too_expensive; 109 110 /* Snakes bigger than this are considered `big'. */ 111 #define SNAKE_LIMIT 20 112}; 113 114struct partition 115{ 116 /* Midpoints of this partition. */ 117 int xmid, ymid; 118 119 /* Nonzero if low half will be analyzed minimally. */ 120 int lo_minimal; 121 122 /* Likewise for high half. */ 123 int hi_minimal; 124}; 125 126 127/* NAME 128 diag - find diagonal path 129 130 SYNOPSIS 131 int diag(int xoff, int xlim, int yoff, int ylim, int minimal, 132 struct partition *part, struct context *ctxt); 133 134 DESCRIPTION 135 Find the midpoint of the shortest edit script for a specified 136 portion of the two strings. 137 138 Scan from the beginnings of the strings, and simultaneously from 139 the ends, doing a breadth-first search through the space of 140 edit-sequence. When the two searches meet, we have found the 141 midpoint of the shortest edit sequence. 142 143 If MINIMAL is nonzero, find the minimal edit script regardless 144 of expense. Otherwise, if the search is too expensive, use 145 heuristics to stop the search and report a suboptimal answer. 146 147 RETURNS 148 Set PART->(XMID,YMID) to the midpoint (XMID,YMID). The diagonal 149 number XMID - YMID equals the number of inserted characters 150 minus the number of deleted characters (counting only characters 151 before the midpoint). Return the approximate edit cost; this is 152 the total number of characters inserted or deleted (counting 153 only characters before the midpoint), unless a heuristic is used 154 to terminate the search prematurely. 155 156 Set PART->LEFT_MINIMAL to nonzero iff the minimal edit script 157 for the left half of the partition is known; similarly for 158 PART->RIGHT_MINIMAL. 159 160 CAVEAT 161 This function assumes that the first characters of the specified 162 portions of the two strings do not match, and likewise that the 163 last characters do not match. The caller must trim matching 164 characters from the beginning and end of the portions it is 165 going to specify. 166 167 If we return the "wrong" partitions, the worst this can do is 168 cause suboptimal diff output. It cannot cause incorrect diff 169 output. */ 170 171static int 172diag (int xoff, int xlim, int yoff, int ylim, int minimal, 173 struct partition *part, struct context *ctxt) 174{ 175 int *const fd = ctxt->fdiag; /* Give the compiler a chance. */ 176 int *const bd = ctxt->bdiag; /* Additional help for the compiler. */ 177 const char *const xv = ctxt->string[0].data; /* Still more help for the compiler. */ 178 const char *const yv = ctxt->string[1].data; /* And more and more . . . */ 179 const int dmin = xoff - ylim; /* Minimum valid diagonal. */ 180 const int dmax = xlim - yoff; /* Maximum valid diagonal. */ 181 const int fmid = xoff - yoff; /* Center diagonal of top-down search. */ 182 const int bmid = xlim - ylim; /* Center diagonal of bottom-up search. */ 183 int fmin = fmid; 184 int fmax = fmid; /* Limits of top-down search. */ 185 int bmin = bmid; 186 int bmax = bmid; /* Limits of bottom-up search. */ 187 int c; /* Cost. */ 188 int odd = (fmid - bmid) & 1; 189 190 /* 191 * True if southeast corner is on an odd diagonal with respect 192 * to the northwest. 193 */ 194 fd[fmid] = xoff; 195 bd[bmid] = xlim; 196 for (c = 1;; ++c) 197 { 198 int d; /* Active diagonal. */ 199 int big_snake; 200 201 big_snake = 0; 202 /* Extend the top-down search by an edit step in each diagonal. */ 203 if (fmin > dmin) 204 fd[--fmin - 1] = -1; 205 else 206 ++fmin; 207 if (fmax < dmax) 208 fd[++fmax + 1] = -1; 209 else 210 --fmax; 211 for (d = fmax; d >= fmin; d -= 2) 212 { 213 int x; 214 int y; 215 int oldx; 216 int tlo; 217 int thi; 218 219 tlo = fd[d - 1], 220 thi = fd[d + 1]; 221 222 if (tlo >= thi) 223 x = tlo + 1; 224 else 225 x = thi; 226 oldx = x; 227 y = x - d; 228 while (x < xlim && y < ylim && xv[x] == yv[y]) 229 { 230 ++x; 231 ++y; 232 } 233 if (x - oldx > SNAKE_LIMIT) 234 big_snake = 1; 235 fd[d] = x; 236 if (odd && bmin <= d && d <= bmax && bd[d] <= x) 237 { 238 part->xmid = x; 239 part->ymid = y; 240 part->lo_minimal = part->hi_minimal = 1; 241 return 2 * c - 1; 242 } 243 } 244 /* Similarly extend the bottom-up search. */ 245 if (bmin > dmin) 246 bd[--bmin - 1] = INT_MAX; 247 else 248 ++bmin; 249 if (bmax < dmax) 250 bd[++bmax + 1] = INT_MAX; 251 else 252 --bmax; 253 for (d = bmax; d >= bmin; d -= 2) 254 { 255 int x; 256 int y; 257 int oldx; 258 int tlo; 259 int thi; 260 261 tlo = bd[d - 1], 262 thi = bd[d + 1]; 263 if (tlo < thi) 264 x = tlo; 265 else 266 x = thi - 1; 267 oldx = x; 268 y = x - d; 269 while (x > xoff && y > yoff && xv[x - 1] == yv[y - 1]) 270 { 271 --x; 272 --y; 273 } 274 if (oldx - x > SNAKE_LIMIT) 275 big_snake = 1; 276 bd[d] = x; 277 if (!odd && fmin <= d && d <= fmax && x <= fd[d]) 278 { 279 part->xmid = x; 280 part->ymid = y; 281 part->lo_minimal = part->hi_minimal = 1; 282 return 2 * c; 283 } 284 } 285 286 if (minimal) 287 continue; 288 289#ifdef MINUS_H_FLAG 290 /* Heuristic: check occasionally for a diagonal that has made lots 291 of progress compared with the edit distance. If we have any 292 such, find the one that has made the most progress and return 293 it as if it had succeeded. 294 295 With this heuristic, for strings with a constant small density 296 of changes, the algorithm is linear in the strings size. */ 297 if (c > 200 && big_snake && ctxt->heuristic) 298 { 299 int best; 300 301 best = 0; 302 for (d = fmax; d >= fmin; d -= 2) 303 { 304 int dd; 305 int x; 306 int y; 307 int v; 308 309 dd = d - fmid; 310 x = fd[d]; 311 y = x - d; 312 v = (x - xoff) * 2 - dd; 313 314 if (v > 12 * (c + (dd < 0 ? -dd : dd))) 315 { 316 if 317 ( 318 v > best 319 && 320 xoff + SNAKE_LIMIT <= x 321 && 322 x < xlim 323 && 324 yoff + SNAKE_LIMIT <= y 325 && 326 y < ylim 327 ) 328 { 329 /* We have a good enough best diagonal; now insist 330 that it end with a significant snake. */ 331 int k; 332 333 for (k = 1; xv[x - k] == yv[y - k]; k++) 334 { 335 if (k == SNAKE_LIMIT) 336 { 337 best = v; 338 part->xmid = x; 339 part->ymid = y; 340 break; 341 } 342 } 343 } 344 } 345 } 346 if (best > 0) 347 { 348 part->lo_minimal = 1; 349 part->hi_minimal = 0; 350 return 2 * c - 1; 351 } 352 best = 0; 353 for (d = bmax; d >= bmin; d -= 2) 354 { 355 int dd; 356 int x; 357 int y; 358 int v; 359 360 dd = d - bmid; 361 x = bd[d]; 362 y = x - d; 363 v = (xlim - x) * 2 + dd; 364 365 if (v > 12 * (c + (dd < 0 ? -dd : dd))) 366 { 367 if (v > best && xoff < x && x <= xlim - SNAKE_LIMIT && 368 yoff < y && y <= ylim - SNAKE_LIMIT) 369 { 370 /* We have a good enough best diagonal; now insist 371 that it end with a significant snake. */ 372 int k; 373 374 for (k = 0; xv[x + k] == yv[y + k]; k++) 375 { 376 if (k == SNAKE_LIMIT - 1) 377 { 378 best = v; 379 part->xmid = x; 380 part->ymid = y; 381 break; 382 } 383 } 384 } 385 } 386 } 387 if (best > 0) 388 { 389 part->lo_minimal = 0; 390 part->hi_minimal = 1; 391 return 2 * c - 1; 392 } 393 } 394#endif /* MINUS_H_FLAG */ 395 396 /* Heuristic: if we've gone well beyond the call of duty, give up 397 and report halfway between our best results so far. */ 398 if (c >= ctxt->too_expensive) 399 { 400 int fxybest; 401 int fxbest; 402 int bxybest; 403 int bxbest; 404 405 /* Pacify `gcc -Wall'. */ 406 fxbest = 0; 407 bxbest = 0; 408 409 /* Find forward diagonal that maximizes X + Y. */ 410 fxybest = -1; 411 for (d = fmax; d >= fmin; d -= 2) 412 { 413 int x; 414 int y; 415 416 x = fd[d] < xlim ? fd[d] : xlim; 417 y = x - d; 418 419 if (ylim < y) 420 { 421 x = ylim + d; 422 y = ylim; 423 } 424 if (fxybest < x + y) 425 { 426 fxybest = x + y; 427 fxbest = x; 428 } 429 } 430 /* Find backward diagonal that minimizes X + Y. */ 431 bxybest = INT_MAX; 432 for (d = bmax; d >= bmin; d -= 2) 433 { 434 int x; 435 int y; 436 437 x = xoff > bd[d] ? xoff : bd[d]; 438 y = x - d; 439 440 if (y < yoff) 441 { 442 x = yoff + d; 443 y = yoff; 444 } 445 if (x + y < bxybest) 446 { 447 bxybest = x + y; 448 bxbest = x; 449 } 450 } 451 /* Use the better of the two diagonals. */ 452 if ((xlim + ylim) - bxybest < fxybest - (xoff + yoff)) 453 { 454 part->xmid = fxbest; 455 part->ymid = fxybest - fxbest; 456 part->lo_minimal = 1; 457 part->hi_minimal = 0; 458 } 459 else 460 { 461 part->xmid = bxbest; 462 part->ymid = bxybest - bxbest; 463 part->lo_minimal = 0; 464 part->hi_minimal = 1; 465 } 466 return 2 * c - 1; 467 } 468 } 469} 470 471 472/* NAME 473 compareseq - find edit sequence 474 475 SYNOPSIS 476 void compareseq(int xoff, int xlim, int yoff, int ylim, int minimal, 477 struct context *ctxt); 478 479 DESCRIPTION 480 Compare in detail contiguous subsequences of the two strings 481 which are known, as a whole, to match each other. 482 483 The subsequence of string 0 is [XOFF, XLIM) and likewise for 484 string 1. 485 486 Note that XLIM, YLIM are exclusive bounds. All character 487 numbers are origin-0. 488 489 If MINIMAL is nonzero, find a minimal difference no matter how 490 expensive it is. */ 491 492static void 493compareseq (int xoff, int xlim, int yoff, int ylim, int minimal, 494 struct context *ctxt) 495{ 496 const char *const xv = ctxt->string[0].data; /* Help the compiler. */ 497 const char *const yv = ctxt->string[1].data; 498 499 /* Slide down the bottom initial diagonal. */ 500 while (xoff < xlim && yoff < ylim && xv[xoff] == yv[yoff]) 501 { 502 ++xoff; 503 ++yoff; 504 } 505 506 /* Slide up the top initial diagonal. */ 507 while (xlim > xoff && ylim > yoff && xv[xlim - 1] == yv[ylim - 1]) 508 { 509 --xlim; 510 --ylim; 511 } 512 513 /* Handle simple cases. */ 514 if (xoff == xlim) 515 { 516 while (yoff < ylim) 517 { 518 ctxt->string[1].edit_count++; 519 ++yoff; 520 } 521 } 522 else if (yoff == ylim) 523 { 524 while (xoff < xlim) 525 { 526 ctxt->string[0].edit_count++; 527 ++xoff; 528 } 529 } 530 else 531 { 532 int c; 533 struct partition part; 534 535 /* Find a point of correspondence in the middle of the strings. */ 536 c = diag (xoff, xlim, yoff, ylim, minimal, &part, ctxt); 537 if (c == 1) 538 { 539#if 0 540 /* This should be impossible, because it implies that one of 541 the two subsequences is empty, and that case was handled 542 above without calling `diag'. Let's verify that this is 543 true. */ 544 abort (); 545#else 546 /* The two subsequences differ by a single insert or delete; 547 record it and we are done. */ 548 if (part.xmid - part.ymid < xoff - yoff) 549 ctxt->string[1].edit_count++; 550 else 551 ctxt->string[0].edit_count++; 552#endif 553 } 554 else 555 { 556 /* Use the partitions to split this problem into subproblems. */ 557 compareseq (xoff, part.xmid, yoff, part.ymid, part.lo_minimal, ctxt); 558 compareseq (part.xmid, xlim, part.ymid, ylim, part.hi_minimal, ctxt); 559 } 560 } 561} 562 563 564/* Because fstrcmp is typically called multiple times, attempt to minimize 565 the number of memory allocations performed. Thus, let a call reuse the 566 memory already allocated by the previous call, if it is sufficient. 567 To make it multithread-safe, without need for a lock that protects the 568 already allocated memory, store the allocated memory per thread. Free 569 it only when the thread exits. */ 570 571static gl_tls_key_t buffer_key; /* TLS key for a 'int *' */ 572static gl_tls_key_t bufmax_key; /* TLS key for a 'size_t' */ 573 574static void 575keys_init (void) 576{ 577 gl_tls_key_init (buffer_key, free); 578 gl_tls_key_init (bufmax_key, NULL); 579 /* The per-thread initial values are NULL and 0, respectively. */ 580} 581 582/* Ensure that keys_init is called once only. */ 583gl_once_define(static, keys_init_once) 584 585 586/* NAME 587 fstrcmp - fuzzy string compare 588 589 SYNOPSIS 590 double fstrcmp(const char *, const char *); 591 592 DESCRIPTION 593 The fstrcmp function may be used to compare two string for 594 similarity. It is very useful in reducing "cascade" or 595 "secondary" errors in compilers or other situations where 596 symbol tables occur. 597 598 RETURNS 599 double; 0 if the strings are entirly dissimilar, 1 if the 600 strings are identical, and a number in between if they are 601 similar. */ 602 603double 604fstrcmp (const char *string1, const char *string2) 605{ 606 struct context ctxt; 607 int i; 608 609 size_t fdiag_len; 610 int *buffer; 611 size_t bufmax; 612 613 /* set the info for each string. */ 614 ctxt.string[0].data = string1; 615 ctxt.string[0].data_length = strlen (string1); 616 ctxt.string[1].data = string2; 617 ctxt.string[1].data_length = strlen (string2); 618 619 /* short-circuit obvious comparisons */ 620 if (ctxt.string[0].data_length == 0 && ctxt.string[1].data_length == 0) 621 return 1.0; 622 if (ctxt.string[0].data_length == 0 || ctxt.string[1].data_length == 0) 623 return 0.0; 624 625 /* Set TOO_EXPENSIVE to be approximate square root of input size, 626 bounded below by 256. */ 627 ctxt.too_expensive = 1; 628 for (i = ctxt.string[0].data_length + ctxt.string[1].data_length; 629 i != 0; 630 i >>= 2) 631 ctxt.too_expensive <<= 1; 632 if (ctxt.too_expensive < 256) 633 ctxt.too_expensive = 256; 634 635 /* Allocate memory for fdiag and bdiag from a thread-local pool. */ 636 fdiag_len = ctxt.string[0].data_length + ctxt.string[1].data_length + 3; 637 gl_once (keys_init_once, keys_init); 638 buffer = (int *) gl_tls_get (buffer_key); 639 bufmax = (size_t) (uintptr_t) gl_tls_get (bufmax_key); 640 if (fdiag_len > bufmax) 641 { 642 /* Need more memory. */ 643 bufmax = 2 * bufmax; 644 if (fdiag_len > bufmax) 645 bufmax = fdiag_len; 646 /* Calling xrealloc would be a waste: buffer's contents does not need 647 to be preserved. */ 648 if (buffer != NULL) 649 free (buffer); 650 buffer = (int *) xmalloc (bufmax * (2 * sizeof (int))); 651 gl_tls_set (buffer_key, buffer); 652 gl_tls_set (bufmax_key, (void *) (uintptr_t) bufmax); 653 } 654 ctxt.fdiag = buffer + ctxt.string[1].data_length + 1; 655 ctxt.bdiag = ctxt.fdiag + fdiag_len; 656 657 /* Now do the main comparison algorithm */ 658 ctxt.string[0].edit_count = 0; 659 ctxt.string[1].edit_count = 0; 660 compareseq (0, ctxt.string[0].data_length, 0, ctxt.string[1].data_length, 0, 661 &ctxt); 662 663 /* The result is 664 ((number of chars in common) / (average length of the strings)). 665 This is admittedly biased towards finding that the strings are 666 similar, however it does produce meaningful results. */ 667 return ((double) (ctxt.string[0].data_length + ctxt.string[1].data_length 668 - ctxt.string[1].edit_count - ctxt.string[0].edit_count) 669 / (ctxt.string[0].data_length + ctxt.string[1].data_length)); 670} 671