1/* Functions to make fuzzy comparisons between strings
2   Copyright (C) 1988-1989, 1992-1993, 1995, 2001-2003, 2006 Free Software Foundation, Inc.
3
4   This program is free software; you can redistribute it and/or modify
5   it under the terms of the GNU General Public License as published by
6   the Free Software Foundation; either version 2 of the License, or (at
7   your option) any later version.
8
9   This program is distributed in the hope that it will be useful, but
10   WITHOUT ANY WARRANTY; without even the implied warranty of
11   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
12   General Public License for more details.
13
14   You should have received a copy of the GNU General Public License
15   along with this program; if not, write to the Free Software
16   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
17
18
19   Derived from GNU diff 2.7, analyze.c et al.
20
21   The basic idea is to consider two strings as similar if, when
22   transforming the first string into the second string through a
23   sequence of edits (inserts and deletes of one character each),
24   this sequence is short - or equivalently, if the ordered list
25   of characters that are untouched by these edits is long.  For a
26   good introduction to the subject, read about the "Levenshtein
27   distance" in Wikipedia.
28
29   The basic algorithm is described in:
30   "An O(ND) Difference Algorithm and its Variations", Eugene Myers,
31   Algorithmica Vol. 1 No. 2, 1986, pp. 251-266;
32   see especially section 4.2, which describes the variation used below.
33
34   The basic algorithm was independently discovered as described in:
35   "Algorithms for Approximate String Matching", E. Ukkonen,
36   Information and Control Vol. 64, 1985, pp. 100-118.
37
38   Unless the 'minimal' flag is set, this code uses the TOO_EXPENSIVE
39   heuristic, by Paul Eggert, to limit the cost to O(N**1.5 log N)
40   at the price of producing suboptimal output for large inputs with
41   many differences.
42
43   Modified to work on strings rather than files
44   by Peter Miller <pmiller@agso.gov.au>, October 1995 */
45
46#include <config.h>
47
48/* Specification.  */
49#include "fstrcmp.h"
50
51#include <string.h>
52#include <stdio.h>
53#include <stdlib.h>
54#include <limits.h>
55
56#include "lock.h"
57#include "tls.h"
58#include "xalloc.h"
59
60#ifndef uintptr_t
61# define uintptr_t unsigned long
62#endif
63
64
65/*
66 * Context of comparison operation.
67 */
68struct context
69{
70  /*
71   * Data on one input string being compared.
72   */
73  struct string_data
74  {
75    /* The string to be compared. */
76    const char *data;
77
78    /* The length of the string to be compared. */
79    int data_length;
80
81    /* The number of characters inserted or deleted. */
82    int edit_count;
83  }
84  string[2];
85
86  #ifdef MINUS_H_FLAG
87
88  /* This corresponds to the diff -H flag.  With this heuristic, for
89     strings with a constant small density of changes, the algorithm is
90     linear in the strings size.  This is unlikely in typical uses of
91     fstrcmp, and so is usually compiled out.  Besides, there is no
92     interface to set it true.  */
93  int heuristic;
94
95  #endif
96
97  /* Vector, indexed by diagonal, containing 1 + the X coordinate of the
98     point furthest along the given diagonal in the forward search of the
99     edit matrix.  */
100  int *fdiag;
101
102  /* Vector, indexed by diagonal, containing the X coordinate of the point
103     furthest along the given diagonal in the backward search of the edit
104     matrix.  */
105  int *bdiag;
106
107  /* Edit scripts longer than this are too expensive to compute.  */
108  int too_expensive;
109
110  /* Snakes bigger than this are considered `big'.  */
111  #define SNAKE_LIMIT	20
112};
113
114struct partition
115{
116  /* Midpoints of this partition.  */
117  int xmid, ymid;
118
119  /* Nonzero if low half will be analyzed minimally.  */
120  int lo_minimal;
121
122  /* Likewise for high half.  */
123  int hi_minimal;
124};
125
126
127/* NAME
128	diag - find diagonal path
129
130   SYNOPSIS
131	int diag(int xoff, int xlim, int yoff, int ylim, int minimal,
132		 struct partition *part, struct context *ctxt);
133
134   DESCRIPTION
135	Find the midpoint of the shortest edit script for a specified
136	portion of the two strings.
137
138	Scan from the beginnings of the strings, and simultaneously from
139	the ends, doing a breadth-first search through the space of
140	edit-sequence.  When the two searches meet, we have found the
141	midpoint of the shortest edit sequence.
142
143	If MINIMAL is nonzero, find the minimal edit script regardless
144	of expense.  Otherwise, if the search is too expensive, use
145	heuristics to stop the search and report a suboptimal answer.
146
147   RETURNS
148	Set PART->(XMID,YMID) to the midpoint (XMID,YMID).  The diagonal
149	number XMID - YMID equals the number of inserted characters
150	minus the number of deleted characters (counting only characters
151	before the midpoint).  Return the approximate edit cost; this is
152	the total number of characters inserted or deleted (counting
153	only characters before the midpoint), unless a heuristic is used
154	to terminate the search prematurely.
155
156	Set PART->LEFT_MINIMAL to nonzero iff the minimal edit script
157	for the left half of the partition is known; similarly for
158	PART->RIGHT_MINIMAL.
159
160   CAVEAT
161	This function assumes that the first characters of the specified
162	portions of the two strings do not match, and likewise that the
163	last characters do not match.  The caller must trim matching
164	characters from the beginning and end of the portions it is
165	going to specify.
166
167	If we return the "wrong" partitions, the worst this can do is
168	cause suboptimal diff output.  It cannot cause incorrect diff
169	output.  */
170
171static int
172diag (int xoff, int xlim, int yoff, int ylim, int minimal,
173      struct partition *part, struct context *ctxt)
174{
175  int *const fd = ctxt->fdiag;	/* Give the compiler a chance. */
176  int *const bd = ctxt->bdiag;	/* Additional help for the compiler. */
177  const char *const xv = ctxt->string[0].data;	/* Still more help for the compiler. */
178  const char *const yv = ctxt->string[1].data;	/* And more and more . . . */
179  const int dmin = xoff - ylim;	/* Minimum valid diagonal. */
180  const int dmax = xlim - yoff;	/* Maximum valid diagonal. */
181  const int fmid = xoff - yoff;	/* Center diagonal of top-down search. */
182  const int bmid = xlim - ylim;	/* Center diagonal of bottom-up search. */
183  int fmin = fmid;
184  int fmax = fmid;		/* Limits of top-down search. */
185  int bmin = bmid;
186  int bmax = bmid;		/* Limits of bottom-up search. */
187  int c;			/* Cost. */
188  int odd = (fmid - bmid) & 1;
189
190  /*
191   * True if southeast corner is on an odd diagonal with respect
192   * to the northwest.
193   */
194  fd[fmid] = xoff;
195  bd[bmid] = xlim;
196  for (c = 1;; ++c)
197    {
198      int d;			/* Active diagonal. */
199      int big_snake;
200
201      big_snake = 0;
202      /* Extend the top-down search by an edit step in each diagonal. */
203      if (fmin > dmin)
204	fd[--fmin - 1] = -1;
205      else
206	++fmin;
207      if (fmax < dmax)
208	fd[++fmax + 1] = -1;
209      else
210	--fmax;
211      for (d = fmax; d >= fmin; d -= 2)
212	{
213	  int x;
214	  int y;
215	  int oldx;
216	  int tlo;
217	  int thi;
218
219	  tlo = fd[d - 1],
220	    thi = fd[d + 1];
221
222	  if (tlo >= thi)
223	    x = tlo + 1;
224	  else
225	    x = thi;
226	  oldx = x;
227	  y = x - d;
228	  while (x < xlim && y < ylim && xv[x] == yv[y])
229	    {
230	      ++x;
231	      ++y;
232	    }
233	  if (x - oldx > SNAKE_LIMIT)
234	    big_snake = 1;
235	  fd[d] = x;
236	  if (odd && bmin <= d && d <= bmax && bd[d] <= x)
237	    {
238	      part->xmid = x;
239	      part->ymid = y;
240	      part->lo_minimal = part->hi_minimal = 1;
241	      return 2 * c - 1;
242	    }
243	}
244      /* Similarly extend the bottom-up search.  */
245      if (bmin > dmin)
246	bd[--bmin - 1] = INT_MAX;
247      else
248	++bmin;
249      if (bmax < dmax)
250	bd[++bmax + 1] = INT_MAX;
251      else
252	--bmax;
253      for (d = bmax; d >= bmin; d -= 2)
254	{
255	  int x;
256	  int y;
257	  int oldx;
258	  int tlo;
259	  int thi;
260
261	  tlo = bd[d - 1],
262	    thi = bd[d + 1];
263	  if (tlo < thi)
264	    x = tlo;
265	  else
266	    x = thi - 1;
267	  oldx = x;
268	  y = x - d;
269	  while (x > xoff && y > yoff && xv[x - 1] == yv[y - 1])
270	    {
271	      --x;
272	      --y;
273	    }
274	  if (oldx - x > SNAKE_LIMIT)
275	    big_snake = 1;
276	  bd[d] = x;
277	  if (!odd && fmin <= d && d <= fmax && x <= fd[d])
278	    {
279	      part->xmid = x;
280	      part->ymid = y;
281	      part->lo_minimal = part->hi_minimal = 1;
282	      return 2 * c;
283	    }
284	}
285
286      if (minimal)
287	continue;
288
289#ifdef MINUS_H_FLAG
290      /* Heuristic: check occasionally for a diagonal that has made lots
291         of progress compared with the edit distance.  If we have any
292         such, find the one that has made the most progress and return
293         it as if it had succeeded.
294
295         With this heuristic, for strings with a constant small density
296         of changes, the algorithm is linear in the strings size.  */
297      if (c > 200 && big_snake && ctxt->heuristic)
298	{
299	  int best;
300
301	  best = 0;
302	  for (d = fmax; d >= fmin; d -= 2)
303	    {
304	      int dd;
305	      int x;
306	      int y;
307	      int v;
308
309	      dd = d - fmid;
310	      x = fd[d];
311	      y = x - d;
312	      v = (x - xoff) * 2 - dd;
313
314	      if (v > 12 * (c + (dd < 0 ? -dd : dd)))
315		{
316		  if
317		    (
318		      v > best
319		      &&
320		      xoff + SNAKE_LIMIT <= x
321		      &&
322		      x < xlim
323		      &&
324		      yoff + SNAKE_LIMIT <= y
325		      &&
326		      y < ylim
327		    )
328		    {
329		      /* We have a good enough best diagonal; now insist
330			 that it end with a significant snake.  */
331		      int k;
332
333		      for (k = 1; xv[x - k] == yv[y - k]; k++)
334			{
335			  if (k == SNAKE_LIMIT)
336			    {
337			      best = v;
338			      part->xmid = x;
339			      part->ymid = y;
340			      break;
341			    }
342			}
343		    }
344		}
345	    }
346	  if (best > 0)
347	    {
348	      part->lo_minimal = 1;
349	      part->hi_minimal = 0;
350	      return 2 * c - 1;
351	    }
352	  best = 0;
353	  for (d = bmax; d >= bmin; d -= 2)
354	    {
355	      int dd;
356	      int x;
357	      int y;
358	      int v;
359
360	      dd = d - bmid;
361	      x = bd[d];
362	      y = x - d;
363	      v = (xlim - x) * 2 + dd;
364
365	      if (v > 12 * (c + (dd < 0 ? -dd : dd)))
366		{
367		  if (v > best && xoff < x && x <= xlim - SNAKE_LIMIT &&
368		      yoff < y && y <= ylim - SNAKE_LIMIT)
369		    {
370		      /* We have a good enough best diagonal; now insist
371			 that it end with a significant snake.  */
372		      int k;
373
374		      for (k = 0; xv[x + k] == yv[y + k]; k++)
375			{
376			  if (k == SNAKE_LIMIT - 1)
377			    {
378			      best = v;
379			      part->xmid = x;
380			      part->ymid = y;
381			      break;
382			    }
383			}
384		    }
385		}
386	    }
387	  if (best > 0)
388	    {
389	      part->lo_minimal = 0;
390	      part->hi_minimal = 1;
391	      return 2 * c - 1;
392	    }
393	}
394#endif /* MINUS_H_FLAG */
395
396      /* Heuristic: if we've gone well beyond the call of duty, give up
397	 and report halfway between our best results so far.  */
398      if (c >= ctxt->too_expensive)
399	{
400	  int fxybest;
401	  int fxbest;
402	  int bxybest;
403	  int bxbest;
404
405	  /* Pacify `gcc -Wall'. */
406	  fxbest = 0;
407	  bxbest = 0;
408
409	  /* Find forward diagonal that maximizes X + Y.  */
410	  fxybest = -1;
411	  for (d = fmax; d >= fmin; d -= 2)
412	    {
413	      int x;
414	      int y;
415
416	      x = fd[d] < xlim ? fd[d] : xlim;
417	      y = x - d;
418
419	      if (ylim < y)
420		{
421		  x = ylim + d;
422		  y = ylim;
423		}
424	      if (fxybest < x + y)
425		{
426		  fxybest = x + y;
427		  fxbest = x;
428		}
429	    }
430	  /* Find backward diagonal that minimizes X + Y.  */
431	  bxybest = INT_MAX;
432	  for (d = bmax; d >= bmin; d -= 2)
433	    {
434	      int x;
435	      int y;
436
437	      x = xoff > bd[d] ? xoff : bd[d];
438	      y = x - d;
439
440	      if (y < yoff)
441		{
442		  x = yoff + d;
443		  y = yoff;
444		}
445	      if (x + y < bxybest)
446		{
447		  bxybest = x + y;
448		  bxbest = x;
449		}
450	    }
451	  /* Use the better of the two diagonals.  */
452	  if ((xlim + ylim) - bxybest < fxybest - (xoff + yoff))
453	    {
454	      part->xmid = fxbest;
455	      part->ymid = fxybest - fxbest;
456	      part->lo_minimal = 1;
457	      part->hi_minimal = 0;
458	    }
459	  else
460	    {
461	      part->xmid = bxbest;
462	      part->ymid = bxybest - bxbest;
463	      part->lo_minimal = 0;
464	      part->hi_minimal = 1;
465	    }
466	  return 2 * c - 1;
467	}
468    }
469}
470
471
472/* NAME
473	compareseq - find edit sequence
474
475   SYNOPSIS
476	void compareseq(int xoff, int xlim, int yoff, int ylim, int minimal,
477			struct context *ctxt);
478
479   DESCRIPTION
480	Compare in detail contiguous subsequences of the two strings
481	which are known, as a whole, to match each other.
482
483	The subsequence of string 0 is [XOFF, XLIM) and likewise for
484	string 1.
485
486	Note that XLIM, YLIM are exclusive bounds.  All character
487	numbers are origin-0.
488
489	If MINIMAL is nonzero, find a minimal difference no matter how
490	expensive it is.  */
491
492static void
493compareseq (int xoff, int xlim, int yoff, int ylim, int minimal,
494	    struct context *ctxt)
495{
496  const char *const xv = ctxt->string[0].data;	/* Help the compiler.  */
497  const char *const yv = ctxt->string[1].data;
498
499  /* Slide down the bottom initial diagonal. */
500  while (xoff < xlim && yoff < ylim && xv[xoff] == yv[yoff])
501    {
502      ++xoff;
503      ++yoff;
504    }
505
506  /* Slide up the top initial diagonal. */
507  while (xlim > xoff && ylim > yoff && xv[xlim - 1] == yv[ylim - 1])
508    {
509      --xlim;
510      --ylim;
511    }
512
513  /* Handle simple cases. */
514  if (xoff == xlim)
515    {
516      while (yoff < ylim)
517	{
518	  ctxt->string[1].edit_count++;
519	  ++yoff;
520	}
521    }
522  else if (yoff == ylim)
523    {
524      while (xoff < xlim)
525	{
526	  ctxt->string[0].edit_count++;
527	  ++xoff;
528	}
529    }
530  else
531    {
532      int c;
533      struct partition part;
534
535      /* Find a point of correspondence in the middle of the strings.  */
536      c = diag (xoff, xlim, yoff, ylim, minimal, &part, ctxt);
537      if (c == 1)
538	{
539#if 0
540	  /* This should be impossible, because it implies that one of
541	     the two subsequences is empty, and that case was handled
542	     above without calling `diag'.  Let's verify that this is
543	     true.  */
544	  abort ();
545#else
546	  /* The two subsequences differ by a single insert or delete;
547	     record it and we are done.  */
548	  if (part.xmid - part.ymid < xoff - yoff)
549	    ctxt->string[1].edit_count++;
550	  else
551	    ctxt->string[0].edit_count++;
552#endif
553	}
554      else
555	{
556	  /* Use the partitions to split this problem into subproblems.  */
557	  compareseq (xoff, part.xmid, yoff, part.ymid, part.lo_minimal, ctxt);
558	  compareseq (part.xmid, xlim, part.ymid, ylim, part.hi_minimal, ctxt);
559	}
560    }
561}
562
563
564/* Because fstrcmp is typically called multiple times, attempt to minimize
565   the number of memory allocations performed.  Thus, let a call reuse the
566   memory already allocated by the previous call, if it is sufficient.
567   To make it multithread-safe, without need for a lock that protects the
568   already allocated memory, store the allocated memory per thread.  Free
569   it only when the thread exits.  */
570
571static gl_tls_key_t buffer_key;	/* TLS key for a 'int *' */
572static gl_tls_key_t bufmax_key;	/* TLS key for a 'size_t' */
573
574static void
575keys_init (void)
576{
577  gl_tls_key_init (buffer_key, free);
578  gl_tls_key_init (bufmax_key, NULL);
579  /* The per-thread initial values are NULL and 0, respectively.  */
580}
581
582/* Ensure that keys_init is called once only.  */
583gl_once_define(static, keys_init_once)
584
585
586/* NAME
587	fstrcmp - fuzzy string compare
588
589   SYNOPSIS
590	double fstrcmp(const char *, const char *);
591
592   DESCRIPTION
593	The fstrcmp function may be used to compare two string for
594	similarity.  It is very useful in reducing "cascade" or
595	"secondary" errors in compilers or other situations where
596	symbol tables occur.
597
598   RETURNS
599	double; 0 if the strings are entirly dissimilar, 1 if the
600	strings are identical, and a number in between if they are
601	similar.  */
602
603double
604fstrcmp (const char *string1, const char *string2)
605{
606  struct context ctxt;
607  int i;
608
609  size_t fdiag_len;
610  int *buffer;
611  size_t bufmax;
612
613  /* set the info for each string.  */
614  ctxt.string[0].data = string1;
615  ctxt.string[0].data_length = strlen (string1);
616  ctxt.string[1].data = string2;
617  ctxt.string[1].data_length = strlen (string2);
618
619  /* short-circuit obvious comparisons */
620  if (ctxt.string[0].data_length == 0 && ctxt.string[1].data_length == 0)
621    return 1.0;
622  if (ctxt.string[0].data_length == 0 || ctxt.string[1].data_length == 0)
623    return 0.0;
624
625  /* Set TOO_EXPENSIVE to be approximate square root of input size,
626     bounded below by 256.  */
627  ctxt.too_expensive = 1;
628  for (i = ctxt.string[0].data_length + ctxt.string[1].data_length;
629       i != 0;
630       i >>= 2)
631    ctxt.too_expensive <<= 1;
632  if (ctxt.too_expensive < 256)
633    ctxt.too_expensive = 256;
634
635  /* Allocate memory for fdiag and bdiag from a thread-local pool.  */
636  fdiag_len = ctxt.string[0].data_length + ctxt.string[1].data_length + 3;
637  gl_once (keys_init_once, keys_init);
638  buffer = (int *) gl_tls_get (buffer_key);
639  bufmax = (size_t) (uintptr_t) gl_tls_get (bufmax_key);
640  if (fdiag_len > bufmax)
641    {
642      /* Need more memory.  */
643      bufmax = 2 * bufmax;
644      if (fdiag_len > bufmax)
645	bufmax = fdiag_len;
646      /* Calling xrealloc would be a waste: buffer's contents does not need
647	 to be preserved.  */
648      if (buffer != NULL)
649	free (buffer);
650      buffer = (int *) xmalloc (bufmax * (2 * sizeof (int)));
651      gl_tls_set (buffer_key, buffer);
652      gl_tls_set (bufmax_key, (void *) (uintptr_t) bufmax);
653    }
654  ctxt.fdiag = buffer + ctxt.string[1].data_length + 1;
655  ctxt.bdiag = ctxt.fdiag + fdiag_len;
656
657  /* Now do the main comparison algorithm */
658  ctxt.string[0].edit_count = 0;
659  ctxt.string[1].edit_count = 0;
660  compareseq (0, ctxt.string[0].data_length, 0, ctxt.string[1].data_length, 0,
661	      &ctxt);
662
663  /* The result is
664	((number of chars in common) / (average length of the strings)).
665     This is admittedly biased towards finding that the strings are
666     similar, however it does produce meaningful results.  */
667  return ((double) (ctxt.string[0].data_length + ctxt.string[1].data_length
668		    - ctxt.string[1].edit_count - ctxt.string[0].edit_count)
669	  / (ctxt.string[0].data_length + ctxt.string[1].data_length));
670}
671