1/* Analyze differences between two vectors. 2 3 Copyright (C) 1988-1989, 1992-1995, 2001-2004, 2006, 2007 Free 4 Software Foundation, Inc. 5 6 This program is free software: you can redistribute it and/or modify 7 it under the terms of the GNU General Public License as published by 8 the Free Software Foundation; either version 3 of the License, or 9 (at your option) any later version. 10 11 This program is distributed in the hope that it will be useful, 12 but WITHOUT ANY WARRANTY; without even the implied warranty of 13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 GNU General Public License for more details. 15 16 You should have received a copy of the GNU General Public License 17 along with this program. If not, see <http://www.gnu.org/licenses/>. */ 18 19 20/* The basic idea is to consider two vectors as similar if, when 21 transforming the first vector into the second vector through a 22 sequence of edits (inserts and deletes of one element each), 23 this sequence is short - or equivalently, if the ordered list 24 of elements that are untouched by these edits is long. For a 25 good introduction to the subject, read about the "Levenshtein 26 distance" in Wikipedia. 27 28 The basic algorithm is described in: 29 "An O(ND) Difference Algorithm and its Variations", Eugene Myers, 30 Algorithmica Vol. 1 No. 2, 1986, pp. 251-266; 31 see especially section 4.2, which describes the variation used below. 32 33 The basic algorithm was independently discovered as described in: 34 "Algorithms for Approximate String Matching", E. Ukkonen, 35 Information and Control Vol. 64, 1985, pp. 100-118. 36 37 Unless the 'find_minimal' flag is set, this code uses the TOO_EXPENSIVE 38 heuristic, by Paul Eggert, to limit the cost to O(N**1.5 log N) 39 at the price of producing suboptimal output for large inputs with 40 many differences. */ 41 42/* Before including this file, you need to define: 43 ELEMENT The element type of the vectors being compared. 44 EQUAL A two-argument macro that tests two elements for 45 equality. 46 OFFSET A signed integer type sufficient to hold the 47 difference between two indices. Usually 48 something like ssize_t. 49 EXTRA_CONTEXT_FIELDS Declarations of fields for 'struct context'. 50 NOTE_DELETE(ctxt, xoff) Record the removal of the object xvec[xoff]. 51 NOTE_INSERT(ctxt, yoff) Record the insertion of the object yvec[yoff]. 52 USE_HEURISTIC (Optional) Define if you want to support the 53 heuristic for large vectors. */ 54 55/* Maximum value of type OFFSET. */ 56#define OFFSET_MAX \ 57 ((((OFFSET)1 << (sizeof (OFFSET) * CHAR_BIT - 2)) - 1) * 2 + 1) 58 59/* Use this to suppress gcc's `...may be used before initialized' warnings. */ 60#ifndef IF_LINT 61# ifdef lint 62# define IF_LINT(Code) Code 63# else 64# define IF_LINT(Code) /* empty */ 65# endif 66#endif 67 68/* 69 * Context of comparison operation. 70 */ 71struct context 72{ 73 /* Vectors being compared. */ 74 const ELEMENT *xvec; 75 const ELEMENT *yvec; 76 77 /* Extra fields. */ 78 EXTRA_CONTEXT_FIELDS 79 80 /* Vector, indexed by diagonal, containing 1 + the X coordinate of the point 81 furthest along the given diagonal in the forward search of the edit 82 matrix. */ 83 OFFSET *fdiag; 84 85 /* Vector, indexed by diagonal, containing the X coordinate of the point 86 furthest along the given diagonal in the backward search of the edit 87 matrix. */ 88 OFFSET *bdiag; 89 90 #ifdef USE_HEURISTIC 91 /* This corresponds to the diff -H flag. With this heuristic, for 92 vectors with a constant small density of changes, the algorithm is 93 linear in the vectors size. */ 94 bool heuristic; 95 #endif 96 97 /* Edit scripts longer than this are too expensive to compute. */ 98 OFFSET too_expensive; 99 100 /* Snakes bigger than this are considered `big'. */ 101 #define SNAKE_LIMIT 20 102}; 103 104struct partition 105{ 106 /* Midpoints of this partition. */ 107 OFFSET xmid; 108 OFFSET ymid; 109 110 /* True if low half will be analyzed minimally. */ 111 bool lo_minimal; 112 113 /* Likewise for high half. */ 114 bool hi_minimal; 115}; 116 117 118/* Find the midpoint of the shortest edit script for a specified portion 119 of the two vectors. 120 121 Scan from the beginnings of the vectors, and simultaneously from the ends, 122 doing a breadth-first search through the space of edit-sequence. 123 When the two searches meet, we have found the midpoint of the shortest 124 edit sequence. 125 126 If FIND_MINIMAL is true, find the minimal edit script regardless of 127 expense. Otherwise, if the search is too expensive, use heuristics to 128 stop the search and report a suboptimal answer. 129 130 Set PART->(xmid,ymid) to the midpoint (XMID,YMID). The diagonal number 131 XMID - YMID equals the number of inserted elements minus the number 132 of deleted elements (counting only elements before the midpoint). 133 134 Set PART->lo_minimal to true iff the minimal edit script for the 135 left half of the partition is known; similarly for PART->hi_minimal. 136 137 This function assumes that the first elements of the specified portions 138 of the two vectors do not match, and likewise that the last elements do not 139 match. The caller must trim matching elements from the beginning and end 140 of the portions it is going to specify. 141 142 If we return the "wrong" partitions, the worst this can do is cause 143 suboptimal diff output. It cannot cause incorrect diff output. */ 144 145static void 146diag (OFFSET xoff, OFFSET xlim, OFFSET yoff, OFFSET ylim, bool find_minimal, 147 struct partition *part, struct context *ctxt) 148{ 149 OFFSET *const fd = ctxt->fdiag; /* Give the compiler a chance. */ 150 OFFSET *const bd = ctxt->bdiag; /* Additional help for the compiler. */ 151 const ELEMENT *const xv = ctxt->xvec; /* Still more help for the compiler. */ 152 const ELEMENT *const yv = ctxt->yvec; /* And more and more . . . */ 153 const OFFSET dmin = xoff - ylim; /* Minimum valid diagonal. */ 154 const OFFSET dmax = xlim - yoff; /* Maximum valid diagonal. */ 155 const OFFSET fmid = xoff - yoff; /* Center diagonal of top-down search. */ 156 const OFFSET bmid = xlim - ylim; /* Center diagonal of bottom-up search. */ 157 OFFSET fmin = fmid; 158 OFFSET fmax = fmid; /* Limits of top-down search. */ 159 OFFSET bmin = bmid; 160 OFFSET bmax = bmid; /* Limits of bottom-up search. */ 161 OFFSET c; /* Cost. */ 162 bool odd = (fmid - bmid) & 1; /* True if southeast corner is on an odd 163 diagonal with respect to the northwest. */ 164 165 fd[fmid] = xoff; 166 bd[bmid] = xlim; 167 168 for (c = 1;; ++c) 169 { 170 OFFSET d; /* Active diagonal. */ 171 bool big_snake = false; 172 173 /* Extend the top-down search by an edit step in each diagonal. */ 174 if (fmin > dmin) 175 fd[--fmin - 1] = -1; 176 else 177 ++fmin; 178 if (fmax < dmax) 179 fd[++fmax + 1] = -1; 180 else 181 --fmax; 182 for (d = fmax; d >= fmin; d -= 2) 183 { 184 OFFSET x; 185 OFFSET y; 186 OFFSET tlo = fd[d - 1]; 187 OFFSET thi = fd[d + 1]; 188 OFFSET x0 = tlo < thi ? thi : tlo + 1; 189 190 for (x = x0, y = x0 - d; 191 x < xlim && y < ylim && EQUAL (xv[x], yv[y]); 192 x++, y++) 193 continue; 194 if (x - x0 > SNAKE_LIMIT) 195 big_snake = true; 196 fd[d] = x; 197 if (odd && bmin <= d && d <= bmax && bd[d] <= x) 198 { 199 part->xmid = x; 200 part->ymid = y; 201 part->lo_minimal = part->hi_minimal = true; 202 return; 203 } 204 } 205 206 /* Similarly extend the bottom-up search. */ 207 if (bmin > dmin) 208 bd[--bmin - 1] = OFFSET_MAX; 209 else 210 ++bmin; 211 if (bmax < dmax) 212 bd[++bmax + 1] = OFFSET_MAX; 213 else 214 --bmax; 215 for (d = bmax; d >= bmin; d -= 2) 216 { 217 OFFSET x; 218 OFFSET y; 219 OFFSET tlo = bd[d - 1]; 220 OFFSET thi = bd[d + 1]; 221 OFFSET x0 = tlo < thi ? tlo : thi - 1; 222 223 for (x = x0, y = x0 - d; 224 xoff < x && yoff < y && EQUAL (xv[x - 1], yv[y - 1]); 225 x--, y--) 226 continue; 227 if (x0 - x > SNAKE_LIMIT) 228 big_snake = true; 229 bd[d] = x; 230 if (!odd && fmin <= d && d <= fmax && x <= fd[d]) 231 { 232 part->xmid = x; 233 part->ymid = y; 234 part->lo_minimal = part->hi_minimal = true; 235 return; 236 } 237 } 238 239 if (find_minimal) 240 continue; 241 242#ifdef USE_HEURISTIC 243 /* Heuristic: check occasionally for a diagonal that has made lots 244 of progress compared with the edit distance. If we have any 245 such, find the one that has made the most progress and return it 246 as if it had succeeded. 247 248 With this heuristic, for vectors with a constant small density 249 of changes, the algorithm is linear in the vector size. */ 250 251 if (200 < c && big_snake && ctxt->heuristic) 252 { 253 OFFSET best = 0; 254 255 for (d = fmax; d >= fmin; d -= 2) 256 { 257 OFFSET dd = d - fmid; 258 OFFSET x = fd[d]; 259 OFFSET y = x - d; 260 OFFSET v = (x - xoff) * 2 - dd; 261 262 if (v > 12 * (c + (dd < 0 ? -dd : dd))) 263 { 264 if (v > best 265 && xoff + SNAKE_LIMIT <= x && x < xlim 266 && yoff + SNAKE_LIMIT <= y && y < ylim) 267 { 268 /* We have a good enough best diagonal; now insist 269 that it end with a significant snake. */ 270 int k; 271 272 for (k = 1; EQUAL (xv[x - k], yv[y - k]); k++) 273 if (k == SNAKE_LIMIT) 274 { 275 best = v; 276 part->xmid = x; 277 part->ymid = y; 278 break; 279 } 280 } 281 } 282 } 283 if (best > 0) 284 { 285 part->lo_minimal = true; 286 part->hi_minimal = false; 287 return; 288 } 289 290 best = 0; 291 for (d = bmax; d >= bmin; d -= 2) 292 { 293 OFFSET dd = d - bmid; 294 OFFSET x = bd[d]; 295 OFFSET y = x - d; 296 OFFSET v = (xlim - x) * 2 + dd; 297 298 if (v > 12 * (c + (dd < 0 ? -dd : dd))) 299 { 300 if (v > best 301 && xoff < x && x <= xlim - SNAKE_LIMIT 302 && yoff < y && y <= ylim - SNAKE_LIMIT) 303 { 304 /* We have a good enough best diagonal; now insist 305 that it end with a significant snake. */ 306 int k; 307 308 for (k = 0; EQUAL (xv[x + k], yv[y + k]); k++) 309 if (k == SNAKE_LIMIT - 1) 310 { 311 best = v; 312 part->xmid = x; 313 part->ymid = y; 314 break; 315 } 316 } 317 } 318 } 319 if (best > 0) 320 { 321 part->lo_minimal = false; 322 part->hi_minimal = true; 323 return; 324 } 325 } 326#endif /* USE_HEURISTIC */ 327 328 /* Heuristic: if we've gone well beyond the call of duty, give up 329 and report halfway between our best results so far. */ 330 if (c >= ctxt->too_expensive) 331 { 332 OFFSET fxybest; 333 OFFSET fxbest IF_LINT (= 0); 334 OFFSET bxybest; 335 OFFSET bxbest IF_LINT (= 0); 336 337 /* Find forward diagonal that maximizes X + Y. */ 338 fxybest = -1; 339 for (d = fmax; d >= fmin; d -= 2) 340 { 341 OFFSET x = MIN (fd[d], xlim); 342 OFFSET y = x - d; 343 if (ylim < y) 344 { 345 x = ylim + d; 346 y = ylim; 347 } 348 if (fxybest < x + y) 349 { 350 fxybest = x + y; 351 fxbest = x; 352 } 353 } 354 355 /* Find backward diagonal that minimizes X + Y. */ 356 bxybest = OFFSET_MAX; 357 for (d = bmax; d >= bmin; d -= 2) 358 { 359 OFFSET x = MAX (xoff, bd[d]); 360 OFFSET y = x - d; 361 if (y < yoff) 362 { 363 x = yoff + d; 364 y = yoff; 365 } 366 if (x + y < bxybest) 367 { 368 bxybest = x + y; 369 bxbest = x; 370 } 371 } 372 373 /* Use the better of the two diagonals. */ 374 if ((xlim + ylim) - bxybest < fxybest - (xoff + yoff)) 375 { 376 part->xmid = fxbest; 377 part->ymid = fxybest - fxbest; 378 part->lo_minimal = true; 379 part->hi_minimal = false; 380 } 381 else 382 { 383 part->xmid = bxbest; 384 part->ymid = bxybest - bxbest; 385 part->lo_minimal = false; 386 part->hi_minimal = true; 387 } 388 return; 389 } 390 } 391} 392 393 394/* Compare in detail contiguous subsequences of the two vectors 395 which are known, as a whole, to match each other. 396 397 The subsequence of vector 0 is [XOFF, XLIM) and likewise for vector 1. 398 399 Note that XLIM, YLIM are exclusive bounds. All indices into the vectors 400 are origin-0. 401 402 If FIND_MINIMAL, find a minimal difference no matter how 403 expensive it is. 404 405 The results are recorded by invoking NOTE_DELETE and NOTE_INSERT. */ 406 407static void 408compareseq (OFFSET xoff, OFFSET xlim, OFFSET yoff, OFFSET ylim, 409 bool find_minimal, struct context *ctxt) 410{ 411 ELEMENT const *xv = ctxt->xvec; /* Help the compiler. */ 412 ELEMENT const *yv = ctxt->yvec; 413 414 /* Slide down the bottom initial diagonal. */ 415 while (xoff < xlim && yoff < ylim && EQUAL (xv[xoff], yv[yoff])) 416 { 417 xoff++; 418 yoff++; 419 } 420 421 /* Slide up the top initial diagonal. */ 422 while (xoff < xlim && yoff < ylim && EQUAL (xv[xlim - 1], yv[ylim - 1])) 423 { 424 xlim--; 425 ylim--; 426 } 427 428 /* Handle simple cases. */ 429 if (xoff == xlim) 430 while (yoff < ylim) 431 { 432 NOTE_INSERT (ctxt, yoff); 433 yoff++; 434 } 435 else if (yoff == ylim) 436 while (xoff < xlim) 437 { 438 NOTE_DELETE (ctxt, xoff); 439 xoff++; 440 } 441 else 442 { 443 struct partition part; 444 445 /* Find a point of correspondence in the middle of the vectors. */ 446 diag (xoff, xlim, yoff, ylim, find_minimal, &part, ctxt); 447 448 /* Use the partitions to split this problem into subproblems. */ 449 compareseq (xoff, part.xmid, yoff, part.ymid, part.lo_minimal, ctxt); 450 compareseq (part.xmid, xlim, part.ymid, ylim, part.hi_minimal, ctxt); 451 } 452} 453 454#undef ELEMENT 455#undef EQUAL 456#undef OFFSET 457#undef EXTRA_CONTEXT_FIELDS 458#undef NOTE_DELETE 459#undef NOTE_INSERT 460#undef USE_HEURISTIC 461#undef OFFSET_MAX 462