1/* Analyze file differences for GNU DIFF.
2
3   Copyright (C) 1988, 1989, 1992, 1993, 1994, 1995, 1998, 2001, 2002,
4   2004 Free Software Foundation, Inc.
5
6   This file is part of GNU DIFF.
7
8   GNU DIFF is free software; you can redistribute it and/or modify
9   it under the terms of the GNU General Public License as published by
10   the Free Software Foundation; either version 2, or (at your option)
11   any later version.
12
13   GNU DIFF is distributed in the hope that it will be useful,
14   but WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16   GNU General Public License for more details.
17
18   You should have received a copy of the GNU General Public License
19   along with this program; see the file COPYING.
20   If not, write to the Free Software Foundation,
21   59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.  */
22
23/* The basic algorithm is described in:
24   "An O(ND) Difference Algorithm and its Variations", Eugene Myers,
25   Algorithmica Vol. 1 No. 2, 1986, pp. 251-266;
26   see especially section 4.2, which describes the variation used below.
27   Unless the --minimal option is specified, this code uses the TOO_EXPENSIVE
28   heuristic, by Paul Eggert, to limit the cost to O(N**1.5 log N)
29   at the price of producing suboptimal output for large inputs with
30   many differences.
31
32   The basic algorithm was independently discovered as described in:
33   "Algorithms for Approximate String Matching", E. Ukkonen,
34   Information and Control Vol. 64, 1985, pp. 100-118.  */
35
36#include "diff.h"
37#include <cmpbuf.h>
38#include <error.h>
39#include <file-type.h>
40#include <xalloc.h>
41
42static lin *xvec, *yvec;	/* Vectors being compared. */
43static lin *fdiag;		/* Vector, indexed by diagonal, containing
44				   1 + the X coordinate of the point furthest
45				   along the given diagonal in the forward
46				   search of the edit matrix. */
47static lin *bdiag;		/* Vector, indexed by diagonal, containing
48				   the X coordinate of the point furthest
49				   along the given diagonal in the backward
50				   search of the edit matrix. */
51static lin too_expensive;	/* Edit scripts longer than this are too
52				   expensive to compute.  */
53
54#define SNAKE_LIMIT 20	/* Snakes bigger than this are considered `big'.  */
55
56struct partition
57{
58  lin xmid, ymid;	/* Midpoints of this partition.  */
59  bool lo_minimal;	/* Nonzero if low half will be analyzed minimally.  */
60  bool hi_minimal;	/* Likewise for high half.  */
61};
62
63/* Find the midpoint of the shortest edit script for a specified
64   portion of the two files.
65
66   Scan from the beginnings of the files, and simultaneously from the ends,
67   doing a breadth-first search through the space of edit-sequence.
68   When the two searches meet, we have found the midpoint of the shortest
69   edit sequence.
70
71   If FIND_MINIMAL is nonzero, find the minimal edit script regardless
72   of expense.  Otherwise, if the search is too expensive, use
73   heuristics to stop the search and report a suboptimal answer.
74
75   Set PART->(xmid,ymid) to the midpoint (XMID,YMID).  The diagonal number
76   XMID - YMID equals the number of inserted lines minus the number
77   of deleted lines (counting only lines before the midpoint).
78
79   Set PART->lo_minimal to true iff the minimal edit script for the
80   left half of the partition is known; similarly for PART->hi_minimal.
81
82   This function assumes that the first lines of the specified portions
83   of the two files do not match, and likewise that the last lines do not
84   match.  The caller must trim matching lines from the beginning and end
85   of the portions it is going to specify.
86
87   If we return the "wrong" partitions,
88   the worst this can do is cause suboptimal diff output.
89   It cannot cause incorrect diff output.  */
90
91static void
92diag (lin xoff, lin xlim, lin yoff, lin ylim, bool find_minimal,
93      struct partition *part)
94{
95  lin *const fd = fdiag;	/* Give the compiler a chance. */
96  lin *const bd = bdiag;	/* Additional help for the compiler. */
97  lin const *const xv = xvec;	/* Still more help for the compiler. */
98  lin const *const yv = yvec;	/* And more and more . . . */
99  lin const dmin = xoff - ylim;	/* Minimum valid diagonal. */
100  lin const dmax = xlim - yoff;	/* Maximum valid diagonal. */
101  lin const fmid = xoff - yoff;	/* Center diagonal of top-down search. */
102  lin const bmid = xlim - ylim;	/* Center diagonal of bottom-up search. */
103  lin fmin = fmid, fmax = fmid;	/* Limits of top-down search. */
104  lin bmin = bmid, bmax = bmid;	/* Limits of bottom-up search. */
105  lin c;			/* Cost. */
106  bool odd = (fmid - bmid) & 1;	/* True if southeast corner is on an odd
107				   diagonal with respect to the northwest. */
108
109  fd[fmid] = xoff;
110  bd[bmid] = xlim;
111
112  for (c = 1;; ++c)
113    {
114      lin d;			/* Active diagonal. */
115      bool big_snake = false;
116
117      /* Extend the top-down search by an edit step in each diagonal. */
118      fmin > dmin ? fd[--fmin - 1] = -1 : ++fmin;
119      fmax < dmax ? fd[++fmax + 1] = -1 : --fmax;
120      for (d = fmax; d >= fmin; d -= 2)
121	{
122	  lin x, y, oldx, tlo = fd[d - 1], thi = fd[d + 1];
123
124	  if (tlo >= thi)
125	    x = tlo + 1;
126	  else
127	    x = thi;
128	  oldx = x;
129	  y = x - d;
130	  while (x < xlim && y < ylim && xv[x] == yv[y])
131	    ++x, ++y;
132	  if (x - oldx > SNAKE_LIMIT)
133	    big_snake = true;
134	  fd[d] = x;
135	  if (odd && bmin <= d && d <= bmax && bd[d] <= x)
136	    {
137	      part->xmid = x;
138	      part->ymid = y;
139	      part->lo_minimal = part->hi_minimal = true;
140	      return;
141	    }
142	}
143
144      /* Similarly extend the bottom-up search.  */
145      bmin > dmin ? bd[--bmin - 1] = LIN_MAX : ++bmin;
146      bmax < dmax ? bd[++bmax + 1] = LIN_MAX : --bmax;
147      for (d = bmax; d >= bmin; d -= 2)
148	{
149	  lin x, y, oldx, tlo = bd[d - 1], thi = bd[d + 1];
150
151	  if (tlo < thi)
152	    x = tlo;
153	  else
154	    x = thi - 1;
155	  oldx = x;
156	  y = x - d;
157	  while (x > xoff && y > yoff && xv[x - 1] == yv[y - 1])
158	    --x, --y;
159	  if (oldx - x > SNAKE_LIMIT)
160	    big_snake = true;
161	  bd[d] = x;
162	  if (!odd && fmin <= d && d <= fmax && x <= fd[d])
163	    {
164	      part->xmid = x;
165	      part->ymid = y;
166	      part->lo_minimal = part->hi_minimal = true;
167	      return;
168	    }
169	}
170
171      if (find_minimal)
172	continue;
173
174      /* Heuristic: check occasionally for a diagonal that has made
175	 lots of progress compared with the edit distance.
176	 If we have any such, find the one that has made the most
177	 progress and return it as if it had succeeded.
178
179	 With this heuristic, for files with a constant small density
180	 of changes, the algorithm is linear in the file size.  */
181
182      if (200 < c && big_snake && speed_large_files)
183	{
184	  lin best = 0;
185
186	  for (d = fmax; d >= fmin; d -= 2)
187	    {
188	      lin dd = d - fmid;
189	      lin x = fd[d];
190	      lin y = x - d;
191	      lin v = (x - xoff) * 2 - dd;
192	      if (v > 12 * (c + (dd < 0 ? -dd : dd)))
193		{
194		  if (v > best
195		      && xoff + SNAKE_LIMIT <= x && x < xlim
196		      && yoff + SNAKE_LIMIT <= y && y < ylim)
197		    {
198		      /* We have a good enough best diagonal;
199			 now insist that it end with a significant snake.  */
200		      int k;
201
202		      for (k = 1; xv[x - k] == yv[y - k]; k++)
203			if (k == SNAKE_LIMIT)
204			  {
205			    best = v;
206			    part->xmid = x;
207			    part->ymid = y;
208			    break;
209			  }
210		    }
211		}
212	    }
213	  if (best > 0)
214	    {
215	      part->lo_minimal = true;
216	      part->hi_minimal = false;
217	      return;
218	    }
219
220	  best = 0;
221	  for (d = bmax; d >= bmin; d -= 2)
222	    {
223	      lin dd = d - bmid;
224	      lin x = bd[d];
225	      lin y = x - d;
226	      lin v = (xlim - x) * 2 + dd;
227	      if (v > 12 * (c + (dd < 0 ? -dd : dd)))
228		{
229		  if (v > best
230		      && xoff < x && x <= xlim - SNAKE_LIMIT
231		      && yoff < y && y <= ylim - SNAKE_LIMIT)
232		    {
233		      /* We have a good enough best diagonal;
234			 now insist that it end with a significant snake.  */
235		      int k;
236
237		      for (k = 0; xv[x + k] == yv[y + k]; k++)
238			if (k == SNAKE_LIMIT - 1)
239			  {
240			    best = v;
241			    part->xmid = x;
242			    part->ymid = y;
243			    break;
244			  }
245		    }
246		}
247	    }
248	  if (best > 0)
249	    {
250	      part->lo_minimal = false;
251	      part->hi_minimal = true;
252	      return;
253	    }
254	}
255
256      /* Heuristic: if we've gone well beyond the call of duty,
257	 give up and report halfway between our best results so far.  */
258      if (c >= too_expensive)
259	{
260	  lin fxybest, fxbest;
261	  lin bxybest, bxbest;
262
263	  fxbest = bxbest = 0;  /* Pacify `gcc -Wall'.  */
264
265	  /* Find forward diagonal that maximizes X + Y.  */
266	  fxybest = -1;
267	  for (d = fmax; d >= fmin; d -= 2)
268	    {
269	      lin x = MIN (fd[d], xlim);
270	      lin y = x - d;
271	      if (ylim < y)
272		x = ylim + d, y = ylim;
273	      if (fxybest < x + y)
274		{
275		  fxybest = x + y;
276		  fxbest = x;
277		}
278	    }
279
280	  /* Find backward diagonal that minimizes X + Y.  */
281	  bxybest = LIN_MAX;
282	  for (d = bmax; d >= bmin; d -= 2)
283	    {
284	      lin x = MAX (xoff, bd[d]);
285	      lin y = x - d;
286	      if (y < yoff)
287		x = yoff + d, y = yoff;
288	      if (x + y < bxybest)
289		{
290		  bxybest = x + y;
291		  bxbest = x;
292		}
293	    }
294
295	  /* Use the better of the two diagonals.  */
296	  if ((xlim + ylim) - bxybest < fxybest - (xoff + yoff))
297	    {
298	      part->xmid = fxbest;
299	      part->ymid = fxybest - fxbest;
300	      part->lo_minimal = true;
301	      part->hi_minimal = false;
302	    }
303	  else
304	    {
305	      part->xmid = bxbest;
306	      part->ymid = bxybest - bxbest;
307	      part->lo_minimal = false;
308	      part->hi_minimal = true;
309	    }
310	  return;
311	}
312    }
313}
314
315/* Compare in detail contiguous subsequences of the two files
316   which are known, as a whole, to match each other.
317
318   The results are recorded in the vectors files[N].changed, by
319   storing 1 in the element for each line that is an insertion or deletion.
320
321   The subsequence of file 0 is [XOFF, XLIM) and likewise for file 1.
322
323   Note that XLIM, YLIM are exclusive bounds.
324   All line numbers are origin-0 and discarded lines are not counted.
325
326   If FIND_MINIMAL, find a minimal difference no matter how
327   expensive it is.  */
328
329static void
330compareseq (lin xoff, lin xlim, lin yoff, lin ylim, bool find_minimal)
331{
332  lin const *xv = xvec; /* Help the compiler.  */
333  lin const *yv = yvec;
334
335  /* Slide down the bottom initial diagonal. */
336  while (xoff < xlim && yoff < ylim && xv[xoff] == yv[yoff])
337    ++xoff, ++yoff;
338  /* Slide up the top initial diagonal. */
339  while (xlim > xoff && ylim > yoff && xv[xlim - 1] == yv[ylim - 1])
340    --xlim, --ylim;
341
342  /* Handle simple cases. */
343  if (xoff == xlim)
344    while (yoff < ylim)
345      files[1].changed[files[1].realindexes[yoff++]] = 1;
346  else if (yoff == ylim)
347    while (xoff < xlim)
348      files[0].changed[files[0].realindexes[xoff++]] = 1;
349  else
350    {
351      struct partition part;
352
353      /* Find a point of correspondence in the middle of the files.  */
354      diag (xoff, xlim, yoff, ylim, find_minimal, &part);
355
356      /* Use the partitions to split this problem into subproblems.  */
357      compareseq (xoff, part.xmid, yoff, part.ymid, part.lo_minimal);
358      compareseq (part.xmid, xlim, part.ymid, ylim, part.hi_minimal);
359    }
360}
361
362/* Discard lines from one file that have no matches in the other file.
363
364   A line which is discarded will not be considered by the actual
365   comparison algorithm; it will be as if that line were not in the file.
366   The file's `realindexes' table maps virtual line numbers
367   (which don't count the discarded lines) into real line numbers;
368   this is how the actual comparison algorithm produces results
369   that are comprehensible when the discarded lines are counted.
370
371   When we discard a line, we also mark it as a deletion or insertion
372   so that it will be printed in the output.  */
373
374static void
375discard_confusing_lines (struct file_data filevec[])
376{
377  int f;
378  lin i;
379  char *discarded[2];
380  lin *equiv_count[2];
381  lin *p;
382
383  /* Allocate our results.  */
384  p = xmalloc ((filevec[0].buffered_lines + filevec[1].buffered_lines)
385	       * (2 * sizeof *p));
386  for (f = 0; f < 2; f++)
387    {
388      filevec[f].undiscarded = p;  p += filevec[f].buffered_lines;
389      filevec[f].realindexes = p;  p += filevec[f].buffered_lines;
390    }
391
392  /* Set up equiv_count[F][I] as the number of lines in file F
393     that fall in equivalence class I.  */
394
395  p = zalloc (filevec[0].equiv_max * (2 * sizeof *p));
396  equiv_count[0] = p;
397  equiv_count[1] = p + filevec[0].equiv_max;
398
399  for (i = 0; i < filevec[0].buffered_lines; ++i)
400    ++equiv_count[0][filevec[0].equivs[i]];
401  for (i = 0; i < filevec[1].buffered_lines; ++i)
402    ++equiv_count[1][filevec[1].equivs[i]];
403
404  /* Set up tables of which lines are going to be discarded.  */
405
406  discarded[0] = zalloc (filevec[0].buffered_lines
407			 + filevec[1].buffered_lines);
408  discarded[1] = discarded[0] + filevec[0].buffered_lines;
409
410  /* Mark to be discarded each line that matches no line of the other file.
411     If a line matches many lines, mark it as provisionally discardable.  */
412
413  for (f = 0; f < 2; f++)
414    {
415      size_t end = filevec[f].buffered_lines;
416      char *discards = discarded[f];
417      lin *counts = equiv_count[1 - f];
418      lin *equivs = filevec[f].equivs;
419      size_t many = 5;
420      size_t tem = end / 64;
421
422      /* Multiply MANY by approximate square root of number of lines.
423	 That is the threshold for provisionally discardable lines.  */
424      while ((tem = tem >> 2) > 0)
425	many *= 2;
426
427      for (i = 0; i < end; i++)
428	{
429	  lin nmatch;
430	  if (equivs[i] == 0)
431	    continue;
432	  nmatch = counts[equivs[i]];
433	  if (nmatch == 0)
434	    discards[i] = 1;
435	  else if (nmatch > many)
436	    discards[i] = 2;
437	}
438    }
439
440  /* Don't really discard the provisional lines except when they occur
441     in a run of discardables, with nonprovisionals at the beginning
442     and end.  */
443
444  for (f = 0; f < 2; f++)
445    {
446      lin end = filevec[f].buffered_lines;
447      register char *discards = discarded[f];
448
449      for (i = 0; i < end; i++)
450	{
451	  /* Cancel provisional discards not in middle of run of discards.  */
452	  if (discards[i] == 2)
453	    discards[i] = 0;
454	  else if (discards[i] != 0)
455	    {
456	      /* We have found a nonprovisional discard.  */
457	      register lin j;
458	      lin length;
459	      lin provisional = 0;
460
461	      /* Find end of this run of discardable lines.
462		 Count how many are provisionally discardable.  */
463	      for (j = i; j < end; j++)
464		{
465		  if (discards[j] == 0)
466		    break;
467		  if (discards[j] == 2)
468		    ++provisional;
469		}
470
471	      /* Cancel provisional discards at end, and shrink the run.  */
472	      while (j > i && discards[j - 1] == 2)
473		discards[--j] = 0, --provisional;
474
475	      /* Now we have the length of a run of discardable lines
476		 whose first and last are not provisional.  */
477	      length = j - i;
478
479	      /* If 1/4 of the lines in the run are provisional,
480		 cancel discarding of all provisional lines in the run.  */
481	      if (provisional * 4 > length)
482		{
483		  while (j > i)
484		    if (discards[--j] == 2)
485		      discards[j] = 0;
486		}
487	      else
488		{
489		  register lin consec;
490		  lin minimum = 1;
491		  lin tem = length >> 2;
492
493		  /* MINIMUM is approximate square root of LENGTH/4.
494		     A subrun of two or more provisionals can stand
495		     when LENGTH is at least 16.
496		     A subrun of 4 or more can stand when LENGTH >= 64.  */
497		  while (0 < (tem >>= 2))
498		    minimum <<= 1;
499		  minimum++;
500
501		  /* Cancel any subrun of MINIMUM or more provisionals
502		     within the larger run.  */
503		  for (j = 0, consec = 0; j < length; j++)
504		    if (discards[i + j] != 2)
505		      consec = 0;
506		    else if (minimum == ++consec)
507		      /* Back up to start of subrun, to cancel it all.  */
508		      j -= consec;
509		    else if (minimum < consec)
510		      discards[i + j] = 0;
511
512		  /* Scan from beginning of run
513		     until we find 3 or more nonprovisionals in a row
514		     or until the first nonprovisional at least 8 lines in.
515		     Until that point, cancel any provisionals.  */
516		  for (j = 0, consec = 0; j < length; j++)
517		    {
518		      if (j >= 8 && discards[i + j] == 1)
519			break;
520		      if (discards[i + j] == 2)
521			consec = 0, discards[i + j] = 0;
522		      else if (discards[i + j] == 0)
523			consec = 0;
524		      else
525			consec++;
526		      if (consec == 3)
527			break;
528		    }
529
530		  /* I advances to the last line of the run.  */
531		  i += length - 1;
532
533		  /* Same thing, from end.  */
534		  for (j = 0, consec = 0; j < length; j++)
535		    {
536		      if (j >= 8 && discards[i - j] == 1)
537			break;
538		      if (discards[i - j] == 2)
539			consec = 0, discards[i - j] = 0;
540		      else if (discards[i - j] == 0)
541			consec = 0;
542		      else
543			consec++;
544		      if (consec == 3)
545			break;
546		    }
547		}
548	    }
549	}
550    }
551
552  /* Actually discard the lines. */
553  for (f = 0; f < 2; f++)
554    {
555      char *discards = discarded[f];
556      lin end = filevec[f].buffered_lines;
557      lin j = 0;
558      for (i = 0; i < end; ++i)
559	if (minimal || discards[i] == 0)
560	  {
561	    filevec[f].undiscarded[j] = filevec[f].equivs[i];
562	    filevec[f].realindexes[j++] = i;
563	  }
564	else
565	  filevec[f].changed[i] = 1;
566      filevec[f].nondiscarded_lines = j;
567    }
568
569  free (discarded[0]);
570  free (equiv_count[0]);
571}
572
573/* Adjust inserts/deletes of identical lines to join changes
574   as much as possible.
575
576   We do something when a run of changed lines include a
577   line at one end and have an excluded, identical line at the other.
578   We are free to choose which identical line is included.
579   `compareseq' usually chooses the one at the beginning,
580   but usually it is cleaner to consider the following identical line
581   to be the "change".  */
582
583static void
584shift_boundaries (struct file_data filevec[])
585{
586  int f;
587
588  for (f = 0; f < 2; f++)
589    {
590      char *changed = filevec[f].changed;
591      char *other_changed = filevec[1 - f].changed;
592      lin const *equivs = filevec[f].equivs;
593      lin i = 0;
594      lin j = 0;
595      lin i_end = filevec[f].buffered_lines;
596
597      while (1)
598	{
599	  lin runlength, start, corresponding;
600
601	  /* Scan forwards to find beginning of another run of changes.
602	     Also keep track of the corresponding point in the other file.  */
603
604	  while (i < i_end && !changed[i])
605	    {
606	      while (other_changed[j++])
607		continue;
608	      i++;
609	    }
610
611	  if (i == i_end)
612	    break;
613
614	  start = i;
615
616	  /* Find the end of this run of changes.  */
617
618	  while (changed[++i])
619	    continue;
620	  while (other_changed[j])
621	    j++;
622
623	  do
624	    {
625	      /* Record the length of this run of changes, so that
626		 we can later determine whether the run has grown.  */
627	      runlength = i - start;
628
629	      /* Move the changed region back, so long as the
630		 previous unchanged line matches the last changed one.
631		 This merges with previous changed regions.  */
632
633	      while (start && equivs[start - 1] == equivs[i - 1])
634		{
635		  changed[--start] = 1;
636		  changed[--i] = 0;
637		  while (changed[start - 1])
638		    start--;
639		  while (other_changed[--j])
640		    continue;
641		}
642
643	      /* Set CORRESPONDING to the end of the changed run, at the last
644		 point where it corresponds to a changed run in the other file.
645		 CORRESPONDING == I_END means no such point has been found.  */
646	      corresponding = other_changed[j - 1] ? i : i_end;
647
648	      /* Move the changed region forward, so long as the
649		 first changed line matches the following unchanged one.
650		 This merges with following changed regions.
651		 Do this second, so that if there are no merges,
652		 the changed region is moved forward as far as possible.  */
653
654	      while (i != i_end && equivs[start] == equivs[i])
655		{
656		  changed[start++] = 0;
657		  changed[i++] = 1;
658		  while (changed[i])
659		    i++;
660		  while (other_changed[++j])
661		    corresponding = i;
662		}
663	    }
664	  while (runlength != i - start);
665
666	  /* If possible, move the fully-merged run of changes
667	     back to a corresponding run in the other file.  */
668
669	  while (corresponding < i)
670	    {
671	      changed[--start] = 1;
672	      changed[--i] = 0;
673	      while (other_changed[--j])
674		continue;
675	    }
676	}
677    }
678}
679
680/* Cons an additional entry onto the front of an edit script OLD.
681   LINE0 and LINE1 are the first affected lines in the two files (origin 0).
682   DELETED is the number of lines deleted here from file 0.
683   INSERTED is the number of lines inserted here in file 1.
684
685   If DELETED is 0 then LINE0 is the number of the line before
686   which the insertion was done; vice versa for INSERTED and LINE1.  */
687
688static struct change *
689add_change (lin line0, lin line1, lin deleted, lin inserted,
690	    struct change *old)
691{
692  struct change *new = xmalloc (sizeof *new);
693
694  new->line0 = line0;
695  new->line1 = line1;
696  new->inserted = inserted;
697  new->deleted = deleted;
698  new->link = old;
699  return new;
700}
701
702/* Scan the tables of which lines are inserted and deleted,
703   producing an edit script in reverse order.  */
704
705static struct change *
706build_reverse_script (struct file_data const filevec[])
707{
708  struct change *script = 0;
709  char *changed0 = filevec[0].changed;
710  char *changed1 = filevec[1].changed;
711  lin len0 = filevec[0].buffered_lines;
712  lin len1 = filevec[1].buffered_lines;
713
714  /* Note that changedN[len0] does exist, and is 0.  */
715
716  lin i0 = 0, i1 = 0;
717
718  while (i0 < len0 || i1 < len1)
719    {
720      if (changed0[i0] | changed1[i1])
721	{
722	  lin line0 = i0, line1 = i1;
723
724	  /* Find # lines changed here in each file.  */
725	  while (changed0[i0]) ++i0;
726	  while (changed1[i1]) ++i1;
727
728	  /* Record this change.  */
729	  script = add_change (line0, line1, i0 - line0, i1 - line1, script);
730	}
731
732      /* We have reached lines in the two files that match each other.  */
733      i0++, i1++;
734    }
735
736  return script;
737}
738
739/* Scan the tables of which lines are inserted and deleted,
740   producing an edit script in forward order.  */
741
742static struct change *
743build_script (struct file_data const filevec[])
744{
745  struct change *script = 0;
746  char *changed0 = filevec[0].changed;
747  char *changed1 = filevec[1].changed;
748  lin i0 = filevec[0].buffered_lines, i1 = filevec[1].buffered_lines;
749
750  /* Note that changedN[-1] does exist, and is 0.  */
751
752  while (i0 >= 0 || i1 >= 0)
753    {
754      if (changed0[i0 - 1] | changed1[i1 - 1])
755	{
756	  lin line0 = i0, line1 = i1;
757
758	  /* Find # lines changed here in each file.  */
759	  while (changed0[i0 - 1]) --i0;
760	  while (changed1[i1 - 1]) --i1;
761
762	  /* Record this change.  */
763	  script = add_change (i0, i1, line0 - i0, line1 - i1, script);
764	}
765
766      /* We have reached lines in the two files that match each other.  */
767      i0--, i1--;
768    }
769
770  return script;
771}
772
773/* If CHANGES, briefly report that two files differed.
774   Return 2 if trouble, CHANGES otherwise.  */
775static int
776briefly_report (int changes, struct file_data const filevec[])
777{
778  if (changes)
779    {
780      char const *label0 = file_label[0] ? file_label[0] : filevec[0].name;
781      char const *label1 = file_label[1] ? file_label[1] : filevec[1].name;
782      message ("Files %s and %s differ\n", label0, label1);
783      if (! brief)
784	changes = 2;
785    }
786
787  return changes;
788}
789
790/* Report the differences of two files.  */
791int
792diff_2_files (struct comparison *cmp)
793{
794  lin diags;
795  int f;
796  struct change *e, *p;
797  struct change *script;
798  int changes;
799
800
801  /* If we have detected that either file is binary,
802     compare the two files as binary.  This can happen
803     only when the first chunk is read.
804     Also, --brief without any --ignore-* options means
805     we can speed things up by treating the files as binary.  */
806
807  if (read_files (cmp->file, files_can_be_treated_as_binary))
808    {
809      /* Files with different lengths must be different.  */
810      if (cmp->file[0].stat.st_size != cmp->file[1].stat.st_size
811	  && (cmp->file[0].desc < 0 || S_ISREG (cmp->file[0].stat.st_mode))
812	  && (cmp->file[1].desc < 0 || S_ISREG (cmp->file[1].stat.st_mode)))
813	changes = 1;
814
815      /* Standard input equals itself.  */
816      else if (cmp->file[0].desc == cmp->file[1].desc)
817	changes = 0;
818
819      else
820	/* Scan both files, a buffer at a time, looking for a difference.  */
821	{
822	  /* Allocate same-sized buffers for both files.  */
823	  size_t lcm_max = PTRDIFF_MAX - 1;
824	  size_t buffer_size =
825	    buffer_lcm (sizeof (word),
826			buffer_lcm (STAT_BLOCKSIZE (cmp->file[0].stat),
827				    STAT_BLOCKSIZE (cmp->file[1].stat),
828				    lcm_max),
829			lcm_max);
830	  for (f = 0; f < 2; f++)
831	    cmp->file[f].buffer = xrealloc (cmp->file[f].buffer, buffer_size);
832
833	  for (;; cmp->file[0].buffered = cmp->file[1].buffered = 0)
834	    {
835	      /* Read a buffer's worth from both files.  */
836	      for (f = 0; f < 2; f++)
837		if (0 <= cmp->file[f].desc)
838		  file_block_read (&cmp->file[f],
839				   buffer_size - cmp->file[f].buffered);
840
841	      /* If the buffers differ, the files differ.  */
842	      if (cmp->file[0].buffered != cmp->file[1].buffered
843		  || memcmp (cmp->file[0].buffer,
844			     cmp->file[1].buffer,
845			     cmp->file[0].buffered))
846		{
847		  changes = 1;
848		  break;
849		}
850
851	      /* If we reach end of file, the files are the same.  */
852	      if (cmp->file[0].buffered != buffer_size)
853		{
854		  changes = 0;
855		  break;
856		}
857	    }
858	}
859
860      changes = briefly_report (changes, cmp->file);
861    }
862  else
863    {
864      /* Allocate vectors for the results of comparison:
865	 a flag for each line of each file, saying whether that line
866	 is an insertion or deletion.
867	 Allocate an extra element, always 0, at each end of each vector.  */
868
869      size_t s = cmp->file[0].buffered_lines + cmp->file[1].buffered_lines + 4;
870      char *flag_space = zalloc (s);
871      cmp->file[0].changed = flag_space + 1;
872      cmp->file[1].changed = flag_space + cmp->file[0].buffered_lines + 3;
873
874      /* Some lines are obviously insertions or deletions
875	 because they don't match anything.  Detect them now, and
876	 avoid even thinking about them in the main comparison algorithm.  */
877
878      discard_confusing_lines (cmp->file);
879
880      /* Now do the main comparison algorithm, considering just the
881	 undiscarded lines.  */
882
883      xvec = cmp->file[0].undiscarded;
884      yvec = cmp->file[1].undiscarded;
885      diags = (cmp->file[0].nondiscarded_lines
886	       + cmp->file[1].nondiscarded_lines + 3);
887      fdiag = xmalloc (diags * (2 * sizeof *fdiag));
888      bdiag = fdiag + diags;
889      fdiag += cmp->file[1].nondiscarded_lines + 1;
890      bdiag += cmp->file[1].nondiscarded_lines + 1;
891
892      /* Set TOO_EXPENSIVE to be approximate square root of input size,
893	 bounded below by 256.  */
894      too_expensive = 1;
895      for (;  diags != 0;  diags >>= 2)
896	too_expensive <<= 1;
897      too_expensive = MAX (256, too_expensive);
898
899      files[0] = cmp->file[0];
900      files[1] = cmp->file[1];
901
902      compareseq (0, cmp->file[0].nondiscarded_lines,
903		  0, cmp->file[1].nondiscarded_lines, minimal);
904
905      free (fdiag - (cmp->file[1].nondiscarded_lines + 1));
906
907      /* Modify the results slightly to make them prettier
908	 in cases where that can validly be done.  */
909
910      shift_boundaries (cmp->file);
911
912      /* Get the results of comparison in the form of a chain
913	 of `struct change's -- an edit script.  */
914
915      if (output_style == OUTPUT_ED)
916	script = build_reverse_script (cmp->file);
917      else
918	script = build_script (cmp->file);
919
920      /* Set CHANGES if we had any diffs.
921	 If some changes are ignored, we must scan the script to decide.  */
922      if (ignore_blank_lines || ignore_regexp.fastmap)
923	{
924	  struct change *next = script;
925	  changes = 0;
926
927	  while (next && changes == 0)
928	    {
929	      struct change *this, *end;
930	      lin first0, last0, first1, last1;
931
932	      /* Find a set of changes that belong together.  */
933	      this = next;
934	      end = find_change (next);
935
936	      /* Disconnect them from the rest of the changes, making them
937		 a hunk, and remember the rest for next iteration.  */
938	      next = end->link;
939	      end->link = 0;
940
941	      /* Determine whether this hunk is really a difference.  */
942	      if (analyze_hunk (this, &first0, &last0, &first1, &last1))
943		changes = 1;
944
945	      /* Reconnect the script so it will all be freed properly.  */
946	      end->link = next;
947	    }
948	}
949      else
950	changes = (script != 0);
951
952      if (brief)
953	changes = briefly_report (changes, cmp->file);
954      else
955	{
956	  if (changes | !no_diff_means_no_output)
957	    {
958	      /* Record info for starting up output,
959		 to be used if and when we have some output to print.  */
960	      setup_output (file_label[0] ? file_label[0] : cmp->file[0].name,
961			    file_label[1] ? file_label[1] : cmp->file[1].name,
962			    cmp->parent != 0);
963
964	      switch (output_style)
965		{
966		case OUTPUT_CONTEXT:
967		  print_context_script (script, false);
968		  break;
969
970		case OUTPUT_UNIFIED:
971		  print_context_script (script, true);
972		  break;
973
974		case OUTPUT_ED:
975		  print_ed_script (script);
976		  break;
977
978		case OUTPUT_FORWARD_ED:
979		  pr_forward_ed_script (script);
980		  break;
981
982		case OUTPUT_RCS:
983		  print_rcs_script (script);
984		  break;
985
986		case OUTPUT_NORMAL:
987		  print_normal_script (script);
988		  break;
989
990		case OUTPUT_IFDEF:
991		  print_ifdef_script (script);
992		  break;
993
994		case OUTPUT_SDIFF:
995		  print_sdiff_script (script);
996		  break;
997
998		default:
999		  abort ();
1000		}
1001
1002	      finish_output ();
1003	    }
1004	}
1005
1006      free (cmp->file[0].undiscarded);
1007
1008      free (flag_space);
1009
1010      for (f = 0; f < 2; f++)
1011	{
1012	  free (cmp->file[f].equivs);
1013	  free (cmp->file[f].linbuf + cmp->file[f].linbuf_base);
1014	}
1015
1016      for (e = script; e; e = p)
1017	{
1018	  p = e->link;
1019	  free (e);
1020	}
1021
1022      if (! ROBUST_OUTPUT_STYLE (output_style))
1023	for (f = 0; f < 2; ++f)
1024	  if (cmp->file[f].missing_newline)
1025	    {
1026	      error (0, 0, "%s: %s\n",
1027		     file_label[f] ? file_label[f] : cmp->file[f].name,
1028		     _("No newline at end of file"));
1029	      changes = 2;
1030	    }
1031    }
1032
1033  if (cmp->file[0].buffer != cmp->file[1].buffer)
1034    free (cmp->file[0].buffer);
1035  free (cmp->file[1].buffer);
1036
1037  return changes;
1038}
1039