1/* Analyze file differences for GNU DIFF.
2   Copyright (C) 1988, 1989, 1992, 1993, 1997 Free Software Foundation, Inc.
3
4This file is part of GNU DIFF.
5
6GNU DIFF is free software; you can redistribute it and/or modify
7it under the terms of the GNU General Public License as published by
8the Free Software Foundation; either version 2, or (at your option)
9any later version.
10
11GNU DIFF is distributed in the hope that it will be useful,
12but WITHOUT ANY WARRANTY; without even the implied warranty of
13MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14GNU General Public License for more details.
15
16*/
17
18/* The basic algorithm is described in:
19   "An O(ND) Difference Algorithm and its Variations", Eugene Myers,
20   Algorithmica Vol. 1 No. 2, 1986, pp. 251-266;
21   see especially section 4.2, which describes the variation used below.
22   Unless the --minimal option is specified, this code uses the TOO_EXPENSIVE
23   heuristic, by Paul Eggert, to limit the cost to O(N**1.5 log N)
24   at the price of producing suboptimal output for large inputs with
25   many differences.
26
27   The basic algorithm was independently discovered as described in:
28   "Algorithms for Approximate String Matching", E. Ukkonen,
29   Information and Control Vol. 64, 1985, pp. 100-118.  */
30
31#include "diff.h"
32#include "cmpbuf.h"
33
34extern int no_discards;
35
36static int *xvec, *yvec;	/* Vectors being compared. */
37static int *fdiag;		/* Vector, indexed by diagonal, containing
38				   1 + the X coordinate of the point furthest
39				   along the given diagonal in the forward
40				   search of the edit matrix. */
41static int *bdiag;		/* Vector, indexed by diagonal, containing
42				   the X coordinate of the point furthest
43				   along the given diagonal in the backward
44				   search of the edit matrix. */
45static int too_expensive;	/* Edit scripts longer than this are too
46				   expensive to compute.  */
47
48#define SNAKE_LIMIT 20	/* Snakes bigger than this are considered `big'.  */
49
50struct partition
51{
52  int xmid, ymid;	/* Midpoints of this partition.  */
53  int lo_minimal;	/* Nonzero if low half will be analyzed minimally.  */
54  int hi_minimal;	/* Likewise for high half.  */
55};
56
57static int diag PARAMS((int, int, int, int, int, struct partition *));
58static struct change *add_change PARAMS((int, int, int, int, struct change *));
59static struct change *build_reverse_script PARAMS((struct file_data const[]));
60static struct change *build_script PARAMS((struct file_data const[]));
61static void briefly_report PARAMS((int, struct file_data const[]));
62static void compareseq PARAMS((int, int, int, int, int));
63static void discard_confusing_lines PARAMS((struct file_data[]));
64static void shift_boundaries PARAMS((struct file_data[]));
65
66/* Find the midpoint of the shortest edit script for a specified
67   portion of the two files.
68
69   Scan from the beginnings of the files, and simultaneously from the ends,
70   doing a breadth-first search through the space of edit-sequence.
71   When the two searches meet, we have found the midpoint of the shortest
72   edit sequence.
73
74   If MINIMAL is nonzero, find the minimal edit script regardless
75   of expense.  Otherwise, if the search is too expensive, use
76   heuristics to stop the search and report a suboptimal answer.
77
78   Set PART->(XMID,YMID) to the midpoint (XMID,YMID).  The diagonal number
79   XMID - YMID equals the number of inserted lines minus the number
80   of deleted lines (counting only lines before the midpoint).
81   Return the approximate edit cost; this is the total number of
82   lines inserted or deleted (counting only lines before the midpoint),
83   unless a heuristic is used to terminate the search prematurely.
84
85   Set PART->LEFT_MINIMAL to nonzero iff the minimal edit script for the
86   left half of the partition is known; similarly for PART->RIGHT_MINIMAL.
87
88   This function assumes that the first lines of the specified portions
89   of the two files do not match, and likewise that the last lines do not
90   match.  The caller must trim matching lines from the beginning and end
91   of the portions it is going to specify.
92
93   If we return the "wrong" partitions,
94   the worst this can do is cause suboptimal diff output.
95   It cannot cause incorrect diff output.  */
96
97static int
98diag (xoff, xlim, yoff, ylim, minimal, part)
99     int xoff, xlim, yoff, ylim, minimal;
100     struct partition *part;
101{
102  int *const fd = fdiag;	/* Give the compiler a chance. */
103  int *const bd = bdiag;	/* Additional help for the compiler. */
104  int const *const xv = xvec;	/* Still more help for the compiler. */
105  int const *const yv = yvec;	/* And more and more . . . */
106  int const dmin = xoff - ylim;	/* Minimum valid diagonal. */
107  int const dmax = xlim - yoff;	/* Maximum valid diagonal. */
108  int const fmid = xoff - yoff;	/* Center diagonal of top-down search. */
109  int const bmid = xlim - ylim;	/* Center diagonal of bottom-up search. */
110  int fmin = fmid, fmax = fmid;	/* Limits of top-down search. */
111  int bmin = bmid, bmax = bmid;	/* Limits of bottom-up search. */
112  int c;			/* Cost. */
113  int odd = (fmid - bmid) & 1;	/* True if southeast corner is on an odd
114				   diagonal with respect to the northwest. */
115
116  fd[fmid] = xoff;
117  bd[bmid] = xlim;
118
119  for (c = 1;; ++c)
120    {
121      int d;			/* Active diagonal. */
122      int big_snake = 0;
123
124      /* Extend the top-down search by an edit step in each diagonal. */
125      fmin > dmin ? fd[--fmin - 1] = -1 : ++fmin;
126      fmax < dmax ? fd[++fmax + 1] = -1 : --fmax;
127      for (d = fmax; d >= fmin; d -= 2)
128	{
129	  int x, y, oldx, tlo = fd[d - 1], thi = fd[d + 1];
130
131	  if (tlo >= thi)
132	    x = tlo + 1;
133	  else
134	    x = thi;
135	  oldx = x;
136	  y = x - d;
137	  while (x < xlim && y < ylim && xv[x] == yv[y])
138	    ++x, ++y;
139	  if (x - oldx > SNAKE_LIMIT)
140	    big_snake = 1;
141	  fd[d] = x;
142	  if (odd && bmin <= d && d <= bmax && bd[d] <= x)
143	    {
144	      part->xmid = x;
145	      part->ymid = y;
146	      part->lo_minimal = part->hi_minimal = 1;
147	      return 2 * c - 1;
148	    }
149	}
150
151      /* Similarly extend the bottom-up search.  */
152      bmin > dmin ? bd[--bmin - 1] = INT_MAX : ++bmin;
153      bmax < dmax ? bd[++bmax + 1] = INT_MAX : --bmax;
154      for (d = bmax; d >= bmin; d -= 2)
155	{
156	  int x, y, oldx, tlo = bd[d - 1], thi = bd[d + 1];
157
158	  if (tlo < thi)
159	    x = tlo;
160	  else
161	    x = thi - 1;
162	  oldx = x;
163	  y = x - d;
164	  while (x > xoff && y > yoff && xv[x - 1] == yv[y - 1])
165	    --x, --y;
166	  if (oldx - x > SNAKE_LIMIT)
167	    big_snake = 1;
168	  bd[d] = x;
169	  if (!odd && fmin <= d && d <= fmax && x <= fd[d])
170	    {
171	      part->xmid = x;
172	      part->ymid = y;
173	      part->lo_minimal = part->hi_minimal = 1;
174	      return 2 * c;
175	    }
176	}
177
178      if (minimal)
179	continue;
180
181      /* Heuristic: check occasionally for a diagonal that has made
182	 lots of progress compared with the edit distance.
183	 If we have any such, find the one that has made the most
184	 progress and return it as if it had succeeded.
185
186	 With this heuristic, for files with a constant small density
187	 of changes, the algorithm is linear in the file size.  */
188
189      if (c > 200 && big_snake && heuristic)
190	{
191	  int best;
192
193	  best = 0;
194	  for (d = fmax; d >= fmin; d -= 2)
195	    {
196	      int dd = d - fmid;
197	      int x = fd[d];
198	      int y = x - d;
199	      int v = (x - xoff) * 2 - dd;
200	      if (v > 12 * (c + (dd < 0 ? -dd : dd)))
201		{
202		  if (v > best
203		      && xoff + SNAKE_LIMIT <= x && x < xlim
204		      && yoff + SNAKE_LIMIT <= y && y < ylim)
205		    {
206		      /* We have a good enough best diagonal;
207			 now insist that it end with a significant snake.  */
208		      int k;
209
210		      for (k = 1; xv[x - k] == yv[y - k]; k++)
211			if (k == SNAKE_LIMIT)
212			  {
213			    best = v;
214			    part->xmid = x;
215			    part->ymid = y;
216			    break;
217			  }
218		    }
219		}
220	    }
221	  if (best > 0)
222	    {
223	      part->lo_minimal = 1;
224	      part->hi_minimal = 0;
225	      return 2 * c - 1;
226	    }
227
228	  best = 0;
229	  for (d = bmax; d >= bmin; d -= 2)
230	    {
231	      int dd = d - bmid;
232	      int x = bd[d];
233	      int y = x - d;
234	      int v = (xlim - x) * 2 + dd;
235	      if (v > 12 * (c + (dd < 0 ? -dd : dd)))
236		{
237		  if (v > best
238		      && xoff < x && x <= xlim - SNAKE_LIMIT
239		      && yoff < y && y <= ylim - SNAKE_LIMIT)
240		    {
241		      /* We have a good enough best diagonal;
242			 now insist that it end with a significant snake.  */
243		      int k;
244
245		      for (k = 0; xv[x + k] == yv[y + k]; k++)
246			if (k == SNAKE_LIMIT - 1)
247			  {
248			    best = v;
249			    part->xmid = x;
250			    part->ymid = y;
251			    break;
252			  }
253		    }
254		}
255	    }
256	  if (best > 0)
257	    {
258	      part->lo_minimal = 0;
259	      part->hi_minimal = 1;
260	      return 2 * c - 1;
261	    }
262	}
263
264      /* Heuristic: if we've gone well beyond the call of duty,
265	 give up and report halfway between our best results so far.  */
266      if (c >= too_expensive)
267	{
268	  int fxybest, fxbest;
269	  int bxybest, bxbest;
270
271	  fxbest = bxbest = 0;  /* Pacify `gcc -Wall'.  */
272
273	  /* Find forward diagonal that maximizes X + Y.  */
274	  fxybest = -1;
275	  for (d = fmax; d >= fmin; d -= 2)
276	    {
277	      int x = min (fd[d], xlim);
278	      int y = x - d;
279	      if (ylim < y)
280		x = ylim + d, y = ylim;
281	      if (fxybest < x + y)
282		{
283		  fxybest = x + y;
284		  fxbest = x;
285		}
286	    }
287
288	  /* Find backward diagonal that minimizes X + Y.  */
289	  bxybest = INT_MAX;
290	  for (d = bmax; d >= bmin; d -= 2)
291	    {
292	      int x = max (xoff, bd[d]);
293	      int y = x - d;
294	      if (y < yoff)
295		x = yoff + d, y = yoff;
296	      if (x + y < bxybest)
297		{
298		  bxybest = x + y;
299		  bxbest = x;
300		}
301	    }
302
303	  /* Use the better of the two diagonals.  */
304	  if ((xlim + ylim) - bxybest < fxybest - (xoff + yoff))
305	    {
306	      part->xmid = fxbest;
307	      part->ymid = fxybest - fxbest;
308	      part->lo_minimal = 1;
309	      part->hi_minimal = 0;
310	    }
311	  else
312	    {
313	      part->xmid = bxbest;
314	      part->ymid = bxybest - bxbest;
315	      part->lo_minimal = 0;
316	      part->hi_minimal = 1;
317	    }
318	  return 2 * c - 1;
319	}
320    }
321}
322
323/* Compare in detail contiguous subsequences of the two files
324   which are known, as a whole, to match each other.
325
326   The results are recorded in the vectors files[N].changed_flag, by
327   storing a 1 in the element for each line that is an insertion or deletion.
328
329   The subsequence of file 0 is [XOFF, XLIM) and likewise for file 1.
330
331   Note that XLIM, YLIM are exclusive bounds.
332   All line numbers are origin-0 and discarded lines are not counted.
333
334   If MINIMAL is nonzero, find a minimal difference no matter how
335   expensive it is.  */
336
337static void
338compareseq (xoff, xlim, yoff, ylim, minimal)
339     int xoff, xlim, yoff, ylim, minimal;
340{
341  int * const xv = xvec; /* Help the compiler.  */
342  int * const yv = yvec;
343
344  /* Slide down the bottom initial diagonal. */
345  while (xoff < xlim && yoff < ylim && xv[xoff] == yv[yoff])
346    ++xoff, ++yoff;
347  /* Slide up the top initial diagonal. */
348  while (xlim > xoff && ylim > yoff && xv[xlim - 1] == yv[ylim - 1])
349    --xlim, --ylim;
350
351  /* Handle simple cases. */
352  if (xoff == xlim)
353    while (yoff < ylim)
354      files[1].changed_flag[files[1].realindexes[yoff++]] = 1;
355  else if (yoff == ylim)
356    while (xoff < xlim)
357      files[0].changed_flag[files[0].realindexes[xoff++]] = 1;
358  else
359    {
360      int c;
361      struct partition part;
362
363      /* Find a point of correspondence in the middle of the files.  */
364
365      c = diag (xoff, xlim, yoff, ylim, minimal, &part);
366
367      if (c == 1)
368	{
369	  /* This should be impossible, because it implies that
370	     one of the two subsequences is empty,
371	     and that case was handled above without calling `diag'.
372	     Let's verify that this is true.  */
373	  abort ();
374#if 0
375	  /* The two subsequences differ by a single insert or delete;
376	     record it and we are done.  */
377	  if (part.xmid - part.ymid < xoff - yoff)
378	    files[1].changed_flag[files[1].realindexes[part.ymid - 1]] = 1;
379	  else
380	    files[0].changed_flag[files[0].realindexes[part.xmid]] = 1;
381#endif
382	}
383      else
384	{
385	  /* Use the partitions to split this problem into subproblems.  */
386	  compareseq (xoff, part.xmid, yoff, part.ymid, part.lo_minimal);
387	  compareseq (part.xmid, xlim, part.ymid, ylim, part.hi_minimal);
388	}
389    }
390}
391
392/* Discard lines from one file that have no matches in the other file.
393
394   A line which is discarded will not be considered by the actual
395   comparison algorithm; it will be as if that line were not in the file.
396   The file's `realindexes' table maps virtual line numbers
397   (which don't count the discarded lines) into real line numbers;
398   this is how the actual comparison algorithm produces results
399   that are comprehensible when the discarded lines are counted.
400
401   When we discard a line, we also mark it as a deletion or insertion
402   so that it will be printed in the output.  */
403
404static void
405discard_confusing_lines (filevec)
406     struct file_data filevec[];
407{
408  unsigned int f, i;
409  char *discarded[2];
410  int *equiv_count[2];
411  int *p;
412
413  /* Allocate our results.  */
414  p = (int *) xmalloc ((filevec[0].buffered_lines + filevec[1].buffered_lines)
415		       * (2 * sizeof (int)));
416  for (f = 0; f < 2; f++)
417    {
418      filevec[f].undiscarded = p;  p += filevec[f].buffered_lines;
419      filevec[f].realindexes = p;  p += filevec[f].buffered_lines;
420    }
421
422  /* Set up equiv_count[F][I] as the number of lines in file F
423     that fall in equivalence class I.  */
424
425  p = (int *) xmalloc (filevec[0].equiv_max * (2 * sizeof (int)));
426  equiv_count[0] = p;
427  equiv_count[1] = p + filevec[0].equiv_max;
428  bzero (p, filevec[0].equiv_max * (2 * sizeof (int)));
429
430  for (i = 0; i < filevec[0].buffered_lines; ++i)
431    ++equiv_count[0][filevec[0].equivs[i]];
432  for (i = 0; i < filevec[1].buffered_lines; ++i)
433    ++equiv_count[1][filevec[1].equivs[i]];
434
435  /* Set up tables of which lines are going to be discarded.  */
436
437  discarded[0] = xmalloc (sizeof (char)
438			  * (filevec[0].buffered_lines
439			     + filevec[1].buffered_lines));
440  discarded[1] = discarded[0] + filevec[0].buffered_lines;
441  bzero (discarded[0], sizeof (char) * (filevec[0].buffered_lines
442					+ filevec[1].buffered_lines));
443
444  /* Mark to be discarded each line that matches no line of the other file.
445     If a line matches many lines, mark it as provisionally discardable.  */
446
447  for (f = 0; f < 2; f++)
448    {
449      unsigned int end = filevec[f].buffered_lines;
450      char *discards = discarded[f];
451      int *counts = equiv_count[1 - f];
452      int *equivs = filevec[f].equivs;
453      unsigned int many = 5;
454      unsigned int tem = end / 64;
455
456      /* Multiply MANY by approximate square root of number of lines.
457	 That is the threshold for provisionally discardable lines.  */
458      while ((tem = tem >> 2) > 0)
459	many *= 2;
460
461      for (i = 0; i < end; i++)
462	{
463	  int nmatch;
464	  if (equivs[i] == 0)
465	    continue;
466	  nmatch = counts[equivs[i]];
467	  if (nmatch == 0)
468	    discards[i] = 1;
469	  else if (nmatch > many)
470	    discards[i] = 2;
471	}
472    }
473
474  /* Don't really discard the provisional lines except when they occur
475     in a run of discardables, with nonprovisionals at the beginning
476     and end.  */
477
478  for (f = 0; f < 2; f++)
479    {
480      unsigned int end = filevec[f].buffered_lines;
481      register char *discards = discarded[f];
482
483      for (i = 0; i < end; i++)
484	{
485	  /* Cancel provisional discards not in middle of run of discards.  */
486	  if (discards[i] == 2)
487	    discards[i] = 0;
488	  else if (discards[i] != 0)
489	    {
490	      /* We have found a nonprovisional discard.  */
491	      register int j;
492	      unsigned int length;
493	      unsigned int provisional = 0;
494
495	      /* Find end of this run of discardable lines.
496		 Count how many are provisionally discardable.  */
497	      for (j = i; j < end; j++)
498		{
499		  if (discards[j] == 0)
500		    break;
501		  if (discards[j] == 2)
502		    ++provisional;
503		}
504
505	      /* Cancel provisional discards at end, and shrink the run.  */
506	      while (j > i && discards[j - 1] == 2)
507		discards[--j] = 0, --provisional;
508
509	      /* Now we have the length of a run of discardable lines
510		 whose first and last are not provisional.  */
511	      length = j - i;
512
513	      /* If 1/4 of the lines in the run are provisional,
514		 cancel discarding of all provisional lines in the run.  */
515	      if (provisional * 4 > length)
516		{
517		  while (j > i)
518		    if (discards[--j] == 2)
519		      discards[j] = 0;
520		}
521	      else
522		{
523		  register unsigned int consec;
524		  unsigned int minimum = 1;
525		  unsigned int tem = length / 4;
526
527		  /* MINIMUM is approximate square root of LENGTH/4.
528		     A subrun of two or more provisionals can stand
529		     when LENGTH is at least 16.
530		     A subrun of 4 or more can stand when LENGTH >= 64.  */
531		  while ((tem = tem >> 2) > 0)
532		    minimum *= 2;
533		  minimum++;
534
535		  /* Cancel any subrun of MINIMUM or more provisionals
536		     within the larger run.  */
537		  for (j = 0, consec = 0; j < length; j++)
538		    if (discards[i + j] != 2)
539		      consec = 0;
540		    else if (minimum == ++consec)
541		      /* Back up to start of subrun, to cancel it all.  */
542		      j -= consec;
543		    else if (minimum < consec)
544		      discards[i + j] = 0;
545
546		  /* Scan from beginning of run
547		     until we find 3 or more nonprovisionals in a row
548		     or until the first nonprovisional at least 8 lines in.
549		     Until that point, cancel any provisionals.  */
550		  for (j = 0, consec = 0; j < length; j++)
551		    {
552		      if (j >= 8 && discards[i + j] == 1)
553			break;
554		      if (discards[i + j] == 2)
555			consec = 0, discards[i + j] = 0;
556		      else if (discards[i + j] == 0)
557			consec = 0;
558		      else
559			consec++;
560		      if (consec == 3)
561			break;
562		    }
563
564		  /* I advances to the last line of the run.  */
565		  i += length - 1;
566
567		  /* Same thing, from end.  */
568		  for (j = 0, consec = 0; j < length; j++)
569		    {
570		      if (j >= 8 && discards[i - j] == 1)
571			break;
572		      if (discards[i - j] == 2)
573			consec = 0, discards[i - j] = 0;
574		      else if (discards[i - j] == 0)
575			consec = 0;
576		      else
577			consec++;
578		      if (consec == 3)
579			break;
580		    }
581		}
582	    }
583	}
584    }
585
586  /* Actually discard the lines. */
587  for (f = 0; f < 2; f++)
588    {
589      char *discards = discarded[f];
590      unsigned int end = filevec[f].buffered_lines;
591      unsigned int j = 0;
592      for (i = 0; i < end; ++i)
593	if (no_discards || discards[i] == 0)
594	  {
595	    filevec[f].undiscarded[j] = filevec[f].equivs[i];
596	    filevec[f].realindexes[j++] = i;
597	  }
598	else
599	  filevec[f].changed_flag[i] = 1;
600      filevec[f].nondiscarded_lines = j;
601    }
602
603  free (discarded[0]);
604  free (equiv_count[0]);
605}
606
607/* Adjust inserts/deletes of identical lines to join changes
608   as much as possible.
609
610   We do something when a run of changed lines include a
611   line at one end and have an excluded, identical line at the other.
612   We are free to choose which identical line is included.
613   `compareseq' usually chooses the one at the beginning,
614   but usually it is cleaner to consider the following identical line
615   to be the "change".  */
616
617int inhibit;
618
619static void
620shift_boundaries (filevec)
621     struct file_data filevec[];
622{
623  int f;
624
625  if (inhibit)
626    return;
627
628  for (f = 0; f < 2; f++)
629    {
630      char *changed = filevec[f].changed_flag;
631      char const *other_changed = filevec[1-f].changed_flag;
632      int const *equivs = filevec[f].equivs;
633      int i = 0;
634      int j = 0;
635      int i_end = filevec[f].buffered_lines;
636
637      while (1)
638	{
639	  int runlength, start, corresponding;
640
641	  /* Scan forwards to find beginning of another run of changes.
642	     Also keep track of the corresponding point in the other file.  */
643
644	  while (i < i_end && changed[i] == 0)
645	    {
646	      while (other_changed[j++])
647		continue;
648	      i++;
649	    }
650
651	  if (i == i_end)
652	    break;
653
654	  start = i;
655
656	  /* Find the end of this run of changes.  */
657
658	  while (changed[++i])
659	    continue;
660	  while (other_changed[j])
661	    j++;
662
663	  do
664	    {
665	      /* Record the length of this run of changes, so that
666		 we can later determine whether the run has grown.  */
667	      runlength = i - start;
668
669	      /* Move the changed region back, so long as the
670		 previous unchanged line matches the last changed one.
671		 This merges with previous changed regions.  */
672
673	      while (start && equivs[start - 1] == equivs[i - 1])
674		{
675		  changed[--start] = 1;
676		  changed[--i] = 0;
677		  while (changed[start - 1])
678		    start--;
679		  while (other_changed[--j])
680		    continue;
681		}
682
683	      /* Set CORRESPONDING to the end of the changed run, at the last
684		 point where it corresponds to a changed run in the other file.
685		 CORRESPONDING == I_END means no such point has been found.  */
686	      corresponding = other_changed[j - 1] ? i : i_end;
687
688	      /* Move the changed region forward, so long as the
689		 first changed line matches the following unchanged one.
690		 This merges with following changed regions.
691		 Do this second, so that if there are no merges,
692		 the changed region is moved forward as far as possible.  */
693
694	      while (i != i_end && equivs[start] == equivs[i])
695		{
696		  changed[start++] = 0;
697		  changed[i++] = 1;
698		  while (changed[i])
699		    i++;
700		  while (other_changed[++j])
701		    corresponding = i;
702		}
703	    }
704	  while (runlength != i - start);
705
706	  /* If possible, move the fully-merged run of changes
707	     back to a corresponding run in the other file.  */
708
709	  while (corresponding < i)
710	    {
711	      changed[--start] = 1;
712	      changed[--i] = 0;
713	      while (other_changed[--j])
714		continue;
715	    }
716	}
717    }
718}
719
720/* Cons an additional entry onto the front of an edit script OLD.
721   LINE0 and LINE1 are the first affected lines in the two files (origin 0).
722   DELETED is the number of lines deleted here from file 0.
723   INSERTED is the number of lines inserted here in file 1.
724
725   If DELETED is 0 then LINE0 is the number of the line before
726   which the insertion was done; vice versa for INSERTED and LINE1.  */
727
728static struct change *
729add_change (line0, line1, deleted, inserted, old)
730     int line0, line1, deleted, inserted;
731     struct change *old;
732{
733  struct change *new = (struct change *) xmalloc (sizeof (struct change));
734
735  new->line0 = line0;
736  new->line1 = line1;
737  new->inserted = inserted;
738  new->deleted = deleted;
739  new->link = old;
740  return new;
741}
742
743/* Scan the tables of which lines are inserted and deleted,
744   producing an edit script in reverse order.  */
745
746static struct change *
747build_reverse_script (filevec)
748     struct file_data const filevec[];
749{
750  struct change *script = 0;
751  char *changed0 = filevec[0].changed_flag;
752  char *changed1 = filevec[1].changed_flag;
753  int len0 = filevec[0].buffered_lines;
754  int len1 = filevec[1].buffered_lines;
755
756  /* Note that changedN[len0] does exist, and contains 0.  */
757
758  int i0 = 0, i1 = 0;
759
760  while (i0 < len0 || i1 < len1)
761    {
762      if (changed0[i0] || changed1[i1])
763	{
764	  int line0 = i0, line1 = i1;
765
766	  /* Find # lines changed here in each file.  */
767	  while (changed0[i0]) ++i0;
768	  while (changed1[i1]) ++i1;
769
770	  /* Record this change.  */
771	  script = add_change (line0, line1, i0 - line0, i1 - line1, script);
772	}
773
774      /* We have reached lines in the two files that match each other.  */
775      i0++, i1++;
776    }
777
778  return script;
779}
780
781/* Scan the tables of which lines are inserted and deleted,
782   producing an edit script in forward order.  */
783
784static struct change *
785build_script (filevec)
786     struct file_data const filevec[];
787{
788  struct change *script = 0;
789  char *changed0 = filevec[0].changed_flag;
790  char *changed1 = filevec[1].changed_flag;
791  int i0 = filevec[0].buffered_lines, i1 = filevec[1].buffered_lines;
792
793  /* Note that changedN[-1] does exist, and contains 0.  */
794
795  while (i0 >= 0 || i1 >= 0)
796    {
797      if (changed0[i0 - 1] || changed1[i1 - 1])
798	{
799	  int line0 = i0, line1 = i1;
800
801	  /* Find # lines changed here in each file.  */
802	  while (changed0[i0 - 1]) --i0;
803	  while (changed1[i1 - 1]) --i1;
804
805	  /* Record this change.  */
806	  script = add_change (i0, i1, line0 - i0, line1 - i1, script);
807	}
808
809      /* We have reached lines in the two files that match each other.  */
810      i0--, i1--;
811    }
812
813  return script;
814}
815
816/* If CHANGES, briefly report that two files differed.  */
817static void
818briefly_report (changes, filevec)
819     int changes;
820     struct file_data const filevec[];
821{
822  if (changes)
823    message (no_details_flag ? "Files %s and %s differ\n"
824	     : "Binary files %s and %s differ\n",
825	     filevec[0].name, filevec[1].name);
826}
827
828/* Report the differences of two files.  DEPTH is the current directory
829   depth. */
830int
831diff_2_files (filevec, depth)
832     struct file_data filevec[];
833     int depth;
834{
835  int diags;
836  int i;
837  struct change *e, *p;
838  struct change *script;
839  int changes;
840
841
842  /* If we have detected that either file is binary,
843     compare the two files as binary.  This can happen
844     only when the first chunk is read.
845     Also, --brief without any --ignore-* options means
846     we can speed things up by treating the files as binary.  */
847
848  if (read_files (filevec, no_details_flag & ~ignore_some_changes))
849    {
850      /* Files with different lengths must be different.  */
851      if (filevec[0].stat.st_size != filevec[1].stat.st_size
852	  && (filevec[0].desc < 0 || S_ISREG (filevec[0].stat.st_mode))
853	  && (filevec[1].desc < 0 || S_ISREG (filevec[1].stat.st_mode)))
854	changes = 1;
855
856      /* Standard input equals itself.  */
857      else if (filevec[0].desc == filevec[1].desc)
858	changes = 0;
859
860      else
861	/* Scan both files, a buffer at a time, looking for a difference.  */
862	{
863	  /* Allocate same-sized buffers for both files.  */
864	  size_t buffer_size = buffer_lcm (STAT_BLOCKSIZE (filevec[0].stat),
865					   STAT_BLOCKSIZE (filevec[1].stat));
866	  for (i = 0; i < 2; i++)
867	    filevec[i].buffer = xrealloc (filevec[i].buffer, buffer_size);
868
869	  for (;;  filevec[0].buffered_chars = filevec[1].buffered_chars = 0)
870	    {
871	      /* Read a buffer's worth from both files.  */
872	      for (i = 0; i < 2; i++)
873		if (0 <= filevec[i].desc)
874		  while (filevec[i].buffered_chars != buffer_size)
875		    {
876		      int r = read (filevec[i].desc,
877				    filevec[i].buffer
878				    + filevec[i].buffered_chars,
879				    buffer_size - filevec[i].buffered_chars);
880		      if (r == 0)
881			break;
882		      if (r < 0)
883			pfatal_with_name (filevec[i].name);
884		      filevec[i].buffered_chars += r;
885		    }
886
887	      /* If the buffers differ, the files differ.  */
888	      if (filevec[0].buffered_chars != filevec[1].buffered_chars
889		  || (filevec[0].buffered_chars != 0
890		      && memcmp (filevec[0].buffer,
891				 filevec[1].buffer,
892				 filevec[0].buffered_chars) != 0))
893		{
894		  changes = 1;
895		  break;
896		}
897
898	      /* If we reach end of file, the files are the same.  */
899	      if (filevec[0].buffered_chars != buffer_size)
900		{
901		  changes = 0;
902		  break;
903		}
904	    }
905	}
906
907      briefly_report (changes, filevec);
908    }
909  else
910    {
911      /* Allocate vectors for the results of comparison:
912	 a flag for each line of each file, saying whether that line
913	 is an insertion or deletion.
914	 Allocate an extra element, always zero, at each end of each vector.  */
915
916      size_t s = filevec[0].buffered_lines + filevec[1].buffered_lines + 4;
917      filevec[0].changed_flag = xmalloc (s);
918      bzero (filevec[0].changed_flag, s);
919      filevec[0].changed_flag++;
920      filevec[1].changed_flag = filevec[0].changed_flag
921				+ filevec[0].buffered_lines + 2;
922
923      /* Some lines are obviously insertions or deletions
924	 because they don't match anything.  Detect them now, and
925	 avoid even thinking about them in the main comparison algorithm.  */
926
927      discard_confusing_lines (filevec);
928
929      /* Now do the main comparison algorithm, considering just the
930	 undiscarded lines.  */
931
932      xvec = filevec[0].undiscarded;
933      yvec = filevec[1].undiscarded;
934      diags = filevec[0].nondiscarded_lines + filevec[1].nondiscarded_lines + 3;
935      fdiag = (int *) xmalloc (diags * (2 * sizeof (int)));
936      bdiag = fdiag + diags;
937      fdiag += filevec[1].nondiscarded_lines + 1;
938      bdiag += filevec[1].nondiscarded_lines + 1;
939
940      /* Set TOO_EXPENSIVE to be approximate square root of input size,
941	 bounded below by 256.  */
942      too_expensive = 1;
943      for (i = filevec[0].nondiscarded_lines + filevec[1].nondiscarded_lines;
944	   i != 0; i >>= 2)
945	too_expensive <<= 1;
946      too_expensive = max (256, too_expensive);
947
948      files[0] = filevec[0];
949      files[1] = filevec[1];
950
951      compareseq (0, filevec[0].nondiscarded_lines,
952		  0, filevec[1].nondiscarded_lines, no_discards);
953
954      free (fdiag - (filevec[1].nondiscarded_lines + 1));
955
956      /* Modify the results slightly to make them prettier
957	 in cases where that can validly be done.  */
958
959      shift_boundaries (filevec);
960
961      /* Get the results of comparison in the form of a chain
962	 of `struct change's -- an edit script.  */
963
964      if (output_style == OUTPUT_ED)
965	script = build_reverse_script (filevec);
966      else
967	script = build_script (filevec);
968
969      /* Set CHANGES if we had any diffs.
970	 If some changes are ignored, we must scan the script to decide.  */
971      if (ignore_blank_lines_flag || ignore_regexp_list)
972	{
973	  struct change *next = script;
974	  changes = 0;
975
976	  while (next && changes == 0)
977	    {
978	      struct change *this, *end;
979	      int first0, last0, first1, last1, deletes, inserts;
980
981	      /* Find a set of changes that belong together.  */
982	      this = next;
983	      end = find_change (next);
984
985	      /* Disconnect them from the rest of the changes, making them
986		 a hunk, and remember the rest for next iteration.  */
987	      next = end->link;
988	      end->link = 0;
989
990	      /* Determine whether this hunk is really a difference.  */
991	      analyze_hunk (this, &first0, &last0, &first1, &last1,
992			    &deletes, &inserts);
993
994	      /* Reconnect the script so it will all be freed properly.  */
995	      end->link = next;
996
997	      if (deletes || inserts)
998		changes = 1;
999	    }
1000	}
1001      else
1002	changes = (script != 0);
1003
1004      if (no_details_flag)
1005	briefly_report (changes, filevec);
1006      else
1007	{
1008	  if (changes || ! no_diff_means_no_output)
1009	    {
1010	      /* Record info for starting up output,
1011		 to be used if and when we have some output to print.  */
1012	      setup_output (files[0].name, files[1].name, depth);
1013
1014	      switch (output_style)
1015		{
1016		case OUTPUT_CONTEXT:
1017		  print_context_script (script, 0);
1018		  break;
1019
1020		case OUTPUT_UNIFIED:
1021		  print_context_script (script, 1);
1022		  break;
1023
1024		case OUTPUT_ED:
1025		  print_ed_script (script);
1026		  break;
1027
1028		case OUTPUT_FORWARD_ED:
1029		  pr_forward_ed_script (script);
1030		  break;
1031
1032		case OUTPUT_RCS:
1033		  print_rcs_script (script);
1034		  break;
1035
1036		case OUTPUT_NORMAL:
1037		  print_normal_script (script);
1038		  break;
1039
1040		case OUTPUT_IFDEF:
1041		  print_ifdef_script (script);
1042		  break;
1043
1044		case OUTPUT_SDIFF:
1045		  print_sdiff_script (script);
1046		}
1047
1048	      finish_output ();
1049	    }
1050	}
1051
1052      free (filevec[0].undiscarded);
1053
1054      free (filevec[0].changed_flag - 1);
1055
1056      for (i = 1; i >= 0; --i)
1057	free (filevec[i].equivs);
1058
1059      for (i = 0; i < 2; ++i)
1060	free (filevec[i].linbuf + filevec[i].linbuf_base);
1061
1062      for (e = script; e; e = p)
1063	{
1064	  p = e->link;
1065	  free (e);
1066	}
1067
1068      if (! ROBUST_OUTPUT_STYLE (output_style))
1069	for (i = 0; i < 2; ++i)
1070	  if (filevec[i].missing_newline)
1071	    {
1072	      diff_error ("No newline at end of file %s", filevec[i].name, "");
1073	      changes = 2;
1074	    }
1075    }
1076
1077  if (filevec[0].buffer != filevec[1].buffer)
1078    free (filevec[0].buffer);
1079  free (filevec[1].buffer);
1080
1081  return changes;
1082}
1083