io.c revision 170754
1/* File I/O for GNU DIFF.
2
3   Copyright (C) 1988, 1989, 1992, 1993, 1994, 1995, 1998, 2001, 2002,
4   2004 Free Software Foundation, Inc.
5
6   This file is part of GNU DIFF.
7
8   GNU DIFF is free software; you can redistribute it and/or modify
9   it under the terms of the GNU General Public License as published by
10   the Free Software Foundation; either version 2, or (at your option)
11   any later version.
12
13   GNU DIFF is distributed in the hope that it will be useful,
14   but WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16   GNU General Public License for more details.
17
18   You should have received a copy of the GNU General Public License
19   along with this program; see the file COPYING.
20   If not, write to the Free Software Foundation,
21   59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.  */
22
23#include "diff.h"
24#include <cmpbuf.h>
25#include <file-type.h>
26#include <setmode.h>
27#include <xalloc.h>
28
29/* Rotate an unsigned value to the left.  */
30#define ROL(v, n) ((v) << (n) | (v) >> (sizeof (v) * CHAR_BIT - (n)))
31
32/* Given a hash value and a new character, return a new hash value.  */
33#define HASH(h, c) ((c) + ROL (h, 7))
34
35/* The type of a hash value.  */
36typedef size_t hash_value;
37verify (hash_value_is_unsigned, ! TYPE_SIGNED (hash_value));
38
39/* Lines are put into equivalence classes of lines that match in lines_differ.
40   Each equivalence class is represented by one of these structures,
41   but only while the classes are being computed.
42   Afterward, each class is represented by a number.  */
43struct equivclass
44{
45  lin next;		/* Next item in this bucket.  */
46  hash_value hash;	/* Hash of lines in this class.  */
47  char const *line;	/* A line that fits this class.  */
48  size_t length;	/* That line's length, not counting its newline.  */
49};
50
51/* Hash-table: array of buckets, each being a chain of equivalence classes.
52   buckets[-1] is reserved for incomplete lines.  */
53static lin *buckets;
54
55/* Number of buckets in the hash table array, not counting buckets[-1].  */
56static size_t nbuckets;
57
58/* Array in which the equivalence classes are allocated.
59   The bucket-chains go through the elements in this array.
60   The number of an equivalence class is its index in this array.  */
61static struct equivclass *equivs;
62
63/* Index of first free element in the array `equivs'.  */
64static lin equivs_index;
65
66/* Number of elements allocated in the array `equivs'.  */
67static lin equivs_alloc;
68
69/* Read a block of data into a file buffer, checking for EOF and error.  */
70
71void
72file_block_read (struct file_data *current, size_t size)
73{
74  if (size && ! current->eof)
75    {
76      size_t s = block_read (current->desc,
77			     FILE_BUFFER (current) + current->buffered, size);
78      if (s == SIZE_MAX)
79	pfatal_with_name (current->name);
80      current->buffered += s;
81      current->eof = s < size;
82    }
83}
84
85/* Check for binary files and compare them for exact identity.  */
86
87/* Return 1 if BUF contains a non text character.
88   SIZE is the number of characters in BUF.  */
89
90#define binary_file_p(buf, size) (memchr (buf, 0, size) != 0)
91
92/* Get ready to read the current file.
93   Return nonzero if SKIP_TEST is zero,
94   and if it appears to be a binary file.  */
95
96static bool
97sip (struct file_data *current, bool skip_test)
98{
99  /* If we have a nonexistent file at this stage, treat it as empty.  */
100  if (current->desc < 0)
101    {
102      /* Leave room for a sentinel.  */
103      current->bufsize = sizeof (word);
104      current->buffer = xmalloc (current->bufsize);
105    }
106  else
107    {
108      current->bufsize = buffer_lcm (sizeof (word),
109				     STAT_BLOCKSIZE (current->stat),
110				     PTRDIFF_MAX - 2 * sizeof (word));
111      current->buffer = xmalloc (current->bufsize);
112
113      if (! skip_test)
114	{
115	  /* Check first part of file to see if it's a binary file.  */
116
117	  bool was_binary = set_binary_mode (current->desc, true);
118	  off_t buffered;
119	  file_block_read (current, current->bufsize);
120	  buffered = current->buffered;
121
122	  if (! was_binary)
123	    {
124	      /* Revert to text mode and seek back to the beginning to
125		 reread the file.  Use relative seek, since file
126		 descriptors like stdin might not start at offset
127		 zero.  */
128
129	      if (lseek (current->desc, - buffered, SEEK_CUR) == -1)
130		pfatal_with_name (current->name);
131	      set_binary_mode (current->desc, false);
132	      current->buffered = 0;
133	      current->eof = false;
134	    }
135
136	  return binary_file_p (current->buffer, buffered);
137	}
138    }
139
140  current->buffered = 0;
141  current->eof = false;
142  return false;
143}
144
145/* Slurp the rest of the current file completely into memory.  */
146
147static void
148slurp (struct file_data *current)
149{
150  size_t cc;
151
152  if (current->desc < 0)
153    {
154      /* The file is nonexistent.  */
155      return;
156    }
157
158  if (S_ISREG (current->stat.st_mode))
159    {
160      /* It's a regular file; slurp in the rest all at once.  */
161
162      /* Get the size out of the stat block.
163	 Allocate just enough room for appended newline plus word sentinel,
164	 plus word-alignment since we want the buffer word-aligned.  */
165      size_t file_size = current->stat.st_size;
166      cc = file_size + 2 * sizeof (word) - file_size % sizeof (word);
167      if (file_size != current->stat.st_size || cc < file_size
168	  || PTRDIFF_MAX <= cc)
169	xalloc_die ();
170
171      if (current->bufsize < cc)
172	{
173	  current->bufsize = cc;
174	  current->buffer = xrealloc (current->buffer, cc);
175	}
176
177      /* Try to read at least 1 more byte than the size indicates, to
178	 detect whether the file is growing.  This is a nicety for
179	 users who run 'diff' on files while they are changing.  */
180
181      if (current->buffered <= file_size)
182	{
183	  file_block_read (current, file_size + 1 - current->buffered);
184	  if (current->buffered <= file_size)
185	    return;
186	}
187    }
188
189  /* It's not a regular file, or it's a growing regular file; read it,
190     growing the buffer as needed.  */
191
192  file_block_read (current, current->bufsize - current->buffered);
193
194  if (current->buffered)
195    {
196      while (current->buffered == current->bufsize)
197	{
198	  if (PTRDIFF_MAX / 2 - sizeof (word) < current->bufsize)
199	    xalloc_die ();
200	  current->bufsize *= 2;
201	  current->buffer = xrealloc (current->buffer, current->bufsize);
202	  file_block_read (current, current->bufsize - current->buffered);
203	}
204
205      /* Allocate just enough room for appended newline plus word
206	 sentinel, plus word-alignment.  */
207      cc = current->buffered + 2 * sizeof (word);
208      current->bufsize = cc - cc % sizeof (word);
209      current->buffer = xrealloc (current->buffer, current->bufsize);
210    }
211}
212
213/* Split the file into lines, simultaneously computing the equivalence
214   class for each line.  */
215
216static void
217find_and_hash_each_line (struct file_data *current)
218{
219  hash_value h;
220  char const *p = current->prefix_end;
221  unsigned char c;
222  lin i, *bucket;
223  size_t length;
224
225  /* Cache often-used quantities in local variables to help the compiler.  */
226  char const **linbuf = current->linbuf;
227  lin alloc_lines = current->alloc_lines;
228  lin line = 0;
229  lin linbuf_base = current->linbuf_base;
230  lin *cureqs = xmalloc (alloc_lines * sizeof *cureqs);
231  struct equivclass *eqs = equivs;
232  lin eqs_index = equivs_index;
233  lin eqs_alloc = equivs_alloc;
234  char const *suffix_begin = current->suffix_begin;
235  char const *bufend = FILE_BUFFER (current) + current->buffered;
236  bool diff_length_compare_anyway =
237    ignore_white_space != IGNORE_NO_WHITE_SPACE;
238  bool same_length_diff_contents_compare_anyway =
239    diff_length_compare_anyway | ignore_case;
240
241  while (p < suffix_begin)
242    {
243      char const *ip = p;
244
245      h = 0;
246
247      /* Hash this line until we find a newline.  */
248      if (ignore_case)
249	switch (ignore_white_space)
250	  {
251	  case IGNORE_ALL_SPACE:
252	    while ((c = *p++) != '\n')
253	      if (! isspace (c))
254		h = HASH (h, tolower (c));
255	    break;
256
257	  case IGNORE_SPACE_CHANGE:
258	    while ((c = *p++) != '\n')
259	      {
260		if (isspace (c))
261		  {
262		    do
263		      if ((c = *p++) == '\n')
264			goto hashing_done;
265		    while (isspace (c));
266
267		    h = HASH (h, ' ');
268		  }
269
270		/* C is now the first non-space.  */
271		h = HASH (h, tolower (c));
272	      }
273	    break;
274
275	  case IGNORE_TAB_EXPANSION:
276	    {
277	      size_t column = 0;
278	      while ((c = *p++) != '\n')
279		{
280		  size_t repetitions = 1;
281
282		  switch (c)
283		    {
284		    case '\b':
285		      column -= 0 < column;
286		      break;
287
288		    case '\t':
289		      c = ' ';
290		      repetitions = tabsize - column % tabsize;
291		      column = (column + repetitions < column
292				? 0
293				: column + repetitions);
294		      break;
295
296		    case '\r':
297		      column = 0;
298		      break;
299
300		    default:
301		      c = tolower (c);
302		      column++;
303		      break;
304		    }
305
306		  do
307		    h = HASH (h, c);
308		  while (--repetitions != 0);
309		}
310	    }
311	    break;
312
313	  default:
314	    while ((c = *p++) != '\n')
315	      h = HASH (h, tolower (c));
316	    break;
317	  }
318      else
319	switch (ignore_white_space)
320	  {
321	  case IGNORE_ALL_SPACE:
322	    while ((c = *p++) != '\n')
323	      if (! isspace (c))
324		h = HASH (h, c);
325	    break;
326
327	  case IGNORE_SPACE_CHANGE:
328	    while ((c = *p++) != '\n')
329	      {
330		if (isspace (c))
331		  {
332		    do
333		      if ((c = *p++) == '\n')
334			goto hashing_done;
335		    while (isspace (c));
336
337		    h = HASH (h, ' ');
338		  }
339
340		/* C is now the first non-space.  */
341		h = HASH (h, c);
342	      }
343	    break;
344
345	  case IGNORE_TAB_EXPANSION:
346	    {
347	      size_t column = 0;
348	      while ((c = *p++) != '\n')
349		{
350		  size_t repetitions = 1;
351
352		  switch (c)
353		    {
354		    case '\b':
355		      column -= 0 < column;
356		      break;
357
358		    case '\t':
359		      c = ' ';
360		      repetitions = tabsize - column % tabsize;
361		      column = (column + repetitions < column
362				? 0
363				: column + repetitions);
364		      break;
365
366		    case '\r':
367		      column = 0;
368		      break;
369
370		    default:
371		      column++;
372		      break;
373		    }
374
375		  do
376		    h = HASH (h, c);
377		  while (--repetitions != 0);
378		}
379	    }
380	    break;
381
382	  default:
383	    while ((c = *p++) != '\n')
384	      h = HASH (h, c);
385	    break;
386	  }
387
388   hashing_done:;
389
390      bucket = &buckets[h % nbuckets];
391      length = p - ip - 1;
392
393      if (p == bufend
394	  && current->missing_newline
395	  && ROBUST_OUTPUT_STYLE (output_style))
396	{
397	  /* This line is incomplete.  If this is significant,
398	     put the line into buckets[-1].  */
399	  if (ignore_white_space < IGNORE_SPACE_CHANGE)
400	    bucket = &buckets[-1];
401
402	  /* Omit the inserted newline when computing linbuf later.  */
403	  p--;
404	  bufend = suffix_begin = p;
405	}
406
407      for (i = *bucket;  ;  i = eqs[i].next)
408	if (!i)
409	  {
410	    /* Create a new equivalence class in this bucket.  */
411	    i = eqs_index++;
412	    if (i == eqs_alloc)
413	      {
414		if (PTRDIFF_MAX / (2 * sizeof *eqs) <= eqs_alloc)
415		  xalloc_die ();
416		eqs_alloc *= 2;
417		eqs = xrealloc (eqs, eqs_alloc * sizeof *eqs);
418	      }
419	    eqs[i].next = *bucket;
420	    eqs[i].hash = h;
421	    eqs[i].line = ip;
422	    eqs[i].length = length;
423	    *bucket = i;
424	    break;
425	  }
426	else if (eqs[i].hash == h)
427	  {
428	    char const *eqline = eqs[i].line;
429
430	    /* Reuse existing class if lines_differ reports the lines
431               equal.  */
432	    if (eqs[i].length == length)
433	      {
434		/* Reuse existing equivalence class if the lines are identical.
435		   This detects the common case of exact identity
436		   faster than lines_differ would.  */
437		if (memcmp (eqline, ip, length) == 0)
438		  break;
439		if (!same_length_diff_contents_compare_anyway)
440		  continue;
441	      }
442	    else if (!diff_length_compare_anyway)
443	      continue;
444
445	    if (! lines_differ (eqline, ip))
446	      break;
447	  }
448
449      /* Maybe increase the size of the line table.  */
450      if (line == alloc_lines)
451	{
452	  /* Double (alloc_lines - linbuf_base) by adding to alloc_lines.  */
453	  if (PTRDIFF_MAX / 3 <= alloc_lines
454	      || PTRDIFF_MAX / sizeof *cureqs <= 2 * alloc_lines - linbuf_base
455	      || PTRDIFF_MAX / sizeof *linbuf <= alloc_lines - linbuf_base)
456	    xalloc_die ();
457	  alloc_lines = 2 * alloc_lines - linbuf_base;
458	  cureqs = xrealloc (cureqs, alloc_lines * sizeof *cureqs);
459	  linbuf += linbuf_base;
460	  linbuf = xrealloc (linbuf,
461			     (alloc_lines - linbuf_base) * sizeof *linbuf);
462	  linbuf -= linbuf_base;
463	}
464      linbuf[line] = ip;
465      cureqs[line] = i;
466      ++line;
467    }
468
469  current->buffered_lines = line;
470
471  for (i = 0;  ;  i++)
472    {
473      /* Record the line start for lines in the suffix that we care about.
474	 Record one more line start than lines,
475	 so that we can compute the length of any buffered line.  */
476      if (line == alloc_lines)
477	{
478	  /* Double (alloc_lines - linbuf_base) by adding to alloc_lines.  */
479	  if (PTRDIFF_MAX / 3 <= alloc_lines
480	      || PTRDIFF_MAX / sizeof *cureqs <= 2 * alloc_lines - linbuf_base
481	      || PTRDIFF_MAX / sizeof *linbuf <= alloc_lines - linbuf_base)
482	    xalloc_die ();
483	  alloc_lines = 2 * alloc_lines - linbuf_base;
484	  linbuf += linbuf_base;
485	  linbuf = xrealloc (linbuf,
486			     (alloc_lines - linbuf_base) * sizeof *linbuf);
487	  linbuf -= linbuf_base;
488	}
489      linbuf[line] = p;
490
491      if (p == bufend)
492	break;
493
494      if (context <= i && no_diff_means_no_output)
495	break;
496
497      line++;
498
499      while (*p++ != '\n')
500	continue;
501    }
502
503  /* Done with cache in local variables.  */
504  current->linbuf = linbuf;
505  current->valid_lines = line;
506  current->alloc_lines = alloc_lines;
507  current->equivs = cureqs;
508  equivs = eqs;
509  equivs_alloc = eqs_alloc;
510  equivs_index = eqs_index;
511}
512
513/* Prepare the text.  Make sure the text end is initialized.
514   Make sure text ends in a newline,
515   but remember that we had to add one.
516   Strip trailing CRs, if that was requested.  */
517
518static void
519prepare_text (struct file_data *current)
520{
521  size_t buffered = current->buffered;
522  char *p = FILE_BUFFER (current);
523  char *dst;
524
525  if (buffered == 0 || p[buffered - 1] == '\n')
526    current->missing_newline = false;
527  else
528    {
529      p[buffered++] = '\n';
530      current->missing_newline = true;
531    }
532
533  if (!p)
534    return;
535
536  /* Don't use uninitialized storage when planting or using sentinels.  */
537  memset (p + buffered, 0, sizeof (word));
538
539  if (strip_trailing_cr && (dst = memchr (p, '\r', buffered)))
540    {
541      char const *src = dst;
542      char const *srclim = p + buffered;
543
544      do
545	dst += ! ((*dst = *src++) == '\r' && *src == '\n');
546      while (src < srclim);
547
548      buffered -= src - dst;
549    }
550
551  current->buffered = buffered;
552}
553
554/* We have found N lines in a buffer of size S; guess the
555   proportionate number of lines that will be found in a buffer of
556   size T.  However, do not guess a number of lines so large that the
557   resulting line table might cause overflow in size calculations.  */
558static lin
559guess_lines (lin n, size_t s, size_t t)
560{
561  size_t guessed_bytes_per_line = n < 10 ? 32 : s / (n - 1);
562  lin guessed_lines = MAX (1, t / guessed_bytes_per_line);
563  return MIN (guessed_lines, PTRDIFF_MAX / (2 * sizeof (char *) + 1) - 5) + 5;
564}
565
566/* Given a vector of two file_data objects, find the identical
567   prefixes and suffixes of each object.  */
568
569static void
570find_identical_ends (struct file_data filevec[])
571{
572  word *w0, *w1;
573  char *p0, *p1, *buffer0, *buffer1;
574  char const *end0, *beg0;
575  char const **linbuf0, **linbuf1;
576  lin i, lines;
577  size_t n0, n1;
578  lin alloc_lines0, alloc_lines1;
579  lin buffered_prefix, prefix_count, prefix_mask;
580  lin middle_guess, suffix_guess;
581
582  slurp (&filevec[0]);
583  prepare_text (&filevec[0]);
584  if (filevec[0].desc != filevec[1].desc)
585    {
586      slurp (&filevec[1]);
587      prepare_text (&filevec[1]);
588    }
589  else
590    {
591      filevec[1].buffer = filevec[0].buffer;
592      filevec[1].bufsize = filevec[0].bufsize;
593      filevec[1].buffered = filevec[0].buffered;
594      filevec[1].missing_newline = filevec[0].missing_newline;
595    }
596
597  /* Find identical prefix.  */
598
599  w0 = filevec[0].buffer;
600  w1 = filevec[1].buffer;
601  p0 = buffer0 = (char *) w0;
602  p1 = buffer1 = (char *) w1;
603  n0 = filevec[0].buffered;
604  n1 = filevec[1].buffered;
605
606  if (p0 == p1)
607    /* The buffers are the same; sentinels won't work.  */
608    p0 = p1 += n1;
609  else
610    {
611      /* Insert end sentinels, in this case characters that are guaranteed
612	 to make the equality test false, and thus terminate the loop.  */
613
614      if (n0 < n1)
615	p0[n0] = ~p1[n0];
616      else
617	p1[n1] = ~p0[n1];
618
619      /* Loop until first mismatch, or to the sentinel characters.  */
620
621      /* Compare a word at a time for speed.  */
622      while (*w0 == *w1)
623	w0++, w1++;
624
625      /* Do the last few bytes of comparison a byte at a time.  */
626      p0 = (char *) w0;
627      p1 = (char *) w1;
628      while (*p0 == *p1)
629	p0++, p1++;
630
631      /* Don't mistakenly count missing newline as part of prefix.  */
632      if (ROBUST_OUTPUT_STYLE (output_style)
633	  && ((buffer0 + n0 - filevec[0].missing_newline < p0)
634	      !=
635	      (buffer1 + n1 - filevec[1].missing_newline < p1)))
636	p0--, p1--;
637    }
638
639  /* Now P0 and P1 point at the first nonmatching characters.  */
640
641  /* Skip back to last line-beginning in the prefix,
642     and then discard up to HORIZON_LINES lines from the prefix.  */
643  i = horizon_lines;
644  while (p0 != buffer0 && (p0[-1] != '\n' || i--))
645    p0--, p1--;
646
647  /* Record the prefix.  */
648  filevec[0].prefix_end = p0;
649  filevec[1].prefix_end = p1;
650
651  /* Find identical suffix.  */
652
653  /* P0 and P1 point beyond the last chars not yet compared.  */
654  p0 = buffer0 + n0;
655  p1 = buffer1 + n1;
656
657  if (! ROBUST_OUTPUT_STYLE (output_style)
658      || filevec[0].missing_newline == filevec[1].missing_newline)
659    {
660      end0 = p0;	/* Addr of last char in file 0.  */
661
662      /* Get value of P0 at which we should stop scanning backward:
663	 this is when either P0 or P1 points just past the last char
664	 of the identical prefix.  */
665      beg0 = filevec[0].prefix_end + (n0 < n1 ? 0 : n0 - n1);
666
667      /* Scan back until chars don't match or we reach that point.  */
668      for (; p0 != beg0; p0--, p1--)
669	if (*p0 != *p1)
670	  {
671	    /* Point at the first char of the matching suffix.  */
672	    beg0 = p0;
673	    break;
674	  }
675
676      /* Are we at a line-beginning in both files?  If not, add the rest of
677	 this line to the main body.  Discard up to HORIZON_LINES lines from
678	 the identical suffix.  Also, discard one extra line,
679	 because shift_boundaries may need it.  */
680      i = horizon_lines + !((buffer0 == p0 || p0[-1] == '\n')
681			    &&
682			    (buffer1 == p1 || p1[-1] == '\n'));
683      while (i-- && p0 != end0)
684	while (*p0++ != '\n')
685	  continue;
686
687      p1 += p0 - beg0;
688    }
689
690  /* Record the suffix.  */
691  filevec[0].suffix_begin = p0;
692  filevec[1].suffix_begin = p1;
693
694  /* Calculate number of lines of prefix to save.
695
696     prefix_count == 0 means save the whole prefix;
697     we need this for options like -D that output the whole file,
698     or for enormous contexts (to avoid worrying about arithmetic overflow).
699     We also need it for options like -F that output some preceding line;
700     at least we will need to find the last few lines,
701     but since we don't know how many, it's easiest to find them all.
702
703     Otherwise, prefix_count != 0.  Save just prefix_count lines at start
704     of the line buffer; they'll be moved to the proper location later.
705     Handle 1 more line than the context says (because we count 1 too many),
706     rounded up to the next power of 2 to speed index computation.  */
707
708  if (no_diff_means_no_output && ! function_regexp.fastmap
709      && context < LIN_MAX / 4 && context < n0)
710    {
711      middle_guess = guess_lines (0, 0, p0 - filevec[0].prefix_end);
712      suffix_guess = guess_lines (0, 0, buffer0 + n0 - p0);
713      for (prefix_count = 1;  prefix_count <= context;  prefix_count *= 2)
714	continue;
715      alloc_lines0 = (prefix_count + middle_guess
716		      + MIN (context, suffix_guess));
717    }
718  else
719    {
720      prefix_count = 0;
721      alloc_lines0 = guess_lines (0, 0, n0);
722    }
723
724  prefix_mask = prefix_count - 1;
725  lines = 0;
726  linbuf0 = xmalloc (alloc_lines0 * sizeof *linbuf0);
727  p0 = buffer0;
728
729  /* If the prefix is needed, find the prefix lines.  */
730  if (! (no_diff_means_no_output
731	 && filevec[0].prefix_end == p0
732	 && filevec[1].prefix_end == p1))
733    {
734      end0 = filevec[0].prefix_end;
735      while (p0 != end0)
736	{
737	  lin l = lines++ & prefix_mask;
738	  if (l == alloc_lines0)
739	    {
740	      if (PTRDIFF_MAX / (2 * sizeof *linbuf0) <= alloc_lines0)
741		xalloc_die ();
742	      alloc_lines0 *= 2;
743	      linbuf0 = xrealloc (linbuf0, alloc_lines0 * sizeof *linbuf0);
744	    }
745	  linbuf0[l] = p0;
746	  while (*p0++ != '\n')
747	    continue;
748	}
749    }
750  buffered_prefix = prefix_count && context < lines ? context : lines;
751
752  /* Allocate line buffer 1.  */
753
754  middle_guess = guess_lines (lines, p0 - buffer0, p1 - filevec[1].prefix_end);
755  suffix_guess = guess_lines (lines, p0 - buffer0, buffer1 + n1 - p1);
756  alloc_lines1 = buffered_prefix + middle_guess + MIN (context, suffix_guess);
757  if (alloc_lines1 < buffered_prefix
758      || PTRDIFF_MAX / sizeof *linbuf1 <= alloc_lines1)
759    xalloc_die ();
760  linbuf1 = xmalloc (alloc_lines1 * sizeof *linbuf1);
761
762  if (buffered_prefix != lines)
763    {
764      /* Rotate prefix lines to proper location.  */
765      for (i = 0;  i < buffered_prefix;  i++)
766	linbuf1[i] = linbuf0[(lines - context + i) & prefix_mask];
767      for (i = 0;  i < buffered_prefix;  i++)
768	linbuf0[i] = linbuf1[i];
769    }
770
771  /* Initialize line buffer 1 from line buffer 0.  */
772  for (i = 0; i < buffered_prefix; i++)
773    linbuf1[i] = linbuf0[i] - buffer0 + buffer1;
774
775  /* Record the line buffer, adjusted so that
776     linbuf[0] points at the first differing line.  */
777  filevec[0].linbuf = linbuf0 + buffered_prefix;
778  filevec[1].linbuf = linbuf1 + buffered_prefix;
779  filevec[0].linbuf_base = filevec[1].linbuf_base = - buffered_prefix;
780  filevec[0].alloc_lines = alloc_lines0 - buffered_prefix;
781  filevec[1].alloc_lines = alloc_lines1 - buffered_prefix;
782  filevec[0].prefix_lines = filevec[1].prefix_lines = lines;
783}
784
785/* If 1 < k, then (2**k - prime_offset[k]) is the largest prime less
786   than 2**k.  This table is derived from Chris K. Caldwell's list
787   <http://www.utm.edu/research/primes/lists/2small/>.  */
788
789static unsigned char const prime_offset[] =
790{
791  0, 0, 1, 1, 3, 1, 3, 1, 5, 3, 3, 9, 3, 1, 3, 19, 15, 1, 5, 1, 3, 9, 3,
792  15, 3, 39, 5, 39, 57, 3, 35, 1, 5, 9, 41, 31, 5, 25, 45, 7, 87, 21,
793  11, 57, 17, 55, 21, 115, 59, 81, 27, 129, 47, 111, 33, 55, 5, 13, 27,
794  55, 93, 1, 57, 25
795};
796
797/* Verify that this host's size_t is not too wide for the above table.  */
798
799verify (enough_prime_offsets,
800	sizeof (size_t) * CHAR_BIT <= sizeof prime_offset);
801
802/* Given a vector of two file_data objects, read the file associated
803   with each one, and build the table of equivalence classes.
804   Return nonzero if either file appears to be a binary file.
805   If PRETEND_BINARY is nonzero, pretend they are binary regardless.  */
806
807bool
808read_files (struct file_data filevec[], bool pretend_binary)
809{
810  int i;
811  bool skip_test = text | pretend_binary;
812  bool appears_binary = pretend_binary | sip (&filevec[0], skip_test);
813
814  if (filevec[0].desc != filevec[1].desc)
815    appears_binary |= sip (&filevec[1], skip_test | appears_binary);
816  else
817    {
818      filevec[1].buffer = filevec[0].buffer;
819      filevec[1].bufsize = filevec[0].bufsize;
820      filevec[1].buffered = filevec[0].buffered;
821    }
822  if (appears_binary)
823    {
824      set_binary_mode (filevec[0].desc, true);
825      set_binary_mode (filevec[1].desc, true);
826      return true;
827    }
828
829  find_identical_ends (filevec);
830
831  equivs_alloc = filevec[0].alloc_lines + filevec[1].alloc_lines + 1;
832  if (PTRDIFF_MAX / sizeof *equivs <= equivs_alloc)
833    xalloc_die ();
834  equivs = xmalloc (equivs_alloc * sizeof *equivs);
835  /* Equivalence class 0 is permanently safe for lines that were not
836     hashed.  Real equivalence classes start at 1.  */
837  equivs_index = 1;
838
839  /* Allocate (one plus) a prime number of hash buckets.  Use a prime
840     number between 1/3 and 2/3 of the value of equiv_allocs,
841     approximately.  */
842  for (i = 9; (size_t) 1 << i < equivs_alloc / 3; i++)
843    continue;
844  nbuckets = ((size_t) 1 << i) - prime_offset[i];
845  if (PTRDIFF_MAX / sizeof *buckets <= nbuckets)
846    xalloc_die ();
847  buckets = zalloc ((nbuckets + 1) * sizeof *buckets);
848  buckets++;
849
850  for (i = 0; i < 2; i++)
851    find_and_hash_each_line (&filevec[i]);
852
853  filevec[0].equiv_max = filevec[1].equiv_max = equivs_index;
854
855  free (equivs);
856  free (buckets - 1);
857
858  return false;
859}
860