1/* File I/O for GNU DIFF.
2
3   Copyright (C) 1988, 1989, 1992, 1993, 1994, 1995, 1998, 2001, 2002
4   Free Software Foundation, Inc.
5
6   This file is part of GNU DIFF.
7
8   GNU DIFF is free software; you can redistribute it and/or modify
9   it under the terms of the GNU General Public License as published by
10   the Free Software Foundation; either version 2, or (at your option)
11   any later version.
12
13   GNU DIFF is distributed in the hope that it will be useful,
14   but WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16   GNU General Public License for more details.
17
18   You should have received a copy of the GNU General Public License
19   along with this program; see the file COPYING.
20   If not, write to the Free Software Foundation,
21   59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.  */
22
23#include "diff.h"
24#include <cmpbuf.h>
25#include <regex.h>
26#include <setmode.h>
27#include <xalloc.h>
28
29/* Rotate an unsigned value to the left.  */
30#define ROL(v, n) ((v) << (n) | (v) >> (sizeof (v) * CHAR_BIT - (n)))
31
32/* Given a hash value and a new character, return a new hash value.  */
33#define HASH(h, c) ((c) + ROL (h, 7))
34
35/* The type of a hash value.  */
36typedef size_t hash_value;
37verify (hash_value_is_unsigned, ! TYPE_SIGNED (hash_value));
38
39/* Lines are put into equivalence classes of lines that match in lines_differ.
40   Each equivalence class is represented by one of these structures,
41   but only while the classes are being computed.
42   Afterward, each class is represented by a number.  */
43struct equivclass
44{
45  lin next;		/* Next item in this bucket.  */
46  hash_value hash;	/* Hash of lines in this class.  */
47  char const *line;	/* A line that fits this class.  */
48  size_t length;	/* That line's length, not counting its newline.  */
49};
50
51/* Hash-table: array of buckets, each being a chain of equivalence classes.
52   buckets[-1] is reserved for incomplete lines.  */
53static lin *buckets;
54
55/* Number of buckets in the hash table array, not counting buckets[-1].  */
56static size_t nbuckets;
57
58/* Array in which the equivalence classes are allocated.
59   The bucket-chains go through the elements in this array.
60   The number of an equivalence class is its index in this array.  */
61static struct equivclass *equivs;
62
63/* Index of first free element in the array `equivs'.  */
64static lin equivs_index;
65
66/* Number of elements allocated in the array `equivs'.  */
67static lin equivs_alloc;
68
69/* Read a block of data into a file buffer, checking for EOF and error.  */
70
71void
72file_block_read (struct file_data *current, size_t size)
73{
74  if (size && ! current->eof)
75    {
76      size_t s = block_read (current->desc,
77			     FILE_BUFFER (current) + current->buffered, size);
78      if (s == SIZE_MAX)
79	pfatal_with_name (current->name);
80      current->buffered += s;
81      current->eof = s < size;
82    }
83}
84
85/* Check for binary files and compare them for exact identity.  */
86
87/* Return 1 if BUF contains a non text character.
88   SIZE is the number of characters in BUF.  */
89
90#define binary_file_p(buf, size) (memchr (buf, 0, size) != 0)
91
92/* Get ready to read the current file.
93   Return nonzero if SKIP_TEST is zero,
94   and if it appears to be a binary file.  */
95
96static bool
97sip (struct file_data *current, bool skip_test)
98{
99  /* If we have a nonexistent file at this stage, treat it as empty.  */
100  if (current->desc < 0)
101    {
102      /* Leave room for a sentinel.  */
103      current->bufsize = sizeof (word);
104      current->buffer = xmalloc (current->bufsize);
105    }
106  else
107    {
108      current->bufsize = buffer_lcm (sizeof (word),
109				     STAT_BLOCKSIZE (current->stat),
110				     PTRDIFF_MAX - 2 * sizeof (word));
111      current->buffer = xmalloc (current->bufsize);
112
113      if (! skip_test)
114	{
115	  /* Check first part of file to see if it's a binary file.  */
116
117	  bool was_binary = set_binary_mode (current->desc, 1);
118	  off_t buffered;
119	  file_block_read (current, current->bufsize);
120	  buffered = current->buffered;
121
122	  if (! was_binary)
123	    {
124	      /* Revert to text mode and seek back to the beginning to
125		 reread the file.  Use relative seek, since file
126		 descriptors like stdin might not start at offset
127		 zero.  */
128
129	      if (lseek (current->desc, - buffered, SEEK_CUR) == -1)
130		pfatal_with_name (current->name);
131	      set_binary_mode (current->desc, 0);
132	      current->buffered = 0;
133	      current->eof = 0;
134	    }
135
136	  return binary_file_p (current->buffer, buffered);
137	}
138    }
139
140  current->buffered = 0;
141  current->eof = 0;
142  return 0;
143}
144
145/* Slurp the rest of the current file completely into memory.  */
146
147static void
148slurp (struct file_data *current)
149{
150  size_t cc;
151
152  if (current->desc < 0)
153    {
154      /* The file is nonexistent.  */
155      return;
156    }
157
158  if (S_ISREG (current->stat.st_mode))
159    {
160      /* It's a regular file; slurp in the rest all at once.  */
161
162      /* Get the size out of the stat block.
163	 Allocate just enough room for appended newline plus word sentinel,
164	 plus word-alignment since we want the buffer word-aligned.  */
165      size_t file_size = current->stat.st_size;
166      cc = file_size + 2 * sizeof (word) - file_size % sizeof (word);
167      if (file_size != current->stat.st_size || cc < file_size
168	  || PTRDIFF_MAX <= cc)
169	xalloc_die ();
170
171      if (current->bufsize < cc)
172	{
173	  current->bufsize = cc;
174	  current->buffer = xrealloc (current->buffer, cc);
175	}
176
177      /* Try to read at least 1 more byte than the size indicates, to
178	 detect whether the file is growing.  This is a nicety for
179	 users who run 'diff' on files while they are changing.  */
180
181      if (current->buffered <= file_size)
182	{
183	  file_block_read (current, file_size + 1 - current->buffered);
184	  if (current->buffered <= file_size)
185	    return;
186	}
187    }
188
189  /* It's not a regular file, or it's a growing regular file; read it,
190     growing the buffer as needed.  */
191
192  file_block_read (current, current->bufsize - current->buffered);
193
194  if (current->buffered)
195    {
196      while (current->buffered == current->bufsize)
197	{
198	  if (PTRDIFF_MAX / 2 - sizeof (word) < current->bufsize)
199	    xalloc_die ();
200	  current->bufsize *= 2;
201	  current->buffer = xrealloc (current->buffer, current->bufsize);
202	  file_block_read (current, current->bufsize - current->buffered);
203	}
204
205      /* Allocate just enough room for appended newline plus word
206	 sentinel, plus word-alignment.  */
207      cc = current->buffered + 2 * sizeof (word);
208      current->bufsize = cc - cc % sizeof (word);
209      current->buffer = xrealloc (current->buffer, current->bufsize);
210    }
211}
212
213/* Split the file into lines, simultaneously computing the equivalence
214   class for each line.  */
215
216static void
217find_and_hash_each_line (struct file_data *current)
218{
219  hash_value h;
220  unsigned char const *p = (unsigned char const *) current->prefix_end;
221  unsigned char c;
222  lin i, *bucket;
223  size_t length;
224
225  /* Cache often-used quantities in local variables to help the compiler.  */
226  char const **linbuf = current->linbuf;
227  lin alloc_lines = current->alloc_lines;
228  lin line = 0;
229  lin linbuf_base = current->linbuf_base;
230  lin *cureqs = xmalloc (alloc_lines * sizeof *cureqs);
231  struct equivclass *eqs = equivs;
232  lin eqs_index = equivs_index;
233  lin eqs_alloc = equivs_alloc;
234  char const *suffix_begin = current->suffix_begin;
235  char const *bufend = FILE_BUFFER (current) + current->buffered;
236  bool diff_length_compare_anyway =
237    ignore_white_space != IGNORE_NO_WHITE_SPACE;
238  bool same_length_diff_contents_compare_anyway =
239    diff_length_compare_anyway | ignore_case;
240
241  while ((char const *) p < suffix_begin)
242    {
243      char const *ip = (char const *) p;
244
245      h = 0;
246
247      /* Hash this line until we find a newline.  */
248      if (ignore_case)
249	switch (ignore_white_space)
250	  {
251	  case IGNORE_ALL_SPACE:
252	    while ((c = *p++) != '\n')
253	      if (! ISSPACE (c))
254		h = HASH (h, TOLOWER (c));
255	    break;
256
257	  case IGNORE_SPACE_CHANGE:
258	    while ((c = *p++) != '\n')
259	      {
260		if (ISSPACE (c))
261		  {
262		    do
263		      if ((c = *p++) == '\n')
264			goto hashing_done;
265		    while (ISSPACE (c));
266
267		    h = HASH (h, ' ');
268		  }
269
270		/* C is now the first non-space.  */
271		h = HASH (h, TOLOWER (c));
272	      }
273	    break;
274
275	  case IGNORE_TAB_EXPANSION:
276	    {
277	      size_t column = 0;
278	      while ((c = *p++) != '\n')
279		{
280		  int repetitions = 1;
281
282		  switch (c)
283		    {
284		    case '\b':
285		      column -= 0 < column;
286		      break;
287
288		    case '\t':
289		      c = ' ';
290		      repetitions = TAB_WIDTH - column % TAB_WIDTH;
291		      column += repetitions;
292		      break;
293
294		    case '\r':
295		      column = 0;
296		      break;
297
298		    default:
299		      c = TOLOWER (c);
300		      column++;
301		      break;
302		    }
303
304		  do
305		    h = HASH (h, c);
306		  while (--repetitions != 0);
307		}
308	    }
309	    break;
310
311	  default:
312	    while ((c = *p++) != '\n')
313	      h = HASH (h, TOLOWER (c));
314	    break;
315	  }
316      else
317	switch (ignore_white_space)
318	  {
319	  case IGNORE_ALL_SPACE:
320	    while ((c = *p++) != '\n')
321	      if (! ISSPACE (c))
322		h = HASH (h, c);
323	    break;
324
325	  case IGNORE_SPACE_CHANGE:
326	    while ((c = *p++) != '\n')
327	      {
328		if (ISSPACE (c))
329		  {
330		    do
331		      if ((c = *p++) == '\n')
332			goto hashing_done;
333		    while (ISSPACE (c));
334
335		    h = HASH (h, ' ');
336		  }
337
338		/* C is now the first non-space.  */
339		h = HASH (h, c);
340	      }
341	    break;
342
343	  case IGNORE_TAB_EXPANSION:
344	    {
345	      size_t column = 0;
346	      while ((c = *p++) != '\n')
347		{
348		  int repetitions = 1;
349
350		  switch (c)
351		    {
352		    case '\b':
353		      column -= 0 < column;
354		      break;
355
356		    case '\t':
357		      c = ' ';
358		      repetitions = TAB_WIDTH - column % TAB_WIDTH;
359		      column += repetitions;
360		      break;
361
362		    case '\r':
363		      column = 0;
364		      break;
365
366		    default:
367		      column++;
368		      break;
369		    }
370
371		  do
372		    h = HASH (h, c);
373		  while (--repetitions != 0);
374		}
375	    }
376	    break;
377
378	  default:
379	    while ((c = *p++) != '\n')
380	      h = HASH (h, c);
381	    break;
382	  }
383
384   hashing_done:;
385
386      bucket = &buckets[h % nbuckets];
387      length = (char const *) p - ip - 1;
388
389      if ((char const *) p == bufend
390	  && current->missing_newline
391	  && ROBUST_OUTPUT_STYLE (output_style))
392	{
393	  /* This line is incomplete.  If this is significant,
394	     put the line into buckets[-1].  */
395	  if (ignore_white_space < IGNORE_SPACE_CHANGE)
396	    bucket = &buckets[-1];
397
398	  /* Omit the inserted newline when computing linbuf later.  */
399	  p--;
400	  bufend = suffix_begin = (char const *) p;
401	}
402
403      for (i = *bucket;  ;  i = eqs[i].next)
404	if (!i)
405	  {
406	    /* Create a new equivalence class in this bucket.  */
407	    i = eqs_index++;
408	    if (i == eqs_alloc)
409	      {
410		if (PTRDIFF_MAX / (2 * sizeof *eqs) <= eqs_alloc)
411		  xalloc_die ();
412		eqs_alloc *= 2;
413		eqs = xrealloc (eqs, eqs_alloc * sizeof *eqs);
414	      }
415	    eqs[i].next = *bucket;
416	    eqs[i].hash = h;
417	    eqs[i].line = ip;
418	    eqs[i].length = length;
419	    *bucket = i;
420	    break;
421	  }
422	else if (eqs[i].hash == h)
423	  {
424	    char const *eqline = eqs[i].line;
425
426	    /* Reuse existing class if lines_differ reports the lines
427               equal.  */
428	    if (eqs[i].length == length)
429	      {
430		/* Reuse existing equivalence class if the lines are identical.
431		   This detects the common case of exact identity
432		   faster than lines_differ would.  */
433		if (memcmp (eqline, ip, length) == 0)
434		  break;
435		if (!same_length_diff_contents_compare_anyway)
436		  continue;
437	      }
438	    else if (!diff_length_compare_anyway)
439	      continue;
440
441	    if (! lines_differ (eqline, ip))
442	      break;
443	  }
444
445      /* Maybe increase the size of the line table.  */
446      if (line == alloc_lines)
447	{
448	  /* Double (alloc_lines - linbuf_base) by adding to alloc_lines.  */
449	  if (PTRDIFF_MAX / 3 <= alloc_lines
450	      || PTRDIFF_MAX / sizeof *cureqs <= 2 * alloc_lines - linbuf_base
451	      || PTRDIFF_MAX / sizeof *linbuf <= alloc_lines - linbuf_base)
452	    xalloc_die ();
453	  alloc_lines = 2 * alloc_lines - linbuf_base;
454	  cureqs = xrealloc (cureqs, alloc_lines * sizeof *cureqs);
455	  linbuf += linbuf_base;
456	  linbuf = xrealloc (linbuf,
457			     (alloc_lines - linbuf_base) * sizeof *linbuf);
458	  linbuf -= linbuf_base;
459	}
460      linbuf[line] = ip;
461      cureqs[line] = i;
462      ++line;
463    }
464
465  current->buffered_lines = line;
466
467  for (i = 0;  ;  i++)
468    {
469      /* Record the line start for lines in the suffix that we care about.
470	 Record one more line start than lines,
471	 so that we can compute the length of any buffered line.  */
472      if (line == alloc_lines)
473	{
474	  /* Double (alloc_lines - linbuf_base) by adding to alloc_lines.  */
475	  if (PTRDIFF_MAX / 3 <= alloc_lines
476	      || PTRDIFF_MAX / sizeof *cureqs <= 2 * alloc_lines - linbuf_base
477	      || PTRDIFF_MAX / sizeof *linbuf <= alloc_lines - linbuf_base)
478	    xalloc_die ();
479	  alloc_lines = 2 * alloc_lines - linbuf_base;
480	  linbuf += linbuf_base;
481	  linbuf = xrealloc (linbuf,
482			     (alloc_lines - linbuf_base) * sizeof *linbuf);
483	  linbuf -= linbuf_base;
484	}
485      linbuf[line] = (char const *) p;
486
487      if ((char const *) p == bufend)
488	break;
489
490      if (context <= i && no_diff_means_no_output)
491	break;
492
493      line++;
494
495      while (*p++ != '\n')
496	continue;
497    }
498
499  /* Done with cache in local variables.  */
500  current->linbuf = linbuf;
501  current->valid_lines = line;
502  current->alloc_lines = alloc_lines;
503  current->equivs = cureqs;
504  equivs = eqs;
505  equivs_alloc = eqs_alloc;
506  equivs_index = eqs_index;
507}
508
509/* Prepare the text.  Make sure the text end is initialized.
510   Make sure text ends in a newline,
511   but remember that we had to add one.
512   Strip trailing CRs, if that was requested.  */
513
514static void
515prepare_text (struct file_data *current)
516{
517  size_t buffered = current->buffered;
518  char *p = FILE_BUFFER (current);
519  char *dst;
520
521  if (buffered == 0 || p[buffered - 1] == '\n')
522    current->missing_newline = 0;
523  else
524    {
525      p[buffered++] = '\n';
526      current->missing_newline = 1;
527    }
528
529  if (!p)
530    return;
531
532  /* Don't use uninitialized storage when planting or using sentinels.  */
533  memset (p + buffered, 0, sizeof (word));
534
535  if (strip_trailing_cr && (dst = memchr (p, '\r', buffered)))
536    {
537      char const *src = dst;
538      char const *srclim = p + buffered;
539
540      do
541	dst += ! ((*dst = *src++) == '\r' && *src == '\n');
542      while (src < srclim);
543
544      buffered -= src - dst;
545    }
546
547  current->buffered = buffered;
548}
549
550/* We have found N lines in a buffer of size S; guess the
551   proportionate number of lines that will be found in a buffer of
552   size T.  However, do not guess a number of lines so large that the
553   resulting line table might cause overflow in size calculations.  */
554static lin
555guess_lines (lin n, size_t s, size_t t)
556{
557  size_t guessed_bytes_per_line = n < 10 ? 32 : s / (n - 1);
558  lin guessed_lines = MAX (1, t / guessed_bytes_per_line);
559  return MIN (guessed_lines, PTRDIFF_MAX / (2 * sizeof (char *) + 1) - 5) + 5;
560}
561
562/* Given a vector of two file_data objects, find the identical
563   prefixes and suffixes of each object.  */
564
565static void
566find_identical_ends (struct file_data filevec[])
567{
568  word *w0, *w1;
569  char *p0, *p1, *buffer0, *buffer1;
570  char const *end0, *beg0;
571  char const **linbuf0, **linbuf1;
572  lin i, lines;
573  size_t n0, n1;
574  lin alloc_lines0, alloc_lines1;
575  lin buffered_prefix, prefix_count, prefix_mask;
576  lin middle_guess, suffix_guess;
577
578  slurp (&filevec[0]);
579  prepare_text (&filevec[0]);
580  if (filevec[0].desc != filevec[1].desc)
581    {
582      slurp (&filevec[1]);
583      prepare_text (&filevec[1]);
584    }
585  else
586    {
587      filevec[1].buffer = filevec[0].buffer;
588      filevec[1].bufsize = filevec[0].bufsize;
589      filevec[1].buffered = filevec[0].buffered;
590      filevec[1].missing_newline = filevec[0].missing_newline;
591    }
592
593  /* Find identical prefix.  */
594
595  w0 = filevec[0].buffer;
596  w1 = filevec[1].buffer;
597  p0 = buffer0 = (char *) w0;
598  p1 = buffer1 = (char *) w1;
599  n0 = filevec[0].buffered;
600  n1 = filevec[1].buffered;
601
602  if (p0 == p1)
603    /* The buffers are the same; sentinels won't work.  */
604    p0 = p1 += n1;
605  else
606    {
607      /* Insert end sentinels, in this case characters that are guaranteed
608	 to make the equality test false, and thus terminate the loop.  */
609
610      if (n0 < n1)
611	p0[n0] = ~p1[n0];
612      else
613	p1[n1] = ~p0[n1];
614
615      /* Loop until first mismatch, or to the sentinel characters.  */
616
617      /* Compare a word at a time for speed.  */
618      while (*w0 == *w1)
619	w0++, w1++;
620
621      /* Do the last few bytes of comparison a byte at a time.  */
622      p0 = (char *) w0;
623      p1 = (char *) w1;
624      while (*p0 == *p1)
625	p0++, p1++;
626
627      /* Don't mistakenly count missing newline as part of prefix.  */
628      if (ROBUST_OUTPUT_STYLE (output_style)
629	  && ((buffer0 + n0 - filevec[0].missing_newline < p0)
630	      !=
631	      (buffer1 + n1 - filevec[1].missing_newline < p1)))
632	p0--, p1--;
633    }
634
635  /* Now P0 and P1 point at the first nonmatching characters.  */
636
637  /* Skip back to last line-beginning in the prefix,
638     and then discard up to HORIZON_LINES lines from the prefix.  */
639  i = horizon_lines;
640  while (p0 != buffer0 && (p0[-1] != '\n' || i--))
641    p0--, p1--;
642
643  /* Record the prefix.  */
644  filevec[0].prefix_end = p0;
645  filevec[1].prefix_end = p1;
646
647  /* Find identical suffix.  */
648
649  /* P0 and P1 point beyond the last chars not yet compared.  */
650  p0 = buffer0 + n0;
651  p1 = buffer1 + n1;
652
653  if (! ROBUST_OUTPUT_STYLE (output_style)
654      || filevec[0].missing_newline == filevec[1].missing_newline)
655    {
656      end0 = p0;	/* Addr of last char in file 0.  */
657
658      /* Get value of P0 at which we should stop scanning backward:
659	 this is when either P0 or P1 points just past the last char
660	 of the identical prefix.  */
661      beg0 = filevec[0].prefix_end + (n0 < n1 ? 0 : n0 - n1);
662
663      /* Scan back until chars don't match or we reach that point.  */
664      for (; p0 != beg0; p0--, p1--)
665	if (*p0 != *p1)
666	  {
667	    /* Point at the first char of the matching suffix.  */
668	    beg0 = p0;
669	    break;
670	  }
671
672      /* Are we at a line-beginning in both files?  If not, add the rest of
673	 this line to the main body.  Discard up to HORIZON_LINES lines from
674	 the identical suffix.  Also, discard one extra line,
675	 because shift_boundaries may need it.  */
676      i = horizon_lines + !((buffer0 == p0 || p0[-1] == '\n')
677			    &&
678			    (buffer1 == p1 || p1[-1] == '\n'));
679      while (i-- && p0 != end0)
680	while (*p0++ != '\n')
681	  continue;
682
683      p1 += p0 - beg0;
684    }
685
686  /* Record the suffix.  */
687  filevec[0].suffix_begin = p0;
688  filevec[1].suffix_begin = p1;
689
690  /* Calculate number of lines of prefix to save.
691
692     prefix_count == 0 means save the whole prefix;
693     we need this for options like -D that output the whole file,
694     or for enormous contexts (to avoid worrying about arithmetic overflow).
695     We also need it for options like -F that output some preceding line;
696     at least we will need to find the last few lines,
697     but since we don't know how many, it's easiest to find them all.
698
699     Otherwise, prefix_count != 0.  Save just prefix_count lines at start
700     of the line buffer; they'll be moved to the proper location later.
701     Handle 1 more line than the context says (because we count 1 too many),
702     rounded up to the next power of 2 to speed index computation.  */
703
704  if (no_diff_means_no_output && ! function_regexp.fastmap
705      && context < LIN_MAX / 4 && context < n0)
706    {
707      middle_guess = guess_lines (0, 0, p0 - filevec[0].prefix_end);
708      suffix_guess = guess_lines (0, 0, buffer0 + n0 - p0);
709      for (prefix_count = 1;  prefix_count <= context;  prefix_count *= 2)
710	continue;
711      alloc_lines0 = (prefix_count + middle_guess
712		      + MIN (context, suffix_guess));
713    }
714  else
715    {
716      prefix_count = 0;
717      alloc_lines0 = guess_lines (0, 0, n0);
718    }
719
720  prefix_mask = prefix_count - 1;
721  lines = 0;
722  linbuf0 = xmalloc (alloc_lines0 * sizeof *linbuf0);
723  p0 = buffer0;
724
725  /* If the prefix is needed, find the prefix lines.  */
726  if (! (no_diff_means_no_output
727	 && filevec[0].prefix_end == p0
728	 && filevec[1].prefix_end == p1))
729    {
730      end0 = filevec[0].prefix_end;
731      while (p0 != end0)
732	{
733	  lin l = lines++ & prefix_mask;
734	  if (l == alloc_lines0)
735	    {
736	      if (PTRDIFF_MAX / (2 * sizeof *linbuf0) <= alloc_lines0)
737		xalloc_die ();
738	      alloc_lines0 *= 2;
739	      linbuf0 = xrealloc (linbuf0, alloc_lines0 * sizeof *linbuf0);
740	    }
741	  linbuf0[l] = p0;
742	  while (*p0++ != '\n')
743	    continue;
744	}
745    }
746  buffered_prefix = prefix_count && context < lines ? context : lines;
747
748  /* Allocate line buffer 1.  */
749
750  middle_guess = guess_lines (lines, p0 - buffer0, p1 - filevec[1].prefix_end);
751  suffix_guess = guess_lines (lines, p0 - buffer0, buffer1 + n1 - p1);
752  alloc_lines1 = buffered_prefix + middle_guess + MIN (context, suffix_guess);
753  if (alloc_lines1 < buffered_prefix
754      || PTRDIFF_MAX / sizeof *linbuf1 <= alloc_lines1)
755    xalloc_die ();
756  linbuf1 = xmalloc (alloc_lines1 * sizeof *linbuf1);
757
758  if (buffered_prefix != lines)
759    {
760      /* Rotate prefix lines to proper location.  */
761      for (i = 0;  i < buffered_prefix;  i++)
762	linbuf1[i] = linbuf0[(lines - context + i) & prefix_mask];
763      for (i = 0;  i < buffered_prefix;  i++)
764	linbuf0[i] = linbuf1[i];
765    }
766
767  /* Initialize line buffer 1 from line buffer 0.  */
768  for (i = 0; i < buffered_prefix; i++)
769    linbuf1[i] = linbuf0[i] - buffer0 + buffer1;
770
771  /* Record the line buffer, adjusted so that
772     linbuf[0] points at the first differing line.  */
773  filevec[0].linbuf = linbuf0 + buffered_prefix;
774  filevec[1].linbuf = linbuf1 + buffered_prefix;
775  filevec[0].linbuf_base = filevec[1].linbuf_base = - buffered_prefix;
776  filevec[0].alloc_lines = alloc_lines0 - buffered_prefix;
777  filevec[1].alloc_lines = alloc_lines1 - buffered_prefix;
778  filevec[0].prefix_lines = filevec[1].prefix_lines = lines;
779}
780
781/* If 1 < k, then (2**k - prime_offset[k]) is the largest prime less
782   than 2**k.  This table is derived from Chris K. Caldwell's list
783   <http://www.utm.edu/research/primes/lists/2small/>.  */
784
785static unsigned char const prime_offset[] =
786{
787  0, 0, 1, 1, 3, 1, 3, 1, 5, 3, 3, 9, 3, 1, 3, 19, 15, 1, 5, 1, 3, 9, 3,
788  15, 3, 39, 5, 39, 57, 3, 35, 1, 5, 9, 41, 31, 5, 25, 45, 7, 87, 21,
789  11, 57, 17, 55, 21, 115, 59, 81, 27, 129, 47, 111, 33, 55, 5, 13, 27,
790  55, 93, 1, 57, 25
791};
792
793/* Verify that this host's size_t is not too wide for the above table.  */
794
795verify (enough_prime_offsets,
796	sizeof (size_t) * CHAR_BIT <= sizeof prime_offset);
797
798/* Given a vector of two file_data objects, read the file associated
799   with each one, and build the table of equivalence classes.
800   Return nonzero if either file appears to be a binary file.
801   If PRETEND_BINARY is nonzero, pretend they are binary regardless.  */
802
803bool
804read_files (struct file_data filevec[], bool pretend_binary)
805{
806  int i;
807  bool skip_test = text | pretend_binary;
808  bool appears_binary = pretend_binary | sip (&filevec[0], skip_test);
809
810  if (filevec[0].desc != filevec[1].desc)
811    appears_binary |= sip (&filevec[1], skip_test | appears_binary);
812  else
813    {
814      filevec[1].buffer = filevec[0].buffer;
815      filevec[1].bufsize = filevec[0].bufsize;
816      filevec[1].buffered = filevec[0].buffered;
817    }
818  if (appears_binary)
819    {
820      set_binary_mode (filevec[0].desc, 1);
821      set_binary_mode (filevec[1].desc, 1);
822      return 1;
823    }
824
825  find_identical_ends (filevec);
826
827  equivs_alloc = filevec[0].alloc_lines + filevec[1].alloc_lines + 1;
828  if (PTRDIFF_MAX / sizeof *equivs <= equivs_alloc)
829    xalloc_die ();
830  equivs = xmalloc (equivs_alloc * sizeof *equivs);
831  /* Equivalence class 0 is permanently safe for lines that were not
832     hashed.  Real equivalence classes start at 1.  */
833  equivs_index = 1;
834
835  /* Allocate (one plus) a prime number of hash buckets.  Use a prime
836     number between 1/3 and 2/3 of the value of equiv_allocs,
837     approximately.  */
838  for (i = 9; (size_t) 1 << i < equivs_alloc / 3; i++)
839    continue;
840  nbuckets = ((size_t) 1 << i) - prime_offset[i];
841  if (PTRDIFF_MAX / sizeof *buckets <= nbuckets)
842    xalloc_die ();
843  buckets = zalloc ((nbuckets + 1) * sizeof *buckets);
844  buckets++;
845
846  for (i = 0; i < 2; i++)
847    find_and_hash_each_line (&filevec[i]);
848
849  filevec[0].equiv_max = filevec[1].equiv_max = equivs_index;
850
851  free (equivs);
852  free (buckets - 1);
853
854  return 0;
855}
856