1170754Sdelphij/* File I/O for GNU DIFF.
2170754Sdelphij
3170754Sdelphij   Copyright (C) 1988, 1989, 1992, 1993, 1994, 1995, 1998, 2001, 2002,
4170754Sdelphij   2004 Free Software Foundation, Inc.
5170754Sdelphij
6170754Sdelphij   This file is part of GNU DIFF.
7170754Sdelphij
8170754Sdelphij   GNU DIFF is free software; you can redistribute it and/or modify
9170754Sdelphij   it under the terms of the GNU General Public License as published by
10170754Sdelphij   the Free Software Foundation; either version 2, or (at your option)
11170754Sdelphij   any later version.
12170754Sdelphij
13170754Sdelphij   GNU DIFF is distributed in the hope that it will be useful,
14170754Sdelphij   but WITHOUT ANY WARRANTY; without even the implied warranty of
15170754Sdelphij   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16170754Sdelphij   GNU General Public License for more details.
17170754Sdelphij
18170754Sdelphij   You should have received a copy of the GNU General Public License
19170754Sdelphij   along with this program; see the file COPYING.
20170754Sdelphij   If not, write to the Free Software Foundation,
21170754Sdelphij   59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.  */
22170754Sdelphij
23170754Sdelphij#include "diff.h"
24170754Sdelphij#include <cmpbuf.h>
25170754Sdelphij#include <file-type.h>
26170754Sdelphij#include <setmode.h>
27170754Sdelphij#include <xalloc.h>
28170754Sdelphij
29170754Sdelphij/* Rotate an unsigned value to the left.  */
30170754Sdelphij#define ROL(v, n) ((v) << (n) | (v) >> (sizeof (v) * CHAR_BIT - (n)))
31170754Sdelphij
32170754Sdelphij/* Given a hash value and a new character, return a new hash value.  */
33170754Sdelphij#define HASH(h, c) ((c) + ROL (h, 7))
34170754Sdelphij
35170754Sdelphij/* The type of a hash value.  */
36170754Sdelphijtypedef size_t hash_value;
37170754Sdelphijverify (hash_value_is_unsigned, ! TYPE_SIGNED (hash_value));
38170754Sdelphij
39170754Sdelphij/* Lines are put into equivalence classes of lines that match in lines_differ.
40170754Sdelphij   Each equivalence class is represented by one of these structures,
41170754Sdelphij   but only while the classes are being computed.
42170754Sdelphij   Afterward, each class is represented by a number.  */
43170754Sdelphijstruct equivclass
44170754Sdelphij{
45170754Sdelphij  lin next;		/* Next item in this bucket.  */
46170754Sdelphij  hash_value hash;	/* Hash of lines in this class.  */
47170754Sdelphij  char const *line;	/* A line that fits this class.  */
48170754Sdelphij  size_t length;	/* That line's length, not counting its newline.  */
49170754Sdelphij};
50170754Sdelphij
51170754Sdelphij/* Hash-table: array of buckets, each being a chain of equivalence classes.
52170754Sdelphij   buckets[-1] is reserved for incomplete lines.  */
53170754Sdelphijstatic lin *buckets;
54170754Sdelphij
55170754Sdelphij/* Number of buckets in the hash table array, not counting buckets[-1].  */
56170754Sdelphijstatic size_t nbuckets;
57170754Sdelphij
58170754Sdelphij/* Array in which the equivalence classes are allocated.
59170754Sdelphij   The bucket-chains go through the elements in this array.
60170754Sdelphij   The number of an equivalence class is its index in this array.  */
61170754Sdelphijstatic struct equivclass *equivs;
62170754Sdelphij
63170754Sdelphij/* Index of first free element in the array `equivs'.  */
64170754Sdelphijstatic lin equivs_index;
65170754Sdelphij
66170754Sdelphij/* Number of elements allocated in the array `equivs'.  */
67170754Sdelphijstatic lin equivs_alloc;
68170754Sdelphij
69170754Sdelphij/* Read a block of data into a file buffer, checking for EOF and error.  */
70170754Sdelphij
71170754Sdelphijvoid
72170754Sdelphijfile_block_read (struct file_data *current, size_t size)
73170754Sdelphij{
74170754Sdelphij  if (size && ! current->eof)
75170754Sdelphij    {
76170754Sdelphij      size_t s = block_read (current->desc,
77170754Sdelphij			     FILE_BUFFER (current) + current->buffered, size);
78170754Sdelphij      if (s == SIZE_MAX)
79170754Sdelphij	pfatal_with_name (current->name);
80170754Sdelphij      current->buffered += s;
81170754Sdelphij      current->eof = s < size;
82170754Sdelphij    }
83170754Sdelphij}
84170754Sdelphij
85170754Sdelphij/* Check for binary files and compare them for exact identity.  */
86170754Sdelphij
87170754Sdelphij/* Return 1 if BUF contains a non text character.
88170754Sdelphij   SIZE is the number of characters in BUF.  */
89170754Sdelphij
90170754Sdelphij#define binary_file_p(buf, size) (memchr (buf, 0, size) != 0)
91170754Sdelphij
92170754Sdelphij/* Get ready to read the current file.
93170754Sdelphij   Return nonzero if SKIP_TEST is zero,
94170754Sdelphij   and if it appears to be a binary file.  */
95170754Sdelphij
96170754Sdelphijstatic bool
97170754Sdelphijsip (struct file_data *current, bool skip_test)
98170754Sdelphij{
99170754Sdelphij  /* If we have a nonexistent file at this stage, treat it as empty.  */
100170754Sdelphij  if (current->desc < 0)
101170754Sdelphij    {
102170754Sdelphij      /* Leave room for a sentinel.  */
103170754Sdelphij      current->bufsize = sizeof (word);
104170754Sdelphij      current->buffer = xmalloc (current->bufsize);
105170754Sdelphij    }
106170754Sdelphij  else
107170754Sdelphij    {
108170754Sdelphij      current->bufsize = buffer_lcm (sizeof (word),
109170754Sdelphij				     STAT_BLOCKSIZE (current->stat),
110170754Sdelphij				     PTRDIFF_MAX - 2 * sizeof (word));
111170754Sdelphij      current->buffer = xmalloc (current->bufsize);
112170754Sdelphij
113170754Sdelphij      if (! skip_test)
114170754Sdelphij	{
115170754Sdelphij	  /* Check first part of file to see if it's a binary file.  */
116170754Sdelphij
117170754Sdelphij	  bool was_binary = set_binary_mode (current->desc, true);
118170754Sdelphij	  off_t buffered;
119170754Sdelphij	  file_block_read (current, current->bufsize);
120170754Sdelphij	  buffered = current->buffered;
121170754Sdelphij
122170754Sdelphij	  if (! was_binary)
123170754Sdelphij	    {
124170754Sdelphij	      /* Revert to text mode and seek back to the beginning to
125170754Sdelphij		 reread the file.  Use relative seek, since file
126170754Sdelphij		 descriptors like stdin might not start at offset
127170754Sdelphij		 zero.  */
128170754Sdelphij
129170754Sdelphij	      if (lseek (current->desc, - buffered, SEEK_CUR) == -1)
130170754Sdelphij		pfatal_with_name (current->name);
131170754Sdelphij	      set_binary_mode (current->desc, false);
132170754Sdelphij	      current->buffered = 0;
133170754Sdelphij	      current->eof = false;
134170754Sdelphij	    }
135170754Sdelphij
136170754Sdelphij	  return binary_file_p (current->buffer, buffered);
137170754Sdelphij	}
138170754Sdelphij    }
139170754Sdelphij
140170754Sdelphij  current->buffered = 0;
141170754Sdelphij  current->eof = false;
142170754Sdelphij  return false;
143170754Sdelphij}
144170754Sdelphij
145170754Sdelphij/* Slurp the rest of the current file completely into memory.  */
146170754Sdelphij
147170754Sdelphijstatic void
148170754Sdelphijslurp (struct file_data *current)
149170754Sdelphij{
150170754Sdelphij  size_t cc;
151170754Sdelphij
152170754Sdelphij  if (current->desc < 0)
153170754Sdelphij    {
154170754Sdelphij      /* The file is nonexistent.  */
155170754Sdelphij      return;
156170754Sdelphij    }
157170754Sdelphij
158170754Sdelphij  if (S_ISREG (current->stat.st_mode))
159170754Sdelphij    {
160170754Sdelphij      /* It's a regular file; slurp in the rest all at once.  */
161170754Sdelphij
162170754Sdelphij      /* Get the size out of the stat block.
163170754Sdelphij	 Allocate just enough room for appended newline plus word sentinel,
164170754Sdelphij	 plus word-alignment since we want the buffer word-aligned.  */
165170754Sdelphij      size_t file_size = current->stat.st_size;
166170754Sdelphij      cc = file_size + 2 * sizeof (word) - file_size % sizeof (word);
167170754Sdelphij      if (file_size != current->stat.st_size || cc < file_size
168170754Sdelphij	  || PTRDIFF_MAX <= cc)
169170754Sdelphij	xalloc_die ();
170170754Sdelphij
171170754Sdelphij      if (current->bufsize < cc)
172170754Sdelphij	{
173170754Sdelphij	  current->bufsize = cc;
174170754Sdelphij	  current->buffer = xrealloc (current->buffer, cc);
175170754Sdelphij	}
176170754Sdelphij
177170754Sdelphij      /* Try to read at least 1 more byte than the size indicates, to
178170754Sdelphij	 detect whether the file is growing.  This is a nicety for
179170754Sdelphij	 users who run 'diff' on files while they are changing.  */
180170754Sdelphij
181170754Sdelphij      if (current->buffered <= file_size)
182170754Sdelphij	{
183170754Sdelphij	  file_block_read (current, file_size + 1 - current->buffered);
184170754Sdelphij	  if (current->buffered <= file_size)
185170754Sdelphij	    return;
186170754Sdelphij	}
187170754Sdelphij    }
188170754Sdelphij
189170754Sdelphij  /* It's not a regular file, or it's a growing regular file; read it,
190170754Sdelphij     growing the buffer as needed.  */
191170754Sdelphij
192170754Sdelphij  file_block_read (current, current->bufsize - current->buffered);
193170754Sdelphij
194170754Sdelphij  if (current->buffered)
195170754Sdelphij    {
196170754Sdelphij      while (current->buffered == current->bufsize)
197170754Sdelphij	{
198170754Sdelphij	  if (PTRDIFF_MAX / 2 - sizeof (word) < current->bufsize)
199170754Sdelphij	    xalloc_die ();
200170754Sdelphij	  current->bufsize *= 2;
201170754Sdelphij	  current->buffer = xrealloc (current->buffer, current->bufsize);
202170754Sdelphij	  file_block_read (current, current->bufsize - current->buffered);
203170754Sdelphij	}
204170754Sdelphij
205170754Sdelphij      /* Allocate just enough room for appended newline plus word
206170754Sdelphij	 sentinel, plus word-alignment.  */
207170754Sdelphij      cc = current->buffered + 2 * sizeof (word);
208170754Sdelphij      current->bufsize = cc - cc % sizeof (word);
209170754Sdelphij      current->buffer = xrealloc (current->buffer, current->bufsize);
210170754Sdelphij    }
211170754Sdelphij}
212170754Sdelphij
213170754Sdelphij/* Split the file into lines, simultaneously computing the equivalence
214170754Sdelphij   class for each line.  */
215170754Sdelphij
216170754Sdelphijstatic void
217170754Sdelphijfind_and_hash_each_line (struct file_data *current)
218170754Sdelphij{
219170754Sdelphij  hash_value h;
220170754Sdelphij  char const *p = current->prefix_end;
221170754Sdelphij  unsigned char c;
222170754Sdelphij  lin i, *bucket;
223170754Sdelphij  size_t length;
224170754Sdelphij
225170754Sdelphij  /* Cache often-used quantities in local variables to help the compiler.  */
226170754Sdelphij  char const **linbuf = current->linbuf;
227170754Sdelphij  lin alloc_lines = current->alloc_lines;
228170754Sdelphij  lin line = 0;
229170754Sdelphij  lin linbuf_base = current->linbuf_base;
230170754Sdelphij  lin *cureqs = xmalloc (alloc_lines * sizeof *cureqs);
231170754Sdelphij  struct equivclass *eqs = equivs;
232170754Sdelphij  lin eqs_index = equivs_index;
233170754Sdelphij  lin eqs_alloc = equivs_alloc;
234170754Sdelphij  char const *suffix_begin = current->suffix_begin;
235170754Sdelphij  char const *bufend = FILE_BUFFER (current) + current->buffered;
236170754Sdelphij  bool diff_length_compare_anyway =
237170754Sdelphij    ignore_white_space != IGNORE_NO_WHITE_SPACE;
238170754Sdelphij  bool same_length_diff_contents_compare_anyway =
239170754Sdelphij    diff_length_compare_anyway | ignore_case;
240170754Sdelphij
241170754Sdelphij  while (p < suffix_begin)
242170754Sdelphij    {
243170754Sdelphij      char const *ip = p;
244170754Sdelphij
245170754Sdelphij      h = 0;
246170754Sdelphij
247170754Sdelphij      /* Hash this line until we find a newline.  */
248170754Sdelphij      if (ignore_case)
249170754Sdelphij	switch (ignore_white_space)
250170754Sdelphij	  {
251170754Sdelphij	  case IGNORE_ALL_SPACE:
252170754Sdelphij	    while ((c = *p++) != '\n')
253170754Sdelphij	      if (! isspace (c))
254170754Sdelphij		h = HASH (h, tolower (c));
255170754Sdelphij	    break;
256170754Sdelphij
257170754Sdelphij	  case IGNORE_SPACE_CHANGE:
258170754Sdelphij	    while ((c = *p++) != '\n')
259170754Sdelphij	      {
260170754Sdelphij		if (isspace (c))
261170754Sdelphij		  {
262170754Sdelphij		    do
263170754Sdelphij		      if ((c = *p++) == '\n')
264170754Sdelphij			goto hashing_done;
265170754Sdelphij		    while (isspace (c));
266170754Sdelphij
267170754Sdelphij		    h = HASH (h, ' ');
268170754Sdelphij		  }
269170754Sdelphij
270170754Sdelphij		/* C is now the first non-space.  */
271170754Sdelphij		h = HASH (h, tolower (c));
272170754Sdelphij	      }
273170754Sdelphij	    break;
274170754Sdelphij
275170754Sdelphij	  case IGNORE_TAB_EXPANSION:
276170754Sdelphij	    {
277170754Sdelphij	      size_t column = 0;
278170754Sdelphij	      while ((c = *p++) != '\n')
279170754Sdelphij		{
280170754Sdelphij		  size_t repetitions = 1;
281170754Sdelphij
282170754Sdelphij		  switch (c)
283170754Sdelphij		    {
284170754Sdelphij		    case '\b':
285170754Sdelphij		      column -= 0 < column;
286170754Sdelphij		      break;
287170754Sdelphij
288170754Sdelphij		    case '\t':
289170754Sdelphij		      c = ' ';
290170754Sdelphij		      repetitions = tabsize - column % tabsize;
291170754Sdelphij		      column = (column + repetitions < column
292170754Sdelphij				? 0
293170754Sdelphij				: column + repetitions);
294170754Sdelphij		      break;
295170754Sdelphij
296170754Sdelphij		    case '\r':
297170754Sdelphij		      column = 0;
298170754Sdelphij		      break;
299170754Sdelphij
300170754Sdelphij		    default:
301170754Sdelphij		      c = tolower (c);
302170754Sdelphij		      column++;
303170754Sdelphij		      break;
304170754Sdelphij		    }
305170754Sdelphij
306170754Sdelphij		  do
307170754Sdelphij		    h = HASH (h, c);
308170754Sdelphij		  while (--repetitions != 0);
309170754Sdelphij		}
310170754Sdelphij	    }
311170754Sdelphij	    break;
312170754Sdelphij
313170754Sdelphij	  default:
314170754Sdelphij	    while ((c = *p++) != '\n')
315170754Sdelphij	      h = HASH (h, tolower (c));
316170754Sdelphij	    break;
317170754Sdelphij	  }
318170754Sdelphij      else
319170754Sdelphij	switch (ignore_white_space)
320170754Sdelphij	  {
321170754Sdelphij	  case IGNORE_ALL_SPACE:
322170754Sdelphij	    while ((c = *p++) != '\n')
323170754Sdelphij	      if (! isspace (c))
324170754Sdelphij		h = HASH (h, c);
325170754Sdelphij	    break;
326170754Sdelphij
327170754Sdelphij	  case IGNORE_SPACE_CHANGE:
328170754Sdelphij	    while ((c = *p++) != '\n')
329170754Sdelphij	      {
330170754Sdelphij		if (isspace (c))
331170754Sdelphij		  {
332170754Sdelphij		    do
333170754Sdelphij		      if ((c = *p++) == '\n')
334170754Sdelphij			goto hashing_done;
335170754Sdelphij		    while (isspace (c));
336170754Sdelphij
337170754Sdelphij		    h = HASH (h, ' ');
338170754Sdelphij		  }
339170754Sdelphij
340170754Sdelphij		/* C is now the first non-space.  */
341170754Sdelphij		h = HASH (h, c);
342170754Sdelphij	      }
343170754Sdelphij	    break;
344170754Sdelphij
345170754Sdelphij	  case IGNORE_TAB_EXPANSION:
346170754Sdelphij	    {
347170754Sdelphij	      size_t column = 0;
348170754Sdelphij	      while ((c = *p++) != '\n')
349170754Sdelphij		{
350170754Sdelphij		  size_t repetitions = 1;
351170754Sdelphij
352170754Sdelphij		  switch (c)
353170754Sdelphij		    {
354170754Sdelphij		    case '\b':
355170754Sdelphij		      column -= 0 < column;
356170754Sdelphij		      break;
357170754Sdelphij
358170754Sdelphij		    case '\t':
359170754Sdelphij		      c = ' ';
360170754Sdelphij		      repetitions = tabsize - column % tabsize;
361170754Sdelphij		      column = (column + repetitions < column
362170754Sdelphij				? 0
363170754Sdelphij				: column + repetitions);
364170754Sdelphij		      break;
365170754Sdelphij
366170754Sdelphij		    case '\r':
367170754Sdelphij		      column = 0;
368170754Sdelphij		      break;
369170754Sdelphij
370170754Sdelphij		    default:
371170754Sdelphij		      column++;
372170754Sdelphij		      break;
373170754Sdelphij		    }
374170754Sdelphij
375170754Sdelphij		  do
376170754Sdelphij		    h = HASH (h, c);
377170754Sdelphij		  while (--repetitions != 0);
378170754Sdelphij		}
379170754Sdelphij	    }
380170754Sdelphij	    break;
381170754Sdelphij
382170754Sdelphij	  default:
383170754Sdelphij	    while ((c = *p++) != '\n')
384170754Sdelphij	      h = HASH (h, c);
385170754Sdelphij	    break;
386170754Sdelphij	  }
387170754Sdelphij
388170754Sdelphij   hashing_done:;
389170754Sdelphij
390170754Sdelphij      bucket = &buckets[h % nbuckets];
391170754Sdelphij      length = p - ip - 1;
392170754Sdelphij
393170754Sdelphij      if (p == bufend
394170754Sdelphij	  && current->missing_newline
395170754Sdelphij	  && ROBUST_OUTPUT_STYLE (output_style))
396170754Sdelphij	{
397170754Sdelphij	  /* This line is incomplete.  If this is significant,
398170754Sdelphij	     put the line into buckets[-1].  */
399170754Sdelphij	  if (ignore_white_space < IGNORE_SPACE_CHANGE)
400170754Sdelphij	    bucket = &buckets[-1];
401170754Sdelphij
402170754Sdelphij	  /* Omit the inserted newline when computing linbuf later.  */
403170754Sdelphij	  p--;
404170754Sdelphij	  bufend = suffix_begin = p;
405170754Sdelphij	}
406170754Sdelphij
407170754Sdelphij      for (i = *bucket;  ;  i = eqs[i].next)
408170754Sdelphij	if (!i)
409170754Sdelphij	  {
410170754Sdelphij	    /* Create a new equivalence class in this bucket.  */
411170754Sdelphij	    i = eqs_index++;
412170754Sdelphij	    if (i == eqs_alloc)
413170754Sdelphij	      {
414170754Sdelphij		if (PTRDIFF_MAX / (2 * sizeof *eqs) <= eqs_alloc)
415170754Sdelphij		  xalloc_die ();
416170754Sdelphij		eqs_alloc *= 2;
417170754Sdelphij		eqs = xrealloc (eqs, eqs_alloc * sizeof *eqs);
418170754Sdelphij	      }
419170754Sdelphij	    eqs[i].next = *bucket;
420170754Sdelphij	    eqs[i].hash = h;
421170754Sdelphij	    eqs[i].line = ip;
422170754Sdelphij	    eqs[i].length = length;
423170754Sdelphij	    *bucket = i;
424170754Sdelphij	    break;
425170754Sdelphij	  }
426170754Sdelphij	else if (eqs[i].hash == h)
427170754Sdelphij	  {
428170754Sdelphij	    char const *eqline = eqs[i].line;
429170754Sdelphij
430170754Sdelphij	    /* Reuse existing class if lines_differ reports the lines
431170754Sdelphij               equal.  */
432170754Sdelphij	    if (eqs[i].length == length)
433170754Sdelphij	      {
434170754Sdelphij		/* Reuse existing equivalence class if the lines are identical.
435170754Sdelphij		   This detects the common case of exact identity
436170754Sdelphij		   faster than lines_differ would.  */
437170754Sdelphij		if (memcmp (eqline, ip, length) == 0)
438170754Sdelphij		  break;
439170754Sdelphij		if (!same_length_diff_contents_compare_anyway)
440170754Sdelphij		  continue;
441170754Sdelphij	      }
442170754Sdelphij	    else if (!diff_length_compare_anyway)
443170754Sdelphij	      continue;
444170754Sdelphij
445170754Sdelphij	    if (! lines_differ (eqline, ip))
446170754Sdelphij	      break;
447170754Sdelphij	  }
448170754Sdelphij
449170754Sdelphij      /* Maybe increase the size of the line table.  */
450170754Sdelphij      if (line == alloc_lines)
451170754Sdelphij	{
452170754Sdelphij	  /* Double (alloc_lines - linbuf_base) by adding to alloc_lines.  */
453170754Sdelphij	  if (PTRDIFF_MAX / 3 <= alloc_lines
454170754Sdelphij	      || PTRDIFF_MAX / sizeof *cureqs <= 2 * alloc_lines - linbuf_base
455170754Sdelphij	      || PTRDIFF_MAX / sizeof *linbuf <= alloc_lines - linbuf_base)
456170754Sdelphij	    xalloc_die ();
457170754Sdelphij	  alloc_lines = 2 * alloc_lines - linbuf_base;
458170754Sdelphij	  cureqs = xrealloc (cureqs, alloc_lines * sizeof *cureqs);
459170754Sdelphij	  linbuf += linbuf_base;
460170754Sdelphij	  linbuf = xrealloc (linbuf,
461170754Sdelphij			     (alloc_lines - linbuf_base) * sizeof *linbuf);
462170754Sdelphij	  linbuf -= linbuf_base;
463170754Sdelphij	}
464170754Sdelphij      linbuf[line] = ip;
465170754Sdelphij      cureqs[line] = i;
466170754Sdelphij      ++line;
467170754Sdelphij    }
468170754Sdelphij
469170754Sdelphij  current->buffered_lines = line;
470170754Sdelphij
471170754Sdelphij  for (i = 0;  ;  i++)
472170754Sdelphij    {
473170754Sdelphij      /* Record the line start for lines in the suffix that we care about.
474170754Sdelphij	 Record one more line start than lines,
475170754Sdelphij	 so that we can compute the length of any buffered line.  */
476170754Sdelphij      if (line == alloc_lines)
477170754Sdelphij	{
478170754Sdelphij	  /* Double (alloc_lines - linbuf_base) by adding to alloc_lines.  */
479170754Sdelphij	  if (PTRDIFF_MAX / 3 <= alloc_lines
480170754Sdelphij	      || PTRDIFF_MAX / sizeof *cureqs <= 2 * alloc_lines - linbuf_base
481170754Sdelphij	      || PTRDIFF_MAX / sizeof *linbuf <= alloc_lines - linbuf_base)
482170754Sdelphij	    xalloc_die ();
483170754Sdelphij	  alloc_lines = 2 * alloc_lines - linbuf_base;
484170754Sdelphij	  linbuf += linbuf_base;
485170754Sdelphij	  linbuf = xrealloc (linbuf,
486170754Sdelphij			     (alloc_lines - linbuf_base) * sizeof *linbuf);
487170754Sdelphij	  linbuf -= linbuf_base;
488170754Sdelphij	}
489170754Sdelphij      linbuf[line] = p;
490170754Sdelphij
491170754Sdelphij      if (p == bufend)
492170754Sdelphij	break;
493170754Sdelphij
494170754Sdelphij      if (context <= i && no_diff_means_no_output)
495170754Sdelphij	break;
496170754Sdelphij
497170754Sdelphij      line++;
498170754Sdelphij
499170754Sdelphij      while (*p++ != '\n')
500170754Sdelphij	continue;
501170754Sdelphij    }
502170754Sdelphij
503170754Sdelphij  /* Done with cache in local variables.  */
504170754Sdelphij  current->linbuf = linbuf;
505170754Sdelphij  current->valid_lines = line;
506170754Sdelphij  current->alloc_lines = alloc_lines;
507170754Sdelphij  current->equivs = cureqs;
508170754Sdelphij  equivs = eqs;
509170754Sdelphij  equivs_alloc = eqs_alloc;
510170754Sdelphij  equivs_index = eqs_index;
511170754Sdelphij}
512170754Sdelphij
513170754Sdelphij/* Prepare the text.  Make sure the text end is initialized.
514170754Sdelphij   Make sure text ends in a newline,
515170754Sdelphij   but remember that we had to add one.
516170754Sdelphij   Strip trailing CRs, if that was requested.  */
517170754Sdelphij
518170754Sdelphijstatic void
519170754Sdelphijprepare_text (struct file_data *current)
520170754Sdelphij{
521170754Sdelphij  size_t buffered = current->buffered;
522170754Sdelphij  char *p = FILE_BUFFER (current);
523170754Sdelphij  char *dst;
524170754Sdelphij
525170754Sdelphij  if (buffered == 0 || p[buffered - 1] == '\n')
526170754Sdelphij    current->missing_newline = false;
527170754Sdelphij  else
528170754Sdelphij    {
529170754Sdelphij      p[buffered++] = '\n';
530170754Sdelphij      current->missing_newline = true;
531170754Sdelphij    }
532170754Sdelphij
533170754Sdelphij  if (!p)
534170754Sdelphij    return;
535170754Sdelphij
536170754Sdelphij  /* Don't use uninitialized storage when planting or using sentinels.  */
537170754Sdelphij  memset (p + buffered, 0, sizeof (word));
538170754Sdelphij
539170754Sdelphij  if (strip_trailing_cr && (dst = memchr (p, '\r', buffered)))
540170754Sdelphij    {
541170754Sdelphij      char const *src = dst;
542170754Sdelphij      char const *srclim = p + buffered;
543170754Sdelphij
544170754Sdelphij      do
545170754Sdelphij	dst += ! ((*dst = *src++) == '\r' && *src == '\n');
546170754Sdelphij      while (src < srclim);
547170754Sdelphij
548170754Sdelphij      buffered -= src - dst;
549170754Sdelphij    }
550170754Sdelphij
551170754Sdelphij  current->buffered = buffered;
552170754Sdelphij}
553170754Sdelphij
554170754Sdelphij/* We have found N lines in a buffer of size S; guess the
555170754Sdelphij   proportionate number of lines that will be found in a buffer of
556170754Sdelphij   size T.  However, do not guess a number of lines so large that the
557170754Sdelphij   resulting line table might cause overflow in size calculations.  */
558170754Sdelphijstatic lin
559170754Sdelphijguess_lines (lin n, size_t s, size_t t)
560170754Sdelphij{
561170754Sdelphij  size_t guessed_bytes_per_line = n < 10 ? 32 : s / (n - 1);
562170754Sdelphij  lin guessed_lines = MAX (1, t / guessed_bytes_per_line);
563170754Sdelphij  return MIN (guessed_lines, PTRDIFF_MAX / (2 * sizeof (char *) + 1) - 5) + 5;
564170754Sdelphij}
565170754Sdelphij
566170754Sdelphij/* Given a vector of two file_data objects, find the identical
567170754Sdelphij   prefixes and suffixes of each object.  */
568170754Sdelphij
569170754Sdelphijstatic void
570170754Sdelphijfind_identical_ends (struct file_data filevec[])
571170754Sdelphij{
572170754Sdelphij  word *w0, *w1;
573170754Sdelphij  char *p0, *p1, *buffer0, *buffer1;
574170754Sdelphij  char const *end0, *beg0;
575170754Sdelphij  char const **linbuf0, **linbuf1;
576170754Sdelphij  lin i, lines;
577170754Sdelphij  size_t n0, n1;
578170754Sdelphij  lin alloc_lines0, alloc_lines1;
579170754Sdelphij  lin buffered_prefix, prefix_count, prefix_mask;
580170754Sdelphij  lin middle_guess, suffix_guess;
581170754Sdelphij
582170754Sdelphij  slurp (&filevec[0]);
583170754Sdelphij  prepare_text (&filevec[0]);
584170754Sdelphij  if (filevec[0].desc != filevec[1].desc)
585170754Sdelphij    {
586170754Sdelphij      slurp (&filevec[1]);
587170754Sdelphij      prepare_text (&filevec[1]);
588170754Sdelphij    }
589170754Sdelphij  else
590170754Sdelphij    {
591170754Sdelphij      filevec[1].buffer = filevec[0].buffer;
592170754Sdelphij      filevec[1].bufsize = filevec[0].bufsize;
593170754Sdelphij      filevec[1].buffered = filevec[0].buffered;
594170754Sdelphij      filevec[1].missing_newline = filevec[0].missing_newline;
595170754Sdelphij    }
596170754Sdelphij
597170754Sdelphij  /* Find identical prefix.  */
598170754Sdelphij
599170754Sdelphij  w0 = filevec[0].buffer;
600170754Sdelphij  w1 = filevec[1].buffer;
601170754Sdelphij  p0 = buffer0 = (char *) w0;
602170754Sdelphij  p1 = buffer1 = (char *) w1;
603170754Sdelphij  n0 = filevec[0].buffered;
604170754Sdelphij  n1 = filevec[1].buffered;
605170754Sdelphij
606170754Sdelphij  if (p0 == p1)
607170754Sdelphij    /* The buffers are the same; sentinels won't work.  */
608170754Sdelphij    p0 = p1 += n1;
609170754Sdelphij  else
610170754Sdelphij    {
611170754Sdelphij      /* Insert end sentinels, in this case characters that are guaranteed
612170754Sdelphij	 to make the equality test false, and thus terminate the loop.  */
613170754Sdelphij
614170754Sdelphij      if (n0 < n1)
615170754Sdelphij	p0[n0] = ~p1[n0];
616170754Sdelphij      else
617170754Sdelphij	p1[n1] = ~p0[n1];
618170754Sdelphij
619170754Sdelphij      /* Loop until first mismatch, or to the sentinel characters.  */
620170754Sdelphij
621170754Sdelphij      /* Compare a word at a time for speed.  */
622170754Sdelphij      while (*w0 == *w1)
623170754Sdelphij	w0++, w1++;
624170754Sdelphij
625170754Sdelphij      /* Do the last few bytes of comparison a byte at a time.  */
626170754Sdelphij      p0 = (char *) w0;
627170754Sdelphij      p1 = (char *) w1;
628170754Sdelphij      while (*p0 == *p1)
629170754Sdelphij	p0++, p1++;
630170754Sdelphij
631170754Sdelphij      /* Don't mistakenly count missing newline as part of prefix.  */
632170754Sdelphij      if (ROBUST_OUTPUT_STYLE (output_style)
633170754Sdelphij	  && ((buffer0 + n0 - filevec[0].missing_newline < p0)
634170754Sdelphij	      !=
635170754Sdelphij	      (buffer1 + n1 - filevec[1].missing_newline < p1)))
636170754Sdelphij	p0--, p1--;
637170754Sdelphij    }
638170754Sdelphij
639170754Sdelphij  /* Now P0 and P1 point at the first nonmatching characters.  */
640170754Sdelphij
641170754Sdelphij  /* Skip back to last line-beginning in the prefix,
642170754Sdelphij     and then discard up to HORIZON_LINES lines from the prefix.  */
643170754Sdelphij  i = horizon_lines;
644170754Sdelphij  while (p0 != buffer0 && (p0[-1] != '\n' || i--))
645170754Sdelphij    p0--, p1--;
646170754Sdelphij
647170754Sdelphij  /* Record the prefix.  */
648170754Sdelphij  filevec[0].prefix_end = p0;
649170754Sdelphij  filevec[1].prefix_end = p1;
650170754Sdelphij
651170754Sdelphij  /* Find identical suffix.  */
652170754Sdelphij
653170754Sdelphij  /* P0 and P1 point beyond the last chars not yet compared.  */
654170754Sdelphij  p0 = buffer0 + n0;
655170754Sdelphij  p1 = buffer1 + n1;
656170754Sdelphij
657170754Sdelphij  if (! ROBUST_OUTPUT_STYLE (output_style)
658170754Sdelphij      || filevec[0].missing_newline == filevec[1].missing_newline)
659170754Sdelphij    {
660170754Sdelphij      end0 = p0;	/* Addr of last char in file 0.  */
661170754Sdelphij
662170754Sdelphij      /* Get value of P0 at which we should stop scanning backward:
663170754Sdelphij	 this is when either P0 or P1 points just past the last char
664170754Sdelphij	 of the identical prefix.  */
665170754Sdelphij      beg0 = filevec[0].prefix_end + (n0 < n1 ? 0 : n0 - n1);
666170754Sdelphij
667170754Sdelphij      /* Scan back until chars don't match or we reach that point.  */
668170754Sdelphij      for (; p0 != beg0; p0--, p1--)
669170754Sdelphij	if (*p0 != *p1)
670170754Sdelphij	  {
671170754Sdelphij	    /* Point at the first char of the matching suffix.  */
672170754Sdelphij	    beg0 = p0;
673170754Sdelphij	    break;
674170754Sdelphij	  }
675170754Sdelphij
676170754Sdelphij      /* Are we at a line-beginning in both files?  If not, add the rest of
677170754Sdelphij	 this line to the main body.  Discard up to HORIZON_LINES lines from
678170754Sdelphij	 the identical suffix.  Also, discard one extra line,
679170754Sdelphij	 because shift_boundaries may need it.  */
680170754Sdelphij      i = horizon_lines + !((buffer0 == p0 || p0[-1] == '\n')
681170754Sdelphij			    &&
682170754Sdelphij			    (buffer1 == p1 || p1[-1] == '\n'));
683170754Sdelphij      while (i-- && p0 != end0)
684170754Sdelphij	while (*p0++ != '\n')
685170754Sdelphij	  continue;
686170754Sdelphij
687170754Sdelphij      p1 += p0 - beg0;
688170754Sdelphij    }
689170754Sdelphij
690170754Sdelphij  /* Record the suffix.  */
691170754Sdelphij  filevec[0].suffix_begin = p0;
692170754Sdelphij  filevec[1].suffix_begin = p1;
693170754Sdelphij
694170754Sdelphij  /* Calculate number of lines of prefix to save.
695170754Sdelphij
696170754Sdelphij     prefix_count == 0 means save the whole prefix;
697170754Sdelphij     we need this for options like -D that output the whole file,
698170754Sdelphij     or for enormous contexts (to avoid worrying about arithmetic overflow).
699170754Sdelphij     We also need it for options like -F that output some preceding line;
700170754Sdelphij     at least we will need to find the last few lines,
701170754Sdelphij     but since we don't know how many, it's easiest to find them all.
702170754Sdelphij
703170754Sdelphij     Otherwise, prefix_count != 0.  Save just prefix_count lines at start
704170754Sdelphij     of the line buffer; they'll be moved to the proper location later.
705170754Sdelphij     Handle 1 more line than the context says (because we count 1 too many),
706170754Sdelphij     rounded up to the next power of 2 to speed index computation.  */
707170754Sdelphij
708170754Sdelphij  if (no_diff_means_no_output && ! function_regexp.fastmap
709170754Sdelphij      && context < LIN_MAX / 4 && context < n0)
710170754Sdelphij    {
711170754Sdelphij      middle_guess = guess_lines (0, 0, p0 - filevec[0].prefix_end);
712170754Sdelphij      suffix_guess = guess_lines (0, 0, buffer0 + n0 - p0);
713170754Sdelphij      for (prefix_count = 1;  prefix_count <= context;  prefix_count *= 2)
714170754Sdelphij	continue;
715170754Sdelphij      alloc_lines0 = (prefix_count + middle_guess
716170754Sdelphij		      + MIN (context, suffix_guess));
717170754Sdelphij    }
718170754Sdelphij  else
719170754Sdelphij    {
720170754Sdelphij      prefix_count = 0;
721170754Sdelphij      alloc_lines0 = guess_lines (0, 0, n0);
722170754Sdelphij    }
723170754Sdelphij
724170754Sdelphij  prefix_mask = prefix_count - 1;
725170754Sdelphij  lines = 0;
726170754Sdelphij  linbuf0 = xmalloc (alloc_lines0 * sizeof *linbuf0);
727170754Sdelphij  p0 = buffer0;
728170754Sdelphij
729170754Sdelphij  /* If the prefix is needed, find the prefix lines.  */
730170754Sdelphij  if (! (no_diff_means_no_output
731170754Sdelphij	 && filevec[0].prefix_end == p0
732170754Sdelphij	 && filevec[1].prefix_end == p1))
733170754Sdelphij    {
734170754Sdelphij      end0 = filevec[0].prefix_end;
735170754Sdelphij      while (p0 != end0)
736170754Sdelphij	{
737170754Sdelphij	  lin l = lines++ & prefix_mask;
738170754Sdelphij	  if (l == alloc_lines0)
739170754Sdelphij	    {
740170754Sdelphij	      if (PTRDIFF_MAX / (2 * sizeof *linbuf0) <= alloc_lines0)
741170754Sdelphij		xalloc_die ();
742170754Sdelphij	      alloc_lines0 *= 2;
743170754Sdelphij	      linbuf0 = xrealloc (linbuf0, alloc_lines0 * sizeof *linbuf0);
744170754Sdelphij	    }
745170754Sdelphij	  linbuf0[l] = p0;
746170754Sdelphij	  while (*p0++ != '\n')
747170754Sdelphij	    continue;
748170754Sdelphij	}
749170754Sdelphij    }
750170754Sdelphij  buffered_prefix = prefix_count && context < lines ? context : lines;
751170754Sdelphij
752170754Sdelphij  /* Allocate line buffer 1.  */
753170754Sdelphij
754170754Sdelphij  middle_guess = guess_lines (lines, p0 - buffer0, p1 - filevec[1].prefix_end);
755170754Sdelphij  suffix_guess = guess_lines (lines, p0 - buffer0, buffer1 + n1 - p1);
756170754Sdelphij  alloc_lines1 = buffered_prefix + middle_guess + MIN (context, suffix_guess);
757170754Sdelphij  if (alloc_lines1 < buffered_prefix
758170754Sdelphij      || PTRDIFF_MAX / sizeof *linbuf1 <= alloc_lines1)
759170754Sdelphij    xalloc_die ();
760170754Sdelphij  linbuf1 = xmalloc (alloc_lines1 * sizeof *linbuf1);
761170754Sdelphij
762170754Sdelphij  if (buffered_prefix != lines)
763170754Sdelphij    {
764170754Sdelphij      /* Rotate prefix lines to proper location.  */
765170754Sdelphij      for (i = 0;  i < buffered_prefix;  i++)
766170754Sdelphij	linbuf1[i] = linbuf0[(lines - context + i) & prefix_mask];
767170754Sdelphij      for (i = 0;  i < buffered_prefix;  i++)
768170754Sdelphij	linbuf0[i] = linbuf1[i];
769170754Sdelphij    }
770170754Sdelphij
771170754Sdelphij  /* Initialize line buffer 1 from line buffer 0.  */
772170754Sdelphij  for (i = 0; i < buffered_prefix; i++)
773170754Sdelphij    linbuf1[i] = linbuf0[i] - buffer0 + buffer1;
774170754Sdelphij
775170754Sdelphij  /* Record the line buffer, adjusted so that
776170754Sdelphij     linbuf[0] points at the first differing line.  */
777170754Sdelphij  filevec[0].linbuf = linbuf0 + buffered_prefix;
778170754Sdelphij  filevec[1].linbuf = linbuf1 + buffered_prefix;
779170754Sdelphij  filevec[0].linbuf_base = filevec[1].linbuf_base = - buffered_prefix;
780170754Sdelphij  filevec[0].alloc_lines = alloc_lines0 - buffered_prefix;
781170754Sdelphij  filevec[1].alloc_lines = alloc_lines1 - buffered_prefix;
782170754Sdelphij  filevec[0].prefix_lines = filevec[1].prefix_lines = lines;
783170754Sdelphij}
784170754Sdelphij
785170754Sdelphij/* If 1 < k, then (2**k - prime_offset[k]) is the largest prime less
786170754Sdelphij   than 2**k.  This table is derived from Chris K. Caldwell's list
787170754Sdelphij   <http://www.utm.edu/research/primes/lists/2small/>.  */
788170754Sdelphij
789170754Sdelphijstatic unsigned char const prime_offset[] =
790170754Sdelphij{
791170754Sdelphij  0, 0, 1, 1, 3, 1, 3, 1, 5, 3, 3, 9, 3, 1, 3, 19, 15, 1, 5, 1, 3, 9, 3,
792170754Sdelphij  15, 3, 39, 5, 39, 57, 3, 35, 1, 5, 9, 41, 31, 5, 25, 45, 7, 87, 21,
793170754Sdelphij  11, 57, 17, 55, 21, 115, 59, 81, 27, 129, 47, 111, 33, 55, 5, 13, 27,
794170754Sdelphij  55, 93, 1, 57, 25
795170754Sdelphij};
796170754Sdelphij
797170754Sdelphij/* Verify that this host's size_t is not too wide for the above table.  */
798170754Sdelphij
799170754Sdelphijverify (enough_prime_offsets,
800170754Sdelphij	sizeof (size_t) * CHAR_BIT <= sizeof prime_offset);
801170754Sdelphij
802170754Sdelphij/* Given a vector of two file_data objects, read the file associated
803170754Sdelphij   with each one, and build the table of equivalence classes.
804170754Sdelphij   Return nonzero if either file appears to be a binary file.
805170754Sdelphij   If PRETEND_BINARY is nonzero, pretend they are binary regardless.  */
806170754Sdelphij
807170754Sdelphijbool
808170754Sdelphijread_files (struct file_data filevec[], bool pretend_binary)
809170754Sdelphij{
810170754Sdelphij  int i;
811170754Sdelphij  bool skip_test = text | pretend_binary;
812170754Sdelphij  bool appears_binary = pretend_binary | sip (&filevec[0], skip_test);
813170754Sdelphij
814170754Sdelphij  if (filevec[0].desc != filevec[1].desc)
815170754Sdelphij    appears_binary |= sip (&filevec[1], skip_test | appears_binary);
816170754Sdelphij  else
817170754Sdelphij    {
818170754Sdelphij      filevec[1].buffer = filevec[0].buffer;
819170754Sdelphij      filevec[1].bufsize = filevec[0].bufsize;
820170754Sdelphij      filevec[1].buffered = filevec[0].buffered;
821170754Sdelphij    }
822170754Sdelphij  if (appears_binary)
823170754Sdelphij    {
824170754Sdelphij      set_binary_mode (filevec[0].desc, true);
825170754Sdelphij      set_binary_mode (filevec[1].desc, true);
826170754Sdelphij      return true;
827170754Sdelphij    }
828170754Sdelphij
829170754Sdelphij  find_identical_ends (filevec);
830170754Sdelphij
831170754Sdelphij  equivs_alloc = filevec[0].alloc_lines + filevec[1].alloc_lines + 1;
832170754Sdelphij  if (PTRDIFF_MAX / sizeof *equivs <= equivs_alloc)
833170754Sdelphij    xalloc_die ();
834170754Sdelphij  equivs = xmalloc (equivs_alloc * sizeof *equivs);
835170754Sdelphij  /* Equivalence class 0 is permanently safe for lines that were not
836170754Sdelphij     hashed.  Real equivalence classes start at 1.  */
837170754Sdelphij  equivs_index = 1;
838170754Sdelphij
839170754Sdelphij  /* Allocate (one plus) a prime number of hash buckets.  Use a prime
840170754Sdelphij     number between 1/3 and 2/3 of the value of equiv_allocs,
841170754Sdelphij     approximately.  */
842170754Sdelphij  for (i = 9; (size_t) 1 << i < equivs_alloc / 3; i++)
843170754Sdelphij    continue;
844170754Sdelphij  nbuckets = ((size_t) 1 << i) - prime_offset[i];
845170754Sdelphij  if (PTRDIFF_MAX / sizeof *buckets <= nbuckets)
846170754Sdelphij    xalloc_die ();
847170754Sdelphij  buckets = zalloc ((nbuckets + 1) * sizeof *buckets);
848170754Sdelphij  buckets++;
849170754Sdelphij
850170754Sdelphij  for (i = 0; i < 2; i++)
851170754Sdelphij    find_and_hash_each_line (&filevec[i]);
852170754Sdelphij
853170754Sdelphij  filevec[0].equiv_max = filevec[1].equiv_max = equivs_index;
854170754Sdelphij
855170754Sdelphij  free (equivs);
856170754Sdelphij  free (buckets - 1);
857170754Sdelphij
858170754Sdelphij  return false;
859170754Sdelphij}
860