1/* File I/O for GNU DIFF.
2   Copyright (C) 1988, 1989, 1992, 1993, 1994 Free Software Foundation, Inc.
3
4This file is part of GNU DIFF.
5
6GNU DIFF is free software; you can redistribute it and/or modify
7it under the terms of the GNU General Public License as published by
8the Free Software Foundation; either version 2, or (at your option)
9any later version.
10
11GNU DIFF is distributed in the hope that it will be useful,
12but WITHOUT ANY WARRANTY; without even the implied warranty of
13MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14GNU General Public License for more details.
15
16*/
17
18#include "diff.h"
19
20/* Rotate a value n bits to the left. */
21#define UINT_BIT (sizeof (unsigned) * CHAR_BIT)
22#define ROL(v, n) ((v) << (n) | (v) >> (UINT_BIT - (n)))
23
24/* Given a hash value and a new character, return a new hash value. */
25#define HASH(h, c) ((c) + ROL (h, 7))
26
27/* Guess remaining number of lines from number N of lines so far,
28   size S so far, and total size T.  */
29#define GUESS_LINES(n,s,t) (((t) - (s)) / ((n) < 10 ? 32 : (s) / ((n)-1)) + 5)
30
31/* Type used for fast prefix comparison in find_identical_ends.  */
32#ifndef word
33#define word int
34#endif
35
36/* Lines are put into equivalence classes (of lines that match in line_cmp).
37   Each equivalence class is represented by one of these structures,
38   but only while the classes are being computed.
39   Afterward, each class is represented by a number.  */
40struct equivclass
41{
42  int next;	/* Next item in this bucket. */
43  unsigned hash;	/* Hash of lines in this class.  */
44  char const *line;	/* A line that fits this class. */
45  size_t length;	/* That line's length, not counting its newline.  */
46};
47
48/* Hash-table: array of buckets, each being a chain of equivalence classes.
49   buckets[-1] is reserved for incomplete lines.  */
50static int *buckets;
51
52/* Number of buckets in the hash table array, not counting buckets[-1]. */
53static int nbuckets;
54
55/* Array in which the equivalence classes are allocated.
56   The bucket-chains go through the elements in this array.
57   The number of an equivalence class is its index in this array.  */
58static struct equivclass *equivs;
59
60/* Index of first free element in the array `equivs'.  */
61static int equivs_index;
62
63/* Number of elements allocated in the array `equivs'.  */
64static int equivs_alloc;
65
66static void find_and_hash_each_line PARAMS((struct file_data *));
67static void find_identical_ends PARAMS((struct file_data[]));
68static void prepare_text_end PARAMS((struct file_data *));
69
70/* Check for binary files and compare them for exact identity.  */
71
72/* Return 1 if BUF contains a non text character.
73   SIZE is the number of characters in BUF.  */
74
75#define binary_file_p(buf, size) (memchr (buf, '\0', size) != 0)
76
77/* Get ready to read the current file.
78   Return nonzero if SKIP_TEST is zero,
79   and if it appears to be a binary file.  */
80
81int
82sip (current, skip_test)
83     struct file_data *current;
84     int skip_test;
85{
86  /* If we have a nonexistent file at this stage, treat it as empty.  */
87  if (current->desc < 0)
88    {
89      /* Leave room for a sentinel.  */
90      current->bufsize = sizeof (word);
91      current->buffer = xmalloc (current->bufsize);
92    }
93  else
94    {
95      current->bufsize = STAT_BLOCKSIZE (current->stat);
96      current->buffer = xmalloc (current->bufsize);
97
98      if (! skip_test)
99	{
100	  /* Check first part of file to see if it's a binary file.  */
101#if HAVE_SETMODE
102	  int oldmode = setmode (current->desc, O_BINARY);
103#endif
104	  ssize_t n = read (current->desc, current->buffer, current->bufsize);
105	  if (n == -1)
106	    pfatal_with_name (current->name);
107	  current->buffered_chars = n;
108#if HAVE_SETMODE
109	  if (oldmode != O_BINARY)
110	    {
111	      if (lseek (current->desc, - (off_t) n, SEEK_CUR) == -1)
112		pfatal_with_name (current->name);
113	      setmode (current->desc, oldmode);
114	      current->buffered_chars = 0;
115	    }
116#endif
117	  return binary_file_p (current->buffer, n);
118	}
119    }
120
121  current->buffered_chars = 0;
122  return 0;
123}
124
125/* Slurp the rest of the current file completely into memory.  */
126
127void
128slurp (current)
129     struct file_data *current;
130{
131  ssize_t cc;
132
133  if (current->desc < 0)
134    /* The file is nonexistent.  */
135    ;
136  else if (S_ISREG (current->stat.st_mode))
137    {
138      /* It's a regular file; slurp in the rest all at once.  */
139
140      /* Get the size out of the stat block.
141	 Allocate enough room for appended newline and sentinel.  */
142      cc = current->stat.st_size + 1 + sizeof (word);
143      if (current->bufsize < cc)
144	{
145	  current->bufsize = cc;
146	  current->buffer = xrealloc (current->buffer, cc);
147	}
148
149      if (current->buffered_chars < current->stat.st_size)
150	{
151	  cc = read (current->desc,
152		     current->buffer + current->buffered_chars,
153		     current->stat.st_size - current->buffered_chars);
154	  if (cc == -1)
155	    pfatal_with_name (current->name);
156	  current->buffered_chars += cc;
157	}
158    }
159  /* It's not a regular file; read it, growing the buffer as needed.  */
160  else if (always_text_flag || current->buffered_chars != 0)
161    {
162      for (;;)
163	{
164	  if (current->buffered_chars == current->bufsize)
165	    {
166	      current->bufsize = current->bufsize * 2;
167	      current->buffer = xrealloc (current->buffer, current->bufsize);
168	    }
169	  cc = read (current->desc,
170		     current->buffer + current->buffered_chars,
171		     current->bufsize - current->buffered_chars);
172	  if (cc == 0)
173	    break;
174	  if (cc == -1)
175	    pfatal_with_name (current->name);
176	  current->buffered_chars += cc;
177	}
178      /* Allocate just enough room for appended newline and sentinel.  */
179      current->bufsize = current->buffered_chars + 1 + sizeof (word);
180      current->buffer = xrealloc (current->buffer, current->bufsize);
181    }
182}
183
184/* Split the file into lines, simultaneously computing the equivalence class for
185   each line. */
186
187static void
188find_and_hash_each_line (current)
189     struct file_data *current;
190{
191  unsigned h;
192  unsigned char const *p = (unsigned char const *) current->prefix_end;
193  unsigned char c;
194  int i, *bucket;
195  size_t length;
196
197  /* Cache often-used quantities in local variables to help the compiler.  */
198  char const **linbuf = current->linbuf;
199  int alloc_lines = current->alloc_lines;
200  int line = 0;
201  int linbuf_base = current->linbuf_base;
202  int *cureqs = (int *) xmalloc (alloc_lines * sizeof (int));
203  struct equivclass *eqs = equivs;
204  int eqs_index = equivs_index;
205  int eqs_alloc = equivs_alloc;
206  char const *suffix_begin = current->suffix_begin;
207  char const *bufend = current->buffer + current->buffered_chars;
208  int use_line_cmp = ignore_some_line_changes;
209
210  while ((char const *) p < suffix_begin)
211    {
212      char const *ip = (char const *) p;
213
214      /* Compute the equivalence class for this line.  */
215
216      h = 0;
217
218      /* Hash this line until we find a newline. */
219      if (ignore_case_flag)
220	{
221	  if (ignore_all_space_flag)
222	    while ((c = *p++) != '\n')
223	      {
224		if (! ISSPACE (c))
225		  h = HASH (h, ISUPPER (c) ? tolower (c) : c);
226	      }
227	  else if (ignore_space_change_flag)
228	    while ((c = *p++) != '\n')
229	      {
230		if (ISSPACE (c))
231		  {
232		    for (;;)
233		      {
234			c = *p++;
235			if (!ISSPACE (c))
236			  break;
237			if (c == '\n')
238			  goto hashing_done;
239		      }
240		    h = HASH (h, ' ');
241		  }
242		/* C is now the first non-space.  */
243		h = HASH (h, ISUPPER (c) ? tolower (c) : c);
244	      }
245	  else
246	    while ((c = *p++) != '\n')
247	      h = HASH (h, ISUPPER (c) ? tolower (c) : c);
248	}
249      else
250	{
251	  if (ignore_all_space_flag)
252	    while ((c = *p++) != '\n')
253	      {
254		if (! ISSPACE (c))
255		  h = HASH (h, c);
256	      }
257	  else if (ignore_space_change_flag)
258	    while ((c = *p++) != '\n')
259	      {
260		if (ISSPACE (c))
261		  {
262		    for (;;)
263		      {
264			c = *p++;
265			if (!ISSPACE (c))
266			  break;
267			if (c == '\n')
268			  goto hashing_done;
269		      }
270		    h = HASH (h, ' ');
271		  }
272		/* C is now the first non-space.  */
273		h = HASH (h, c);
274	      }
275	  else
276	    while ((c = *p++) != '\n')
277	      h = HASH (h, c);
278	}
279   hashing_done:;
280
281      bucket = &buckets[h % nbuckets];
282      length = (char const *) p - ip - 1;
283
284      if ((char const *) p == bufend
285	  && current->missing_newline
286	  && ROBUST_OUTPUT_STYLE (output_style))
287	{
288	  /* This line is incomplete.  If this is significant,
289	     put the line into bucket[-1].  */
290	  if (! (ignore_space_change_flag | ignore_all_space_flag))
291	    bucket = &buckets[-1];
292
293	  /* Omit the inserted newline when computing linbuf later.  */
294	  p--;
295	  bufend = suffix_begin = (char const *) p;
296	}
297
298      for (i = *bucket;  ;  i = eqs[i].next)
299	if (!i)
300	  {
301	    /* Create a new equivalence class in this bucket. */
302	    i = eqs_index++;
303	    if (i == eqs_alloc)
304	      eqs = (struct equivclass *)
305		      xrealloc (eqs, (eqs_alloc*=2) * sizeof(*eqs));
306	    eqs[i].next = *bucket;
307	    eqs[i].hash = h;
308	    eqs[i].line = ip;
309	    eqs[i].length = length;
310	    *bucket = i;
311	    break;
312	  }
313	else if (eqs[i].hash == h)
314	  {
315	    char const *eqline = eqs[i].line;
316
317	    /* Reuse existing equivalence class if the lines are identical.
318	       This detects the common case of exact identity
319	       faster than complete comparison would.  */
320	    if (eqs[i].length == length && memcmp (eqline, ip, length) == 0)
321	      break;
322
323	    /* Reuse existing class if line_cmp reports the lines equal.  */
324	    if (use_line_cmp && line_cmp (eqline, ip) == 0)
325	      break;
326	  }
327
328      /* Maybe increase the size of the line table. */
329      if (line == alloc_lines)
330	{
331	  /* Double (alloc_lines - linbuf_base) by adding to alloc_lines.  */
332	  alloc_lines = 2 * alloc_lines - linbuf_base;
333	  cureqs = (int *) xrealloc (cureqs, alloc_lines * sizeof (*cureqs));
334	  linbuf = (char const **) xrealloc (linbuf + linbuf_base,
335					     (alloc_lines - linbuf_base)
336					     * sizeof (*linbuf))
337		   - linbuf_base;
338	}
339      linbuf[line] = ip;
340      cureqs[line] = i;
341      ++line;
342    }
343
344  current->buffered_lines = line;
345
346  for (i = 0;  ;  i++)
347    {
348      /* Record the line start for lines in the suffix that we care about.
349	 Record one more line start than lines,
350	 so that we can compute the length of any buffered line.  */
351      if (line == alloc_lines)
352	{
353	  /* Double (alloc_lines - linbuf_base) by adding to alloc_lines.  */
354	  alloc_lines = 2 * alloc_lines - linbuf_base;
355	  linbuf = (char const **) xrealloc (linbuf + linbuf_base,
356					     (alloc_lines - linbuf_base)
357					     * sizeof (*linbuf))
358		   - linbuf_base;
359	}
360      linbuf[line] = (char const *) p;
361
362      if ((char const *) p == bufend)
363	break;
364
365      if (context <= i && no_diff_means_no_output)
366	break;
367
368      line++;
369
370      while (*p++ != '\n')
371	;
372    }
373
374  /* Done with cache in local variables.  */
375  current->linbuf = linbuf;
376  current->valid_lines = line;
377  current->alloc_lines = alloc_lines;
378  current->equivs = cureqs;
379  equivs = eqs;
380  equivs_alloc = eqs_alloc;
381  equivs_index = eqs_index;
382}
383
384/* Prepare the end of the text.  Make sure it's initialized.
385   Make sure text ends in a newline,
386   but remember that we had to add one.  */
387
388static void
389prepare_text_end (current)
390     struct file_data *current;
391{
392  size_t buffered_chars = current->buffered_chars;
393  char *p = current->buffer;
394
395  if (buffered_chars == 0 || p[buffered_chars - 1] == '\n')
396    current->missing_newline = 0;
397  else
398    {
399      p[buffered_chars++] = '\n';
400      current->buffered_chars = buffered_chars;
401      current->missing_newline = 1;
402    }
403
404  /* Don't use uninitialized storage when planting or using sentinels.  */
405  if (p)
406    bzero (p + buffered_chars, sizeof (word));
407}
408
409/* Given a vector of two file_data objects, find the identical
410   prefixes and suffixes of each object. */
411
412static void
413find_identical_ends (filevec)
414     struct file_data filevec[];
415{
416  word *w0, *w1;
417  char *p0, *p1, *buffer0, *buffer1;
418  char const *end0, *beg0;
419  char const **linbuf0, **linbuf1;
420  int i, lines;
421  size_t n0, n1, tem;
422  int alloc_lines0, alloc_lines1;
423  int buffered_prefix, prefix_count, prefix_mask;
424
425  slurp (&filevec[0]);
426  if (filevec[0].desc != filevec[1].desc)
427    slurp (&filevec[1]);
428  else
429    {
430      filevec[1].buffer = filevec[0].buffer;
431      filevec[1].bufsize = filevec[0].bufsize;
432      filevec[1].buffered_chars = filevec[0].buffered_chars;
433    }
434  for (i = 0; i < 2; i++)
435    prepare_text_end (&filevec[i]);
436
437  /* Find identical prefix.  */
438
439  p0 = buffer0 = filevec[0].buffer;
440  p1 = buffer1 = filevec[1].buffer;
441
442  n0 = filevec[0].buffered_chars;
443  n1 = filevec[1].buffered_chars;
444
445  if (p0 == p1)
446    /* The buffers are the same; sentinels won't work.  */
447    p0 = p1 += n1;
448  else
449    {
450      /* Insert end sentinels, in this case characters that are guaranteed
451	 to make the equality test false, and thus terminate the loop.  */
452
453      if (n0 < n1)
454	p0[n0] = ~p1[n0];
455      else
456	p1[n1] = ~p0[n1];
457
458      /* Loop until first mismatch, or to the sentinel characters.  */
459
460      /* Compare a word at a time for speed.  */
461      w0 = (word *) p0;
462      w1 = (word *) p1;
463      while (*w0++ == *w1++)
464	;
465      --w0, --w1;
466
467      /* Do the last few bytes of comparison a byte at a time.  */
468      p0 = (char *) w0;
469      p1 = (char *) w1;
470      while (*p0++ == *p1++)
471	;
472      --p0, --p1;
473
474      /* Don't mistakenly count missing newline as part of prefix. */
475      if (ROBUST_OUTPUT_STYLE (output_style)
476	  && (buffer0 + n0 - filevec[0].missing_newline < p0)
477	     !=
478	     (buffer1 + n1 - filevec[1].missing_newline < p1))
479	--p0, --p1;
480    }
481
482  /* Now P0 and P1 point at the first nonmatching characters.  */
483
484  /* Skip back to last line-beginning in the prefix,
485     and then discard up to HORIZON_LINES lines from the prefix.  */
486  i = horizon_lines;
487  while (p0 != buffer0 && (p0[-1] != '\n' || i--))
488    --p0, --p1;
489
490  /* Record the prefix.  */
491  filevec[0].prefix_end = p0;
492  filevec[1].prefix_end = p1;
493
494  /* Find identical suffix.  */
495
496  /* P0 and P1 point beyond the last chars not yet compared.  */
497  p0 = buffer0 + n0;
498  p1 = buffer1 + n1;
499
500  if (! ROBUST_OUTPUT_STYLE (output_style)
501      || filevec[0].missing_newline == filevec[1].missing_newline)
502    {
503      end0 = p0;	/* Addr of last char in file 0.  */
504
505      /* Get value of P0 at which we should stop scanning backward:
506	 this is when either P0 or P1 points just past the last char
507	 of the identical prefix.  */
508      beg0 = filevec[0].prefix_end + (n0 < n1 ? 0 : n0 - n1);
509
510      /* Scan back until chars don't match or we reach that point.  */
511      for (; p0 != beg0; p0--, p1--)
512	if (*p0 != *p1)
513	  {
514	    /* Point at the first char of the matching suffix.  */
515	    beg0 = p0;
516	    break;
517	  }
518
519      /* Are we at a line-beginning in both files?  If not, add the rest of
520	 this line to the main body.  Discard up to HORIZON_LINES lines from
521	 the identical suffix.  Also, discard one extra line,
522	 because shift_boundaries may need it.  */
523      i = horizon_lines + !((buffer0 == p0 || p0[-1] == '\n')
524			    &&
525			    (buffer1 == p1 || p1[-1] == '\n'));
526      while (i-- && p0 != end0)
527	while (*p0++ != '\n')
528	  ;
529
530      p1 += p0 - beg0;
531    }
532
533  /* Record the suffix.  */
534  filevec[0].suffix_begin = p0;
535  filevec[1].suffix_begin = p1;
536
537  /* Calculate number of lines of prefix to save.
538
539     prefix_count == 0 means save the whole prefix;
540     we need this with for options like -D that output the whole file.
541     We also need it for options like -F that output some preceding line;
542     at least we will need to find the last few lines,
543     but since we don't know how many, it's easiest to find them all.
544
545     Otherwise, prefix_count != 0.  Save just prefix_count lines at start
546     of the line buffer; they'll be moved to the proper location later.
547     Handle 1 more line than the context says (because we count 1 too many),
548     rounded up to the next power of 2 to speed index computation.  */
549
550  if (no_diff_means_no_output && ! function_regexp_list)
551    {
552      for (prefix_count = 1;  prefix_count < context + 1;  prefix_count *= 2)
553	;
554      prefix_mask = prefix_count - 1;
555      alloc_lines0
556	= prefix_count
557	  + GUESS_LINES (0, 0, p0 - filevec[0].prefix_end)
558	  + context;
559    }
560  else
561    {
562      prefix_count = 0;
563      prefix_mask = ~0;
564      alloc_lines0 = GUESS_LINES (0, 0, n0);
565    }
566
567  lines = 0;
568  linbuf0 = (char const **) xmalloc (alloc_lines0 * sizeof (*linbuf0));
569
570  /* If the prefix is needed, find the prefix lines.  */
571  if (! (no_diff_means_no_output
572	 && filevec[0].prefix_end == p0
573	 && filevec[1].prefix_end == p1))
574    {
575      p0 = buffer0;
576      end0 = filevec[0].prefix_end;
577      while (p0 != end0)
578	{
579	  int l = lines++ & prefix_mask;
580	  if (l == alloc_lines0)
581	    linbuf0 = (char const **) xrealloc (linbuf0, (alloc_lines0 *= 2)
582							 * sizeof(*linbuf0));
583	  linbuf0[l] = p0;
584	  while (*p0++ != '\n')
585	    ;
586	}
587    }
588  buffered_prefix = prefix_count && context < lines ? context : lines;
589
590  /* Allocate line buffer 1.  */
591  tem = prefix_count ? filevec[1].suffix_begin - buffer1 : n1;
592
593  alloc_lines1
594    = (buffered_prefix
595       + GUESS_LINES (lines, filevec[1].prefix_end - buffer1, tem)
596       + context);
597  linbuf1 = (char const **) xmalloc (alloc_lines1 * sizeof (*linbuf1));
598
599  if (buffered_prefix != lines)
600    {
601      /* Rotate prefix lines to proper location.  */
602      for (i = 0;  i < buffered_prefix;  i++)
603	linbuf1[i] = linbuf0[(lines - context + i) & prefix_mask];
604      for (i = 0;  i < buffered_prefix;  i++)
605	linbuf0[i] = linbuf1[i];
606    }
607
608  /* Initialize line buffer 1 from line buffer 0.  */
609  for (i = 0; i < buffered_prefix; i++)
610    linbuf1[i] = linbuf0[i] - buffer0 + buffer1;
611
612  /* Record the line buffer, adjusted so that
613     linbuf*[0] points at the first differing line.  */
614  filevec[0].linbuf = linbuf0 + buffered_prefix;
615  filevec[1].linbuf = linbuf1 + buffered_prefix;
616  filevec[0].linbuf_base = filevec[1].linbuf_base = - buffered_prefix;
617  filevec[0].alloc_lines = alloc_lines0 - buffered_prefix;
618  filevec[1].alloc_lines = alloc_lines1 - buffered_prefix;
619  filevec[0].prefix_lines = filevec[1].prefix_lines = lines;
620}
621
622/* Largest primes less than some power of two, for nbuckets.  Values range
623   from useful to preposterous.  If one of these numbers isn't prime
624   after all, don't blame it on me, blame it on primes (6) . . . */
625static int const primes[] =
626{
627  509,
628  1021,
629  2039,
630  4093,
631  8191,
632  16381,
633  32749,
634#if 32767 < INT_MAX
635  65521,
636  131071,
637  262139,
638  524287,
639  1048573,
640  2097143,
641  4194301,
642  8388593,
643  16777213,
644  33554393,
645  67108859,			/* Preposterously large . . . */
646  134217689,
647  268435399,
648  536870909,
649  1073741789,
650  2147483647,
651#endif
652  0
653};
654
655/* Given a vector of two file_data objects, read the file associated
656   with each one, and build the table of equivalence classes.
657   Return 1 if either file appears to be a binary file.
658   If PRETEND_BINARY is nonzero, pretend they are binary regardless.  */
659
660int
661read_files (filevec, pretend_binary)
662     struct file_data filevec[];
663     int pretend_binary;
664{
665  int i;
666  int skip_test = always_text_flag | pretend_binary;
667  int appears_binary = pretend_binary | sip (&filevec[0], skip_test);
668
669  if (filevec[0].desc != filevec[1].desc)
670    appears_binary |= sip (&filevec[1], skip_test | appears_binary);
671  else
672    {
673      filevec[1].buffer = filevec[0].buffer;
674      filevec[1].bufsize = filevec[0].bufsize;
675      filevec[1].buffered_chars = filevec[0].buffered_chars;
676    }
677  if (appears_binary)
678    {
679#if HAVE_SETMODE
680      setmode (filevec[0].desc, O_BINARY);
681      setmode (filevec[1].desc, O_BINARY);
682#endif
683      return 1;
684    }
685
686  find_identical_ends (filevec);
687
688  equivs_alloc = filevec[0].alloc_lines + filevec[1].alloc_lines + 1;
689  equivs = (struct equivclass *) xmalloc (equivs_alloc * sizeof (struct equivclass));
690  /* Equivalence class 0 is permanently safe for lines that were not
691     hashed.  Real equivalence classes start at 1. */
692  equivs_index = 1;
693
694  for (i = 0;  primes[i] < equivs_alloc / 3;  i++)
695    if (! primes[i])
696      abort ();
697  nbuckets = primes[i];
698
699  buckets = (int *) xmalloc ((nbuckets + 1) * sizeof (*buckets));
700  bzero (buckets++, (nbuckets + 1) * sizeof (*buckets));
701
702  for (i = 0; i < 2; i++)
703    find_and_hash_each_line (&filevec[i]);
704
705  filevec[0].equiv_max = filevec[1].equiv_max = equivs_index;
706
707  free (equivs);
708  free (buckets - 1);
709
710  return 0;
711}
712