io.c revision 32785
1/* File I/O for GNU DIFF.
2   Copyright (C) 1988, 1989, 1992, 1993, 1994 Free Software Foundation, Inc.
3
4This file is part of GNU DIFF.
5
6GNU DIFF is free software; you can redistribute it and/or modify
7it under the terms of the GNU General Public License as published by
8the Free Software Foundation; either version 2, or (at your option)
9any later version.
10
11GNU DIFF is distributed in the hope that it will be useful,
12but WITHOUT ANY WARRANTY; without even the implied warranty of
13MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14GNU General Public License for more details.
15
16You should have received a copy of the GNU General Public License
17along with GNU DIFF; see the file COPYING.  If not, write to
18the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.  */
19
20#include "diff.h"
21
22/* Rotate a value n bits to the left. */
23#define UINT_BIT (sizeof (unsigned) * CHAR_BIT)
24#define ROL(v, n) ((v) << (n) | (v) >> (UINT_BIT - (n)))
25
26/* Given a hash value and a new character, return a new hash value. */
27#define HASH(h, c) ((c) + ROL (h, 7))
28
29/* Guess remaining number of lines from number N of lines so far,
30   size S so far, and total size T.  */
31#define GUESS_LINES(n,s,t) (((t) - (s)) / ((n) < 10 ? 32 : (s) / ((n)-1)) + 5)
32
33/* Type used for fast prefix comparison in find_identical_ends.  */
34#ifndef word
35#define word int
36#endif
37
38/* Lines are put into equivalence classes (of lines that match in line_cmp).
39   Each equivalence class is represented by one of these structures,
40   but only while the classes are being computed.
41   Afterward, each class is represented by a number.  */
42struct equivclass
43{
44  int next;	/* Next item in this bucket. */
45  unsigned hash;	/* Hash of lines in this class.  */
46  char const *line;	/* A line that fits this class. */
47  size_t length;	/* That line's length, not counting its newline.  */
48};
49
50/* Hash-table: array of buckets, each being a chain of equivalence classes.
51   buckets[-1] is reserved for incomplete lines.  */
52static int *buckets;
53
54/* Number of buckets in the hash table array, not counting buckets[-1]. */
55static int nbuckets;
56
57/* Array in which the equivalence classes are allocated.
58   The bucket-chains go through the elements in this array.
59   The number of an equivalence class is its index in this array.  */
60static struct equivclass *equivs;
61
62/* Index of first free element in the array `equivs'.  */
63static int equivs_index;
64
65/* Number of elements allocated in the array `equivs'.  */
66static int equivs_alloc;
67
68static void find_and_hash_each_line PARAMS((struct file_data *));
69static void find_identical_ends PARAMS((struct file_data[]));
70static void prepare_text_end PARAMS((struct file_data *));
71
72/* Check for binary files and compare them for exact identity.  */
73
74/* Return 1 if BUF contains a non text character.
75   SIZE is the number of characters in BUF.  */
76
77#define binary_file_p(buf, size) (memchr (buf, '\0', size) != 0)
78
79/* Get ready to read the current file.
80   Return nonzero if SKIP_TEST is zero,
81   and if it appears to be a binary file.  */
82
83int
84sip (current, skip_test)
85     struct file_data *current;
86     int skip_test;
87{
88  /* If we have a nonexistent file at this stage, treat it as empty.  */
89  if (current->desc < 0)
90    {
91      /* Leave room for a sentinel.  */
92      current->bufsize = sizeof (word);
93      current->buffer = xmalloc (current->bufsize);
94    }
95  else
96    {
97      current->bufsize = STAT_BLOCKSIZE (current->stat);
98      current->buffer = xmalloc (current->bufsize);
99
100      if (! skip_test)
101	{
102	  /* Check first part of file to see if it's a binary file.  */
103#if HAVE_SETMODE
104	  int oldmode = setmode (current->desc, O_BINARY);
105#endif
106	  size_t n = read (current->desc, current->buffer, current->bufsize);
107	  if (n == -1)
108	    pfatal_with_name (current->name);
109	  current->buffered_chars = n;
110#if HAVE_SETMODE
111	  if (oldmode != O_BINARY)
112	    {
113	      if (lseek (current->desc, - (off_t) n, SEEK_CUR) == -1)
114		pfatal_with_name (current->name);
115	      setmode (current->desc, oldmode);
116	      current->buffered_chars = 0;
117	    }
118#endif
119	  return binary_file_p (current->buffer, n);
120	}
121    }
122
123  current->buffered_chars = 0;
124  return 0;
125}
126
127/* Slurp the rest of the current file completely into memory.  */
128
129void
130slurp (current)
131     struct file_data *current;
132{
133  size_t cc;
134
135  if (current->desc < 0)
136    /* The file is nonexistent.  */
137    ;
138  else if (S_ISREG (current->stat.st_mode))
139    {
140      /* It's a regular file; slurp in the rest all at once.  */
141
142      /* Get the size out of the stat block.
143	 Allocate enough room for appended newline and sentinel.  */
144      cc = current->stat.st_size + 1 + sizeof (word);
145      if (current->bufsize < cc)
146	{
147	  current->bufsize = cc;
148	  current->buffer = xrealloc (current->buffer, cc);
149	}
150
151      if (current->buffered_chars < current->stat.st_size)
152	{
153	  cc = read (current->desc,
154		     current->buffer + current->buffered_chars,
155		     current->stat.st_size - current->buffered_chars);
156	  if (cc == -1)
157	    pfatal_with_name (current->name);
158	  current->buffered_chars += cc;
159	}
160    }
161  /* It's not a regular file; read it, growing the buffer as needed.  */
162  else if (always_text_flag || current->buffered_chars != 0)
163    {
164      for (;;)
165	{
166	  if (current->buffered_chars == current->bufsize)
167	    {
168	      current->bufsize = current->bufsize * 2;
169	      current->buffer = xrealloc (current->buffer, current->bufsize);
170	    }
171	  cc = read (current->desc,
172		     current->buffer + current->buffered_chars,
173		     current->bufsize - current->buffered_chars);
174	  if (cc == 0)
175	    break;
176	  if (cc == -1)
177	    pfatal_with_name (current->name);
178	  current->buffered_chars += cc;
179	}
180      /* Allocate just enough room for appended newline and sentinel.  */
181      current->bufsize = current->buffered_chars + 1 + sizeof (word);
182      current->buffer = xrealloc (current->buffer, current->bufsize);
183    }
184}
185
186/* Split the file into lines, simultaneously computing the equivalence class for
187   each line. */
188
189static void
190find_and_hash_each_line (current)
191     struct file_data *current;
192{
193  unsigned h;
194  unsigned char const *p = (unsigned char const *) current->prefix_end;
195  unsigned char c;
196  int i, *bucket;
197  size_t length;
198
199  /* Cache often-used quantities in local variables to help the compiler.  */
200  char const **linbuf = current->linbuf;
201  int alloc_lines = current->alloc_lines;
202  int line = 0;
203  int linbuf_base = current->linbuf_base;
204  int *cureqs = (int *) xmalloc (alloc_lines * sizeof (int));
205  struct equivclass *eqs = equivs;
206  int eqs_index = equivs_index;
207  int eqs_alloc = equivs_alloc;
208  char const *suffix_begin = current->suffix_begin;
209  char const *bufend = current->buffer + current->buffered_chars;
210  int use_line_cmp = ignore_some_line_changes;
211
212  while ((char const *) p < suffix_begin)
213    {
214      char const *ip = (char const *) p;
215
216      /* Compute the equivalence class for this line.  */
217
218      h = 0;
219
220      /* Hash this line until we find a newline. */
221      if (ignore_case_flag)
222	{
223	  if (ignore_all_space_flag)
224	    while ((c = *p++) != '\n')
225	      {
226		if (! ISSPACE (c))
227		  h = HASH (h, ISUPPER (c) ? tolower (c) : c);
228	      }
229	  else if (ignore_space_change_flag)
230	    while ((c = *p++) != '\n')
231	      {
232		if (ISSPACE (c))
233		  {
234		    for (;;)
235		      {
236			c = *p++;
237			if (!ISSPACE (c))
238			  break;
239			if (c == '\n')
240			  goto hashing_done;
241		      }
242		    h = HASH (h, ' ');
243		  }
244		/* C is now the first non-space.  */
245		h = HASH (h, ISUPPER (c) ? tolower (c) : c);
246	      }
247	  else
248	    while ((c = *p++) != '\n')
249	      h = HASH (h, ISUPPER (c) ? tolower (c) : c);
250	}
251      else
252	{
253	  if (ignore_all_space_flag)
254	    while ((c = *p++) != '\n')
255	      {
256		if (! ISSPACE (c))
257		  h = HASH (h, c);
258	      }
259	  else if (ignore_space_change_flag)
260	    while ((c = *p++) != '\n')
261	      {
262		if (ISSPACE (c))
263		  {
264		    for (;;)
265		      {
266			c = *p++;
267			if (!ISSPACE (c))
268			  break;
269			if (c == '\n')
270			  goto hashing_done;
271		      }
272		    h = HASH (h, ' ');
273		  }
274		/* C is now the first non-space.  */
275		h = HASH (h, c);
276	      }
277	  else
278	    while ((c = *p++) != '\n')
279	      h = HASH (h, c);
280	}
281   hashing_done:;
282
283      bucket = &buckets[h % nbuckets];
284      length = (char const *) p - ip - 1;
285
286      if ((char const *) p == bufend
287	  && current->missing_newline
288	  && ROBUST_OUTPUT_STYLE (output_style))
289	{
290	  /* This line is incomplete.  If this is significant,
291	     put the line into bucket[-1].  */
292	  if (! (ignore_space_change_flag | ignore_all_space_flag))
293	    bucket = &buckets[-1];
294
295	  /* Omit the inserted newline when computing linbuf later.  */
296	  p--;
297	  bufend = suffix_begin = (char const *) p;
298	}
299
300      for (i = *bucket;  ;  i = eqs[i].next)
301	if (!i)
302	  {
303	    /* Create a new equivalence class in this bucket. */
304	    i = eqs_index++;
305	    if (i == eqs_alloc)
306	      eqs = (struct equivclass *)
307		      xrealloc (eqs, (eqs_alloc*=2) * sizeof(*eqs));
308	    eqs[i].next = *bucket;
309	    eqs[i].hash = h;
310	    eqs[i].line = ip;
311	    eqs[i].length = length;
312	    *bucket = i;
313	    break;
314	  }
315	else if (eqs[i].hash == h)
316	  {
317	    char const *eqline = eqs[i].line;
318
319	    /* Reuse existing equivalence class if the lines are identical.
320	       This detects the common case of exact identity
321	       faster than complete comparison would.  */
322	    if (eqs[i].length == length && memcmp (eqline, ip, length) == 0)
323	      break;
324
325	    /* Reuse existing class if line_cmp reports the lines equal.  */
326	    if (use_line_cmp && line_cmp (eqline, ip) == 0)
327	      break;
328	  }
329
330      /* Maybe increase the size of the line table. */
331      if (line == alloc_lines)
332	{
333	  /* Double (alloc_lines - linbuf_base) by adding to alloc_lines.  */
334	  alloc_lines = 2 * alloc_lines - linbuf_base;
335	  cureqs = (int *) xrealloc (cureqs, alloc_lines * sizeof (*cureqs));
336	  linbuf = (char const **) xrealloc (linbuf + linbuf_base,
337					     (alloc_lines - linbuf_base)
338					     * sizeof (*linbuf))
339		   - linbuf_base;
340	}
341      linbuf[line] = ip;
342      cureqs[line] = i;
343      ++line;
344    }
345
346  current->buffered_lines = line;
347
348  for (i = 0;  ;  i++)
349    {
350      /* Record the line start for lines in the suffix that we care about.
351	 Record one more line start than lines,
352	 so that we can compute the length of any buffered line.  */
353      if (line == alloc_lines)
354	{
355	  /* Double (alloc_lines - linbuf_base) by adding to alloc_lines.  */
356	  alloc_lines = 2 * alloc_lines - linbuf_base;
357	  linbuf = (char const **) xrealloc (linbuf + linbuf_base,
358					     (alloc_lines - linbuf_base)
359					     * sizeof (*linbuf))
360		   - linbuf_base;
361	}
362      linbuf[line] = (char const *) p;
363
364      if ((char const *) p == bufend)
365	break;
366
367      if (context <= i && no_diff_means_no_output)
368	break;
369
370      line++;
371
372      while (*p++ != '\n')
373	;
374    }
375
376  /* Done with cache in local variables.  */
377  current->linbuf = linbuf;
378  current->valid_lines = line;
379  current->alloc_lines = alloc_lines;
380  current->equivs = cureqs;
381  equivs = eqs;
382  equivs_alloc = eqs_alloc;
383  equivs_index = eqs_index;
384}
385
386/* Prepare the end of the text.  Make sure it's initialized.
387   Make sure text ends in a newline,
388   but remember that we had to add one.  */
389
390static void
391prepare_text_end (current)
392     struct file_data *current;
393{
394  size_t buffered_chars = current->buffered_chars;
395  char *p = current->buffer;
396
397  if (buffered_chars == 0 || p[buffered_chars - 1] == '\n')
398    current->missing_newline = 0;
399  else
400    {
401      p[buffered_chars++] = '\n';
402      current->buffered_chars = buffered_chars;
403      current->missing_newline = 1;
404    }
405
406  /* Don't use uninitialized storage when planting or using sentinels.  */
407  if (p)
408    bzero (p + buffered_chars, sizeof (word));
409}
410
411/* Given a vector of two file_data objects, find the identical
412   prefixes and suffixes of each object. */
413
414static void
415find_identical_ends (filevec)
416     struct file_data filevec[];
417{
418  word *w0, *w1;
419  char *p0, *p1, *buffer0, *buffer1;
420  char const *end0, *beg0;
421  char const **linbuf0, **linbuf1;
422  int i, lines;
423  size_t n0, n1, tem;
424  int alloc_lines0, alloc_lines1;
425  int buffered_prefix, prefix_count, prefix_mask;
426
427  slurp (&filevec[0]);
428  if (filevec[0].desc != filevec[1].desc)
429    slurp (&filevec[1]);
430  else
431    {
432      filevec[1].buffer = filevec[0].buffer;
433      filevec[1].bufsize = filevec[0].bufsize;
434      filevec[1].buffered_chars = filevec[0].buffered_chars;
435    }
436  for (i = 0; i < 2; i++)
437    prepare_text_end (&filevec[i]);
438
439  /* Find identical prefix.  */
440
441  p0 = buffer0 = filevec[0].buffer;
442  p1 = buffer1 = filevec[1].buffer;
443
444  n0 = filevec[0].buffered_chars;
445  n1 = filevec[1].buffered_chars;
446
447  if (p0 == p1)
448    /* The buffers are the same; sentinels won't work.  */
449    p0 = p1 += n1;
450  else
451    {
452      /* Insert end sentinels, in this case characters that are guaranteed
453	 to make the equality test false, and thus terminate the loop.  */
454
455      if (n0 < n1)
456	p0[n0] = ~p1[n0];
457      else
458	p1[n1] = ~p0[n1];
459
460      /* Loop until first mismatch, or to the sentinel characters.  */
461
462      /* Compare a word at a time for speed.  */
463      w0 = (word *) p0;
464      w1 = (word *) p1;
465      while (*w0++ == *w1++)
466	;
467      --w0, --w1;
468
469      /* Do the last few bytes of comparison a byte at a time.  */
470      p0 = (char *) w0;
471      p1 = (char *) w1;
472      while (*p0++ == *p1++)
473	;
474      --p0, --p1;
475
476      /* Don't mistakenly count missing newline as part of prefix. */
477      if (ROBUST_OUTPUT_STYLE (output_style)
478	  && (buffer0 + n0 - filevec[0].missing_newline < p0)
479	     !=
480	     (buffer1 + n1 - filevec[1].missing_newline < p1))
481	--p0, --p1;
482    }
483
484  /* Now P0 and P1 point at the first nonmatching characters.  */
485
486  /* Skip back to last line-beginning in the prefix,
487     and then discard up to HORIZON_LINES lines from the prefix.  */
488  i = horizon_lines;
489  while (p0 != buffer0 && (p0[-1] != '\n' || i--))
490    --p0, --p1;
491
492  /* Record the prefix.  */
493  filevec[0].prefix_end = p0;
494  filevec[1].prefix_end = p1;
495
496  /* Find identical suffix.  */
497
498  /* P0 and P1 point beyond the last chars not yet compared.  */
499  p0 = buffer0 + n0;
500  p1 = buffer1 + n1;
501
502  if (! ROBUST_OUTPUT_STYLE (output_style)
503      || filevec[0].missing_newline == filevec[1].missing_newline)
504    {
505      end0 = p0;	/* Addr of last char in file 0.  */
506
507      /* Get value of P0 at which we should stop scanning backward:
508	 this is when either P0 or P1 points just past the last char
509	 of the identical prefix.  */
510      beg0 = filevec[0].prefix_end + (n0 < n1 ? 0 : n0 - n1);
511
512      /* Scan back until chars don't match or we reach that point.  */
513      while (p0 != beg0)
514	if (*--p0 != *--p1)
515	  {
516	    /* Point at the first char of the matching suffix.  */
517	    ++p0, ++p1;
518	    beg0 = p0;
519	    break;
520	  }
521
522      /* Are we at a line-beginning in both files?  If not, add the rest of
523	 this line to the main body.  Discard up to HORIZON_LINES lines from
524	 the identical suffix.  Also, discard one extra line,
525	 because shift_boundaries may need it.  */
526      i = horizon_lines + !((buffer0 == p0 || p0[-1] == '\n')
527			    &&
528			    (buffer1 == p1 || p1[-1] == '\n'));
529      while (i-- && p0 != end0)
530	while (*p0++ != '\n')
531	  ;
532
533      p1 += p0 - beg0;
534    }
535
536  /* Record the suffix.  */
537  filevec[0].suffix_begin = p0;
538  filevec[1].suffix_begin = p1;
539
540  /* Calculate number of lines of prefix to save.
541
542     prefix_count == 0 means save the whole prefix;
543     we need this with for options like -D that output the whole file.
544     We also need it for options like -F that output some preceding line;
545     at least we will need to find the last few lines,
546     but since we don't know how many, it's easiest to find them all.
547
548     Otherwise, prefix_count != 0.  Save just prefix_count lines at start
549     of the line buffer; they'll be moved to the proper location later.
550     Handle 1 more line than the context says (because we count 1 too many),
551     rounded up to the next power of 2 to speed index computation.  */
552
553  if (no_diff_means_no_output && ! function_regexp_list)
554    {
555      for (prefix_count = 1;  prefix_count < context + 1;  prefix_count *= 2)
556	;
557      prefix_mask = prefix_count - 1;
558      alloc_lines0
559	= prefix_count
560	  + GUESS_LINES (0, 0, p0 - filevec[0].prefix_end)
561	  + context;
562    }
563  else
564    {
565      prefix_count = 0;
566      prefix_mask = ~0;
567      alloc_lines0 = GUESS_LINES (0, 0, n0);
568    }
569
570  lines = 0;
571  linbuf0 = (char const **) xmalloc (alloc_lines0 * sizeof (*linbuf0));
572
573  /* If the prefix is needed, find the prefix lines.  */
574  if (! (no_diff_means_no_output
575	 && filevec[0].prefix_end == p0
576	 && filevec[1].prefix_end == p1))
577    {
578      p0 = buffer0;
579      end0 = filevec[0].prefix_end;
580      while (p0 != end0)
581	{
582	  int l = lines++ & prefix_mask;
583	  if (l == alloc_lines0)
584	    linbuf0 = (char const **) xrealloc (linbuf0, (alloc_lines0 *= 2)
585							 * sizeof(*linbuf0));
586	  linbuf0[l] = p0;
587	  while (*p0++ != '\n')
588	    ;
589	}
590    }
591  buffered_prefix = prefix_count && context < lines ? context : lines;
592
593  /* Allocate line buffer 1.  */
594  tem = prefix_count ? filevec[1].suffix_begin - buffer1 : n1;
595
596  alloc_lines1
597    = (buffered_prefix
598       + GUESS_LINES (lines, filevec[1].prefix_end - buffer1, tem)
599       + context);
600  linbuf1 = (char const **) xmalloc (alloc_lines1 * sizeof (*linbuf1));
601
602  if (buffered_prefix != lines)
603    {
604      /* Rotate prefix lines to proper location.  */
605      for (i = 0;  i < buffered_prefix;  i++)
606	linbuf1[i] = linbuf0[(lines - context + i) & prefix_mask];
607      for (i = 0;  i < buffered_prefix;  i++)
608	linbuf0[i] = linbuf1[i];
609    }
610
611  /* Initialize line buffer 1 from line buffer 0.  */
612  for (i = 0; i < buffered_prefix; i++)
613    linbuf1[i] = linbuf0[i] - buffer0 + buffer1;
614
615  /* Record the line buffer, adjusted so that
616     linbuf*[0] points at the first differing line.  */
617  filevec[0].linbuf = linbuf0 + buffered_prefix;
618  filevec[1].linbuf = linbuf1 + buffered_prefix;
619  filevec[0].linbuf_base = filevec[1].linbuf_base = - buffered_prefix;
620  filevec[0].alloc_lines = alloc_lines0 - buffered_prefix;
621  filevec[1].alloc_lines = alloc_lines1 - buffered_prefix;
622  filevec[0].prefix_lines = filevec[1].prefix_lines = lines;
623}
624
625/* Largest primes less than some power of two, for nbuckets.  Values range
626   from useful to preposterous.  If one of these numbers isn't prime
627   after all, don't blame it on me, blame it on primes (6) . . . */
628static int const primes[] =
629{
630  509,
631  1021,
632  2039,
633  4093,
634  8191,
635  16381,
636  32749,
637#if 32767 < INT_MAX
638  65521,
639  131071,
640  262139,
641  524287,
642  1048573,
643  2097143,
644  4194301,
645  8388593,
646  16777213,
647  33554393,
648  67108859,			/* Preposterously large . . . */
649  134217689,
650  268435399,
651  536870909,
652  1073741789,
653  2147483647,
654#endif
655  0
656};
657
658/* Given a vector of two file_data objects, read the file associated
659   with each one, and build the table of equivalence classes.
660   Return 1 if either file appears to be a binary file.
661   If PRETEND_BINARY is nonzero, pretend they are binary regardless.  */
662
663int
664read_files (filevec, pretend_binary)
665     struct file_data filevec[];
666     int pretend_binary;
667{
668  int i;
669  int skip_test = always_text_flag | pretend_binary;
670  int appears_binary = pretend_binary | sip (&filevec[0], skip_test);
671
672  if (filevec[0].desc != filevec[1].desc)
673    appears_binary |= sip (&filevec[1], skip_test | appears_binary);
674  else
675    {
676      filevec[1].buffer = filevec[0].buffer;
677      filevec[1].bufsize = filevec[0].bufsize;
678      filevec[1].buffered_chars = filevec[0].buffered_chars;
679    }
680  if (appears_binary)
681    {
682#if HAVE_SETMODE
683      setmode (filevec[0].desc, O_BINARY);
684      setmode (filevec[1].desc, O_BINARY);
685#endif
686      return 1;
687    }
688
689  find_identical_ends (filevec);
690
691  equivs_alloc = filevec[0].alloc_lines + filevec[1].alloc_lines + 1;
692  equivs = (struct equivclass *) xmalloc (equivs_alloc * sizeof (struct equivclass));
693  /* Equivalence class 0 is permanently safe for lines that were not
694     hashed.  Real equivalence classes start at 1. */
695  equivs_index = 1;
696
697  for (i = 0;  primes[i] < equivs_alloc / 3;  i++)
698    if (! primes[i])
699      abort ();
700  nbuckets = primes[i];
701
702  buckets = (int *) xmalloc ((nbuckets + 1) * sizeof (*buckets));
703  bzero (buckets++, (nbuckets + 1) * sizeof (*buckets));
704
705  for (i = 0; i < 2; i++)
706    find_and_hash_each_line (&filevec[i]);
707
708  filevec[0].equiv_max = filevec[1].equiv_max = equivs_index;
709
710  free (equivs);
711  free (buckets - 1);
712
713  return 0;
714}
715