diff/src/io.c

170754Sdelphij/* File I/O for GNU DIFF.
170754Sdelphij
170754Sdelphij   Copyright (C) 1988, 1989, 1992, 1993, 1994, 1995, 1998, 2001, 2002,
170754Sdelphij   2004 Free Software Foundation, Inc.
170754Sdelphij
170754Sdelphij   This file is part of GNU DIFF.
170754Sdelphij
170754Sdelphij   GNU DIFF is free software; you can redistribute it and/or modify
170754Sdelphij   it under the terms of the GNU General Public License as published by
170754Sdelphij   the Free Software Foundation; either version 2, or (at your option)
170754Sdelphij   any later version.
170754Sdelphij
170754Sdelphij   GNU DIFF is distributed in the hope that it will be useful,
170754Sdelphij   but WITHOUT ANY WARRANTY; without even the implied warranty of
170754Sdelphij   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
170754Sdelphij   GNU General Public License for more details.
170754Sdelphij
170754Sdelphij   You should have received a copy of the GNU General Public License
170754Sdelphij   along with this program; see the file COPYING.
170754Sdelphij   If not, write to the Free Software Foundation,
170754Sdelphij   59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.  */
170754Sdelphij
170754Sdelphij#include "diff.h"
170754Sdelphij#include <cmpbuf.h>
170754Sdelphij#include <file-type.h>
170754Sdelphij#include <setmode.h>
170754Sdelphij#include <xalloc.h>
170754Sdelphij
170754Sdelphij/* Rotate an unsigned value to the left.  */
170754Sdelphij#define ROL(v, n) ((v) << (n) | (v) >> (sizeof (v) * CHAR_BIT - (n)))
170754Sdelphij
170754Sdelphij/* Given a hash value and a new character, return a new hash value.  */
170754Sdelphij#define HASH(h, c) ((c) + ROL (h, 7))
170754Sdelphij
170754Sdelphij/* The type of a hash value.  */
170754Sdelphijtypedef size_t hash_value;
170754Sdelphijverify (hash_value_is_unsigned, ! TYPE_SIGNED (hash_value));
170754Sdelphij
170754Sdelphij/* Lines are put into equivalence classes of lines that match in lines_differ.
170754Sdelphij   Each equivalence class is represented by one of these structures,
170754Sdelphij   but only while the classes are being computed.
170754Sdelphij   Afterward, each class is represented by a number.  */
170754Sdelphijstruct equivclass
170754Sdelphij{
170754Sdelphij  lin next;		/* Next item in this bucket.  */
170754Sdelphij  hash_value hash;	/* Hash of lines in this class.  */
170754Sdelphij  char const *line;	/* A line that fits this class.  */
170754Sdelphij  size_t length;	/* That line's length, not counting its newline.  */
170754Sdelphij};
170754Sdelphij
170754Sdelphij/* Hash-table: array of buckets, each being a chain of equivalence classes.
170754Sdelphij   buckets[-1] is reserved for incomplete lines.  */
170754Sdelphijstatic lin *buckets;
170754Sdelphij
170754Sdelphij/* Number of buckets in the hash table array, not counting buckets[-1].  */
170754Sdelphijstatic size_t nbuckets;
170754Sdelphij
170754Sdelphij/* Array in which the equivalence classes are allocated.
170754Sdelphij   The bucket-chains go through the elements in this array.
170754Sdelphij   The number of an equivalence class is its index in this array.  */
170754Sdelphijstatic struct equivclass *equivs;
170754Sdelphij
170754Sdelphij/* Index of first free element in the array `equivs'.  */
170754Sdelphijstatic lin equivs_index;
170754Sdelphij
170754Sdelphij/* Number of elements allocated in the array `equivs'.  */
170754Sdelphijstatic lin equivs_alloc;
170754Sdelphij
170754Sdelphij/* Read a block of data into a file buffer, checking for EOF and error.  */
170754Sdelphij
170754Sdelphijvoid
170754Sdelphijfile_block_read (struct file_data *current, size_t size)
170754Sdelphij{
170754Sdelphij  if (size && ! current->eof)
170754Sdelphij    {
170754Sdelphij      size_t s = block_read (current->desc,
170754Sdelphij			     FILE_BUFFER (current) + current->buffered, size);
170754Sdelphij      if (s == SIZE_MAX)
170754Sdelphij	pfatal_with_name (current->name);
170754Sdelphij      current->buffered += s;
170754Sdelphij      current->eof = s < size;
170754Sdelphij    }
170754Sdelphij}
170754Sdelphij
170754Sdelphij/* Check for binary files and compare them for exact identity.  */
170754Sdelphij
170754Sdelphij/* Return 1 if BUF contains a non text character.
170754Sdelphij   SIZE is the number of characters in BUF.  */
170754Sdelphij
170754Sdelphij#define binary_file_p(buf, size) (memchr (buf, 0, size) != 0)
170754Sdelphij
170754Sdelphij/* Get ready to read the current file.
170754Sdelphij   Return nonzero if SKIP_TEST is zero,
170754Sdelphij   and if it appears to be a binary file.  */
170754Sdelphij
170754Sdelphijstatic bool
170754Sdelphijsip (struct file_data *current, bool skip_test)
170754Sdelphij{
170754Sdelphij  /* If we have a nonexistent file at this stage, treat it as empty.  */
170754Sdelphij  if (current->desc < 0)
170754Sdelphij    {
170754Sdelphij      /* Leave room for a sentinel.  */
170754Sdelphij      current->bufsize = sizeof (word);
170754Sdelphij      current->buffer = xmalloc (current->bufsize);
170754Sdelphij    }
170754Sdelphij  else
170754Sdelphij    {
170754Sdelphij      current->bufsize = buffer_lcm (sizeof (word),
170754Sdelphij				     STAT_BLOCKSIZE (current->stat),
170754Sdelphij				     PTRDIFF_MAX - 2 * sizeof (word));
170754Sdelphij      current->buffer = xmalloc (current->bufsize);
170754Sdelphij
170754Sdelphij      if (! skip_test)
170754Sdelphij	{
170754Sdelphij	  /* Check first part of file to see if it's a binary file.  */
170754Sdelphij
170754Sdelphij	  bool was_binary = set_binary_mode (current->desc, true);
170754Sdelphij	  off_t buffered;
170754Sdelphij	  file_block_read (current, current->bufsize);
170754Sdelphij	  buffered = current->buffered;
170754Sdelphij
170754Sdelphij	  if (! was_binary)
170754Sdelphij	    {
170754Sdelphij	      /* Revert to text mode and seek back to the beginning to
170754Sdelphij		 reread the file.  Use relative seek, since file
170754Sdelphij		 descriptors like stdin might not start at offset
170754Sdelphij		 zero.  */
170754Sdelphij
170754Sdelphij	      if (lseek (current->desc, - buffered, SEEK_CUR) == -1)
170754Sdelphij		pfatal_with_name (current->name);
170754Sdelphij	      set_binary_mode (current->desc, false);
170754Sdelphij	      current->buffered = 0;
170754Sdelphij	      current->eof = false;
170754Sdelphij	    }
170754Sdelphij
170754Sdelphij	  return binary_file_p (current->buffer, buffered);
170754Sdelphij	}
170754Sdelphij    }
170754Sdelphij
170754Sdelphij  current->buffered = 0;
170754Sdelphij  current->eof = false;
170754Sdelphij  return false;
170754Sdelphij}
170754Sdelphij
170754Sdelphij/* Slurp the rest of the current file completely into memory.  */
170754Sdelphij
170754Sdelphijstatic void
170754Sdelphijslurp (struct file_data *current)
170754Sdelphij{
170754Sdelphij  size_t cc;
170754Sdelphij
170754Sdelphij  if (current->desc < 0)
170754Sdelphij    {
170754Sdelphij      /* The file is nonexistent.  */
170754Sdelphij      return;
170754Sdelphij    }
170754Sdelphij
170754Sdelphij  if (S_ISREG (current->stat.st_mode))
170754Sdelphij    {
170754Sdelphij      /* It's a regular file; slurp in the rest all at once.  */
170754Sdelphij
170754Sdelphij      /* Get the size out of the stat block.
170754Sdelphij	 Allocate just enough room for appended newline plus word sentinel,
170754Sdelphij	 plus word-alignment since we want the buffer word-aligned.  */
170754Sdelphij      size_t file_size = current->stat.st_size;
170754Sdelphij      cc = file_size + 2 * sizeof (word) - file_size % sizeof (word);
170754Sdelphij      if (file_size != current->stat.st_size || cc < file_size
170754Sdelphij	  || PTRDIFF_MAX <= cc)
170754Sdelphij	xalloc_die ();
170754Sdelphij
170754Sdelphij      if (current->bufsize < cc)
170754Sdelphij	{
170754Sdelphij	  current->bufsize = cc;
170754Sdelphij	  current->buffer = xrealloc (current->buffer, cc);
170754Sdelphij	}
170754Sdelphij
170754Sdelphij      /* Try to read at least 1 more byte than the size indicates, to
170754Sdelphij	 detect whether the file is growing.  This is a nicety for
170754Sdelphij	 users who run 'diff' on files while they are changing.  */
170754Sdelphij
170754Sdelphij      if (current->buffered <= file_size)
170754Sdelphij	{
170754Sdelphij	  file_block_read (current, file_size + 1 - current->buffered);
170754Sdelphij	  if (current->buffered <= file_size)
170754Sdelphij	    return;
170754Sdelphij	}
170754Sdelphij    }
170754Sdelphij
170754Sdelphij  /* It's not a regular file, or it's a growing regular file; read it,
170754Sdelphij     growing the buffer as needed.  */
170754Sdelphij
170754Sdelphij  file_block_read (current, current->bufsize - current->buffered);
170754Sdelphij
170754Sdelphij  if (current->buffered)
170754Sdelphij    {
170754Sdelphij      while (current->buffered == current->bufsize)
170754Sdelphij	{
170754Sdelphij	  if (PTRDIFF_MAX / 2 - sizeof (word) < current->bufsize)
170754Sdelphij	    xalloc_die ();
170754Sdelphij	  current->bufsize *= 2;
170754Sdelphij	  current->buffer = xrealloc (current->buffer, current->bufsize);
170754Sdelphij	  file_block_read (current, current->bufsize - current->buffered);
170754Sdelphij	}
170754Sdelphij
170754Sdelphij      /* Allocate just enough room for appended newline plus word
170754Sdelphij	 sentinel, plus word-alignment.  */
170754Sdelphij      cc = current->buffered + 2 * sizeof (word);
170754Sdelphij      current->bufsize = cc - cc % sizeof (word);
170754Sdelphij      current->buffer = xrealloc (current->buffer, current->bufsize);
170754Sdelphij    }
170754Sdelphij}
170754Sdelphij
170754Sdelphij/* Split the file into lines, simultaneously computing the equivalence
170754Sdelphij   class for each line.  */
170754Sdelphij
170754Sdelphijstatic void
170754Sdelphijfind_and_hash_each_line (struct file_data *current)
170754Sdelphij{
170754Sdelphij  hash_value h;
170754Sdelphij  char const *p = current->prefix_end;
170754Sdelphij  unsigned char c;
170754Sdelphij  lin i, *bucket;
170754Sdelphij  size_t length;
170754Sdelphij
170754Sdelphij  /* Cache often-used quantities in local variables to help the compiler.  */
170754Sdelphij  char const **linbuf = current->linbuf;
170754Sdelphij  lin alloc_lines = current->alloc_lines;
170754Sdelphij  lin line = 0;
170754Sdelphij  lin linbuf_base = current->linbuf_base;
170754Sdelphij  lin *cureqs = xmalloc (alloc_lines * sizeof *cureqs);
170754Sdelphij  struct equivclass *eqs = equivs;
170754Sdelphij  lin eqs_index = equivs_index;
170754Sdelphij  lin eqs_alloc = equivs_alloc;
170754Sdelphij  char const *suffix_begin = current->suffix_begin;
170754Sdelphij  char const *bufend = FILE_BUFFER (current) + current->buffered;
170754Sdelphij  bool diff_length_compare_anyway =
170754Sdelphij    ignore_white_space != IGNORE_NO_WHITE_SPACE;
170754Sdelphij  bool same_length_diff_contents_compare_anyway =
170754Sdelphij    diff_length_compare_anyway | ignore_case;
170754Sdelphij
170754Sdelphij  while (p < suffix_begin)
170754Sdelphij    {
170754Sdelphij      char const *ip = p;
170754Sdelphij
170754Sdelphij      h = 0;
170754Sdelphij
170754Sdelphij      /* Hash this line until we find a newline.  */
170754Sdelphij      if (ignore_case)
170754Sdelphij	switch (ignore_white_space)
170754Sdelphij	  {
170754Sdelphij	  case IGNORE_ALL_SPACE:
170754Sdelphij	    while ((c = *p++) != '\n')
170754Sdelphij	      if (! isspace (c))
170754Sdelphij		h = HASH (h, tolower (c));
170754Sdelphij	    break;
170754Sdelphij
170754Sdelphij	  case IGNORE_SPACE_CHANGE:
170754Sdelphij	    while ((c = *p++) != '\n')
170754Sdelphij	      {
170754Sdelphij		if (isspace (c))
170754Sdelphij		  {
170754Sdelphij		    do
170754Sdelphij		      if ((c = *p++) == '\n')
170754Sdelphij			goto hashing_done;
170754Sdelphij		    while (isspace (c));
170754Sdelphij
170754Sdelphij		    h = HASH (h, ' ');
170754Sdelphij		  }
170754Sdelphij
170754Sdelphij		/* C is now the first non-space.  */
170754Sdelphij		h = HASH (h, tolower (c));
170754Sdelphij	      }
170754Sdelphij	    break;
170754Sdelphij
170754Sdelphij	  case IGNORE_TAB_EXPANSION:
170754Sdelphij	    {
170754Sdelphij	      size_t column = 0;
170754Sdelphij	      while ((c = *p++) != '\n')
170754Sdelphij		{
170754Sdelphij		  size_t repetitions = 1;
170754Sdelphij
170754Sdelphij		  switch (c)
170754Sdelphij		    {
170754Sdelphij		    case '\b':
170754Sdelphij		      column -= 0 < column;
170754Sdelphij		      break;
170754Sdelphij
170754Sdelphij		    case '\t':
170754Sdelphij		      c = ' ';
170754Sdelphij		      repetitions = tabsize - column % tabsize;
170754Sdelphij		      column = (column + repetitions < column
170754Sdelphij				? 0
170754Sdelphij				: column + repetitions);
170754Sdelphij		      break;
170754Sdelphij
170754Sdelphij		    case '\r':
170754Sdelphij		      column = 0;
170754Sdelphij		      break;
170754Sdelphij
170754Sdelphij		    default:
170754Sdelphij		      c = tolower (c);
170754Sdelphij		      column++;
170754Sdelphij		      break;
170754Sdelphij		    }
170754Sdelphij
170754Sdelphij		  do
170754Sdelphij		    h = HASH (h, c);
170754Sdelphij		  while (--repetitions != 0);
170754Sdelphij		}
170754Sdelphij	    }
170754Sdelphij	    break;
170754Sdelphij
170754Sdelphij	  default:
170754Sdelphij	    while ((c = *p++) != '\n')
170754Sdelphij	      h = HASH (h, tolower (c));
170754Sdelphij	    break;
170754Sdelphij	  }
170754Sdelphij      else
170754Sdelphij	switch (ignore_white_space)
170754Sdelphij	  {
170754Sdelphij	  case IGNORE_ALL_SPACE:
170754Sdelphij	    while ((c = *p++) != '\n')
170754Sdelphij	      if (! isspace (c))
170754Sdelphij		h = HASH (h, c);
170754Sdelphij	    break;
170754Sdelphij
170754Sdelphij	  case IGNORE_SPACE_CHANGE:
170754Sdelphij	    while ((c = *p++) != '\n')
170754Sdelphij	      {
170754Sdelphij		if (isspace (c))
170754Sdelphij		  {
170754Sdelphij		    do
170754Sdelphij		      if ((c = *p++) == '\n')
170754Sdelphij			goto hashing_done;
170754Sdelphij		    while (isspace (c));
170754Sdelphij
170754Sdelphij		    h = HASH (h, ' ');
170754Sdelphij		  }
170754Sdelphij
170754Sdelphij		/* C is now the first non-space.  */
170754Sdelphij		h = HASH (h, c);
170754Sdelphij	      }
170754Sdelphij	    break;
170754Sdelphij
170754Sdelphij	  case IGNORE_TAB_EXPANSION:
170754Sdelphij	    {
170754Sdelphij	      size_t column = 0;
170754Sdelphij	      while ((c = *p++) != '\n')
170754Sdelphij		{
170754Sdelphij		  size_t repetitions = 1;
170754Sdelphij
170754Sdelphij		  switch (c)
170754Sdelphij		    {
170754Sdelphij		    case '\b':
170754Sdelphij		      column -= 0 < column;
170754Sdelphij		      break;
170754Sdelphij
170754Sdelphij		    case '\t':
170754Sdelphij		      c = ' ';
170754Sdelphij		      repetitions = tabsize - column % tabsize;
170754Sdelphij		      column = (column + repetitions < column
170754Sdelphij				? 0
170754Sdelphij				: column + repetitions);
170754Sdelphij		      break;
170754Sdelphij
170754Sdelphij		    case '\r':
170754Sdelphij		      column = 0;
170754Sdelphij		      break;
170754Sdelphij
170754Sdelphij		    default:
170754Sdelphij		      column++;
170754Sdelphij		      break;
170754Sdelphij		    }
170754Sdelphij
170754Sdelphij		  do
170754Sdelphij		    h = HASH (h, c);
170754Sdelphij		  while (--repetitions != 0);
170754Sdelphij		}
170754Sdelphij	    }
170754Sdelphij	    break;
170754Sdelphij
170754Sdelphij	  default:
170754Sdelphij	    while ((c = *p++) != '\n')
170754Sdelphij	      h = HASH (h, c);
170754Sdelphij	    break;
170754Sdelphij	  }
170754Sdelphij
170754Sdelphij   hashing_done:;
170754Sdelphij
170754Sdelphij      bucket = &buckets[h % nbuckets];
170754Sdelphij      length = p - ip - 1;
170754Sdelphij
170754Sdelphij      if (p == bufend
170754Sdelphij	  && current->missing_newline
170754Sdelphij	  && ROBUST_OUTPUT_STYLE (output_style))
170754Sdelphij	{
170754Sdelphij	  /* This line is incomplete.  If this is significant,
170754Sdelphij	     put the line into buckets[-1].  */
170754Sdelphij	  if (ignore_white_space < IGNORE_SPACE_CHANGE)
170754Sdelphij	    bucket = &buckets[-1];
170754Sdelphij
170754Sdelphij	  /* Omit the inserted newline when computing linbuf later.  */
170754Sdelphij	  p--;
170754Sdelphij	  bufend = suffix_begin = p;
170754Sdelphij	}
170754Sdelphij
170754Sdelphij      for (i = *bucket;  ;  i = eqs[i].next)
170754Sdelphij	if (!i)
170754Sdelphij	  {
170754Sdelphij	    /* Create a new equivalence class in this bucket.  */
170754Sdelphij	    i = eqs_index++;
170754Sdelphij	    if (i == eqs_alloc)
170754Sdelphij	      {
170754Sdelphij		if (PTRDIFF_MAX / (2 * sizeof *eqs) <= eqs_alloc)
170754Sdelphij		  xalloc_die ();
170754Sdelphij		eqs_alloc *= 2;
170754Sdelphij		eqs = xrealloc (eqs, eqs_alloc * sizeof *eqs);
170754Sdelphij	      }
170754Sdelphij	    eqs[i].next = *bucket;
170754Sdelphij	    eqs[i].hash = h;
170754Sdelphij	    eqs[i].line = ip;
170754Sdelphij	    eqs[i].length = length;
170754Sdelphij	    *bucket = i;
170754Sdelphij	    break;
170754Sdelphij	  }
170754Sdelphij	else if (eqs[i].hash == h)
170754Sdelphij	  {
170754Sdelphij	    char const *eqline = eqs[i].line;
170754Sdelphij
170754Sdelphij	    /* Reuse existing class if lines_differ reports the lines
170754Sdelphij               equal.  */
170754Sdelphij	    if (eqs[i].length == length)
170754Sdelphij	      {
170754Sdelphij		/* Reuse existing equivalence class if the lines are identical.
170754Sdelphij		   This detects the common case of exact identity
170754Sdelphij		   faster than lines_differ would.  */
170754Sdelphij		if (memcmp (eqline, ip, length) == 0)
170754Sdelphij		  break;
170754Sdelphij		if (!same_length_diff_contents_compare_anyway)
170754Sdelphij		  continue;
170754Sdelphij	      }
170754Sdelphij	    else if (!diff_length_compare_anyway)
170754Sdelphij	      continue;
170754Sdelphij
170754Sdelphij	    if (! lines_differ (eqline, ip))
170754Sdelphij	      break;
170754Sdelphij	  }
170754Sdelphij
170754Sdelphij      /* Maybe increase the size of the line table.  */
170754Sdelphij      if (line == alloc_lines)
170754Sdelphij	{
170754Sdelphij	  /* Double (alloc_lines - linbuf_base) by adding to alloc_lines.  */
170754Sdelphij	  if (PTRDIFF_MAX / 3 <= alloc_lines
170754Sdelphij	      || PTRDIFF_MAX / sizeof *cureqs <= 2 * alloc_lines - linbuf_base
170754Sdelphij	      || PTRDIFF_MAX / sizeof *linbuf <= alloc_lines - linbuf_base)
170754Sdelphij	    xalloc_die ();
170754Sdelphij	  alloc_lines = 2 * alloc_lines - linbuf_base;
170754Sdelphij	  cureqs = xrealloc (cureqs, alloc_lines * sizeof *cureqs);
170754Sdelphij	  linbuf += linbuf_base;
170754Sdelphij	  linbuf = xrealloc (linbuf,
170754Sdelphij			     (alloc_lines - linbuf_base) * sizeof *linbuf);
170754Sdelphij	  linbuf -= linbuf_base;
170754Sdelphij	}
170754Sdelphij      linbuf[line] = ip;
170754Sdelphij      cureqs[line] = i;
170754Sdelphij      ++line;
170754Sdelphij    }
170754Sdelphij
170754Sdelphij  current->buffered_lines = line;
170754Sdelphij
170754Sdelphij  for (i = 0;  ;  i++)
170754Sdelphij    {
170754Sdelphij      /* Record the line start for lines in the suffix that we care about.
170754Sdelphij	 Record one more line start than lines,
170754Sdelphij	 so that we can compute the length of any buffered line.  */
170754Sdelphij      if (line == alloc_lines)
170754Sdelphij	{
170754Sdelphij	  /* Double (alloc_lines - linbuf_base) by adding to alloc_lines.  */
170754Sdelphij	  if (PTRDIFF_MAX / 3 <= alloc_lines
170754Sdelphij	      || PTRDIFF_MAX / sizeof *cureqs <= 2 * alloc_lines - linbuf_base
170754Sdelphij	      || PTRDIFF_MAX / sizeof *linbuf <= alloc_lines - linbuf_base)
170754Sdelphij	    xalloc_die ();
170754Sdelphij	  alloc_lines = 2 * alloc_lines - linbuf_base;
170754Sdelphij	  linbuf += linbuf_base;
170754Sdelphij	  linbuf = xrealloc (linbuf,
170754Sdelphij			     (alloc_lines - linbuf_base) * sizeof *linbuf);
170754Sdelphij	  linbuf -= linbuf_base;
170754Sdelphij	}
170754Sdelphij      linbuf[line] = p;
170754Sdelphij
170754Sdelphij      if (p == bufend)
170754Sdelphij	break;
170754Sdelphij
170754Sdelphij      if (context <= i && no_diff_means_no_output)
170754Sdelphij	break;
170754Sdelphij
170754Sdelphij      line++;
170754Sdelphij
170754Sdelphij      while (*p++ != '\n')
170754Sdelphij	continue;
170754Sdelphij    }
170754Sdelphij
170754Sdelphij  /* Done with cache in local variables.  */
170754Sdelphij  current->linbuf = linbuf;
170754Sdelphij  current->valid_lines = line;
170754Sdelphij  current->alloc_lines = alloc_lines;
170754Sdelphij  current->equivs = cureqs;
170754Sdelphij  equivs = eqs;
170754Sdelphij  equivs_alloc = eqs_alloc;
170754Sdelphij  equivs_index = eqs_index;
170754Sdelphij}
170754Sdelphij
170754Sdelphij/* Prepare the text.  Make sure the text end is initialized.
170754Sdelphij   Make sure text ends in a newline,
170754Sdelphij   but remember that we had to add one.
170754Sdelphij   Strip trailing CRs, if that was requested.  */
170754Sdelphij
170754Sdelphijstatic void
170754Sdelphijprepare_text (struct file_data *current)
170754Sdelphij{
170754Sdelphij  size_t buffered = current->buffered;
170754Sdelphij  char *p = FILE_BUFFER (current);
170754Sdelphij  char *dst;
170754Sdelphij
170754Sdelphij  if (buffered == 0 || p[buffered - 1] == '\n')
170754Sdelphij    current->missing_newline = false;
170754Sdelphij  else
170754Sdelphij    {
170754Sdelphij      p[buffered++] = '\n';
170754Sdelphij      current->missing_newline = true;
170754Sdelphij    }
170754Sdelphij
170754Sdelphij  if (!p)
170754Sdelphij    return;
170754Sdelphij
170754Sdelphij  /* Don't use uninitialized storage when planting or using sentinels.  */
170754Sdelphij  memset (p + buffered, 0, sizeof (word));
170754Sdelphij
170754Sdelphij  if (strip_trailing_cr && (dst = memchr (p, '\r', buffered)))
170754Sdelphij    {
170754Sdelphij      char const *src = dst;
170754Sdelphij      char const *srclim = p + buffered;
170754Sdelphij
170754Sdelphij      do
170754Sdelphij	dst += ! ((*dst = *src++) == '\r' && *src == '\n');
170754Sdelphij      while (src < srclim);
170754Sdelphij
170754Sdelphij      buffered -= src - dst;
170754Sdelphij    }
170754Sdelphij
170754Sdelphij  current->buffered = buffered;
170754Sdelphij}
170754Sdelphij
170754Sdelphij/* We have found N lines in a buffer of size S; guess the
170754Sdelphij   proportionate number of lines that will be found in a buffer of
170754Sdelphij   size T.  However, do not guess a number of lines so large that the
170754Sdelphij   resulting line table might cause overflow in size calculations.  */
170754Sdelphijstatic lin
170754Sdelphijguess_lines (lin n, size_t s, size_t t)
170754Sdelphij{
170754Sdelphij  size_t guessed_bytes_per_line = n < 10 ? 32 : s / (n - 1);
170754Sdelphij  lin guessed_lines = MAX (1, t / guessed_bytes_per_line);
170754Sdelphij  return MIN (guessed_lines, PTRDIFF_MAX / (2 * sizeof (char *) + 1) - 5) + 5;
170754Sdelphij}
170754Sdelphij
170754Sdelphij/* Given a vector of two file_data objects, find the identical
170754Sdelphij   prefixes and suffixes of each object.  */
170754Sdelphij
170754Sdelphijstatic void
170754Sdelphijfind_identical_ends (struct file_data filevec[])
170754Sdelphij{
170754Sdelphij  word *w0, *w1;
170754Sdelphij  char *p0, *p1, *buffer0, *buffer1;
170754Sdelphij  char const *end0, *beg0;
170754Sdelphij  char const **linbuf0, **linbuf1;
170754Sdelphij  lin i, lines;
170754Sdelphij  size_t n0, n1;
170754Sdelphij  lin alloc_lines0, alloc_lines1;
170754Sdelphij  lin buffered_prefix, prefix_count, prefix_mask;
170754Sdelphij  lin middle_guess, suffix_guess;
170754Sdelphij
170754Sdelphij  slurp (&filevec[0]);
170754Sdelphij  prepare_text (&filevec[0]);
170754Sdelphij  if (filevec[0].desc != filevec[1].desc)
170754Sdelphij    {
170754Sdelphij      slurp (&filevec[1]);
170754Sdelphij      prepare_text (&filevec[1]);
170754Sdelphij    }
170754Sdelphij  else
170754Sdelphij    {
170754Sdelphij      filevec[1].buffer = filevec[0].buffer;
170754Sdelphij      filevec[1].bufsize = filevec[0].bufsize;
170754Sdelphij      filevec[1].buffered = filevec[0].buffered;
170754Sdelphij      filevec[1].missing_newline = filevec[0].missing_newline;
170754Sdelphij    }
170754Sdelphij
170754Sdelphij  /* Find identical prefix.  */
170754Sdelphij
170754Sdelphij  w0 = filevec[0].buffer;
170754Sdelphij  w1 = filevec[1].buffer;
170754Sdelphij  p0 = buffer0 = (char *) w0;
170754Sdelphij  p1 = buffer1 = (char *) w1;
170754Sdelphij  n0 = filevec[0].buffered;
170754Sdelphij  n1 = filevec[1].buffered;
170754Sdelphij
170754Sdelphij  if (p0 == p1)
170754Sdelphij    /* The buffers are the same; sentinels won't work.  */
170754Sdelphij    p0 = p1 += n1;
170754Sdelphij  else
170754Sdelphij    {
170754Sdelphij      /* Insert end sentinels, in this case characters that are guaranteed
170754Sdelphij	 to make the equality test false, and thus terminate the loop.  */
170754Sdelphij
170754Sdelphij      if (n0 < n1)
170754Sdelphij	p0[n0] = ~p1[n0];
170754Sdelphij      else
170754Sdelphij	p1[n1] = ~p0[n1];
170754Sdelphij
170754Sdelphij      /* Loop until first mismatch, or to the sentinel characters.  */
170754Sdelphij
170754Sdelphij      /* Compare a word at a time for speed.  */
170754Sdelphij      while (*w0 == *w1)
170754Sdelphij	w0++, w1++;
170754Sdelphij
170754Sdelphij      /* Do the last few bytes of comparison a byte at a time.  */
170754Sdelphij      p0 = (char *) w0;
170754Sdelphij      p1 = (char *) w1;
170754Sdelphij      while (*p0 == *p1)
170754Sdelphij	p0++, p1++;
170754Sdelphij
170754Sdelphij      /* Don't mistakenly count missing newline as part of prefix.  */
170754Sdelphij      if (ROBUST_OUTPUT_STYLE (output_style)
170754Sdelphij	  && ((buffer0 + n0 - filevec[0].missing_newline < p0)
170754Sdelphij	      !=
170754Sdelphij	      (buffer1 + n1 - filevec[1].missing_newline < p1)))
170754Sdelphij	p0--, p1--;
170754Sdelphij    }
170754Sdelphij
170754Sdelphij  /* Now P0 and P1 point at the first nonmatching characters.  */
170754Sdelphij
170754Sdelphij  /* Skip back to last line-beginning in the prefix,
170754Sdelphij     and then discard up to HORIZON_LINES lines from the prefix.  */
170754Sdelphij  i = horizon_lines;
170754Sdelphij  while (p0 != buffer0 && (p0[-1] != '\n' || i--))
170754Sdelphij    p0--, p1--;
170754Sdelphij
170754Sdelphij  /* Record the prefix.  */
170754Sdelphij  filevec[0].prefix_end = p0;
170754Sdelphij  filevec[1].prefix_end = p1;
170754Sdelphij
170754Sdelphij  /* Find identical suffix.  */
170754Sdelphij
170754Sdelphij  /* P0 and P1 point beyond the last chars not yet compared.  */
170754Sdelphij  p0 = buffer0 + n0;
170754Sdelphij  p1 = buffer1 + n1;
170754Sdelphij
170754Sdelphij  if (! ROBUST_OUTPUT_STYLE (output_style)
170754Sdelphij      || filevec[0].missing_newline == filevec[1].missing_newline)
170754Sdelphij    {
170754Sdelphij      end0 = p0;	/* Addr of last char in file 0.  */
170754Sdelphij
170754Sdelphij      /* Get value of P0 at which we should stop scanning backward:
170754Sdelphij	 this is when either P0 or P1 points just past the last char
170754Sdelphij	 of the identical prefix.  */
170754Sdelphij      beg0 = filevec[0].prefix_end + (n0 < n1 ? 0 : n0 - n1);
170754Sdelphij
170754Sdelphij      /* Scan back until chars don't match or we reach that point.  */
170754Sdelphij      for (; p0 != beg0; p0--, p1--)
170754Sdelphij	if (*p0 != *p1)
170754Sdelphij	  {
170754Sdelphij	    /* Point at the first char of the matching suffix.  */
170754Sdelphij	    beg0 = p0;
170754Sdelphij	    break;
170754Sdelphij	  }
170754Sdelphij
170754Sdelphij      /* Are we at a line-beginning in both files?  If not, add the rest of
170754Sdelphij	 this line to the main body.  Discard up to HORIZON_LINES lines from
170754Sdelphij	 the identical suffix.  Also, discard one extra line,
170754Sdelphij	 because shift_boundaries may need it.  */
170754Sdelphij      i = horizon_lines + !((buffer0 == p0 || p0[-1] == '\n')
170754Sdelphij			    &&
170754Sdelphij			    (buffer1 == p1 || p1[-1] == '\n'));
170754Sdelphij      while (i-- && p0 != end0)
170754Sdelphij	while (*p0++ != '\n')
170754Sdelphij	  continue;
170754Sdelphij
170754Sdelphij      p1 += p0 - beg0;
170754Sdelphij    }
170754Sdelphij
170754Sdelphij  /* Record the suffix.  */
170754Sdelphij  filevec[0].suffix_begin = p0;
170754Sdelphij  filevec[1].suffix_begin = p1;
170754Sdelphij
170754Sdelphij  /* Calculate number of lines of prefix to save.
170754Sdelphij
170754Sdelphij     prefix_count == 0 means save the whole prefix;
170754Sdelphij     we need this for options like -D that output the whole file,
170754Sdelphij     or for enormous contexts (to avoid worrying about arithmetic overflow).
170754Sdelphij     We also need it for options like -F that output some preceding line;
170754Sdelphij     at least we will need to find the last few lines,
170754Sdelphij     but since we don't know how many, it's easiest to find them all.
170754Sdelphij
170754Sdelphij     Otherwise, prefix_count != 0.  Save just prefix_count lines at start
170754Sdelphij     of the line buffer; they'll be moved to the proper location later.
170754Sdelphij     Handle 1 more line than the context says (because we count 1 too many),
170754Sdelphij     rounded up to the next power of 2 to speed index computation.  */
170754Sdelphij
170754Sdelphij  if (no_diff_means_no_output && ! function_regexp.fastmap
170754Sdelphij      && context < LIN_MAX / 4 && context < n0)
170754Sdelphij    {
170754Sdelphij      middle_guess = guess_lines (0, 0, p0 - filevec[0].prefix_end);
170754Sdelphij      suffix_guess = guess_lines (0, 0, buffer0 + n0 - p0);
170754Sdelphij      for (prefix_count = 1;  prefix_count <= context;  prefix_count *= 2)
170754Sdelphij	continue;
170754Sdelphij      alloc_lines0 = (prefix_count + middle_guess
170754Sdelphij		      + MIN (context, suffix_guess));
170754Sdelphij    }
170754Sdelphij  else
170754Sdelphij    {
170754Sdelphij      prefix_count = 0;
170754Sdelphij      alloc_lines0 = guess_lines (0, 0, n0);
170754Sdelphij    }
170754Sdelphij
170754Sdelphij  prefix_mask = prefix_count - 1;
170754Sdelphij  lines = 0;
170754Sdelphij  linbuf0 = xmalloc (alloc_lines0 * sizeof *linbuf0);
170754Sdelphij  p0 = buffer0;
170754Sdelphij
170754Sdelphij  /* If the prefix is needed, find the prefix lines.  */
170754Sdelphij  if (! (no_diff_means_no_output
170754Sdelphij	 && filevec[0].prefix_end == p0
170754Sdelphij	 && filevec[1].prefix_end == p1))
170754Sdelphij    {
170754Sdelphij      end0 = filevec[0].prefix_end;
170754Sdelphij      while (p0 != end0)
170754Sdelphij	{
170754Sdelphij	  lin l = lines++ & prefix_mask;
170754Sdelphij	  if (l == alloc_lines0)
170754Sdelphij	    {
170754Sdelphij	      if (PTRDIFF_MAX / (2 * sizeof *linbuf0) <= alloc_lines0)
170754Sdelphij		xalloc_die ();
170754Sdelphij	      alloc_lines0 *= 2;
170754Sdelphij	      linbuf0 = xrealloc (linbuf0, alloc_lines0 * sizeof *linbuf0);
170754Sdelphij	    }
170754Sdelphij	  linbuf0[l] = p0;
170754Sdelphij	  while (*p0++ != '\n')
170754Sdelphij	    continue;
170754Sdelphij	}
170754Sdelphij    }
170754Sdelphij  buffered_prefix = prefix_count && context < lines ? context : lines;
170754Sdelphij
170754Sdelphij  /* Allocate line buffer 1.  */
170754Sdelphij
170754Sdelphij  middle_guess = guess_lines (lines, p0 - buffer0, p1 - filevec[1].prefix_end);
170754Sdelphij  suffix_guess = guess_lines (lines, p0 - buffer0, buffer1 + n1 - p1);
170754Sdelphij  alloc_lines1 = buffered_prefix + middle_guess + MIN (context, suffix_guess);
170754Sdelphij  if (alloc_lines1 < buffered_prefix
170754Sdelphij      || PTRDIFF_MAX / sizeof *linbuf1 <= alloc_lines1)
170754Sdelphij    xalloc_die ();
170754Sdelphij  linbuf1 = xmalloc (alloc_lines1 * sizeof *linbuf1);
170754Sdelphij
170754Sdelphij  if (buffered_prefix != lines)
170754Sdelphij    {
170754Sdelphij      /* Rotate prefix lines to proper location.  */
170754Sdelphij      for (i = 0;  i < buffered_prefix;  i++)
170754Sdelphij	linbuf1[i] = linbuf0[(lines - context + i) & prefix_mask];
170754Sdelphij      for (i = 0;  i < buffered_prefix;  i++)
170754Sdelphij	linbuf0[i] = linbuf1[i];
170754Sdelphij    }
170754Sdelphij
170754Sdelphij  /* Initialize line buffer 1 from line buffer 0.  */
170754Sdelphij  for (i = 0; i < buffered_prefix; i++)
170754Sdelphij    linbuf1[i] = linbuf0[i] - buffer0 + buffer1;
170754Sdelphij
170754Sdelphij  /* Record the line buffer, adjusted so that
170754Sdelphij     linbuf[0] points at the first differing line.  */
170754Sdelphij  filevec[0].linbuf = linbuf0 + buffered_prefix;
170754Sdelphij  filevec[1].linbuf = linbuf1 + buffered_prefix;
170754Sdelphij  filevec[0].linbuf_base = filevec[1].linbuf_base = - buffered_prefix;
170754Sdelphij  filevec[0].alloc_lines = alloc_lines0 - buffered_prefix;
170754Sdelphij  filevec[1].alloc_lines = alloc_lines1 - buffered_prefix;
170754Sdelphij  filevec[0].prefix_lines = filevec[1].prefix_lines = lines;
170754Sdelphij}
170754Sdelphij
170754Sdelphij/* If 1 < k, then (2**k - prime_offset[k]) is the largest prime less
170754Sdelphij   than 2**k.  This table is derived from Chris K. Caldwell's list
170754Sdelphij   <http://www.utm.edu/research/primes/lists/2small/>.  */
170754Sdelphij
170754Sdelphijstatic unsigned char const prime_offset[] =
170754Sdelphij{
170754Sdelphij  0, 0, 1, 1, 3, 1, 3, 1, 5, 3, 3, 9, 3, 1, 3, 19, 15, 1, 5, 1, 3, 9, 3,
170754Sdelphij  15, 3, 39, 5, 39, 57, 3, 35, 1, 5, 9, 41, 31, 5, 25, 45, 7, 87, 21,
170754Sdelphij  11, 57, 17, 55, 21, 115, 59, 81, 27, 129, 47, 111, 33, 55, 5, 13, 27,
170754Sdelphij  55, 93, 1, 57, 25
170754Sdelphij};
170754Sdelphij
170754Sdelphij/* Verify that this host's size_t is not too wide for the above table.  */
170754Sdelphij
170754Sdelphijverify (enough_prime_offsets,
170754Sdelphij	sizeof (size_t) * CHAR_BIT <= sizeof prime_offset);
170754Sdelphij
170754Sdelphij/* Given a vector of two file_data objects, read the file associated
170754Sdelphij   with each one, and build the table of equivalence classes.
170754Sdelphij   Return nonzero if either file appears to be a binary file.
170754Sdelphij   If PRETEND_BINARY is nonzero, pretend they are binary regardless.  */
170754Sdelphij
170754Sdelphijbool
170754Sdelphijread_files (struct file_data filevec[], bool pretend_binary)
170754Sdelphij{
170754Sdelphij  int i;
170754Sdelphij  bool skip_test = text | pretend_binary;
170754Sdelphij  bool appears_binary = pretend_binary | sip (&filevec[0], skip_test);
170754Sdelphij
170754Sdelphij  if (filevec[0].desc != filevec[1].desc)
170754Sdelphij    appears_binary |= sip (&filevec[1], skip_test | appears_binary);
170754Sdelphij  else
170754Sdelphij    {
170754Sdelphij      filevec[1].buffer = filevec[0].buffer;
170754Sdelphij      filevec[1].bufsize = filevec[0].bufsize;
170754Sdelphij      filevec[1].buffered = filevec[0].buffered;
170754Sdelphij    }
170754Sdelphij  if (appears_binary)
170754Sdelphij    {
170754Sdelphij      set_binary_mode (filevec[0].desc, true);
170754Sdelphij      set_binary_mode (filevec[1].desc, true);
170754Sdelphij      return true;
170754Sdelphij    }
170754Sdelphij
170754Sdelphij  find_identical_ends (filevec);
170754Sdelphij
170754Sdelphij  equivs_alloc = filevec[0].alloc_lines + filevec[1].alloc_lines + 1;
170754Sdelphij  if (PTRDIFF_MAX / sizeof *equivs <= equivs_alloc)
170754Sdelphij    xalloc_die ();
170754Sdelphij  equivs = xmalloc (equivs_alloc * sizeof *equivs);
170754Sdelphij  /* Equivalence class 0 is permanently safe for lines that were not
170754Sdelphij     hashed.  Real equivalence classes start at 1.  */
170754Sdelphij  equivs_index = 1;
170754Sdelphij
170754Sdelphij  /* Allocate (one plus) a prime number of hash buckets.  Use a prime
170754Sdelphij     number between 1/3 and 2/3 of the value of equiv_allocs,
170754Sdelphij     approximately.  */
170754Sdelphij  for (i = 9; (size_t) 1 << i < equivs_alloc / 3; i++)
170754Sdelphij    continue;
170754Sdelphij  nbuckets = ((size_t) 1 << i) - prime_offset[i];
170754Sdelphij  if (PTRDIFF_MAX / sizeof *buckets <= nbuckets)
170754Sdelphij    xalloc_die ();
170754Sdelphij  buckets = zalloc ((nbuckets + 1) * sizeof *buckets);
170754Sdelphij  buckets++;
170754Sdelphij
170754Sdelphij  for (i = 0; i < 2; i++)
170754Sdelphij    find_and_hash_each_line (&filevec[i]);
170754Sdelphij
170754Sdelphij  filevec[0].equiv_max = filevec[1].equiv_max = equivs_index;
170754Sdelphij
170754Sdelphij  free (equivs);
170754Sdelphij  free (buckets - 1);
170754Sdelphij
170754Sdelphij  return false;
170754Sdelphij}