1/* cmp - compare two files byte by byte
2
3   Copyright (C) 1990, 1991, 1992, 1993, 1994, 1995, 1996, 1998, 2001,
4   2002, 2004 Free Software Foundation, Inc.
5
6   This program is free software; you can redistribute it and/or modify
7   it under the terms of the GNU General Public License as published by
8   the Free Software Foundation; either version 2, or (at your option)
9   any later version.
10
11   This program is distributed in the hope that it will be useful,
12   but WITHOUT ANY WARRANTY; without even the implied warranty of
13   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
14   See the GNU General Public License for more details.
15
16   You should have received a copy of the GNU General Public License
17   along with this program; see the file COPYING.
18   If not, write to the Free Software Foundation,
19   59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.  */
20
21#include "system.h"
22#include "paths.h"
23
24#include <stdio.h>
25
26#include <c-stack.h>
27#include <cmpbuf.h>
28#include <error.h>
29#include <exit.h>
30#include <exitfail.h>
31#include <file-type.h>
32#include <getopt.h>
33#include <hard-locale.h>
34#include <inttostr.h>
35#include <setmode.h>
36#include <unlocked-io.h>
37#include <version-etc.h>
38#include <xalloc.h>
39#include <xstrtol.h>
40
41#if defined LC_MESSAGES && ENABLE_NLS
42# define hard_locale_LC_MESSAGES hard_locale (LC_MESSAGES)
43#else
44# define hard_locale_LC_MESSAGES 0
45#endif
46
47static int cmp (void);
48static off_t file_position (int);
49static size_t block_compare (word const *, word const *);
50static size_t block_compare_and_count (word const *, word const *, off_t *);
51static void sprintc (char *, unsigned char);
52
53/* Name under which this program was invoked.  */
54char *program_name;
55
56/* Filenames of the compared files.  */
57static char const *file[2];
58
59/* File descriptors of the files.  */
60static int file_desc[2];
61
62/* Status of the files.  */
63static struct stat stat_buf[2];
64
65/* Read buffers for the files.  */
66static word *buffer[2];
67
68/* Optimal block size for the files.  */
69static size_t buf_size;
70
71/* Initial prefix to ignore for each file.  */
72static off_t ignore_initial[2];
73
74/* Number of bytes to compare.  */
75static uintmax_t bytes = UINTMAX_MAX;
76
77/* Output format.  */
78static enum comparison_type
79  {
80    type_first_diff,	/* Print the first difference.  */
81    type_all_diffs,	/* Print all differences.  */
82    type_status		/* Exit status only.  */
83  } comparison_type;
84
85/* If nonzero, print values of bytes quoted like cat -t does. */
86static bool opt_print_bytes;
87
88/* Values for long options that do not have single-letter equivalents.  */
89enum
90{
91  HELP_OPTION = CHAR_MAX + 1
92};
93
94static struct option const long_options[] =
95{
96  {"print-bytes", 0, 0, 'b'},
97  {"print-chars", 0, 0, 'c'}, /* obsolescent as of diffutils 2.7.3 */
98  {"ignore-initial", 1, 0, 'i'},
99  {"verbose", 0, 0, 'l'},
100  {"bytes", 1, 0, 'n'},
101  {"silent", 0, 0, 's'},
102  {"quiet", 0, 0, 's'},
103  {"version", 0, 0, 'v'},
104  {"help", 0, 0, HELP_OPTION},
105  {0, 0, 0, 0}
106};
107
108static void try_help (char const *, char const *) __attribute__((noreturn));
109static void
110try_help (char const *reason_msgid, char const *operand)
111{
112  if (reason_msgid)
113    error (0, 0, _(reason_msgid), operand);
114  error (EXIT_TROUBLE, 0,
115	 _("Try `%s --help' for more information."), program_name);
116  abort ();
117}
118
119static char const valid_suffixes[] = "kKMGTPEZY0";
120
121/* Update ignore_initial[F] according to the result of parsing an
122   *operand ARGPTR of --ignore-initial, updating *ARGPTR to point
123   *after the operand.  If DELIMITER is nonzero, the operand may be
124   *followed by DELIMITER; otherwise it must be null-terminated.  */
125static void
126specify_ignore_initial (int f, char **argptr, char delimiter)
127{
128  uintmax_t val;
129  off_t o;
130  char const *arg = *argptr;
131  strtol_error e = xstrtoumax (arg, argptr, 0, &val, valid_suffixes);
132  if (! (e == LONGINT_OK
133	 || (e == LONGINT_INVALID_SUFFIX_CHAR && **argptr == delimiter))
134      || (o = val) < 0 || o != val || val == UINTMAX_MAX)
135    try_help ("invalid --ignore-initial value `%s'", arg);
136  if (ignore_initial[f] < o)
137    ignore_initial[f] = o;
138}
139
140/* Specify the output format.  */
141static void
142specify_comparison_type (enum comparison_type t)
143{
144  if (comparison_type && comparison_type != t)
145    try_help ("options -l and -s are incompatible", 0);
146  comparison_type = t;
147}
148
149static void
150check_stdout (void)
151{
152  if (ferror (stdout))
153    error (EXIT_TROUBLE, 0, "%s", _("write failed"));
154  else if (fclose (stdout) != 0)
155    error (EXIT_TROUBLE, errno, "%s", _("standard output"));
156}
157
158static char const * const option_help_msgid[] = {
159  N_("-b  --print-bytes  Print differing bytes."),
160  N_("-i SKIP  --ignore-initial=SKIP  Skip the first SKIP bytes of input."),
161  N_("-i SKIP1:SKIP2  --ignore-initial=SKIP1:SKIP2"),
162  N_("  Skip the first SKIP1 bytes of FILE1 and the first SKIP2 bytes of FILE2."),
163  N_("-l  --verbose  Output byte numbers and values of all differing bytes."),
164  N_("-n LIMIT  --bytes=LIMIT  Compare at most LIMIT bytes."),
165  N_("-s  --quiet  --silent  Output nothing; yield exit status only."),
166  N_("-v  --version  Output version info."),
167  N_("--help  Output this help."),
168  0
169};
170
171static void
172usage (void)
173{
174  char const * const *p;
175
176  printf (_("Usage: %s [OPTION]... FILE1 [FILE2 [SKIP1 [SKIP2]]]\n"),
177	  program_name);
178  printf ("%s\n\n", _("Compare two files byte by byte."));
179  for (p = option_help_msgid;  *p;  p++)
180    printf ("  %s\n", _(*p));
181  printf ("\n%s\n%s\n\n%s\n%s\n\n%s\n",
182	  _("SKIP1 and SKIP2 are the number of bytes to skip in each file."),
183	  _("SKIP values may be followed by the following multiplicative suffixes:\n\
184kB 1000, K 1024, MB 1,000,000, M 1,048,576,\n\
185GB 1,000,000,000, G 1,073,741,824, and so on for T, P, E, Z, Y."),
186	  _("If a FILE is `-' or missing, read standard input."),
187	  _("Exit status is 0 if inputs are the same, 1 if different, 2 if trouble."),
188	  _("Report bugs to <bug-gnu-utils@gnu.org>."));
189}
190
191int
192main (int argc, char **argv)
193{
194  int c, f, exit_status;
195  size_t words_per_buffer;
196
197  exit_failure = EXIT_TROUBLE;
198  initialize_main (&argc, &argv);
199  program_name = argv[0];
200  setlocale (LC_ALL, "");
201  bindtextdomain (PACKAGE, LOCALEDIR);
202  textdomain (PACKAGE);
203  c_stack_action (0);
204
205  /* Parse command line options.  */
206
207  while ((c = getopt_long (argc, argv, "bci:ln:sv", long_options, 0))
208	 != -1)
209    switch (c)
210      {
211      case 'b':
212      case 'c': /* 'c' is obsolescent as of diffutils 2.7.3 */
213	opt_print_bytes = true;
214	break;
215
216      case 'i':
217	specify_ignore_initial (0, &optarg, ':');
218	if (*optarg++ == ':')
219	  specify_ignore_initial (1, &optarg, 0);
220	else if (ignore_initial[1] < ignore_initial[0])
221	  ignore_initial[1] = ignore_initial[0];
222	break;
223
224      case 'l':
225	specify_comparison_type (type_all_diffs);
226	break;
227
228      case 'n':
229	{
230	  uintmax_t n;
231	  if (xstrtoumax (optarg, 0, 0, &n, valid_suffixes) != LONGINT_OK)
232	    try_help ("invalid --bytes value `%s'", optarg);
233	  if (n < bytes)
234	    bytes = n;
235	}
236	break;
237
238      case 's':
239	specify_comparison_type (type_status);
240	break;
241
242      case 'v':
243	/* TRANSLATORS: Please translate the second "o" in "Torbjorn
244	   Granlund" to an o-with-umlaut (U+00F6, LATIN SMALL LETTER O
245	   WITH DIAERESIS) if possible.  */
246	version_etc (stdout, "cmp", PACKAGE_NAME, PACKAGE_VERSION,
247		     _("Torbjorn Granlund"), "David MacKenzie", (char *) 0);
248	check_stdout ();
249	return EXIT_SUCCESS;
250
251      case HELP_OPTION:
252	usage ();
253	check_stdout ();
254	return EXIT_SUCCESS;
255
256      default:
257	try_help (0, 0);
258      }
259
260  if (optind == argc)
261    try_help ("missing operand after `%s'", argv[argc - 1]);
262
263  file[0] = argv[optind++];
264  file[1] = optind < argc ? argv[optind++] : "-";
265
266  for (f = 0; f < 2 && optind < argc; f++)
267    {
268      char *arg = argv[optind++];
269      specify_ignore_initial (f, &arg, 0);
270    }
271
272  if (optind < argc)
273    try_help ("extra operand `%s'", argv[optind]);
274
275  for (f = 0; f < 2; f++)
276    {
277      /* If file[1] is "-", treat it first; this avoids a misdiagnostic if
278	 stdin is closed and opening file[0] yields file descriptor 0.  */
279      int f1 = f ^ (strcmp (file[1], "-") == 0);
280
281      /* Two files with the same name and offset are identical.
282	 But wait until we open the file once, for proper diagnostics.  */
283      if (f && ignore_initial[0] == ignore_initial[1]
284	  && file_name_cmp (file[0], file[1]) == 0)
285	return EXIT_SUCCESS;
286
287      file_desc[f1] = (strcmp (file[f1], "-") == 0
288		       ? STDIN_FILENO
289		       : open (file[f1], O_RDONLY, 0));
290      if (file_desc[f1] < 0 || fstat (file_desc[f1], stat_buf + f1) != 0)
291	{
292	  if (file_desc[f1] < 0 && comparison_type == type_status)
293	    exit (EXIT_TROUBLE);
294	  else
295	    error (EXIT_TROUBLE, errno, "%s", file[f1]);
296	}
297
298      set_binary_mode (file_desc[f1], true);
299    }
300
301  /* If the files are links to the same inode and have the same file position,
302     they are identical.  */
303
304  if (0 < same_file (&stat_buf[0], &stat_buf[1])
305      && same_file_attributes (&stat_buf[0], &stat_buf[1])
306      && file_position (0) == file_position (1))
307    return EXIT_SUCCESS;
308
309  /* If output is redirected to the null device, we may assume `-s'.  */
310
311  if (comparison_type != type_status)
312    {
313      struct stat outstat, nullstat;
314
315      if (fstat (STDOUT_FILENO, &outstat) == 0
316	  && stat (NULL_DEVICE, &nullstat) == 0
317	  && 0 < same_file (&outstat, &nullstat))
318	comparison_type = type_status;
319    }
320
321  /* If only a return code is needed,
322     and if both input descriptors are associated with plain files,
323     conclude that the files differ if they have different sizes
324     and if more bytes will be compared than are in the smaller file.  */
325
326  if (comparison_type == type_status
327      && S_ISREG (stat_buf[0].st_mode)
328      && S_ISREG (stat_buf[1].st_mode))
329    {
330      off_t s0 = stat_buf[0].st_size - file_position (0);
331      off_t s1 = stat_buf[1].st_size - file_position (1);
332      if (s0 < 0)
333	s0 = 0;
334      if (s1 < 0)
335	s1 = 0;
336      if (s0 != s1 && MIN (s0, s1) < bytes)
337	exit (EXIT_FAILURE);
338    }
339
340  /* Get the optimal block size of the files.  */
341
342  buf_size = buffer_lcm (STAT_BLOCKSIZE (stat_buf[0]),
343			 STAT_BLOCKSIZE (stat_buf[1]),
344			 PTRDIFF_MAX - sizeof (word));
345
346  /* Allocate word-aligned buffers, with space for sentinels at the end.  */
347
348  words_per_buffer = (buf_size + 2 * sizeof (word) - 1) / sizeof (word);
349  buffer[0] = xmalloc (2 * sizeof (word) * words_per_buffer);
350  buffer[1] = buffer[0] + words_per_buffer;
351
352  exit_status = cmp ();
353
354  for (f = 0; f < 2; f++)
355    if (close (file_desc[f]) != 0)
356      error (EXIT_TROUBLE, errno, "%s", file[f]);
357  if (exit_status != 0  &&  comparison_type != type_status)
358    check_stdout ();
359  exit (exit_status);
360  return exit_status;
361}
362
363/* Compare the two files already open on `file_desc[0]' and `file_desc[1]',
364   using `buffer[0]' and `buffer[1]'.
365   Return EXIT_SUCCESS if identical, EXIT_FAILURE if different,
366   >1 if error.  */
367
368static int
369cmp (void)
370{
371  off_t line_number = 1;	/* Line number (1...) of difference. */
372  off_t byte_number = 1;	/* Byte number (1...) of difference. */
373  uintmax_t remaining = bytes;	/* Remaining number of bytes to compare.  */
374  size_t read0, read1;		/* Number of bytes read from each file. */
375  size_t first_diff;		/* Offset (0...) in buffers of 1st diff. */
376  size_t smaller;		/* The lesser of `read0' and `read1'. */
377  word *buffer0 = buffer[0];
378  word *buffer1 = buffer[1];
379  char *buf0 = (char *) buffer0;
380  char *buf1 = (char *) buffer1;
381  int ret = EXIT_SUCCESS;
382  int f;
383  int offset_width;
384
385  if (comparison_type == type_all_diffs)
386    {
387      off_t byte_number_max = MIN (bytes, TYPE_MAXIMUM (off_t));
388
389      for (f = 0; f < 2; f++)
390	if (S_ISREG (stat_buf[f].st_mode))
391	  {
392	    off_t file_bytes = stat_buf[f].st_size - file_position (f);
393	    if (file_bytes < byte_number_max)
394	      byte_number_max = file_bytes;
395	  }
396
397      for (offset_width = 1; (byte_number_max /= 10) != 0; offset_width++)
398	continue;
399    }
400
401  for (f = 0; f < 2; f++)
402    {
403      off_t ig = ignore_initial[f];
404      if (ig && file_position (f) == -1)
405	{
406	  /* lseek failed; read and discard the ignored initial prefix.  */
407	  do
408	    {
409	      size_t bytes_to_read = MIN (ig, buf_size);
410	      size_t r = block_read (file_desc[f], buf0, bytes_to_read);
411	      if (r != bytes_to_read)
412		{
413		  if (r == SIZE_MAX)
414		    error (EXIT_TROUBLE, errno, "%s", file[f]);
415		  break;
416		}
417	      ig -= r;
418	    }
419	  while (ig);
420	}
421    }
422
423  do
424    {
425      size_t bytes_to_read = buf_size;
426
427      if (remaining != UINTMAX_MAX)
428	{
429	  if (remaining < bytes_to_read)
430	    bytes_to_read = remaining;
431	  remaining -= bytes_to_read;
432	}
433
434      read0 = block_read (file_desc[0], buf0, bytes_to_read);
435      if (read0 == SIZE_MAX)
436	error (EXIT_TROUBLE, errno, "%s", file[0]);
437      read1 = block_read (file_desc[1], buf1, bytes_to_read);
438      if (read1 == SIZE_MAX)
439	error (EXIT_TROUBLE, errno, "%s", file[1]);
440
441      /* Insert sentinels for the block compare.  */
442
443      buf0[read0] = ~buf1[read0];
444      buf1[read1] = ~buf0[read1];
445
446      /* If the line number should be written for differing files,
447	 compare the blocks and count the number of newlines
448	 simultaneously.  */
449      first_diff = (comparison_type == type_first_diff
450		    ? block_compare_and_count (buffer0, buffer1, &line_number)
451		    : block_compare (buffer0, buffer1));
452
453      byte_number += first_diff;
454      smaller = MIN (read0, read1);
455
456      if (first_diff < smaller)
457	{
458	  switch (comparison_type)
459	    {
460	    case type_first_diff:
461	      {
462		char byte_buf[INT_BUFSIZE_BOUND (off_t)];
463		char line_buf[INT_BUFSIZE_BOUND (off_t)];
464		char const *byte_num = offtostr (byte_number, byte_buf);
465		char const *line_num = offtostr (line_number, line_buf);
466		if (!opt_print_bytes)
467		  {
468		    /* See POSIX 1003.1-2001 for this format.  This
469		       message is used only in the POSIX locale, so it
470		       need not be translated.  */
471		    static char const char_message[] =
472		      "%s %s differ: char %s, line %s\n";
473
474		    /* The POSIX rationale recommends using the word
475		       "byte" outside the POSIX locale.  Some gettext
476		       implementations translate even in the POSIX
477		       locale if certain other environment variables
478		       are set, so use "byte" if a translation is
479		       available, or if outside the POSIX locale.  */
480		    static char const byte_msgid[] =
481		      N_("%s %s differ: byte %s, line %s\n");
482		    char const *byte_message = _(byte_msgid);
483		    bool use_byte_message = (byte_message != byte_msgid
484					     || hard_locale_LC_MESSAGES);
485
486		    printf (use_byte_message ? byte_message : char_message,
487			    file[0], file[1], byte_num, line_num);
488		  }
489		else
490		  {
491		    unsigned char c0 = buf0[first_diff];
492		    unsigned char c1 = buf1[first_diff];
493		    char s0[5];
494		    char s1[5];
495		    sprintc (s0, c0);
496		    sprintc (s1, c1);
497		    printf (_("%s %s differ: byte %s, line %s is %3o %s %3o %s\n"),
498			    file[0], file[1], byte_num, line_num,
499			    c0, s0, c1, s1);
500		}
501	      }
502	      /* Fall through.  */
503	    case type_status:
504	      return EXIT_FAILURE;
505
506	    case type_all_diffs:
507	      do
508		{
509		  unsigned char c0 = buf0[first_diff];
510		  unsigned char c1 = buf1[first_diff];
511		  if (c0 != c1)
512		    {
513		      char byte_buf[INT_BUFSIZE_BOUND (off_t)];
514		      char const *byte_num = offtostr (byte_number, byte_buf);
515		      if (!opt_print_bytes)
516			{
517			  /* See POSIX 1003.1-2001 for this format.  */
518			  printf ("%*s %3o %3o\n",
519				  offset_width, byte_num, c0, c1);
520			}
521		      else
522			{
523			  char s0[5];
524			  char s1[5];
525			  sprintc (s0, c0);
526			  sprintc (s1, c1);
527			  printf ("%*s %3o %-4s %3o %s\n",
528				  offset_width, byte_num, c0, s0, c1, s1);
529			}
530		    }
531		  byte_number++;
532		  first_diff++;
533		}
534	      while (first_diff < smaller);
535	      ret = EXIT_FAILURE;
536	      break;
537	    }
538	}
539
540      if (read0 != read1)
541	{
542	  if (comparison_type != type_status)
543	    {
544	      /* See POSIX 1003.1-2001 for this format.  */
545	      fprintf (stderr, _("cmp: EOF on %s\n"), file[read1 < read0]);
546	    }
547
548	  return EXIT_FAILURE;
549	}
550    }
551  while (read0 == buf_size);
552
553  return ret;
554}
555
556/* Compare two blocks of memory P0 and P1 until they differ,
557   and count the number of '\n' occurrences in the common
558   part of P0 and P1.
559   If the blocks are not guaranteed to be different, put sentinels at the ends
560   of the blocks before calling this function.
561
562   Return the offset of the first byte that differs.
563   Increment *COUNT by the count of '\n' occurrences.  */
564
565static size_t
566block_compare_and_count (word const *p0, word const *p1, off_t *count)
567{
568  word l;		/* One word from first buffer. */
569  word const *l0, *l1;	/* Pointers into each buffer. */
570  char const *c0, *c1;	/* Pointers for finding exact address. */
571  size_t cnt = 0;	/* Number of '\n' occurrences. */
572  word nnnn;		/* Newline, sizeof (word) times.  */
573  int i;
574
575  nnnn = 0;
576  for (i = 0; i < sizeof nnnn; i++)
577    nnnn = (nnnn << CHAR_BIT) | '\n';
578
579  /* Find the rough position of the first difference by reading words,
580     not bytes.  */
581
582  for (l0 = p0, l1 = p1;  (l = *l0) == *l1;  l0++, l1++)
583    {
584      l ^= nnnn;
585      for (i = 0; i < sizeof l; i++)
586	{
587	  unsigned char uc = l;
588	  cnt += ! uc;
589	  l >>= CHAR_BIT;
590	}
591    }
592
593  /* Find the exact differing position (endianness independent).  */
594
595  for (c0 = (char const *) l0, c1 = (char const *) l1;
596       *c0 == *c1;
597       c0++, c1++)
598    cnt += *c0 == '\n';
599
600  *count += cnt;
601  return c0 - (char const *) p0;
602}
603
604/* Compare two blocks of memory P0 and P1 until they differ.
605   If the blocks are not guaranteed to be different, put sentinels at the ends
606   of the blocks before calling this function.
607
608   Return the offset of the first byte that differs.  */
609
610static size_t
611block_compare (word const *p0, word const *p1)
612{
613  word const *l0, *l1;
614  char const *c0, *c1;
615
616  /* Find the rough position of the first difference by reading words,
617     not bytes.  */
618
619  for (l0 = p0, l1 = p1;  *l0 == *l1;  l0++, l1++)
620    continue;
621
622  /* Find the exact differing position (endianness independent).  */
623
624  for (c0 = (char const *) l0, c1 = (char const *) l1;
625       *c0 == *c1;
626       c0++, c1++)
627    continue;
628
629  return c0 - (char const *) p0;
630}
631
632/* Put into BUF the unsigned char C, making unprintable bytes
633   visible by quoting like cat -t does.  */
634
635static void
636sprintc (char *buf, unsigned char c)
637{
638  if (! isprint (c))
639    {
640      if (c >= 128)
641	{
642	  *buf++ = 'M';
643	  *buf++ = '-';
644	  c -= 128;
645	}
646      if (c < 32)
647	{
648	  *buf++ = '^';
649	  c += 64;
650	}
651      else if (c == 127)
652	{
653	  *buf++ = '^';
654	  c = '?';
655	}
656    }
657
658  *buf++ = c;
659  *buf = 0;
660}
661
662/* Position file F to ignore_initial[F] bytes from its initial position,
663   and yield its new position.  Don't try more than once.  */
664
665static off_t
666file_position (int f)
667{
668  static bool positioned[2];
669  static off_t position[2];
670
671  if (! positioned[f])
672    {
673      positioned[f] = true;
674      position[f] = lseek (file_desc[f], ignore_initial[f], SEEK_CUR);
675    }
676  return position[f];
677}
678