1170754Sdelphij/* File I/O for GNU DIFF. 2170754Sdelphij 3170754Sdelphij Copyright (C) 1988, 1989, 1992, 1993, 1994, 1995, 1998, 2001, 2002, 4170754Sdelphij 2004 Free Software Foundation, Inc. 5170754Sdelphij 6170754Sdelphij This file is part of GNU DIFF. 7170754Sdelphij 8170754Sdelphij GNU DIFF is free software; you can redistribute it and/or modify 9170754Sdelphij it under the terms of the GNU General Public License as published by 10170754Sdelphij the Free Software Foundation; either version 2, or (at your option) 11170754Sdelphij any later version. 12170754Sdelphij 13170754Sdelphij GNU DIFF is distributed in the hope that it will be useful, 14170754Sdelphij but WITHOUT ANY WARRANTY; without even the implied warranty of 15170754Sdelphij MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16170754Sdelphij GNU General Public License for more details. 17170754Sdelphij 18170754Sdelphij You should have received a copy of the GNU General Public License 19170754Sdelphij along with this program; see the file COPYING. 20170754Sdelphij If not, write to the Free Software Foundation, 21170754Sdelphij 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ 22170754Sdelphij 23170754Sdelphij#include "diff.h" 24170754Sdelphij#include <cmpbuf.h> 25170754Sdelphij#include <file-type.h> 26170754Sdelphij#include <setmode.h> 27170754Sdelphij#include <xalloc.h> 28170754Sdelphij 29170754Sdelphij/* Rotate an unsigned value to the left. */ 30170754Sdelphij#define ROL(v, n) ((v) << (n) | (v) >> (sizeof (v) * CHAR_BIT - (n))) 31170754Sdelphij 32170754Sdelphij/* Given a hash value and a new character, return a new hash value. */ 33170754Sdelphij#define HASH(h, c) ((c) + ROL (h, 7)) 34170754Sdelphij 35170754Sdelphij/* The type of a hash value. */ 36170754Sdelphijtypedef size_t hash_value; 37170754Sdelphijverify (hash_value_is_unsigned, ! TYPE_SIGNED (hash_value)); 38170754Sdelphij 39170754Sdelphij/* Lines are put into equivalence classes of lines that match in lines_differ. 40170754Sdelphij Each equivalence class is represented by one of these structures, 41170754Sdelphij but only while the classes are being computed. 42170754Sdelphij Afterward, each class is represented by a number. */ 43170754Sdelphijstruct equivclass 44170754Sdelphij{ 45170754Sdelphij lin next; /* Next item in this bucket. */ 46170754Sdelphij hash_value hash; /* Hash of lines in this class. */ 47170754Sdelphij char const *line; /* A line that fits this class. */ 48170754Sdelphij size_t length; /* That line's length, not counting its newline. */ 49170754Sdelphij}; 50170754Sdelphij 51170754Sdelphij/* Hash-table: array of buckets, each being a chain of equivalence classes. 52170754Sdelphij buckets[-1] is reserved for incomplete lines. */ 53170754Sdelphijstatic lin *buckets; 54170754Sdelphij 55170754Sdelphij/* Number of buckets in the hash table array, not counting buckets[-1]. */ 56170754Sdelphijstatic size_t nbuckets; 57170754Sdelphij 58170754Sdelphij/* Array in which the equivalence classes are allocated. 59170754Sdelphij The bucket-chains go through the elements in this array. 60170754Sdelphij The number of an equivalence class is its index in this array. */ 61170754Sdelphijstatic struct equivclass *equivs; 62170754Sdelphij 63170754Sdelphij/* Index of first free element in the array `equivs'. */ 64170754Sdelphijstatic lin equivs_index; 65170754Sdelphij 66170754Sdelphij/* Number of elements allocated in the array `equivs'. */ 67170754Sdelphijstatic lin equivs_alloc; 68170754Sdelphij 69170754Sdelphij/* Read a block of data into a file buffer, checking for EOF and error. */ 70170754Sdelphij 71170754Sdelphijvoid 72170754Sdelphijfile_block_read (struct file_data *current, size_t size) 73170754Sdelphij{ 74170754Sdelphij if (size && ! current->eof) 75170754Sdelphij { 76170754Sdelphij size_t s = block_read (current->desc, 77170754Sdelphij FILE_BUFFER (current) + current->buffered, size); 78170754Sdelphij if (s == SIZE_MAX) 79170754Sdelphij pfatal_with_name (current->name); 80170754Sdelphij current->buffered += s; 81170754Sdelphij current->eof = s < size; 82170754Sdelphij } 83170754Sdelphij} 84170754Sdelphij 85170754Sdelphij/* Check for binary files and compare them for exact identity. */ 86170754Sdelphij 87170754Sdelphij/* Return 1 if BUF contains a non text character. 88170754Sdelphij SIZE is the number of characters in BUF. */ 89170754Sdelphij 90170754Sdelphij#define binary_file_p(buf, size) (memchr (buf, 0, size) != 0) 91170754Sdelphij 92170754Sdelphij/* Get ready to read the current file. 93170754Sdelphij Return nonzero if SKIP_TEST is zero, 94170754Sdelphij and if it appears to be a binary file. */ 95170754Sdelphij 96170754Sdelphijstatic bool 97170754Sdelphijsip (struct file_data *current, bool skip_test) 98170754Sdelphij{ 99170754Sdelphij /* If we have a nonexistent file at this stage, treat it as empty. */ 100170754Sdelphij if (current->desc < 0) 101170754Sdelphij { 102170754Sdelphij /* Leave room for a sentinel. */ 103170754Sdelphij current->bufsize = sizeof (word); 104170754Sdelphij current->buffer = xmalloc (current->bufsize); 105170754Sdelphij } 106170754Sdelphij else 107170754Sdelphij { 108170754Sdelphij current->bufsize = buffer_lcm (sizeof (word), 109170754Sdelphij STAT_BLOCKSIZE (current->stat), 110170754Sdelphij PTRDIFF_MAX - 2 * sizeof (word)); 111170754Sdelphij current->buffer = xmalloc (current->bufsize); 112170754Sdelphij 113170754Sdelphij if (! skip_test) 114170754Sdelphij { 115170754Sdelphij /* Check first part of file to see if it's a binary file. */ 116170754Sdelphij 117170754Sdelphij bool was_binary = set_binary_mode (current->desc, true); 118170754Sdelphij off_t buffered; 119170754Sdelphij file_block_read (current, current->bufsize); 120170754Sdelphij buffered = current->buffered; 121170754Sdelphij 122170754Sdelphij if (! was_binary) 123170754Sdelphij { 124170754Sdelphij /* Revert to text mode and seek back to the beginning to 125170754Sdelphij reread the file. Use relative seek, since file 126170754Sdelphij descriptors like stdin might not start at offset 127170754Sdelphij zero. */ 128170754Sdelphij 129170754Sdelphij if (lseek (current->desc, - buffered, SEEK_CUR) == -1) 130170754Sdelphij pfatal_with_name (current->name); 131170754Sdelphij set_binary_mode (current->desc, false); 132170754Sdelphij current->buffered = 0; 133170754Sdelphij current->eof = false; 134170754Sdelphij } 135170754Sdelphij 136170754Sdelphij return binary_file_p (current->buffer, buffered); 137170754Sdelphij } 138170754Sdelphij } 139170754Sdelphij 140170754Sdelphij current->buffered = 0; 141170754Sdelphij current->eof = false; 142170754Sdelphij return false; 143170754Sdelphij} 144170754Sdelphij 145170754Sdelphij/* Slurp the rest of the current file completely into memory. */ 146170754Sdelphij 147170754Sdelphijstatic void 148170754Sdelphijslurp (struct file_data *current) 149170754Sdelphij{ 150170754Sdelphij size_t cc; 151170754Sdelphij 152170754Sdelphij if (current->desc < 0) 153170754Sdelphij { 154170754Sdelphij /* The file is nonexistent. */ 155170754Sdelphij return; 156170754Sdelphij } 157170754Sdelphij 158170754Sdelphij if (S_ISREG (current->stat.st_mode)) 159170754Sdelphij { 160170754Sdelphij /* It's a regular file; slurp in the rest all at once. */ 161170754Sdelphij 162170754Sdelphij /* Get the size out of the stat block. 163170754Sdelphij Allocate just enough room for appended newline plus word sentinel, 164170754Sdelphij plus word-alignment since we want the buffer word-aligned. */ 165170754Sdelphij size_t file_size = current->stat.st_size; 166170754Sdelphij cc = file_size + 2 * sizeof (word) - file_size % sizeof (word); 167170754Sdelphij if (file_size != current->stat.st_size || cc < file_size 168170754Sdelphij || PTRDIFF_MAX <= cc) 169170754Sdelphij xalloc_die (); 170170754Sdelphij 171170754Sdelphij if (current->bufsize < cc) 172170754Sdelphij { 173170754Sdelphij current->bufsize = cc; 174170754Sdelphij current->buffer = xrealloc (current->buffer, cc); 175170754Sdelphij } 176170754Sdelphij 177170754Sdelphij /* Try to read at least 1 more byte than the size indicates, to 178170754Sdelphij detect whether the file is growing. This is a nicety for 179170754Sdelphij users who run 'diff' on files while they are changing. */ 180170754Sdelphij 181170754Sdelphij if (current->buffered <= file_size) 182170754Sdelphij { 183170754Sdelphij file_block_read (current, file_size + 1 - current->buffered); 184170754Sdelphij if (current->buffered <= file_size) 185170754Sdelphij return; 186170754Sdelphij } 187170754Sdelphij } 188170754Sdelphij 189170754Sdelphij /* It's not a regular file, or it's a growing regular file; read it, 190170754Sdelphij growing the buffer as needed. */ 191170754Sdelphij 192170754Sdelphij file_block_read (current, current->bufsize - current->buffered); 193170754Sdelphij 194170754Sdelphij if (current->buffered) 195170754Sdelphij { 196170754Sdelphij while (current->buffered == current->bufsize) 197170754Sdelphij { 198170754Sdelphij if (PTRDIFF_MAX / 2 - sizeof (word) < current->bufsize) 199170754Sdelphij xalloc_die (); 200170754Sdelphij current->bufsize *= 2; 201170754Sdelphij current->buffer = xrealloc (current->buffer, current->bufsize); 202170754Sdelphij file_block_read (current, current->bufsize - current->buffered); 203170754Sdelphij } 204170754Sdelphij 205170754Sdelphij /* Allocate just enough room for appended newline plus word 206170754Sdelphij sentinel, plus word-alignment. */ 207170754Sdelphij cc = current->buffered + 2 * sizeof (word); 208170754Sdelphij current->bufsize = cc - cc % sizeof (word); 209170754Sdelphij current->buffer = xrealloc (current->buffer, current->bufsize); 210170754Sdelphij } 211170754Sdelphij} 212170754Sdelphij 213170754Sdelphij/* Split the file into lines, simultaneously computing the equivalence 214170754Sdelphij class for each line. */ 215170754Sdelphij 216170754Sdelphijstatic void 217170754Sdelphijfind_and_hash_each_line (struct file_data *current) 218170754Sdelphij{ 219170754Sdelphij hash_value h; 220170754Sdelphij char const *p = current->prefix_end; 221170754Sdelphij unsigned char c; 222170754Sdelphij lin i, *bucket; 223170754Sdelphij size_t length; 224170754Sdelphij 225170754Sdelphij /* Cache often-used quantities in local variables to help the compiler. */ 226170754Sdelphij char const **linbuf = current->linbuf; 227170754Sdelphij lin alloc_lines = current->alloc_lines; 228170754Sdelphij lin line = 0; 229170754Sdelphij lin linbuf_base = current->linbuf_base; 230170754Sdelphij lin *cureqs = xmalloc (alloc_lines * sizeof *cureqs); 231170754Sdelphij struct equivclass *eqs = equivs; 232170754Sdelphij lin eqs_index = equivs_index; 233170754Sdelphij lin eqs_alloc = equivs_alloc; 234170754Sdelphij char const *suffix_begin = current->suffix_begin; 235170754Sdelphij char const *bufend = FILE_BUFFER (current) + current->buffered; 236170754Sdelphij bool diff_length_compare_anyway = 237170754Sdelphij ignore_white_space != IGNORE_NO_WHITE_SPACE; 238170754Sdelphij bool same_length_diff_contents_compare_anyway = 239170754Sdelphij diff_length_compare_anyway | ignore_case; 240170754Sdelphij 241170754Sdelphij while (p < suffix_begin) 242170754Sdelphij { 243170754Sdelphij char const *ip = p; 244170754Sdelphij 245170754Sdelphij h = 0; 246170754Sdelphij 247170754Sdelphij /* Hash this line until we find a newline. */ 248170754Sdelphij if (ignore_case) 249170754Sdelphij switch (ignore_white_space) 250170754Sdelphij { 251170754Sdelphij case IGNORE_ALL_SPACE: 252170754Sdelphij while ((c = *p++) != '\n') 253170754Sdelphij if (! isspace (c)) 254170754Sdelphij h = HASH (h, tolower (c)); 255170754Sdelphij break; 256170754Sdelphij 257170754Sdelphij case IGNORE_SPACE_CHANGE: 258170754Sdelphij while ((c = *p++) != '\n') 259170754Sdelphij { 260170754Sdelphij if (isspace (c)) 261170754Sdelphij { 262170754Sdelphij do 263170754Sdelphij if ((c = *p++) == '\n') 264170754Sdelphij goto hashing_done; 265170754Sdelphij while (isspace (c)); 266170754Sdelphij 267170754Sdelphij h = HASH (h, ' '); 268170754Sdelphij } 269170754Sdelphij 270170754Sdelphij /* C is now the first non-space. */ 271170754Sdelphij h = HASH (h, tolower (c)); 272170754Sdelphij } 273170754Sdelphij break; 274170754Sdelphij 275170754Sdelphij case IGNORE_TAB_EXPANSION: 276170754Sdelphij { 277170754Sdelphij size_t column = 0; 278170754Sdelphij while ((c = *p++) != '\n') 279170754Sdelphij { 280170754Sdelphij size_t repetitions = 1; 281170754Sdelphij 282170754Sdelphij switch (c) 283170754Sdelphij { 284170754Sdelphij case '\b': 285170754Sdelphij column -= 0 < column; 286170754Sdelphij break; 287170754Sdelphij 288170754Sdelphij case '\t': 289170754Sdelphij c = ' '; 290170754Sdelphij repetitions = tabsize - column % tabsize; 291170754Sdelphij column = (column + repetitions < column 292170754Sdelphij ? 0 293170754Sdelphij : column + repetitions); 294170754Sdelphij break; 295170754Sdelphij 296170754Sdelphij case '\r': 297170754Sdelphij column = 0; 298170754Sdelphij break; 299170754Sdelphij 300170754Sdelphij default: 301170754Sdelphij c = tolower (c); 302170754Sdelphij column++; 303170754Sdelphij break; 304170754Sdelphij } 305170754Sdelphij 306170754Sdelphij do 307170754Sdelphij h = HASH (h, c); 308170754Sdelphij while (--repetitions != 0); 309170754Sdelphij } 310170754Sdelphij } 311170754Sdelphij break; 312170754Sdelphij 313170754Sdelphij default: 314170754Sdelphij while ((c = *p++) != '\n') 315170754Sdelphij h = HASH (h, tolower (c)); 316170754Sdelphij break; 317170754Sdelphij } 318170754Sdelphij else 319170754Sdelphij switch (ignore_white_space) 320170754Sdelphij { 321170754Sdelphij case IGNORE_ALL_SPACE: 322170754Sdelphij while ((c = *p++) != '\n') 323170754Sdelphij if (! isspace (c)) 324170754Sdelphij h = HASH (h, c); 325170754Sdelphij break; 326170754Sdelphij 327170754Sdelphij case IGNORE_SPACE_CHANGE: 328170754Sdelphij while ((c = *p++) != '\n') 329170754Sdelphij { 330170754Sdelphij if (isspace (c)) 331170754Sdelphij { 332170754Sdelphij do 333170754Sdelphij if ((c = *p++) == '\n') 334170754Sdelphij goto hashing_done; 335170754Sdelphij while (isspace (c)); 336170754Sdelphij 337170754Sdelphij h = HASH (h, ' '); 338170754Sdelphij } 339170754Sdelphij 340170754Sdelphij /* C is now the first non-space. */ 341170754Sdelphij h = HASH (h, c); 342170754Sdelphij } 343170754Sdelphij break; 344170754Sdelphij 345170754Sdelphij case IGNORE_TAB_EXPANSION: 346170754Sdelphij { 347170754Sdelphij size_t column = 0; 348170754Sdelphij while ((c = *p++) != '\n') 349170754Sdelphij { 350170754Sdelphij size_t repetitions = 1; 351170754Sdelphij 352170754Sdelphij switch (c) 353170754Sdelphij { 354170754Sdelphij case '\b': 355170754Sdelphij column -= 0 < column; 356170754Sdelphij break; 357170754Sdelphij 358170754Sdelphij case '\t': 359170754Sdelphij c = ' '; 360170754Sdelphij repetitions = tabsize - column % tabsize; 361170754Sdelphij column = (column + repetitions < column 362170754Sdelphij ? 0 363170754Sdelphij : column + repetitions); 364170754Sdelphij break; 365170754Sdelphij 366170754Sdelphij case '\r': 367170754Sdelphij column = 0; 368170754Sdelphij break; 369170754Sdelphij 370170754Sdelphij default: 371170754Sdelphij column++; 372170754Sdelphij break; 373170754Sdelphij } 374170754Sdelphij 375170754Sdelphij do 376170754Sdelphij h = HASH (h, c); 377170754Sdelphij while (--repetitions != 0); 378170754Sdelphij } 379170754Sdelphij } 380170754Sdelphij break; 381170754Sdelphij 382170754Sdelphij default: 383170754Sdelphij while ((c = *p++) != '\n') 384170754Sdelphij h = HASH (h, c); 385170754Sdelphij break; 386170754Sdelphij } 387170754Sdelphij 388170754Sdelphij hashing_done:; 389170754Sdelphij 390170754Sdelphij bucket = &buckets[h % nbuckets]; 391170754Sdelphij length = p - ip - 1; 392170754Sdelphij 393170754Sdelphij if (p == bufend 394170754Sdelphij && current->missing_newline 395170754Sdelphij && ROBUST_OUTPUT_STYLE (output_style)) 396170754Sdelphij { 397170754Sdelphij /* This line is incomplete. If this is significant, 398170754Sdelphij put the line into buckets[-1]. */ 399170754Sdelphij if (ignore_white_space < IGNORE_SPACE_CHANGE) 400170754Sdelphij bucket = &buckets[-1]; 401170754Sdelphij 402170754Sdelphij /* Omit the inserted newline when computing linbuf later. */ 403170754Sdelphij p--; 404170754Sdelphij bufend = suffix_begin = p; 405170754Sdelphij } 406170754Sdelphij 407170754Sdelphij for (i = *bucket; ; i = eqs[i].next) 408170754Sdelphij if (!i) 409170754Sdelphij { 410170754Sdelphij /* Create a new equivalence class in this bucket. */ 411170754Sdelphij i = eqs_index++; 412170754Sdelphij if (i == eqs_alloc) 413170754Sdelphij { 414170754Sdelphij if (PTRDIFF_MAX / (2 * sizeof *eqs) <= eqs_alloc) 415170754Sdelphij xalloc_die (); 416170754Sdelphij eqs_alloc *= 2; 417170754Sdelphij eqs = xrealloc (eqs, eqs_alloc * sizeof *eqs); 418170754Sdelphij } 419170754Sdelphij eqs[i].next = *bucket; 420170754Sdelphij eqs[i].hash = h; 421170754Sdelphij eqs[i].line = ip; 422170754Sdelphij eqs[i].length = length; 423170754Sdelphij *bucket = i; 424170754Sdelphij break; 425170754Sdelphij } 426170754Sdelphij else if (eqs[i].hash == h) 427170754Sdelphij { 428170754Sdelphij char const *eqline = eqs[i].line; 429170754Sdelphij 430170754Sdelphij /* Reuse existing class if lines_differ reports the lines 431170754Sdelphij equal. */ 432170754Sdelphij if (eqs[i].length == length) 433170754Sdelphij { 434170754Sdelphij /* Reuse existing equivalence class if the lines are identical. 435170754Sdelphij This detects the common case of exact identity 436170754Sdelphij faster than lines_differ would. */ 437170754Sdelphij if (memcmp (eqline, ip, length) == 0) 438170754Sdelphij break; 439170754Sdelphij if (!same_length_diff_contents_compare_anyway) 440170754Sdelphij continue; 441170754Sdelphij } 442170754Sdelphij else if (!diff_length_compare_anyway) 443170754Sdelphij continue; 444170754Sdelphij 445170754Sdelphij if (! lines_differ (eqline, ip)) 446170754Sdelphij break; 447170754Sdelphij } 448170754Sdelphij 449170754Sdelphij /* Maybe increase the size of the line table. */ 450170754Sdelphij if (line == alloc_lines) 451170754Sdelphij { 452170754Sdelphij /* Double (alloc_lines - linbuf_base) by adding to alloc_lines. */ 453170754Sdelphij if (PTRDIFF_MAX / 3 <= alloc_lines 454170754Sdelphij || PTRDIFF_MAX / sizeof *cureqs <= 2 * alloc_lines - linbuf_base 455170754Sdelphij || PTRDIFF_MAX / sizeof *linbuf <= alloc_lines - linbuf_base) 456170754Sdelphij xalloc_die (); 457170754Sdelphij alloc_lines = 2 * alloc_lines - linbuf_base; 458170754Sdelphij cureqs = xrealloc (cureqs, alloc_lines * sizeof *cureqs); 459170754Sdelphij linbuf += linbuf_base; 460170754Sdelphij linbuf = xrealloc (linbuf, 461170754Sdelphij (alloc_lines - linbuf_base) * sizeof *linbuf); 462170754Sdelphij linbuf -= linbuf_base; 463170754Sdelphij } 464170754Sdelphij linbuf[line] = ip; 465170754Sdelphij cureqs[line] = i; 466170754Sdelphij ++line; 467170754Sdelphij } 468170754Sdelphij 469170754Sdelphij current->buffered_lines = line; 470170754Sdelphij 471170754Sdelphij for (i = 0; ; i++) 472170754Sdelphij { 473170754Sdelphij /* Record the line start for lines in the suffix that we care about. 474170754Sdelphij Record one more line start than lines, 475170754Sdelphij so that we can compute the length of any buffered line. */ 476170754Sdelphij if (line == alloc_lines) 477170754Sdelphij { 478170754Sdelphij /* Double (alloc_lines - linbuf_base) by adding to alloc_lines. */ 479170754Sdelphij if (PTRDIFF_MAX / 3 <= alloc_lines 480170754Sdelphij || PTRDIFF_MAX / sizeof *cureqs <= 2 * alloc_lines - linbuf_base 481170754Sdelphij || PTRDIFF_MAX / sizeof *linbuf <= alloc_lines - linbuf_base) 482170754Sdelphij xalloc_die (); 483170754Sdelphij alloc_lines = 2 * alloc_lines - linbuf_base; 484170754Sdelphij linbuf += linbuf_base; 485170754Sdelphij linbuf = xrealloc (linbuf, 486170754Sdelphij (alloc_lines - linbuf_base) * sizeof *linbuf); 487170754Sdelphij linbuf -= linbuf_base; 488170754Sdelphij } 489170754Sdelphij linbuf[line] = p; 490170754Sdelphij 491170754Sdelphij if (p == bufend) 492170754Sdelphij break; 493170754Sdelphij 494170754Sdelphij if (context <= i && no_diff_means_no_output) 495170754Sdelphij break; 496170754Sdelphij 497170754Sdelphij line++; 498170754Sdelphij 499170754Sdelphij while (*p++ != '\n') 500170754Sdelphij continue; 501170754Sdelphij } 502170754Sdelphij 503170754Sdelphij /* Done with cache in local variables. */ 504170754Sdelphij current->linbuf = linbuf; 505170754Sdelphij current->valid_lines = line; 506170754Sdelphij current->alloc_lines = alloc_lines; 507170754Sdelphij current->equivs = cureqs; 508170754Sdelphij equivs = eqs; 509170754Sdelphij equivs_alloc = eqs_alloc; 510170754Sdelphij equivs_index = eqs_index; 511170754Sdelphij} 512170754Sdelphij 513170754Sdelphij/* Prepare the text. Make sure the text end is initialized. 514170754Sdelphij Make sure text ends in a newline, 515170754Sdelphij but remember that we had to add one. 516170754Sdelphij Strip trailing CRs, if that was requested. */ 517170754Sdelphij 518170754Sdelphijstatic void 519170754Sdelphijprepare_text (struct file_data *current) 520170754Sdelphij{ 521170754Sdelphij size_t buffered = current->buffered; 522170754Sdelphij char *p = FILE_BUFFER (current); 523170754Sdelphij char *dst; 524170754Sdelphij 525170754Sdelphij if (buffered == 0 || p[buffered - 1] == '\n') 526170754Sdelphij current->missing_newline = false; 527170754Sdelphij else 528170754Sdelphij { 529170754Sdelphij p[buffered++] = '\n'; 530170754Sdelphij current->missing_newline = true; 531170754Sdelphij } 532170754Sdelphij 533170754Sdelphij if (!p) 534170754Sdelphij return; 535170754Sdelphij 536170754Sdelphij /* Don't use uninitialized storage when planting or using sentinels. */ 537170754Sdelphij memset (p + buffered, 0, sizeof (word)); 538170754Sdelphij 539170754Sdelphij if (strip_trailing_cr && (dst = memchr (p, '\r', buffered))) 540170754Sdelphij { 541170754Sdelphij char const *src = dst; 542170754Sdelphij char const *srclim = p + buffered; 543170754Sdelphij 544170754Sdelphij do 545170754Sdelphij dst += ! ((*dst = *src++) == '\r' && *src == '\n'); 546170754Sdelphij while (src < srclim); 547170754Sdelphij 548170754Sdelphij buffered -= src - dst; 549170754Sdelphij } 550170754Sdelphij 551170754Sdelphij current->buffered = buffered; 552170754Sdelphij} 553170754Sdelphij 554170754Sdelphij/* We have found N lines in a buffer of size S; guess the 555170754Sdelphij proportionate number of lines that will be found in a buffer of 556170754Sdelphij size T. However, do not guess a number of lines so large that the 557170754Sdelphij resulting line table might cause overflow in size calculations. */ 558170754Sdelphijstatic lin 559170754Sdelphijguess_lines (lin n, size_t s, size_t t) 560170754Sdelphij{ 561170754Sdelphij size_t guessed_bytes_per_line = n < 10 ? 32 : s / (n - 1); 562170754Sdelphij lin guessed_lines = MAX (1, t / guessed_bytes_per_line); 563170754Sdelphij return MIN (guessed_lines, PTRDIFF_MAX / (2 * sizeof (char *) + 1) - 5) + 5; 564170754Sdelphij} 565170754Sdelphij 566170754Sdelphij/* Given a vector of two file_data objects, find the identical 567170754Sdelphij prefixes and suffixes of each object. */ 568170754Sdelphij 569170754Sdelphijstatic void 570170754Sdelphijfind_identical_ends (struct file_data filevec[]) 571170754Sdelphij{ 572170754Sdelphij word *w0, *w1; 573170754Sdelphij char *p0, *p1, *buffer0, *buffer1; 574170754Sdelphij char const *end0, *beg0; 575170754Sdelphij char const **linbuf0, **linbuf1; 576170754Sdelphij lin i, lines; 577170754Sdelphij size_t n0, n1; 578170754Sdelphij lin alloc_lines0, alloc_lines1; 579170754Sdelphij lin buffered_prefix, prefix_count, prefix_mask; 580170754Sdelphij lin middle_guess, suffix_guess; 581170754Sdelphij 582170754Sdelphij slurp (&filevec[0]); 583170754Sdelphij prepare_text (&filevec[0]); 584170754Sdelphij if (filevec[0].desc != filevec[1].desc) 585170754Sdelphij { 586170754Sdelphij slurp (&filevec[1]); 587170754Sdelphij prepare_text (&filevec[1]); 588170754Sdelphij } 589170754Sdelphij else 590170754Sdelphij { 591170754Sdelphij filevec[1].buffer = filevec[0].buffer; 592170754Sdelphij filevec[1].bufsize = filevec[0].bufsize; 593170754Sdelphij filevec[1].buffered = filevec[0].buffered; 594170754Sdelphij filevec[1].missing_newline = filevec[0].missing_newline; 595170754Sdelphij } 596170754Sdelphij 597170754Sdelphij /* Find identical prefix. */ 598170754Sdelphij 599170754Sdelphij w0 = filevec[0].buffer; 600170754Sdelphij w1 = filevec[1].buffer; 601170754Sdelphij p0 = buffer0 = (char *) w0; 602170754Sdelphij p1 = buffer1 = (char *) w1; 603170754Sdelphij n0 = filevec[0].buffered; 604170754Sdelphij n1 = filevec[1].buffered; 605170754Sdelphij 606170754Sdelphij if (p0 == p1) 607170754Sdelphij /* The buffers are the same; sentinels won't work. */ 608170754Sdelphij p0 = p1 += n1; 609170754Sdelphij else 610170754Sdelphij { 611170754Sdelphij /* Insert end sentinels, in this case characters that are guaranteed 612170754Sdelphij to make the equality test false, and thus terminate the loop. */ 613170754Sdelphij 614170754Sdelphij if (n0 < n1) 615170754Sdelphij p0[n0] = ~p1[n0]; 616170754Sdelphij else 617170754Sdelphij p1[n1] = ~p0[n1]; 618170754Sdelphij 619170754Sdelphij /* Loop until first mismatch, or to the sentinel characters. */ 620170754Sdelphij 621170754Sdelphij /* Compare a word at a time for speed. */ 622170754Sdelphij while (*w0 == *w1) 623170754Sdelphij w0++, w1++; 624170754Sdelphij 625170754Sdelphij /* Do the last few bytes of comparison a byte at a time. */ 626170754Sdelphij p0 = (char *) w0; 627170754Sdelphij p1 = (char *) w1; 628170754Sdelphij while (*p0 == *p1) 629170754Sdelphij p0++, p1++; 630170754Sdelphij 631170754Sdelphij /* Don't mistakenly count missing newline as part of prefix. */ 632170754Sdelphij if (ROBUST_OUTPUT_STYLE (output_style) 633170754Sdelphij && ((buffer0 + n0 - filevec[0].missing_newline < p0) 634170754Sdelphij != 635170754Sdelphij (buffer1 + n1 - filevec[1].missing_newline < p1))) 636170754Sdelphij p0--, p1--; 637170754Sdelphij } 638170754Sdelphij 639170754Sdelphij /* Now P0 and P1 point at the first nonmatching characters. */ 640170754Sdelphij 641170754Sdelphij /* Skip back to last line-beginning in the prefix, 642170754Sdelphij and then discard up to HORIZON_LINES lines from the prefix. */ 643170754Sdelphij i = horizon_lines; 644170754Sdelphij while (p0 != buffer0 && (p0[-1] != '\n' || i--)) 645170754Sdelphij p0--, p1--; 646170754Sdelphij 647170754Sdelphij /* Record the prefix. */ 648170754Sdelphij filevec[0].prefix_end = p0; 649170754Sdelphij filevec[1].prefix_end = p1; 650170754Sdelphij 651170754Sdelphij /* Find identical suffix. */ 652170754Sdelphij 653170754Sdelphij /* P0 and P1 point beyond the last chars not yet compared. */ 654170754Sdelphij p0 = buffer0 + n0; 655170754Sdelphij p1 = buffer1 + n1; 656170754Sdelphij 657170754Sdelphij if (! ROBUST_OUTPUT_STYLE (output_style) 658170754Sdelphij || filevec[0].missing_newline == filevec[1].missing_newline) 659170754Sdelphij { 660170754Sdelphij end0 = p0; /* Addr of last char in file 0. */ 661170754Sdelphij 662170754Sdelphij /* Get value of P0 at which we should stop scanning backward: 663170754Sdelphij this is when either P0 or P1 points just past the last char 664170754Sdelphij of the identical prefix. */ 665170754Sdelphij beg0 = filevec[0].prefix_end + (n0 < n1 ? 0 : n0 - n1); 666170754Sdelphij 667170754Sdelphij /* Scan back until chars don't match or we reach that point. */ 668170754Sdelphij for (; p0 != beg0; p0--, p1--) 669170754Sdelphij if (*p0 != *p1) 670170754Sdelphij { 671170754Sdelphij /* Point at the first char of the matching suffix. */ 672170754Sdelphij beg0 = p0; 673170754Sdelphij break; 674170754Sdelphij } 675170754Sdelphij 676170754Sdelphij /* Are we at a line-beginning in both files? If not, add the rest of 677170754Sdelphij this line to the main body. Discard up to HORIZON_LINES lines from 678170754Sdelphij the identical suffix. Also, discard one extra line, 679170754Sdelphij because shift_boundaries may need it. */ 680170754Sdelphij i = horizon_lines + !((buffer0 == p0 || p0[-1] == '\n') 681170754Sdelphij && 682170754Sdelphij (buffer1 == p1 || p1[-1] == '\n')); 683170754Sdelphij while (i-- && p0 != end0) 684170754Sdelphij while (*p0++ != '\n') 685170754Sdelphij continue; 686170754Sdelphij 687170754Sdelphij p1 += p0 - beg0; 688170754Sdelphij } 689170754Sdelphij 690170754Sdelphij /* Record the suffix. */ 691170754Sdelphij filevec[0].suffix_begin = p0; 692170754Sdelphij filevec[1].suffix_begin = p1; 693170754Sdelphij 694170754Sdelphij /* Calculate number of lines of prefix to save. 695170754Sdelphij 696170754Sdelphij prefix_count == 0 means save the whole prefix; 697170754Sdelphij we need this for options like -D that output the whole file, 698170754Sdelphij or for enormous contexts (to avoid worrying about arithmetic overflow). 699170754Sdelphij We also need it for options like -F that output some preceding line; 700170754Sdelphij at least we will need to find the last few lines, 701170754Sdelphij but since we don't know how many, it's easiest to find them all. 702170754Sdelphij 703170754Sdelphij Otherwise, prefix_count != 0. Save just prefix_count lines at start 704170754Sdelphij of the line buffer; they'll be moved to the proper location later. 705170754Sdelphij Handle 1 more line than the context says (because we count 1 too many), 706170754Sdelphij rounded up to the next power of 2 to speed index computation. */ 707170754Sdelphij 708170754Sdelphij if (no_diff_means_no_output && ! function_regexp.fastmap 709170754Sdelphij && context < LIN_MAX / 4 && context < n0) 710170754Sdelphij { 711170754Sdelphij middle_guess = guess_lines (0, 0, p0 - filevec[0].prefix_end); 712170754Sdelphij suffix_guess = guess_lines (0, 0, buffer0 + n0 - p0); 713170754Sdelphij for (prefix_count = 1; prefix_count <= context; prefix_count *= 2) 714170754Sdelphij continue; 715170754Sdelphij alloc_lines0 = (prefix_count + middle_guess 716170754Sdelphij + MIN (context, suffix_guess)); 717170754Sdelphij } 718170754Sdelphij else 719170754Sdelphij { 720170754Sdelphij prefix_count = 0; 721170754Sdelphij alloc_lines0 = guess_lines (0, 0, n0); 722170754Sdelphij } 723170754Sdelphij 724170754Sdelphij prefix_mask = prefix_count - 1; 725170754Sdelphij lines = 0; 726170754Sdelphij linbuf0 = xmalloc (alloc_lines0 * sizeof *linbuf0); 727170754Sdelphij p0 = buffer0; 728170754Sdelphij 729170754Sdelphij /* If the prefix is needed, find the prefix lines. */ 730170754Sdelphij if (! (no_diff_means_no_output 731170754Sdelphij && filevec[0].prefix_end == p0 732170754Sdelphij && filevec[1].prefix_end == p1)) 733170754Sdelphij { 734170754Sdelphij end0 = filevec[0].prefix_end; 735170754Sdelphij while (p0 != end0) 736170754Sdelphij { 737170754Sdelphij lin l = lines++ & prefix_mask; 738170754Sdelphij if (l == alloc_lines0) 739170754Sdelphij { 740170754Sdelphij if (PTRDIFF_MAX / (2 * sizeof *linbuf0) <= alloc_lines0) 741170754Sdelphij xalloc_die (); 742170754Sdelphij alloc_lines0 *= 2; 743170754Sdelphij linbuf0 = xrealloc (linbuf0, alloc_lines0 * sizeof *linbuf0); 744170754Sdelphij } 745170754Sdelphij linbuf0[l] = p0; 746170754Sdelphij while (*p0++ != '\n') 747170754Sdelphij continue; 748170754Sdelphij } 749170754Sdelphij } 750170754Sdelphij buffered_prefix = prefix_count && context < lines ? context : lines; 751170754Sdelphij 752170754Sdelphij /* Allocate line buffer 1. */ 753170754Sdelphij 754170754Sdelphij middle_guess = guess_lines (lines, p0 - buffer0, p1 - filevec[1].prefix_end); 755170754Sdelphij suffix_guess = guess_lines (lines, p0 - buffer0, buffer1 + n1 - p1); 756170754Sdelphij alloc_lines1 = buffered_prefix + middle_guess + MIN (context, suffix_guess); 757170754Sdelphij if (alloc_lines1 < buffered_prefix 758170754Sdelphij || PTRDIFF_MAX / sizeof *linbuf1 <= alloc_lines1) 759170754Sdelphij xalloc_die (); 760170754Sdelphij linbuf1 = xmalloc (alloc_lines1 * sizeof *linbuf1); 761170754Sdelphij 762170754Sdelphij if (buffered_prefix != lines) 763170754Sdelphij { 764170754Sdelphij /* Rotate prefix lines to proper location. */ 765170754Sdelphij for (i = 0; i < buffered_prefix; i++) 766170754Sdelphij linbuf1[i] = linbuf0[(lines - context + i) & prefix_mask]; 767170754Sdelphij for (i = 0; i < buffered_prefix; i++) 768170754Sdelphij linbuf0[i] = linbuf1[i]; 769170754Sdelphij } 770170754Sdelphij 771170754Sdelphij /* Initialize line buffer 1 from line buffer 0. */ 772170754Sdelphij for (i = 0; i < buffered_prefix; i++) 773170754Sdelphij linbuf1[i] = linbuf0[i] - buffer0 + buffer1; 774170754Sdelphij 775170754Sdelphij /* Record the line buffer, adjusted so that 776170754Sdelphij linbuf[0] points at the first differing line. */ 777170754Sdelphij filevec[0].linbuf = linbuf0 + buffered_prefix; 778170754Sdelphij filevec[1].linbuf = linbuf1 + buffered_prefix; 779170754Sdelphij filevec[0].linbuf_base = filevec[1].linbuf_base = - buffered_prefix; 780170754Sdelphij filevec[0].alloc_lines = alloc_lines0 - buffered_prefix; 781170754Sdelphij filevec[1].alloc_lines = alloc_lines1 - buffered_prefix; 782170754Sdelphij filevec[0].prefix_lines = filevec[1].prefix_lines = lines; 783170754Sdelphij} 784170754Sdelphij 785170754Sdelphij/* If 1 < k, then (2**k - prime_offset[k]) is the largest prime less 786170754Sdelphij than 2**k. This table is derived from Chris K. Caldwell's list 787170754Sdelphij <http://www.utm.edu/research/primes/lists/2small/>. */ 788170754Sdelphij 789170754Sdelphijstatic unsigned char const prime_offset[] = 790170754Sdelphij{ 791170754Sdelphij 0, 0, 1, 1, 3, 1, 3, 1, 5, 3, 3, 9, 3, 1, 3, 19, 15, 1, 5, 1, 3, 9, 3, 792170754Sdelphij 15, 3, 39, 5, 39, 57, 3, 35, 1, 5, 9, 41, 31, 5, 25, 45, 7, 87, 21, 793170754Sdelphij 11, 57, 17, 55, 21, 115, 59, 81, 27, 129, 47, 111, 33, 55, 5, 13, 27, 794170754Sdelphij 55, 93, 1, 57, 25 795170754Sdelphij}; 796170754Sdelphij 797170754Sdelphij/* Verify that this host's size_t is not too wide for the above table. */ 798170754Sdelphij 799170754Sdelphijverify (enough_prime_offsets, 800170754Sdelphij sizeof (size_t) * CHAR_BIT <= sizeof prime_offset); 801170754Sdelphij 802170754Sdelphij/* Given a vector of two file_data objects, read the file associated 803170754Sdelphij with each one, and build the table of equivalence classes. 804170754Sdelphij Return nonzero if either file appears to be a binary file. 805170754Sdelphij If PRETEND_BINARY is nonzero, pretend they are binary regardless. */ 806170754Sdelphij 807170754Sdelphijbool 808170754Sdelphijread_files (struct file_data filevec[], bool pretend_binary) 809170754Sdelphij{ 810170754Sdelphij int i; 811170754Sdelphij bool skip_test = text | pretend_binary; 812170754Sdelphij bool appears_binary = pretend_binary | sip (&filevec[0], skip_test); 813170754Sdelphij 814170754Sdelphij if (filevec[0].desc != filevec[1].desc) 815170754Sdelphij appears_binary |= sip (&filevec[1], skip_test | appears_binary); 816170754Sdelphij else 817170754Sdelphij { 818170754Sdelphij filevec[1].buffer = filevec[0].buffer; 819170754Sdelphij filevec[1].bufsize = filevec[0].bufsize; 820170754Sdelphij filevec[1].buffered = filevec[0].buffered; 821170754Sdelphij } 822170754Sdelphij if (appears_binary) 823170754Sdelphij { 824170754Sdelphij set_binary_mode (filevec[0].desc, true); 825170754Sdelphij set_binary_mode (filevec[1].desc, true); 826170754Sdelphij return true; 827170754Sdelphij } 828170754Sdelphij 829170754Sdelphij find_identical_ends (filevec); 830170754Sdelphij 831170754Sdelphij equivs_alloc = filevec[0].alloc_lines + filevec[1].alloc_lines + 1; 832170754Sdelphij if (PTRDIFF_MAX / sizeof *equivs <= equivs_alloc) 833170754Sdelphij xalloc_die (); 834170754Sdelphij equivs = xmalloc (equivs_alloc * sizeof *equivs); 835170754Sdelphij /* Equivalence class 0 is permanently safe for lines that were not 836170754Sdelphij hashed. Real equivalence classes start at 1. */ 837170754Sdelphij equivs_index = 1; 838170754Sdelphij 839170754Sdelphij /* Allocate (one plus) a prime number of hash buckets. Use a prime 840170754Sdelphij number between 1/3 and 2/3 of the value of equiv_allocs, 841170754Sdelphij approximately. */ 842170754Sdelphij for (i = 9; (size_t) 1 << i < equivs_alloc / 3; i++) 843170754Sdelphij continue; 844170754Sdelphij nbuckets = ((size_t) 1 << i) - prime_offset[i]; 845170754Sdelphij if (PTRDIFF_MAX / sizeof *buckets <= nbuckets) 846170754Sdelphij xalloc_die (); 847170754Sdelphij buckets = zalloc ((nbuckets + 1) * sizeof *buckets); 848170754Sdelphij buckets++; 849170754Sdelphij 850170754Sdelphij for (i = 0; i < 2; i++) 851170754Sdelphij find_and_hash_each_line (&filevec[i]); 852170754Sdelphij 853170754Sdelphij filevec[0].equiv_max = filevec[1].equiv_max = equivs_index; 854170754Sdelphij 855170754Sdelphij free (equivs); 856170754Sdelphij free (buckets - 1); 857170754Sdelphij 858170754Sdelphij return false; 859170754Sdelphij} 860