1/* File I/O for GNU DIFF. 2 3 Copyright (C) 1988, 1989, 1992, 1993, 1994, 1995, 1998, 2001, 2002 4 Free Software Foundation, Inc. 5 6 This file is part of GNU DIFF. 7 8 GNU DIFF is free software; you can redistribute it and/or modify 9 it under the terms of the GNU General Public License as published by 10 the Free Software Foundation; either version 2, or (at your option) 11 any later version. 12 13 GNU DIFF is distributed in the hope that it will be useful, 14 but WITHOUT ANY WARRANTY; without even the implied warranty of 15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 GNU General Public License for more details. 17 18 You should have received a copy of the GNU General Public License 19 along with this program; see the file COPYING. 20 If not, write to the Free Software Foundation, 21 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ 22 23#include "diff.h" 24#include <cmpbuf.h> 25#include <regex.h> 26#include <setmode.h> 27#include <xalloc.h> 28 29/* Rotate an unsigned value to the left. */ 30#define ROL(v, n) ((v) << (n) | (v) >> (sizeof (v) * CHAR_BIT - (n))) 31 32/* Given a hash value and a new character, return a new hash value. */ 33#define HASH(h, c) ((c) + ROL (h, 7)) 34 35/* The type of a hash value. */ 36typedef size_t hash_value; 37verify (hash_value_is_unsigned, ! TYPE_SIGNED (hash_value)); 38 39/* Lines are put into equivalence classes of lines that match in lines_differ. 40 Each equivalence class is represented by one of these structures, 41 but only while the classes are being computed. 42 Afterward, each class is represented by a number. */ 43struct equivclass 44{ 45 lin next; /* Next item in this bucket. */ 46 hash_value hash; /* Hash of lines in this class. */ 47 char const *line; /* A line that fits this class. */ 48 size_t length; /* That line's length, not counting its newline. */ 49}; 50 51/* Hash-table: array of buckets, each being a chain of equivalence classes. 52 buckets[-1] is reserved for incomplete lines. */ 53static lin *buckets; 54 55/* Number of buckets in the hash table array, not counting buckets[-1]. */ 56static size_t nbuckets; 57 58/* Array in which the equivalence classes are allocated. 59 The bucket-chains go through the elements in this array. 60 The number of an equivalence class is its index in this array. */ 61static struct equivclass *equivs; 62 63/* Index of first free element in the array `equivs'. */ 64static lin equivs_index; 65 66/* Number of elements allocated in the array `equivs'. */ 67static lin equivs_alloc; 68 69/* Read a block of data into a file buffer, checking for EOF and error. */ 70 71void 72file_block_read (struct file_data *current, size_t size) 73{ 74 if (size && ! current->eof) 75 { 76 size_t s = block_read (current->desc, 77 FILE_BUFFER (current) + current->buffered, size); 78 if (s == SIZE_MAX) 79 pfatal_with_name (current->name); 80 current->buffered += s; 81 current->eof = s < size; 82 } 83} 84 85/* Check for binary files and compare them for exact identity. */ 86 87/* Return 1 if BUF contains a non text character. 88 SIZE is the number of characters in BUF. */ 89 90#define binary_file_p(buf, size) (memchr (buf, 0, size) != 0) 91 92/* Get ready to read the current file. 93 Return nonzero if SKIP_TEST is zero, 94 and if it appears to be a binary file. */ 95 96static bool 97sip (struct file_data *current, bool skip_test) 98{ 99 /* If we have a nonexistent file at this stage, treat it as empty. */ 100 if (current->desc < 0) 101 { 102 /* Leave room for a sentinel. */ 103 current->bufsize = sizeof (word); 104 current->buffer = xmalloc (current->bufsize); 105 } 106 else 107 { 108 current->bufsize = buffer_lcm (sizeof (word), 109 STAT_BLOCKSIZE (current->stat), 110 PTRDIFF_MAX - 2 * sizeof (word)); 111 current->buffer = xmalloc (current->bufsize); 112 113 if (! skip_test) 114 { 115 /* Check first part of file to see if it's a binary file. */ 116 117 bool was_binary = set_binary_mode (current->desc, 1); 118 off_t buffered; 119 file_block_read (current, current->bufsize); 120 buffered = current->buffered; 121 122 if (! was_binary) 123 { 124 /* Revert to text mode and seek back to the beginning to 125 reread the file. Use relative seek, since file 126 descriptors like stdin might not start at offset 127 zero. */ 128 129 if (lseek (current->desc, - buffered, SEEK_CUR) == -1) 130 pfatal_with_name (current->name); 131 set_binary_mode (current->desc, 0); 132 current->buffered = 0; 133 current->eof = 0; 134 } 135 136 return binary_file_p (current->buffer, buffered); 137 } 138 } 139 140 current->buffered = 0; 141 current->eof = 0; 142 return 0; 143} 144 145/* Slurp the rest of the current file completely into memory. */ 146 147static void 148slurp (struct file_data *current) 149{ 150 size_t cc; 151 152 if (current->desc < 0) 153 { 154 /* The file is nonexistent. */ 155 return; 156 } 157 158 if (S_ISREG (current->stat.st_mode)) 159 { 160 /* It's a regular file; slurp in the rest all at once. */ 161 162 /* Get the size out of the stat block. 163 Allocate just enough room for appended newline plus word sentinel, 164 plus word-alignment since we want the buffer word-aligned. */ 165 size_t file_size = current->stat.st_size; 166 cc = file_size + 2 * sizeof (word) - file_size % sizeof (word); 167 if (file_size != current->stat.st_size || cc < file_size 168 || PTRDIFF_MAX <= cc) 169 xalloc_die (); 170 171 if (current->bufsize < cc) 172 { 173 current->bufsize = cc; 174 current->buffer = xrealloc (current->buffer, cc); 175 } 176 177 /* Try to read at least 1 more byte than the size indicates, to 178 detect whether the file is growing. This is a nicety for 179 users who run 'diff' on files while they are changing. */ 180 181 if (current->buffered <= file_size) 182 { 183 file_block_read (current, file_size + 1 - current->buffered); 184 if (current->buffered <= file_size) 185 return; 186 } 187 } 188 189 /* It's not a regular file, or it's a growing regular file; read it, 190 growing the buffer as needed. */ 191 192 file_block_read (current, current->bufsize - current->buffered); 193 194 if (current->buffered) 195 { 196 while (current->buffered == current->bufsize) 197 { 198 if (PTRDIFF_MAX / 2 - sizeof (word) < current->bufsize) 199 xalloc_die (); 200 current->bufsize *= 2; 201 current->buffer = xrealloc (current->buffer, current->bufsize); 202 file_block_read (current, current->bufsize - current->buffered); 203 } 204 205 /* Allocate just enough room for appended newline plus word 206 sentinel, plus word-alignment. */ 207 cc = current->buffered + 2 * sizeof (word); 208 current->bufsize = cc - cc % sizeof (word); 209 current->buffer = xrealloc (current->buffer, current->bufsize); 210 } 211} 212 213/* Split the file into lines, simultaneously computing the equivalence 214 class for each line. */ 215 216static void 217find_and_hash_each_line (struct file_data *current) 218{ 219 hash_value h; 220 unsigned char const *p = (unsigned char const *) current->prefix_end; 221 unsigned char c; 222 lin i, *bucket; 223 size_t length; 224 225 /* Cache often-used quantities in local variables to help the compiler. */ 226 char const **linbuf = current->linbuf; 227 lin alloc_lines = current->alloc_lines; 228 lin line = 0; 229 lin linbuf_base = current->linbuf_base; 230 lin *cureqs = xmalloc (alloc_lines * sizeof *cureqs); 231 struct equivclass *eqs = equivs; 232 lin eqs_index = equivs_index; 233 lin eqs_alloc = equivs_alloc; 234 char const *suffix_begin = current->suffix_begin; 235 char const *bufend = FILE_BUFFER (current) + current->buffered; 236 bool diff_length_compare_anyway = 237 ignore_white_space != IGNORE_NO_WHITE_SPACE; 238 bool same_length_diff_contents_compare_anyway = 239 diff_length_compare_anyway | ignore_case; 240 241 while ((char const *) p < suffix_begin) 242 { 243 char const *ip = (char const *) p; 244 245 h = 0; 246 247 /* Hash this line until we find a newline. */ 248 if (ignore_case) 249 switch (ignore_white_space) 250 { 251 case IGNORE_ALL_SPACE: 252 while ((c = *p++) != '\n') 253 if (! ISSPACE (c)) 254 h = HASH (h, TOLOWER (c)); 255 break; 256 257 case IGNORE_SPACE_CHANGE: 258 while ((c = *p++) != '\n') 259 { 260 if (ISSPACE (c)) 261 { 262 do 263 if ((c = *p++) == '\n') 264 goto hashing_done; 265 while (ISSPACE (c)); 266 267 h = HASH (h, ' '); 268 } 269 270 /* C is now the first non-space. */ 271 h = HASH (h, TOLOWER (c)); 272 } 273 break; 274 275 case IGNORE_TAB_EXPANSION: 276 { 277 size_t column = 0; 278 while ((c = *p++) != '\n') 279 { 280 int repetitions = 1; 281 282 switch (c) 283 { 284 case '\b': 285 column -= 0 < column; 286 break; 287 288 case '\t': 289 c = ' '; 290 repetitions = TAB_WIDTH - column % TAB_WIDTH; 291 column += repetitions; 292 break; 293 294 case '\r': 295 column = 0; 296 break; 297 298 default: 299 c = TOLOWER (c); 300 column++; 301 break; 302 } 303 304 do 305 h = HASH (h, c); 306 while (--repetitions != 0); 307 } 308 } 309 break; 310 311 default: 312 while ((c = *p++) != '\n') 313 h = HASH (h, TOLOWER (c)); 314 break; 315 } 316 else 317 switch (ignore_white_space) 318 { 319 case IGNORE_ALL_SPACE: 320 while ((c = *p++) != '\n') 321 if (! ISSPACE (c)) 322 h = HASH (h, c); 323 break; 324 325 case IGNORE_SPACE_CHANGE: 326 while ((c = *p++) != '\n') 327 { 328 if (ISSPACE (c)) 329 { 330 do 331 if ((c = *p++) == '\n') 332 goto hashing_done; 333 while (ISSPACE (c)); 334 335 h = HASH (h, ' '); 336 } 337 338 /* C is now the first non-space. */ 339 h = HASH (h, c); 340 } 341 break; 342 343 case IGNORE_TAB_EXPANSION: 344 { 345 size_t column = 0; 346 while ((c = *p++) != '\n') 347 { 348 int repetitions = 1; 349 350 switch (c) 351 { 352 case '\b': 353 column -= 0 < column; 354 break; 355 356 case '\t': 357 c = ' '; 358 repetitions = TAB_WIDTH - column % TAB_WIDTH; 359 column += repetitions; 360 break; 361 362 case '\r': 363 column = 0; 364 break; 365 366 default: 367 column++; 368 break; 369 } 370 371 do 372 h = HASH (h, c); 373 while (--repetitions != 0); 374 } 375 } 376 break; 377 378 default: 379 while ((c = *p++) != '\n') 380 h = HASH (h, c); 381 break; 382 } 383 384 hashing_done:; 385 386 bucket = &buckets[h % nbuckets]; 387 length = (char const *) p - ip - 1; 388 389 if ((char const *) p == bufend 390 && current->missing_newline 391 && ROBUST_OUTPUT_STYLE (output_style)) 392 { 393 /* This line is incomplete. If this is significant, 394 put the line into buckets[-1]. */ 395 if (ignore_white_space < IGNORE_SPACE_CHANGE) 396 bucket = &buckets[-1]; 397 398 /* Omit the inserted newline when computing linbuf later. */ 399 p--; 400 bufend = suffix_begin = (char const *) p; 401 } 402 403 for (i = *bucket; ; i = eqs[i].next) 404 if (!i) 405 { 406 /* Create a new equivalence class in this bucket. */ 407 i = eqs_index++; 408 if (i == eqs_alloc) 409 { 410 if (PTRDIFF_MAX / (2 * sizeof *eqs) <= eqs_alloc) 411 xalloc_die (); 412 eqs_alloc *= 2; 413 eqs = xrealloc (eqs, eqs_alloc * sizeof *eqs); 414 } 415 eqs[i].next = *bucket; 416 eqs[i].hash = h; 417 eqs[i].line = ip; 418 eqs[i].length = length; 419 *bucket = i; 420 break; 421 } 422 else if (eqs[i].hash == h) 423 { 424 char const *eqline = eqs[i].line; 425 426 /* Reuse existing class if lines_differ reports the lines 427 equal. */ 428 if (eqs[i].length == length) 429 { 430 /* Reuse existing equivalence class if the lines are identical. 431 This detects the common case of exact identity 432 faster than lines_differ would. */ 433 if (memcmp (eqline, ip, length) == 0) 434 break; 435 if (!same_length_diff_contents_compare_anyway) 436 continue; 437 } 438 else if (!diff_length_compare_anyway) 439 continue; 440 441 if (! lines_differ (eqline, ip)) 442 break; 443 } 444 445 /* Maybe increase the size of the line table. */ 446 if (line == alloc_lines) 447 { 448 /* Double (alloc_lines - linbuf_base) by adding to alloc_lines. */ 449 if (PTRDIFF_MAX / 3 <= alloc_lines 450 || PTRDIFF_MAX / sizeof *cureqs <= 2 * alloc_lines - linbuf_base 451 || PTRDIFF_MAX / sizeof *linbuf <= alloc_lines - linbuf_base) 452 xalloc_die (); 453 alloc_lines = 2 * alloc_lines - linbuf_base; 454 cureqs = xrealloc (cureqs, alloc_lines * sizeof *cureqs); 455 linbuf += linbuf_base; 456 linbuf = xrealloc (linbuf, 457 (alloc_lines - linbuf_base) * sizeof *linbuf); 458 linbuf -= linbuf_base; 459 } 460 linbuf[line] = ip; 461 cureqs[line] = i; 462 ++line; 463 } 464 465 current->buffered_lines = line; 466 467 for (i = 0; ; i++) 468 { 469 /* Record the line start for lines in the suffix that we care about. 470 Record one more line start than lines, 471 so that we can compute the length of any buffered line. */ 472 if (line == alloc_lines) 473 { 474 /* Double (alloc_lines - linbuf_base) by adding to alloc_lines. */ 475 if (PTRDIFF_MAX / 3 <= alloc_lines 476 || PTRDIFF_MAX / sizeof *cureqs <= 2 * alloc_lines - linbuf_base 477 || PTRDIFF_MAX / sizeof *linbuf <= alloc_lines - linbuf_base) 478 xalloc_die (); 479 alloc_lines = 2 * alloc_lines - linbuf_base; 480 linbuf += linbuf_base; 481 linbuf = xrealloc (linbuf, 482 (alloc_lines - linbuf_base) * sizeof *linbuf); 483 linbuf -= linbuf_base; 484 } 485 linbuf[line] = (char const *) p; 486 487 if ((char const *) p == bufend) 488 break; 489 490 if (context <= i && no_diff_means_no_output) 491 break; 492 493 line++; 494 495 while (*p++ != '\n') 496 continue; 497 } 498 499 /* Done with cache in local variables. */ 500 current->linbuf = linbuf; 501 current->valid_lines = line; 502 current->alloc_lines = alloc_lines; 503 current->equivs = cureqs; 504 equivs = eqs; 505 equivs_alloc = eqs_alloc; 506 equivs_index = eqs_index; 507} 508 509/* Prepare the text. Make sure the text end is initialized. 510 Make sure text ends in a newline, 511 but remember that we had to add one. 512 Strip trailing CRs, if that was requested. */ 513 514static void 515prepare_text (struct file_data *current) 516{ 517 size_t buffered = current->buffered; 518 char *p = FILE_BUFFER (current); 519 char *dst; 520 521 if (buffered == 0 || p[buffered - 1] == '\n') 522 current->missing_newline = 0; 523 else 524 { 525 p[buffered++] = '\n'; 526 current->missing_newline = 1; 527 } 528 529 if (!p) 530 return; 531 532 /* Don't use uninitialized storage when planting or using sentinels. */ 533 memset (p + buffered, 0, sizeof (word)); 534 535 if (strip_trailing_cr && (dst = memchr (p, '\r', buffered))) 536 { 537 char const *src = dst; 538 char const *srclim = p + buffered; 539 540 do 541 dst += ! ((*dst = *src++) == '\r' && *src == '\n'); 542 while (src < srclim); 543 544 buffered -= src - dst; 545 } 546 547 current->buffered = buffered; 548} 549 550/* We have found N lines in a buffer of size S; guess the 551 proportionate number of lines that will be found in a buffer of 552 size T. However, do not guess a number of lines so large that the 553 resulting line table might cause overflow in size calculations. */ 554static lin 555guess_lines (lin n, size_t s, size_t t) 556{ 557 size_t guessed_bytes_per_line = n < 10 ? 32 : s / (n - 1); 558 lin guessed_lines = MAX (1, t / guessed_bytes_per_line); 559 return MIN (guessed_lines, PTRDIFF_MAX / (2 * sizeof (char *) + 1) - 5) + 5; 560} 561 562/* Given a vector of two file_data objects, find the identical 563 prefixes and suffixes of each object. */ 564 565static void 566find_identical_ends (struct file_data filevec[]) 567{ 568 word *w0, *w1; 569 char *p0, *p1, *buffer0, *buffer1; 570 char const *end0, *beg0; 571 char const **linbuf0, **linbuf1; 572 lin i, lines; 573 size_t n0, n1; 574 lin alloc_lines0, alloc_lines1; 575 lin buffered_prefix, prefix_count, prefix_mask; 576 lin middle_guess, suffix_guess; 577 578 slurp (&filevec[0]); 579 prepare_text (&filevec[0]); 580 if (filevec[0].desc != filevec[1].desc) 581 { 582 slurp (&filevec[1]); 583 prepare_text (&filevec[1]); 584 } 585 else 586 { 587 filevec[1].buffer = filevec[0].buffer; 588 filevec[1].bufsize = filevec[0].bufsize; 589 filevec[1].buffered = filevec[0].buffered; 590 filevec[1].missing_newline = filevec[0].missing_newline; 591 } 592 593 /* Find identical prefix. */ 594 595 w0 = filevec[0].buffer; 596 w1 = filevec[1].buffer; 597 p0 = buffer0 = (char *) w0; 598 p1 = buffer1 = (char *) w1; 599 n0 = filevec[0].buffered; 600 n1 = filevec[1].buffered; 601 602 if (p0 == p1) 603 /* The buffers are the same; sentinels won't work. */ 604 p0 = p1 += n1; 605 else 606 { 607 /* Insert end sentinels, in this case characters that are guaranteed 608 to make the equality test false, and thus terminate the loop. */ 609 610 if (n0 < n1) 611 p0[n0] = ~p1[n0]; 612 else 613 p1[n1] = ~p0[n1]; 614 615 /* Loop until first mismatch, or to the sentinel characters. */ 616 617 /* Compare a word at a time for speed. */ 618 while (*w0 == *w1) 619 w0++, w1++; 620 621 /* Do the last few bytes of comparison a byte at a time. */ 622 p0 = (char *) w0; 623 p1 = (char *) w1; 624 while (*p0 == *p1) 625 p0++, p1++; 626 627 /* Don't mistakenly count missing newline as part of prefix. */ 628 if (ROBUST_OUTPUT_STYLE (output_style) 629 && ((buffer0 + n0 - filevec[0].missing_newline < p0) 630 != 631 (buffer1 + n1 - filevec[1].missing_newline < p1))) 632 p0--, p1--; 633 } 634 635 /* Now P0 and P1 point at the first nonmatching characters. */ 636 637 /* Skip back to last line-beginning in the prefix, 638 and then discard up to HORIZON_LINES lines from the prefix. */ 639 i = horizon_lines; 640 while (p0 != buffer0 && (p0[-1] != '\n' || i--)) 641 p0--, p1--; 642 643 /* Record the prefix. */ 644 filevec[0].prefix_end = p0; 645 filevec[1].prefix_end = p1; 646 647 /* Find identical suffix. */ 648 649 /* P0 and P1 point beyond the last chars not yet compared. */ 650 p0 = buffer0 + n0; 651 p1 = buffer1 + n1; 652 653 if (! ROBUST_OUTPUT_STYLE (output_style) 654 || filevec[0].missing_newline == filevec[1].missing_newline) 655 { 656 end0 = p0; /* Addr of last char in file 0. */ 657 658 /* Get value of P0 at which we should stop scanning backward: 659 this is when either P0 or P1 points just past the last char 660 of the identical prefix. */ 661 beg0 = filevec[0].prefix_end + (n0 < n1 ? 0 : n0 - n1); 662 663 /* Scan back until chars don't match or we reach that point. */ 664 for (; p0 != beg0; p0--, p1--) 665 if (*p0 != *p1) 666 { 667 /* Point at the first char of the matching suffix. */ 668 beg0 = p0; 669 break; 670 } 671 672 /* Are we at a line-beginning in both files? If not, add the rest of 673 this line to the main body. Discard up to HORIZON_LINES lines from 674 the identical suffix. Also, discard one extra line, 675 because shift_boundaries may need it. */ 676 i = horizon_lines + !((buffer0 == p0 || p0[-1] == '\n') 677 && 678 (buffer1 == p1 || p1[-1] == '\n')); 679 while (i-- && p0 != end0) 680 while (*p0++ != '\n') 681 continue; 682 683 p1 += p0 - beg0; 684 } 685 686 /* Record the suffix. */ 687 filevec[0].suffix_begin = p0; 688 filevec[1].suffix_begin = p1; 689 690 /* Calculate number of lines of prefix to save. 691 692 prefix_count == 0 means save the whole prefix; 693 we need this for options like -D that output the whole file, 694 or for enormous contexts (to avoid worrying about arithmetic overflow). 695 We also need it for options like -F that output some preceding line; 696 at least we will need to find the last few lines, 697 but since we don't know how many, it's easiest to find them all. 698 699 Otherwise, prefix_count != 0. Save just prefix_count lines at start 700 of the line buffer; they'll be moved to the proper location later. 701 Handle 1 more line than the context says (because we count 1 too many), 702 rounded up to the next power of 2 to speed index computation. */ 703 704 if (no_diff_means_no_output && ! function_regexp.fastmap 705 && context < LIN_MAX / 4 && context < n0) 706 { 707 middle_guess = guess_lines (0, 0, p0 - filevec[0].prefix_end); 708 suffix_guess = guess_lines (0, 0, buffer0 + n0 - p0); 709 for (prefix_count = 1; prefix_count <= context; prefix_count *= 2) 710 continue; 711 alloc_lines0 = (prefix_count + middle_guess 712 + MIN (context, suffix_guess)); 713 } 714 else 715 { 716 prefix_count = 0; 717 alloc_lines0 = guess_lines (0, 0, n0); 718 } 719 720 prefix_mask = prefix_count - 1; 721 lines = 0; 722 linbuf0 = xmalloc (alloc_lines0 * sizeof *linbuf0); 723 p0 = buffer0; 724 725 /* If the prefix is needed, find the prefix lines. */ 726 if (! (no_diff_means_no_output 727 && filevec[0].prefix_end == p0 728 && filevec[1].prefix_end == p1)) 729 { 730 end0 = filevec[0].prefix_end; 731 while (p0 != end0) 732 { 733 lin l = lines++ & prefix_mask; 734 if (l == alloc_lines0) 735 { 736 if (PTRDIFF_MAX / (2 * sizeof *linbuf0) <= alloc_lines0) 737 xalloc_die (); 738 alloc_lines0 *= 2; 739 linbuf0 = xrealloc (linbuf0, alloc_lines0 * sizeof *linbuf0); 740 } 741 linbuf0[l] = p0; 742 while (*p0++ != '\n') 743 continue; 744 } 745 } 746 buffered_prefix = prefix_count && context < lines ? context : lines; 747 748 /* Allocate line buffer 1. */ 749 750 middle_guess = guess_lines (lines, p0 - buffer0, p1 - filevec[1].prefix_end); 751 suffix_guess = guess_lines (lines, p0 - buffer0, buffer1 + n1 - p1); 752 alloc_lines1 = buffered_prefix + middle_guess + MIN (context, suffix_guess); 753 if (alloc_lines1 < buffered_prefix 754 || PTRDIFF_MAX / sizeof *linbuf1 <= alloc_lines1) 755 xalloc_die (); 756 linbuf1 = xmalloc (alloc_lines1 * sizeof *linbuf1); 757 758 if (buffered_prefix != lines) 759 { 760 /* Rotate prefix lines to proper location. */ 761 for (i = 0; i < buffered_prefix; i++) 762 linbuf1[i] = linbuf0[(lines - context + i) & prefix_mask]; 763 for (i = 0; i < buffered_prefix; i++) 764 linbuf0[i] = linbuf1[i]; 765 } 766 767 /* Initialize line buffer 1 from line buffer 0. */ 768 for (i = 0; i < buffered_prefix; i++) 769 linbuf1[i] = linbuf0[i] - buffer0 + buffer1; 770 771 /* Record the line buffer, adjusted so that 772 linbuf[0] points at the first differing line. */ 773 filevec[0].linbuf = linbuf0 + buffered_prefix; 774 filevec[1].linbuf = linbuf1 + buffered_prefix; 775 filevec[0].linbuf_base = filevec[1].linbuf_base = - buffered_prefix; 776 filevec[0].alloc_lines = alloc_lines0 - buffered_prefix; 777 filevec[1].alloc_lines = alloc_lines1 - buffered_prefix; 778 filevec[0].prefix_lines = filevec[1].prefix_lines = lines; 779} 780 781/* If 1 < k, then (2**k - prime_offset[k]) is the largest prime less 782 than 2**k. This table is derived from Chris K. Caldwell's list 783 <http://www.utm.edu/research/primes/lists/2small/>. */ 784 785static unsigned char const prime_offset[] = 786{ 787 0, 0, 1, 1, 3, 1, 3, 1, 5, 3, 3, 9, 3, 1, 3, 19, 15, 1, 5, 1, 3, 9, 3, 788 15, 3, 39, 5, 39, 57, 3, 35, 1, 5, 9, 41, 31, 5, 25, 45, 7, 87, 21, 789 11, 57, 17, 55, 21, 115, 59, 81, 27, 129, 47, 111, 33, 55, 5, 13, 27, 790 55, 93, 1, 57, 25 791}; 792 793/* Verify that this host's size_t is not too wide for the above table. */ 794 795verify (enough_prime_offsets, 796 sizeof (size_t) * CHAR_BIT <= sizeof prime_offset); 797 798/* Given a vector of two file_data objects, read the file associated 799 with each one, and build the table of equivalence classes. 800 Return nonzero if either file appears to be a binary file. 801 If PRETEND_BINARY is nonzero, pretend they are binary regardless. */ 802 803bool 804read_files (struct file_data filevec[], bool pretend_binary) 805{ 806 int i; 807 bool skip_test = text | pretend_binary; 808 bool appears_binary = pretend_binary | sip (&filevec[0], skip_test); 809 810 if (filevec[0].desc != filevec[1].desc) 811 appears_binary |= sip (&filevec[1], skip_test | appears_binary); 812 else 813 { 814 filevec[1].buffer = filevec[0].buffer; 815 filevec[1].bufsize = filevec[0].bufsize; 816 filevec[1].buffered = filevec[0].buffered; 817 } 818 if (appears_binary) 819 { 820 set_binary_mode (filevec[0].desc, 1); 821 set_binary_mode (filevec[1].desc, 1); 822 return 1; 823 } 824 825 find_identical_ends (filevec); 826 827 equivs_alloc = filevec[0].alloc_lines + filevec[1].alloc_lines + 1; 828 if (PTRDIFF_MAX / sizeof *equivs <= equivs_alloc) 829 xalloc_die (); 830 equivs = xmalloc (equivs_alloc * sizeof *equivs); 831 /* Equivalence class 0 is permanently safe for lines that were not 832 hashed. Real equivalence classes start at 1. */ 833 equivs_index = 1; 834 835 /* Allocate (one plus) a prime number of hash buckets. Use a prime 836 number between 1/3 and 2/3 of the value of equiv_allocs, 837 approximately. */ 838 for (i = 9; (size_t) 1 << i < equivs_alloc / 3; i++) 839 continue; 840 nbuckets = ((size_t) 1 << i) - prime_offset[i]; 841 if (PTRDIFF_MAX / sizeof *buckets <= nbuckets) 842 xalloc_die (); 843 buckets = zalloc ((nbuckets + 1) * sizeof *buckets); 844 buckets++; 845 846 for (i = 0; i < 2; i++) 847 find_and_hash_each_line (&filevec[i]); 848 849 filevec[0].equiv_max = filevec[1].equiv_max = equivs_index; 850 851 free (equivs); 852 free (buckets - 1); 853 854 return 0; 855} 856