io.c revision 32785
1/* File I/O for GNU DIFF. 2 Copyright (C) 1988, 1989, 1992, 1993, 1994 Free Software Foundation, Inc. 3 4This file is part of GNU DIFF. 5 6GNU DIFF is free software; you can redistribute it and/or modify 7it under the terms of the GNU General Public License as published by 8the Free Software Foundation; either version 2, or (at your option) 9any later version. 10 11GNU DIFF is distributed in the hope that it will be useful, 12but WITHOUT ANY WARRANTY; without even the implied warranty of 13MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14GNU General Public License for more details. 15 16You should have received a copy of the GNU General Public License 17along with GNU DIFF; see the file COPYING. If not, write to 18the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. */ 19 20#include "diff.h" 21 22/* Rotate a value n bits to the left. */ 23#define UINT_BIT (sizeof (unsigned) * CHAR_BIT) 24#define ROL(v, n) ((v) << (n) | (v) >> (UINT_BIT - (n))) 25 26/* Given a hash value and a new character, return a new hash value. */ 27#define HASH(h, c) ((c) + ROL (h, 7)) 28 29/* Guess remaining number of lines from number N of lines so far, 30 size S so far, and total size T. */ 31#define GUESS_LINES(n,s,t) (((t) - (s)) / ((n) < 10 ? 32 : (s) / ((n)-1)) + 5) 32 33/* Type used for fast prefix comparison in find_identical_ends. */ 34#ifndef word 35#define word int 36#endif 37 38/* Lines are put into equivalence classes (of lines that match in line_cmp). 39 Each equivalence class is represented by one of these structures, 40 but only while the classes are being computed. 41 Afterward, each class is represented by a number. */ 42struct equivclass 43{ 44 int next; /* Next item in this bucket. */ 45 unsigned hash; /* Hash of lines in this class. */ 46 char const *line; /* A line that fits this class. */ 47 size_t length; /* That line's length, not counting its newline. */ 48}; 49 50/* Hash-table: array of buckets, each being a chain of equivalence classes. 51 buckets[-1] is reserved for incomplete lines. */ 52static int *buckets; 53 54/* Number of buckets in the hash table array, not counting buckets[-1]. */ 55static int nbuckets; 56 57/* Array in which the equivalence classes are allocated. 58 The bucket-chains go through the elements in this array. 59 The number of an equivalence class is its index in this array. */ 60static struct equivclass *equivs; 61 62/* Index of first free element in the array `equivs'. */ 63static int equivs_index; 64 65/* Number of elements allocated in the array `equivs'. */ 66static int equivs_alloc; 67 68static void find_and_hash_each_line PARAMS((struct file_data *)); 69static void find_identical_ends PARAMS((struct file_data[])); 70static void prepare_text_end PARAMS((struct file_data *)); 71 72/* Check for binary files and compare them for exact identity. */ 73 74/* Return 1 if BUF contains a non text character. 75 SIZE is the number of characters in BUF. */ 76 77#define binary_file_p(buf, size) (memchr (buf, '\0', size) != 0) 78 79/* Get ready to read the current file. 80 Return nonzero if SKIP_TEST is zero, 81 and if it appears to be a binary file. */ 82 83int 84sip (current, skip_test) 85 struct file_data *current; 86 int skip_test; 87{ 88 /* If we have a nonexistent file at this stage, treat it as empty. */ 89 if (current->desc < 0) 90 { 91 /* Leave room for a sentinel. */ 92 current->bufsize = sizeof (word); 93 current->buffer = xmalloc (current->bufsize); 94 } 95 else 96 { 97 current->bufsize = STAT_BLOCKSIZE (current->stat); 98 current->buffer = xmalloc (current->bufsize); 99 100 if (! skip_test) 101 { 102 /* Check first part of file to see if it's a binary file. */ 103#if HAVE_SETMODE 104 int oldmode = setmode (current->desc, O_BINARY); 105#endif 106 size_t n = read (current->desc, current->buffer, current->bufsize); 107 if (n == -1) 108 pfatal_with_name (current->name); 109 current->buffered_chars = n; 110#if HAVE_SETMODE 111 if (oldmode != O_BINARY) 112 { 113 if (lseek (current->desc, - (off_t) n, SEEK_CUR) == -1) 114 pfatal_with_name (current->name); 115 setmode (current->desc, oldmode); 116 current->buffered_chars = 0; 117 } 118#endif 119 return binary_file_p (current->buffer, n); 120 } 121 } 122 123 current->buffered_chars = 0; 124 return 0; 125} 126 127/* Slurp the rest of the current file completely into memory. */ 128 129void 130slurp (current) 131 struct file_data *current; 132{ 133 size_t cc; 134 135 if (current->desc < 0) 136 /* The file is nonexistent. */ 137 ; 138 else if (S_ISREG (current->stat.st_mode)) 139 { 140 /* It's a regular file; slurp in the rest all at once. */ 141 142 /* Get the size out of the stat block. 143 Allocate enough room for appended newline and sentinel. */ 144 cc = current->stat.st_size + 1 + sizeof (word); 145 if (current->bufsize < cc) 146 { 147 current->bufsize = cc; 148 current->buffer = xrealloc (current->buffer, cc); 149 } 150 151 if (current->buffered_chars < current->stat.st_size) 152 { 153 cc = read (current->desc, 154 current->buffer + current->buffered_chars, 155 current->stat.st_size - current->buffered_chars); 156 if (cc == -1) 157 pfatal_with_name (current->name); 158 current->buffered_chars += cc; 159 } 160 } 161 /* It's not a regular file; read it, growing the buffer as needed. */ 162 else if (always_text_flag || current->buffered_chars != 0) 163 { 164 for (;;) 165 { 166 if (current->buffered_chars == current->bufsize) 167 { 168 current->bufsize = current->bufsize * 2; 169 current->buffer = xrealloc (current->buffer, current->bufsize); 170 } 171 cc = read (current->desc, 172 current->buffer + current->buffered_chars, 173 current->bufsize - current->buffered_chars); 174 if (cc == 0) 175 break; 176 if (cc == -1) 177 pfatal_with_name (current->name); 178 current->buffered_chars += cc; 179 } 180 /* Allocate just enough room for appended newline and sentinel. */ 181 current->bufsize = current->buffered_chars + 1 + sizeof (word); 182 current->buffer = xrealloc (current->buffer, current->bufsize); 183 } 184} 185 186/* Split the file into lines, simultaneously computing the equivalence class for 187 each line. */ 188 189static void 190find_and_hash_each_line (current) 191 struct file_data *current; 192{ 193 unsigned h; 194 unsigned char const *p = (unsigned char const *) current->prefix_end; 195 unsigned char c; 196 int i, *bucket; 197 size_t length; 198 199 /* Cache often-used quantities in local variables to help the compiler. */ 200 char const **linbuf = current->linbuf; 201 int alloc_lines = current->alloc_lines; 202 int line = 0; 203 int linbuf_base = current->linbuf_base; 204 int *cureqs = (int *) xmalloc (alloc_lines * sizeof (int)); 205 struct equivclass *eqs = equivs; 206 int eqs_index = equivs_index; 207 int eqs_alloc = equivs_alloc; 208 char const *suffix_begin = current->suffix_begin; 209 char const *bufend = current->buffer + current->buffered_chars; 210 int use_line_cmp = ignore_some_line_changes; 211 212 while ((char const *) p < suffix_begin) 213 { 214 char const *ip = (char const *) p; 215 216 /* Compute the equivalence class for this line. */ 217 218 h = 0; 219 220 /* Hash this line until we find a newline. */ 221 if (ignore_case_flag) 222 { 223 if (ignore_all_space_flag) 224 while ((c = *p++) != '\n') 225 { 226 if (! ISSPACE (c)) 227 h = HASH (h, ISUPPER (c) ? tolower (c) : c); 228 } 229 else if (ignore_space_change_flag) 230 while ((c = *p++) != '\n') 231 { 232 if (ISSPACE (c)) 233 { 234 for (;;) 235 { 236 c = *p++; 237 if (!ISSPACE (c)) 238 break; 239 if (c == '\n') 240 goto hashing_done; 241 } 242 h = HASH (h, ' '); 243 } 244 /* C is now the first non-space. */ 245 h = HASH (h, ISUPPER (c) ? tolower (c) : c); 246 } 247 else 248 while ((c = *p++) != '\n') 249 h = HASH (h, ISUPPER (c) ? tolower (c) : c); 250 } 251 else 252 { 253 if (ignore_all_space_flag) 254 while ((c = *p++) != '\n') 255 { 256 if (! ISSPACE (c)) 257 h = HASH (h, c); 258 } 259 else if (ignore_space_change_flag) 260 while ((c = *p++) != '\n') 261 { 262 if (ISSPACE (c)) 263 { 264 for (;;) 265 { 266 c = *p++; 267 if (!ISSPACE (c)) 268 break; 269 if (c == '\n') 270 goto hashing_done; 271 } 272 h = HASH (h, ' '); 273 } 274 /* C is now the first non-space. */ 275 h = HASH (h, c); 276 } 277 else 278 while ((c = *p++) != '\n') 279 h = HASH (h, c); 280 } 281 hashing_done:; 282 283 bucket = &buckets[h % nbuckets]; 284 length = (char const *) p - ip - 1; 285 286 if ((char const *) p == bufend 287 && current->missing_newline 288 && ROBUST_OUTPUT_STYLE (output_style)) 289 { 290 /* This line is incomplete. If this is significant, 291 put the line into bucket[-1]. */ 292 if (! (ignore_space_change_flag | ignore_all_space_flag)) 293 bucket = &buckets[-1]; 294 295 /* Omit the inserted newline when computing linbuf later. */ 296 p--; 297 bufend = suffix_begin = (char const *) p; 298 } 299 300 for (i = *bucket; ; i = eqs[i].next) 301 if (!i) 302 { 303 /* Create a new equivalence class in this bucket. */ 304 i = eqs_index++; 305 if (i == eqs_alloc) 306 eqs = (struct equivclass *) 307 xrealloc (eqs, (eqs_alloc*=2) * sizeof(*eqs)); 308 eqs[i].next = *bucket; 309 eqs[i].hash = h; 310 eqs[i].line = ip; 311 eqs[i].length = length; 312 *bucket = i; 313 break; 314 } 315 else if (eqs[i].hash == h) 316 { 317 char const *eqline = eqs[i].line; 318 319 /* Reuse existing equivalence class if the lines are identical. 320 This detects the common case of exact identity 321 faster than complete comparison would. */ 322 if (eqs[i].length == length && memcmp (eqline, ip, length) == 0) 323 break; 324 325 /* Reuse existing class if line_cmp reports the lines equal. */ 326 if (use_line_cmp && line_cmp (eqline, ip) == 0) 327 break; 328 } 329 330 /* Maybe increase the size of the line table. */ 331 if (line == alloc_lines) 332 { 333 /* Double (alloc_lines - linbuf_base) by adding to alloc_lines. */ 334 alloc_lines = 2 * alloc_lines - linbuf_base; 335 cureqs = (int *) xrealloc (cureqs, alloc_lines * sizeof (*cureqs)); 336 linbuf = (char const **) xrealloc (linbuf + linbuf_base, 337 (alloc_lines - linbuf_base) 338 * sizeof (*linbuf)) 339 - linbuf_base; 340 } 341 linbuf[line] = ip; 342 cureqs[line] = i; 343 ++line; 344 } 345 346 current->buffered_lines = line; 347 348 for (i = 0; ; i++) 349 { 350 /* Record the line start for lines in the suffix that we care about. 351 Record one more line start than lines, 352 so that we can compute the length of any buffered line. */ 353 if (line == alloc_lines) 354 { 355 /* Double (alloc_lines - linbuf_base) by adding to alloc_lines. */ 356 alloc_lines = 2 * alloc_lines - linbuf_base; 357 linbuf = (char const **) xrealloc (linbuf + linbuf_base, 358 (alloc_lines - linbuf_base) 359 * sizeof (*linbuf)) 360 - linbuf_base; 361 } 362 linbuf[line] = (char const *) p; 363 364 if ((char const *) p == bufend) 365 break; 366 367 if (context <= i && no_diff_means_no_output) 368 break; 369 370 line++; 371 372 while (*p++ != '\n') 373 ; 374 } 375 376 /* Done with cache in local variables. */ 377 current->linbuf = linbuf; 378 current->valid_lines = line; 379 current->alloc_lines = alloc_lines; 380 current->equivs = cureqs; 381 equivs = eqs; 382 equivs_alloc = eqs_alloc; 383 equivs_index = eqs_index; 384} 385 386/* Prepare the end of the text. Make sure it's initialized. 387 Make sure text ends in a newline, 388 but remember that we had to add one. */ 389 390static void 391prepare_text_end (current) 392 struct file_data *current; 393{ 394 size_t buffered_chars = current->buffered_chars; 395 char *p = current->buffer; 396 397 if (buffered_chars == 0 || p[buffered_chars - 1] == '\n') 398 current->missing_newline = 0; 399 else 400 { 401 p[buffered_chars++] = '\n'; 402 current->buffered_chars = buffered_chars; 403 current->missing_newline = 1; 404 } 405 406 /* Don't use uninitialized storage when planting or using sentinels. */ 407 if (p) 408 bzero (p + buffered_chars, sizeof (word)); 409} 410 411/* Given a vector of two file_data objects, find the identical 412 prefixes and suffixes of each object. */ 413 414static void 415find_identical_ends (filevec) 416 struct file_data filevec[]; 417{ 418 word *w0, *w1; 419 char *p0, *p1, *buffer0, *buffer1; 420 char const *end0, *beg0; 421 char const **linbuf0, **linbuf1; 422 int i, lines; 423 size_t n0, n1, tem; 424 int alloc_lines0, alloc_lines1; 425 int buffered_prefix, prefix_count, prefix_mask; 426 427 slurp (&filevec[0]); 428 if (filevec[0].desc != filevec[1].desc) 429 slurp (&filevec[1]); 430 else 431 { 432 filevec[1].buffer = filevec[0].buffer; 433 filevec[1].bufsize = filevec[0].bufsize; 434 filevec[1].buffered_chars = filevec[0].buffered_chars; 435 } 436 for (i = 0; i < 2; i++) 437 prepare_text_end (&filevec[i]); 438 439 /* Find identical prefix. */ 440 441 p0 = buffer0 = filevec[0].buffer; 442 p1 = buffer1 = filevec[1].buffer; 443 444 n0 = filevec[0].buffered_chars; 445 n1 = filevec[1].buffered_chars; 446 447 if (p0 == p1) 448 /* The buffers are the same; sentinels won't work. */ 449 p0 = p1 += n1; 450 else 451 { 452 /* Insert end sentinels, in this case characters that are guaranteed 453 to make the equality test false, and thus terminate the loop. */ 454 455 if (n0 < n1) 456 p0[n0] = ~p1[n0]; 457 else 458 p1[n1] = ~p0[n1]; 459 460 /* Loop until first mismatch, or to the sentinel characters. */ 461 462 /* Compare a word at a time for speed. */ 463 w0 = (word *) p0; 464 w1 = (word *) p1; 465 while (*w0++ == *w1++) 466 ; 467 --w0, --w1; 468 469 /* Do the last few bytes of comparison a byte at a time. */ 470 p0 = (char *) w0; 471 p1 = (char *) w1; 472 while (*p0++ == *p1++) 473 ; 474 --p0, --p1; 475 476 /* Don't mistakenly count missing newline as part of prefix. */ 477 if (ROBUST_OUTPUT_STYLE (output_style) 478 && (buffer0 + n0 - filevec[0].missing_newline < p0) 479 != 480 (buffer1 + n1 - filevec[1].missing_newline < p1)) 481 --p0, --p1; 482 } 483 484 /* Now P0 and P1 point at the first nonmatching characters. */ 485 486 /* Skip back to last line-beginning in the prefix, 487 and then discard up to HORIZON_LINES lines from the prefix. */ 488 i = horizon_lines; 489 while (p0 != buffer0 && (p0[-1] != '\n' || i--)) 490 --p0, --p1; 491 492 /* Record the prefix. */ 493 filevec[0].prefix_end = p0; 494 filevec[1].prefix_end = p1; 495 496 /* Find identical suffix. */ 497 498 /* P0 and P1 point beyond the last chars not yet compared. */ 499 p0 = buffer0 + n0; 500 p1 = buffer1 + n1; 501 502 if (! ROBUST_OUTPUT_STYLE (output_style) 503 || filevec[0].missing_newline == filevec[1].missing_newline) 504 { 505 end0 = p0; /* Addr of last char in file 0. */ 506 507 /* Get value of P0 at which we should stop scanning backward: 508 this is when either P0 or P1 points just past the last char 509 of the identical prefix. */ 510 beg0 = filevec[0].prefix_end + (n0 < n1 ? 0 : n0 - n1); 511 512 /* Scan back until chars don't match or we reach that point. */ 513 while (p0 != beg0) 514 if (*--p0 != *--p1) 515 { 516 /* Point at the first char of the matching suffix. */ 517 ++p0, ++p1; 518 beg0 = p0; 519 break; 520 } 521 522 /* Are we at a line-beginning in both files? If not, add the rest of 523 this line to the main body. Discard up to HORIZON_LINES lines from 524 the identical suffix. Also, discard one extra line, 525 because shift_boundaries may need it. */ 526 i = horizon_lines + !((buffer0 == p0 || p0[-1] == '\n') 527 && 528 (buffer1 == p1 || p1[-1] == '\n')); 529 while (i-- && p0 != end0) 530 while (*p0++ != '\n') 531 ; 532 533 p1 += p0 - beg0; 534 } 535 536 /* Record the suffix. */ 537 filevec[0].suffix_begin = p0; 538 filevec[1].suffix_begin = p1; 539 540 /* Calculate number of lines of prefix to save. 541 542 prefix_count == 0 means save the whole prefix; 543 we need this with for options like -D that output the whole file. 544 We also need it for options like -F that output some preceding line; 545 at least we will need to find the last few lines, 546 but since we don't know how many, it's easiest to find them all. 547 548 Otherwise, prefix_count != 0. Save just prefix_count lines at start 549 of the line buffer; they'll be moved to the proper location later. 550 Handle 1 more line than the context says (because we count 1 too many), 551 rounded up to the next power of 2 to speed index computation. */ 552 553 if (no_diff_means_no_output && ! function_regexp_list) 554 { 555 for (prefix_count = 1; prefix_count < context + 1; prefix_count *= 2) 556 ; 557 prefix_mask = prefix_count - 1; 558 alloc_lines0 559 = prefix_count 560 + GUESS_LINES (0, 0, p0 - filevec[0].prefix_end) 561 + context; 562 } 563 else 564 { 565 prefix_count = 0; 566 prefix_mask = ~0; 567 alloc_lines0 = GUESS_LINES (0, 0, n0); 568 } 569 570 lines = 0; 571 linbuf0 = (char const **) xmalloc (alloc_lines0 * sizeof (*linbuf0)); 572 573 /* If the prefix is needed, find the prefix lines. */ 574 if (! (no_diff_means_no_output 575 && filevec[0].prefix_end == p0 576 && filevec[1].prefix_end == p1)) 577 { 578 p0 = buffer0; 579 end0 = filevec[0].prefix_end; 580 while (p0 != end0) 581 { 582 int l = lines++ & prefix_mask; 583 if (l == alloc_lines0) 584 linbuf0 = (char const **) xrealloc (linbuf0, (alloc_lines0 *= 2) 585 * sizeof(*linbuf0)); 586 linbuf0[l] = p0; 587 while (*p0++ != '\n') 588 ; 589 } 590 } 591 buffered_prefix = prefix_count && context < lines ? context : lines; 592 593 /* Allocate line buffer 1. */ 594 tem = prefix_count ? filevec[1].suffix_begin - buffer1 : n1; 595 596 alloc_lines1 597 = (buffered_prefix 598 + GUESS_LINES (lines, filevec[1].prefix_end - buffer1, tem) 599 + context); 600 linbuf1 = (char const **) xmalloc (alloc_lines1 * sizeof (*linbuf1)); 601 602 if (buffered_prefix != lines) 603 { 604 /* Rotate prefix lines to proper location. */ 605 for (i = 0; i < buffered_prefix; i++) 606 linbuf1[i] = linbuf0[(lines - context + i) & prefix_mask]; 607 for (i = 0; i < buffered_prefix; i++) 608 linbuf0[i] = linbuf1[i]; 609 } 610 611 /* Initialize line buffer 1 from line buffer 0. */ 612 for (i = 0; i < buffered_prefix; i++) 613 linbuf1[i] = linbuf0[i] - buffer0 + buffer1; 614 615 /* Record the line buffer, adjusted so that 616 linbuf*[0] points at the first differing line. */ 617 filevec[0].linbuf = linbuf0 + buffered_prefix; 618 filevec[1].linbuf = linbuf1 + buffered_prefix; 619 filevec[0].linbuf_base = filevec[1].linbuf_base = - buffered_prefix; 620 filevec[0].alloc_lines = alloc_lines0 - buffered_prefix; 621 filevec[1].alloc_lines = alloc_lines1 - buffered_prefix; 622 filevec[0].prefix_lines = filevec[1].prefix_lines = lines; 623} 624 625/* Largest primes less than some power of two, for nbuckets. Values range 626 from useful to preposterous. If one of these numbers isn't prime 627 after all, don't blame it on me, blame it on primes (6) . . . */ 628static int const primes[] = 629{ 630 509, 631 1021, 632 2039, 633 4093, 634 8191, 635 16381, 636 32749, 637#if 32767 < INT_MAX 638 65521, 639 131071, 640 262139, 641 524287, 642 1048573, 643 2097143, 644 4194301, 645 8388593, 646 16777213, 647 33554393, 648 67108859, /* Preposterously large . . . */ 649 134217689, 650 268435399, 651 536870909, 652 1073741789, 653 2147483647, 654#endif 655 0 656}; 657 658/* Given a vector of two file_data objects, read the file associated 659 with each one, and build the table of equivalence classes. 660 Return 1 if either file appears to be a binary file. 661 If PRETEND_BINARY is nonzero, pretend they are binary regardless. */ 662 663int 664read_files (filevec, pretend_binary) 665 struct file_data filevec[]; 666 int pretend_binary; 667{ 668 int i; 669 int skip_test = always_text_flag | pretend_binary; 670 int appears_binary = pretend_binary | sip (&filevec[0], skip_test); 671 672 if (filevec[0].desc != filevec[1].desc) 673 appears_binary |= sip (&filevec[1], skip_test | appears_binary); 674 else 675 { 676 filevec[1].buffer = filevec[0].buffer; 677 filevec[1].bufsize = filevec[0].bufsize; 678 filevec[1].buffered_chars = filevec[0].buffered_chars; 679 } 680 if (appears_binary) 681 { 682#if HAVE_SETMODE 683 setmode (filevec[0].desc, O_BINARY); 684 setmode (filevec[1].desc, O_BINARY); 685#endif 686 return 1; 687 } 688 689 find_identical_ends (filevec); 690 691 equivs_alloc = filevec[0].alloc_lines + filevec[1].alloc_lines + 1; 692 equivs = (struct equivclass *) xmalloc (equivs_alloc * sizeof (struct equivclass)); 693 /* Equivalence class 0 is permanently safe for lines that were not 694 hashed. Real equivalence classes start at 1. */ 695 equivs_index = 1; 696 697 for (i = 0; primes[i] < equivs_alloc / 3; i++) 698 if (! primes[i]) 699 abort (); 700 nbuckets = primes[i]; 701 702 buckets = (int *) xmalloc ((nbuckets + 1) * sizeof (*buckets)); 703 bzero (buckets++, (nbuckets + 1) * sizeof (*buckets)); 704 705 for (i = 0; i < 2; i++) 706 find_and_hash_each_line (&filevec[i]); 707 708 filevec[0].equiv_max = filevec[1].equiv_max = equivs_index; 709 710 free (equivs); 711 free (buckets - 1); 712 713 return 0; 714} 715