1// -*- C++ -*- 2/* Copyright (C) 1989-1992, 2000, 2001, 2002, 2003, 2004 3 Free Software Foundation, Inc. 4 Written by James Clark (jjc@jclark.com) 5 6This file is part of groff. 7 8groff is free software; you can redistribute it and/or modify it under 9the terms of the GNU General Public License as published by the Free 10Software Foundation; either version 2, or (at your option) any later 11version. 12 13groff is distributed in the hope that it will be useful, but WITHOUT ANY 14WARRANTY; without even the implied warranty of MERCHANTABILITY or 15FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 16for more details. 17 18You should have received a copy of the GNU General Public License along 19with groff; see the file COPYING. If not, write to the Free Software 20Foundation, 51 Franklin St - Fifth Floor, Boston, MA 02110-1301, USA. */ 21 22#include "lib.h" 23 24#include <stdlib.h> 25#include <assert.h> 26#include <errno.h> 27 28#include "posix.h" 29#include "errarg.h" 30#include "error.h" 31#include "stringclass.h" 32#include "cset.h" 33#include "cmap.h" 34 35#include "defs.h" 36#include "index.h" 37 38#include "nonposix.h" 39 40extern "C" const char *Version_string; 41 42#define DEFAULT_HASH_TABLE_SIZE 997 43#define TEMP_INDEX_TEMPLATE "indxbibXXXXXX" 44 45// (2^n - MALLOC_OVERHEAD) should be a good argument for malloc(). 46 47#define MALLOC_OVERHEAD 16 48 49#ifdef BLOCK_SIZE 50#undef BLOCK_SIZE 51#endif 52 53const int BLOCK_SIZE = ((1024 - MALLOC_OVERHEAD - sizeof(struct block *) 54 - sizeof(int)) / sizeof(int)); 55struct block { 56 block *next; 57 int used; 58 int v[BLOCK_SIZE]; 59 60 block(block *p = 0) : next(p), used(0) { } 61}; 62 63struct block; 64 65union table_entry { 66 block *ptr; 67 int count; 68}; 69 70struct word_list { 71 word_list *next; 72 char *str; 73 int len; 74 word_list(const char *, int, word_list *); 75}; 76 77table_entry *hash_table; 78int hash_table_size = DEFAULT_HASH_TABLE_SIZE; 79// We make this the same size as hash_table so we only have to do one 80// mod per key. 81static word_list **common_words_table = 0; 82char *key_buffer; 83 84FILE *indxfp; 85int ntags = 0; 86string filenames; 87char *temp_index_file = 0; 88 89const char *ignore_fields = "XYZ"; 90const char *common_words_file = COMMON_WORDS_FILE; 91int n_ignore_words = 100; 92int truncate_len = 6; 93int shortest_len = 3; 94int max_keys_per_item = 100; 95 96static void usage(FILE *stream); 97static void write_hash_table(); 98static void init_hash_table(); 99static void read_common_words_file(); 100static int store_key(char *s, int len); 101static void possibly_store_key(char *s, int len); 102static int do_whole_file(const char *filename); 103static int do_file(const char *filename); 104static void store_reference(int filename_index, int pos, int len); 105static void check_integer_arg(char opt, const char *arg, int min, int *res); 106static void store_filename(const char *); 107static void fwrite_or_die(const void *ptr, int size, int nitems, FILE *fp); 108static char *get_cwd(); 109 110extern "C" { 111 void cleanup(); 112 void catch_fatal_signals(); 113 void ignore_fatal_signals(); 114} 115 116int main(int argc, char **argv) 117{ 118 program_name = argv[0]; 119 static char stderr_buf[BUFSIZ]; 120 setbuf(stderr, stderr_buf); 121 122 const char *base_name = 0; 123 typedef int (*parser_t)(const char *); 124 parser_t parser = do_file; 125 const char *directory = 0; 126 const char *foption = 0; 127 int opt; 128 static const struct option long_options[] = { 129 { "help", no_argument, 0, CHAR_MAX + 1 }, 130 { "version", no_argument, 0, 'v' }, 131 { NULL, 0, 0, 0 } 132 }; 133 while ((opt = getopt_long(argc, argv, "c:o:h:i:k:l:t:n:c:d:f:vw", 134 long_options, NULL)) 135 != EOF) 136 switch (opt) { 137 case 'c': 138 common_words_file = optarg; 139 break; 140 case 'd': 141 directory = optarg; 142 break; 143 case 'f': 144 foption = optarg; 145 break; 146 case 'h': 147 check_integer_arg('h', optarg, 1, &hash_table_size); 148 if (!is_prime(hash_table_size)) { 149 while (!is_prime(++hash_table_size)) 150 ; 151 warning("%1 not prime: using %2 instead", optarg, hash_table_size); 152 } 153 break; 154 case 'i': 155 ignore_fields = optarg; 156 break; 157 case 'k': 158 check_integer_arg('k', optarg, 1, &max_keys_per_item); 159 break; 160 case 'l': 161 check_integer_arg('l', optarg, 0, &shortest_len); 162 break; 163 case 'n': 164 check_integer_arg('n', optarg, 0, &n_ignore_words); 165 break; 166 case 'o': 167 base_name = optarg; 168 break; 169 case 't': 170 check_integer_arg('t', optarg, 1, &truncate_len); 171 break; 172 case 'w': 173 parser = do_whole_file; 174 break; 175 case 'v': 176 printf("GNU indxbib (groff) version %s\n", Version_string); 177 exit(0); 178 break; 179 case CHAR_MAX + 1: // --help 180 usage(stdout); 181 exit(0); 182 break; 183 case '?': 184 usage(stderr); 185 exit(1); 186 break; 187 default: 188 assert(0); 189 break; 190 } 191 if (optind >= argc && foption == 0) 192 fatal("no files and no -f option"); 193 if (!directory) { 194 char *path = get_cwd(); 195 store_filename(path); 196 a_delete path; 197 } 198 else 199 store_filename(directory); 200 init_hash_table(); 201 store_filename(common_words_file); 202 store_filename(ignore_fields); 203 key_buffer = new char[truncate_len]; 204 read_common_words_file(); 205 if (!base_name) 206 base_name = optind < argc ? argv[optind] : DEFAULT_INDEX_NAME; 207 const char *p = strrchr(base_name, DIR_SEPS[0]), *p1; 208 const char *sep = &DIR_SEPS[1]; 209 while (*sep) { 210 p1 = strrchr(base_name, *sep); 211 if (p1 && (!p || p1 > p)) 212 p = p1; 213 sep++; 214 } 215 size_t name_max; 216 if (p) { 217 char *dir = strsave(base_name); 218 dir[p - base_name] = '\0'; 219 name_max = file_name_max(dir); 220 a_delete dir; 221 } 222 else 223 name_max = file_name_max("."); 224 const char *filename = p ? p + 1 : base_name; 225 if (strlen(filename) + sizeof(INDEX_SUFFIX) - 1 > name_max) 226 fatal("`%1.%2' is too long for a filename", filename, INDEX_SUFFIX); 227 if (p) { 228 p++; 229 temp_index_file = new char[p - base_name + sizeof(TEMP_INDEX_TEMPLATE)]; 230 memcpy(temp_index_file, base_name, p - base_name); 231 strcpy(temp_index_file + (p - base_name), TEMP_INDEX_TEMPLATE); 232 } 233 else { 234 temp_index_file = strsave(TEMP_INDEX_TEMPLATE); 235 } 236 catch_fatal_signals(); 237 int fd = mkstemp(temp_index_file); 238 if (fd < 0) 239 fatal("can't create temporary index file: %1", strerror(errno)); 240 indxfp = fdopen(fd, FOPEN_WB); 241 if (indxfp == 0) 242 fatal("fdopen failed"); 243 if (fseek(indxfp, sizeof(index_header), 0) < 0) 244 fatal("can't seek past index header: %1", strerror(errno)); 245 int failed = 0; 246 if (foption) { 247 FILE *fp = stdin; 248 if (strcmp(foption, "-") != 0) { 249 errno = 0; 250 fp = fopen(foption, "r"); 251 if (!fp) 252 fatal("can't open `%1': %2", foption, strerror(errno)); 253 } 254 string path; 255 int lineno = 1; 256 for (;;) { 257 int c; 258 for (c = getc(fp); c != '\n' && c != EOF; c = getc(fp)) { 259 if (c == '\0') 260 error_with_file_and_line(foption, lineno, 261 "nul character in pathname ignored"); 262 else 263 path += c; 264 } 265 if (path.length() > 0) { 266 path += '\0'; 267 if (!(*parser)(path.contents())) 268 failed = 1; 269 path.clear(); 270 } 271 if (c == EOF) 272 break; 273 lineno++; 274 } 275 if (fp != stdin) 276 fclose(fp); 277 } 278 for (int i = optind; i < argc; i++) 279 if (!(*parser)(argv[i])) 280 failed = 1; 281 write_hash_table(); 282 if (fclose(indxfp) < 0) 283 fatal("error closing temporary index file: %1", strerror(errno)); 284 char *index_file = new char[strlen(base_name) + sizeof(INDEX_SUFFIX)]; 285 strcpy(index_file, base_name); 286 strcat(index_file, INDEX_SUFFIX); 287#ifdef HAVE_RENAME 288#ifdef __EMX__ 289 if (access(index_file, R_OK) == 0) 290 unlink(index_file); 291#endif /* __EMX__ */ 292 if (rename(temp_index_file, index_file) < 0) { 293#ifdef __MSDOS__ 294 // RENAME could fail on plain MSDOS filesystems because 295 // INDEX_FILE is an invalid filename, e.g. it has multiple dots. 296 char *fname = p ? index_file + (p - base_name) : 0; 297 char *dot = 0; 298 299 // Replace the dot with an underscore and try again. 300 if (fname 301 && (dot = strchr(fname, '.')) != 0 302 && strcmp(dot, INDEX_SUFFIX) != 0) 303 *dot = '_'; 304 if (rename(temp_index_file, index_file) < 0) 305#endif 306 fatal("can't rename temporary index file: %1", strerror(errno)); 307 } 308#else /* not HAVE_RENAME */ 309 ignore_fatal_signals(); 310 if (unlink(index_file) < 0) { 311 if (errno != ENOENT) 312 fatal("can't unlink `%1': %2", index_file, strerror(errno)); 313 } 314 if (link(temp_index_file, index_file) < 0) 315 fatal("can't link temporary index file: %1", strerror(errno)); 316 if (unlink(temp_index_file) < 0) 317 fatal("can't unlink temporary index file: %1", strerror(errno)); 318#endif /* not HAVE_RENAME */ 319 temp_index_file = 0; 320 return failed; 321} 322 323static void usage(FILE *stream) 324{ 325 fprintf(stream, 326"usage: %s [-vw] [-c file] [-d dir] [-f file] [-h n] [-i XYZ] [-k n]\n" 327" [-l n] [-n n] [-o base] [-t n] [files...]\n", 328 program_name); 329} 330 331static void check_integer_arg(char opt, const char *arg, int min, int *res) 332{ 333 char *ptr; 334 long n = strtol(arg, &ptr, 10); 335 if (n == 0 && ptr == arg) 336 error("argument to -%1 not an integer", opt); 337 else if (n < min) 338 error("argument to -%1 must not be less than %2", opt, min); 339 else { 340 if (n > INT_MAX) 341 error("argument to -%1 greater than maximum integer", opt); 342 else if (*ptr != '\0') 343 error("junk after integer argument to -%1", opt); 344 *res = int(n); 345 } 346} 347 348static char *get_cwd() 349{ 350 char *buf; 351 int size = 12; 352 353 for (;;) { 354 buf = new char[size]; 355 if (getcwd(buf, size)) 356 break; 357 if (errno != ERANGE) 358 fatal("cannot get current working directory: %1", strerror(errno)); 359 a_delete buf; 360 if (size == INT_MAX) 361 fatal("current working directory longer than INT_MAX"); 362 if (size > INT_MAX/2) 363 size = INT_MAX; 364 else 365 size *= 2; 366 } 367 return buf; 368} 369 370word_list::word_list(const char *s, int n, word_list *p) 371: next(p), len(n) 372{ 373 str = new char[n]; 374 memcpy(str, s, n); 375} 376 377static void read_common_words_file() 378{ 379 if (n_ignore_words <= 0) 380 return; 381 errno = 0; 382 FILE *fp = fopen(common_words_file, "r"); 383 if (!fp) 384 fatal("can't open `%1': %2", common_words_file, strerror(errno)); 385 common_words_table = new word_list * [hash_table_size]; 386 for (int i = 0; i < hash_table_size; i++) 387 common_words_table[i] = 0; 388 int count = 0; 389 int key_len = 0; 390 for (;;) { 391 int c = getc(fp); 392 while (c != EOF && !csalnum(c)) 393 c = getc(fp); 394 if (c == EOF) 395 break; 396 do { 397 if (key_len < truncate_len) 398 key_buffer[key_len++] = cmlower(c); 399 c = getc(fp); 400 } while (c != EOF && csalnum(c)); 401 if (key_len >= shortest_len) { 402 int h = hash(key_buffer, key_len) % hash_table_size; 403 common_words_table[h] = new word_list(key_buffer, key_len, 404 common_words_table[h]); 405 } 406 if (++count >= n_ignore_words) 407 break; 408 key_len = 0; 409 if (c == EOF) 410 break; 411 } 412 n_ignore_words = count; 413 fclose(fp); 414} 415 416static int do_whole_file(const char *filename) 417{ 418 errno = 0; 419 FILE *fp = fopen(filename, "r"); 420 if (!fp) { 421 error("can't open `%1': %2", filename, strerror(errno)); 422 return 0; 423 } 424 int count = 0; 425 int key_len = 0; 426 int c; 427 while ((c = getc(fp)) != EOF) { 428 if (csalnum(c)) { 429 key_len = 1; 430 key_buffer[0] = c; 431 while ((c = getc(fp)) != EOF) { 432 if (!csalnum(c)) 433 break; 434 if (key_len < truncate_len) 435 key_buffer[key_len++] = c; 436 } 437 if (store_key(key_buffer, key_len)) { 438 if (++count >= max_keys_per_item) 439 break; 440 } 441 if (c == EOF) 442 break; 443 } 444 } 445 store_reference(filenames.length(), 0, 0); 446 store_filename(filename); 447 fclose(fp); 448 return 1; 449} 450 451static int do_file(const char *filename) 452{ 453 errno = 0; 454 // Need binary I/O for MS-DOS/MS-Windows, because indxbib relies on 455 // byte counts to be consistent with fseek. 456 FILE *fp = fopen(filename, FOPEN_RB); 457 if (fp == 0) { 458 error("can't open `%1': %2", filename, strerror(errno)); 459 return 0; 460 } 461 int filename_index = filenames.length(); 462 store_filename(filename); 463 464 enum { 465 START, // at the start of the file; also in between references 466 BOL, // in the middle of a reference, at the beginning of the line 467 PERCENT, // seen a percent at the beginning of the line 468 IGNORE, // ignoring a field 469 IGNORE_BOL, // at the beginning of a line ignoring a field 470 KEY, // in the middle of a key 471 DISCARD, // after truncate_len bytes of a key 472 MIDDLE // in between keys 473 } state = START; 474 475 // In states START, BOL, IGNORE_BOL, space_count how many spaces at 476 // the beginning have been seen. In states PERCENT, IGNORE, KEY, 477 // MIDDLE space_count must be 0. 478 int space_count = 0; 479 int byte_count = 0; // bytes read 480 int key_len = 0; 481 int ref_start = -1; // position of start of current reference 482 for (;;) { 483 int c = getc(fp); 484 if (c == EOF) 485 break; 486 // We opened the file in binary mode, so we need to skip 487 // every CR character before a Newline. 488 if (c == '\r') { 489 int peek = getc(fp); 490 if (peek == '\n') { 491 byte_count++; 492 c = peek; 493 } 494 else 495 ungetc(peek, fp); 496 } 497#if defined(__MSDOS__) || defined(_MSC_VER) || defined(__EMX__) 498 else if (c == 0x1a) // ^Z means EOF in text files 499 break; 500#endif 501 byte_count++; 502 switch (state) { 503 case START: 504 if (c == ' ' || c == '\t') { 505 space_count++; 506 break; 507 } 508 if (c == '\n') { 509 space_count = 0; 510 break; 511 } 512 ref_start = byte_count - space_count - 1; 513 space_count = 0; 514 if (c == '%') 515 state = PERCENT; 516 else if (csalnum(c)) { 517 state = KEY; 518 key_buffer[0] = c; 519 key_len = 1; 520 } 521 else 522 state = MIDDLE; 523 break; 524 case BOL: 525 switch (c) { 526 case '%': 527 if (space_count > 0) { 528 space_count = 0; 529 state = MIDDLE; 530 } 531 else 532 state = PERCENT; 533 break; 534 case ' ': 535 case '\t': 536 space_count++; 537 break; 538 case '\n': 539 store_reference(filename_index, ref_start, 540 byte_count - 1 - space_count - ref_start); 541 state = START; 542 space_count = 0; 543 break; 544 default: 545 space_count = 0; 546 if (csalnum(c)) { 547 state = KEY; 548 key_buffer[0] = c; 549 key_len = 1; 550 } 551 else 552 state = MIDDLE; 553 } 554 break; 555 case PERCENT: 556 if (strchr(ignore_fields, c) != 0) 557 state = IGNORE; 558 else if (c == '\n') 559 state = BOL; 560 else 561 state = MIDDLE; 562 break; 563 case IGNORE: 564 if (c == '\n') 565 state = IGNORE_BOL; 566 break; 567 case IGNORE_BOL: 568 switch (c) { 569 case '%': 570 if (space_count > 0) { 571 state = IGNORE; 572 space_count = 0; 573 } 574 else 575 state = PERCENT; 576 break; 577 case ' ': 578 case '\t': 579 space_count++; 580 break; 581 case '\n': 582 store_reference(filename_index, ref_start, 583 byte_count - 1 - space_count - ref_start); 584 state = START; 585 space_count = 0; 586 break; 587 default: 588 space_count = 0; 589 state = IGNORE; 590 } 591 break; 592 case KEY: 593 if (csalnum(c)) { 594 if (key_len < truncate_len) 595 key_buffer[key_len++] = c; 596 else 597 state = DISCARD; 598 } 599 else { 600 possibly_store_key(key_buffer, key_len); 601 key_len = 0; 602 if (c == '\n') 603 state = BOL; 604 else 605 state = MIDDLE; 606 } 607 break; 608 case DISCARD: 609 if (!csalnum(c)) { 610 possibly_store_key(key_buffer, key_len); 611 key_len = 0; 612 if (c == '\n') 613 state = BOL; 614 else 615 state = MIDDLE; 616 } 617 break; 618 case MIDDLE: 619 if (csalnum(c)) { 620 state = KEY; 621 key_buffer[0] = c; 622 key_len = 1; 623 } 624 else if (c == '\n') 625 state = BOL; 626 break; 627 default: 628 assert(0); 629 } 630 } 631 switch (state) { 632 case START: 633 break; 634 case DISCARD: 635 case KEY: 636 possibly_store_key(key_buffer, key_len); 637 // fall through 638 case BOL: 639 case PERCENT: 640 case IGNORE_BOL: 641 case IGNORE: 642 case MIDDLE: 643 store_reference(filename_index, ref_start, 644 byte_count - ref_start - space_count); 645 break; 646 default: 647 assert(0); 648 } 649 fclose(fp); 650 return 1; 651} 652 653static void store_reference(int filename_index, int pos, int len) 654{ 655 tag t; 656 t.filename_index = filename_index; 657 t.start = pos; 658 t.length = len; 659 fwrite_or_die(&t, sizeof(t), 1, indxfp); 660 ntags++; 661} 662 663static void store_filename(const char *fn) 664{ 665 filenames += fn; 666 filenames += '\0'; 667} 668 669static void init_hash_table() 670{ 671 hash_table = new table_entry[hash_table_size]; 672 for (int i = 0; i < hash_table_size; i++) 673 hash_table[i].ptr = 0; 674} 675 676static void possibly_store_key(char *s, int len) 677{ 678 static int last_tagno = -1; 679 static int key_count; 680 if (last_tagno != ntags) { 681 last_tagno = ntags; 682 key_count = 0; 683 } 684 if (key_count < max_keys_per_item) { 685 if (store_key(s, len)) 686 key_count++; 687 } 688} 689 690static int store_key(char *s, int len) 691{ 692 if (len < shortest_len) 693 return 0; 694 int is_number = 1; 695 for (int i = 0; i < len; i++) 696 if (!csdigit(s[i])) { 697 is_number = 0; 698 s[i] = cmlower(s[i]); 699 } 700 if (is_number && !(len == 4 && s[0] == '1' && s[1] == '9')) 701 return 0; 702 int h = hash(s, len) % hash_table_size; 703 if (common_words_table) { 704 for (word_list *ptr = common_words_table[h]; ptr; ptr = ptr->next) 705 if (len == ptr->len && memcmp(s, ptr->str, len) == 0) 706 return 0; 707 } 708 table_entry *pp = hash_table + h; 709 if (!pp->ptr) 710 pp->ptr = new block; 711 else if (pp->ptr->v[pp->ptr->used - 1] == ntags) 712 return 1; 713 else if (pp->ptr->used >= BLOCK_SIZE) 714 pp->ptr = new block(pp->ptr); 715 pp->ptr->v[(pp->ptr->used)++] = ntags; 716 return 1; 717} 718 719static void write_hash_table() 720{ 721 const int minus_one = -1; 722 int li = 0; 723 for (int i = 0; i < hash_table_size; i++) { 724 block *ptr = hash_table[i].ptr; 725 if (!ptr) 726 hash_table[i].count = -1; 727 else { 728 hash_table[i].count = li; 729 block *rev = 0; 730 while (ptr) { 731 block *tem = ptr; 732 ptr = ptr->next; 733 tem->next = rev; 734 rev = tem; 735 } 736 while (rev) { 737 fwrite_or_die(rev->v, sizeof(int), rev->used, indxfp); 738 li += rev->used; 739 block *tem = rev; 740 rev = rev->next; 741 delete tem; 742 } 743 fwrite_or_die(&minus_one, sizeof(int), 1, indxfp); 744 li += 1; 745 } 746 } 747 if (sizeof(table_entry) == sizeof(int)) 748 fwrite_or_die(hash_table, sizeof(int), hash_table_size, indxfp); 749 else { 750 // write it out word by word 751 for (int i = 0; i < hash_table_size; i++) 752 fwrite_or_die(&hash_table[i].count, sizeof(int), 1, indxfp); 753 } 754 fwrite_or_die(filenames.contents(), 1, filenames.length(), indxfp); 755 if (fseek(indxfp, 0, 0) < 0) 756 fatal("error seeking on index file: %1", strerror(errno)); 757 index_header h; 758 h.magic = INDEX_MAGIC; 759 h.version = INDEX_VERSION; 760 h.tags_size = ntags; 761 h.lists_size = li; 762 h.table_size = hash_table_size; 763 h.strings_size = filenames.length(); 764 h.truncate = truncate_len; 765 h.shortest = shortest_len; 766 h.common = n_ignore_words; 767 fwrite_or_die(&h, sizeof(h), 1, indxfp); 768} 769 770static void fwrite_or_die(const void *ptr, int size, int nitems, FILE *fp) 771{ 772 if (fwrite(ptr, size, nitems, fp) != (size_t)nitems) 773 fatal("fwrite failed: %1", strerror(errno)); 774} 775 776void fatal_error_exit() 777{ 778 cleanup(); 779 exit(3); 780} 781 782extern "C" { 783 784void cleanup() 785{ 786 if (temp_index_file) 787 unlink(temp_index_file); 788} 789 790} 791