1114402Sru// -*- C++ -*- 2151497Sru/* Copyright (C) 1989-1992, 2000, 2001, 2002, 2003, 2004 3114402Sru Free Software Foundation, Inc. 4114402Sru Written by James Clark (jjc@jclark.com) 5114402Sru 6114402SruThis file is part of groff. 7114402Sru 8114402Srugroff is free software; you can redistribute it and/or modify it under 9114402Sruthe terms of the GNU General Public License as published by the Free 10114402SruSoftware Foundation; either version 2, or (at your option) any later 11114402Sruversion. 12114402Sru 13114402Srugroff is distributed in the hope that it will be useful, but WITHOUT ANY 14114402SruWARRANTY; without even the implied warranty of MERCHANTABILITY or 15114402SruFITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 16114402Srufor more details. 17114402Sru 18114402SruYou should have received a copy of the GNU General Public License along 19114402Sruwith groff; see the file COPYING. If not, write to the Free Software 20151497SruFoundation, 51 Franklin St - Fifth Floor, Boston, MA 02110-1301, USA. */ 21114402Sru 22114402Sru#include "lib.h" 23114402Sru 24114402Sru#include <stdlib.h> 25114402Sru#include <assert.h> 26114402Sru#include <errno.h> 27114402Sru 28114402Sru#include "posix.h" 29114402Sru#include "errarg.h" 30114402Sru#include "error.h" 31114402Sru#include "stringclass.h" 32114402Sru#include "cset.h" 33114402Sru#include "cmap.h" 34114402Sru 35114402Sru#include "defs.h" 36114402Sru#include "index.h" 37114402Sru 38114402Sru#include "nonposix.h" 39114402Sru 40114402Sruextern "C" const char *Version_string; 41114402Sru 42114402Sru#define DEFAULT_HASH_TABLE_SIZE 997 43114402Sru#define TEMP_INDEX_TEMPLATE "indxbibXXXXXX" 44114402Sru 45114402Sru// (2^n - MALLOC_OVERHEAD) should be a good argument for malloc(). 46114402Sru 47114402Sru#define MALLOC_OVERHEAD 16 48114402Sru 49114402Sru#ifdef BLOCK_SIZE 50114402Sru#undef BLOCK_SIZE 51114402Sru#endif 52114402Sru 53114402Sruconst int BLOCK_SIZE = ((1024 - MALLOC_OVERHEAD - sizeof(struct block *) 54114402Sru - sizeof(int)) / sizeof(int)); 55114402Srustruct block { 56114402Sru block *next; 57114402Sru int used; 58114402Sru int v[BLOCK_SIZE]; 59114402Sru 60114402Sru block(block *p = 0) : next(p), used(0) { } 61114402Sru}; 62114402Sru 63114402Srustruct block; 64114402Sru 65114402Sruunion table_entry { 66114402Sru block *ptr; 67114402Sru int count; 68114402Sru}; 69114402Sru 70114402Srustruct word_list { 71114402Sru word_list *next; 72114402Sru char *str; 73114402Sru int len; 74114402Sru word_list(const char *, int, word_list *); 75114402Sru}; 76114402Sru 77114402Srutable_entry *hash_table; 78114402Sruint hash_table_size = DEFAULT_HASH_TABLE_SIZE; 79114402Sru// We make this the same size as hash_table so we only have to do one 80114402Sru// mod per key. 81114402Srustatic word_list **common_words_table = 0; 82114402Sruchar *key_buffer; 83114402Sru 84114402SruFILE *indxfp; 85114402Sruint ntags = 0; 86114402Srustring filenames; 87114402Sruchar *temp_index_file = 0; 88114402Sru 89114402Sruconst char *ignore_fields = "XYZ"; 90114402Sruconst char *common_words_file = COMMON_WORDS_FILE; 91114402Sruint n_ignore_words = 100; 92114402Sruint truncate_len = 6; 93114402Sruint shortest_len = 3; 94114402Sruint max_keys_per_item = 100; 95114402Sru 96114402Srustatic void usage(FILE *stream); 97114402Srustatic void write_hash_table(); 98114402Srustatic void init_hash_table(); 99114402Srustatic void read_common_words_file(); 100114402Srustatic int store_key(char *s, int len); 101114402Srustatic void possibly_store_key(char *s, int len); 102114402Srustatic int do_whole_file(const char *filename); 103114402Srustatic int do_file(const char *filename); 104114402Srustatic void store_reference(int filename_index, int pos, int len); 105114402Srustatic void check_integer_arg(char opt, const char *arg, int min, int *res); 106114402Srustatic void store_filename(const char *); 107114402Srustatic void fwrite_or_die(const void *ptr, int size, int nitems, FILE *fp); 108114402Srustatic char *get_cwd(); 109114402Sru 110114402Sruextern "C" { 111114402Sru void cleanup(); 112114402Sru void catch_fatal_signals(); 113114402Sru void ignore_fatal_signals(); 114114402Sru} 115114402Sru 116114402Sruint main(int argc, char **argv) 117114402Sru{ 118114402Sru program_name = argv[0]; 119114402Sru static char stderr_buf[BUFSIZ]; 120114402Sru setbuf(stderr, stderr_buf); 121114402Sru 122151497Sru const char *base_name = 0; 123114402Sru typedef int (*parser_t)(const char *); 124114402Sru parser_t parser = do_file; 125114402Sru const char *directory = 0; 126114402Sru const char *foption = 0; 127114402Sru int opt; 128114402Sru static const struct option long_options[] = { 129114402Sru { "help", no_argument, 0, CHAR_MAX + 1 }, 130114402Sru { "version", no_argument, 0, 'v' }, 131114402Sru { NULL, 0, 0, 0 } 132114402Sru }; 133114402Sru while ((opt = getopt_long(argc, argv, "c:o:h:i:k:l:t:n:c:d:f:vw", 134114402Sru long_options, NULL)) 135114402Sru != EOF) 136114402Sru switch (opt) { 137114402Sru case 'c': 138114402Sru common_words_file = optarg; 139114402Sru break; 140114402Sru case 'd': 141114402Sru directory = optarg; 142114402Sru break; 143114402Sru case 'f': 144114402Sru foption = optarg; 145114402Sru break; 146114402Sru case 'h': 147114402Sru check_integer_arg('h', optarg, 1, &hash_table_size); 148114402Sru if (!is_prime(hash_table_size)) { 149114402Sru while (!is_prime(++hash_table_size)) 150114402Sru ; 151114402Sru warning("%1 not prime: using %2 instead", optarg, hash_table_size); 152114402Sru } 153114402Sru break; 154114402Sru case 'i': 155114402Sru ignore_fields = optarg; 156114402Sru break; 157114402Sru case 'k': 158114402Sru check_integer_arg('k', optarg, 1, &max_keys_per_item); 159114402Sru break; 160114402Sru case 'l': 161114402Sru check_integer_arg('l', optarg, 0, &shortest_len); 162114402Sru break; 163114402Sru case 'n': 164114402Sru check_integer_arg('n', optarg, 0, &n_ignore_words); 165114402Sru break; 166114402Sru case 'o': 167151497Sru base_name = optarg; 168114402Sru break; 169114402Sru case 't': 170114402Sru check_integer_arg('t', optarg, 1, &truncate_len); 171114402Sru break; 172114402Sru case 'w': 173114402Sru parser = do_whole_file; 174114402Sru break; 175114402Sru case 'v': 176114402Sru printf("GNU indxbib (groff) version %s\n", Version_string); 177114402Sru exit(0); 178114402Sru break; 179114402Sru case CHAR_MAX + 1: // --help 180114402Sru usage(stdout); 181114402Sru exit(0); 182114402Sru break; 183114402Sru case '?': 184114402Sru usage(stderr); 185114402Sru exit(1); 186114402Sru break; 187114402Sru default: 188114402Sru assert(0); 189114402Sru break; 190114402Sru } 191114402Sru if (optind >= argc && foption == 0) 192114402Sru fatal("no files and no -f option"); 193114402Sru if (!directory) { 194114402Sru char *path = get_cwd(); 195114402Sru store_filename(path); 196114402Sru a_delete path; 197114402Sru } 198114402Sru else 199114402Sru store_filename(directory); 200114402Sru init_hash_table(); 201114402Sru store_filename(common_words_file); 202114402Sru store_filename(ignore_fields); 203114402Sru key_buffer = new char[truncate_len]; 204114402Sru read_common_words_file(); 205151497Sru if (!base_name) 206151497Sru base_name = optind < argc ? argv[optind] : DEFAULT_INDEX_NAME; 207151497Sru const char *p = strrchr(base_name, DIR_SEPS[0]), *p1; 208114402Sru const char *sep = &DIR_SEPS[1]; 209114402Sru while (*sep) { 210151497Sru p1 = strrchr(base_name, *sep); 211114402Sru if (p1 && (!p || p1 > p)) 212114402Sru p = p1; 213114402Sru sep++; 214114402Sru } 215114402Sru size_t name_max; 216114402Sru if (p) { 217151497Sru char *dir = strsave(base_name); 218151497Sru dir[p - base_name] = '\0'; 219114402Sru name_max = file_name_max(dir); 220114402Sru a_delete dir; 221114402Sru } 222114402Sru else 223114402Sru name_max = file_name_max("."); 224151497Sru const char *filename = p ? p + 1 : base_name; 225114402Sru if (strlen(filename) + sizeof(INDEX_SUFFIX) - 1 > name_max) 226114402Sru fatal("`%1.%2' is too long for a filename", filename, INDEX_SUFFIX); 227114402Sru if (p) { 228114402Sru p++; 229151497Sru temp_index_file = new char[p - base_name + sizeof(TEMP_INDEX_TEMPLATE)]; 230151497Sru memcpy(temp_index_file, base_name, p - base_name); 231151497Sru strcpy(temp_index_file + (p - base_name), TEMP_INDEX_TEMPLATE); 232114402Sru } 233114402Sru else { 234114402Sru temp_index_file = strsave(TEMP_INDEX_TEMPLATE); 235114402Sru } 236114402Sru catch_fatal_signals(); 237114402Sru int fd = mkstemp(temp_index_file); 238114402Sru if (fd < 0) 239114402Sru fatal("can't create temporary index file: %1", strerror(errno)); 240114402Sru indxfp = fdopen(fd, FOPEN_WB); 241114402Sru if (indxfp == 0) 242114402Sru fatal("fdopen failed"); 243114402Sru if (fseek(indxfp, sizeof(index_header), 0) < 0) 244114402Sru fatal("can't seek past index header: %1", strerror(errno)); 245114402Sru int failed = 0; 246114402Sru if (foption) { 247114402Sru FILE *fp = stdin; 248114402Sru if (strcmp(foption, "-") != 0) { 249114402Sru errno = 0; 250114402Sru fp = fopen(foption, "r"); 251114402Sru if (!fp) 252114402Sru fatal("can't open `%1': %2", foption, strerror(errno)); 253114402Sru } 254114402Sru string path; 255114402Sru int lineno = 1; 256114402Sru for (;;) { 257114402Sru int c; 258114402Sru for (c = getc(fp); c != '\n' && c != EOF; c = getc(fp)) { 259114402Sru if (c == '\0') 260114402Sru error_with_file_and_line(foption, lineno, 261114402Sru "nul character in pathname ignored"); 262114402Sru else 263114402Sru path += c; 264114402Sru } 265114402Sru if (path.length() > 0) { 266114402Sru path += '\0'; 267114402Sru if (!(*parser)(path.contents())) 268114402Sru failed = 1; 269114402Sru path.clear(); 270114402Sru } 271114402Sru if (c == EOF) 272114402Sru break; 273114402Sru lineno++; 274114402Sru } 275114402Sru if (fp != stdin) 276114402Sru fclose(fp); 277114402Sru } 278114402Sru for (int i = optind; i < argc; i++) 279114402Sru if (!(*parser)(argv[i])) 280114402Sru failed = 1; 281114402Sru write_hash_table(); 282114402Sru if (fclose(indxfp) < 0) 283114402Sru fatal("error closing temporary index file: %1", strerror(errno)); 284151497Sru char *index_file = new char[strlen(base_name) + sizeof(INDEX_SUFFIX)]; 285151497Sru strcpy(index_file, base_name); 286114402Sru strcat(index_file, INDEX_SUFFIX); 287114402Sru#ifdef HAVE_RENAME 288114402Sru#ifdef __EMX__ 289114402Sru if (access(index_file, R_OK) == 0) 290114402Sru unlink(index_file); 291114402Sru#endif /* __EMX__ */ 292114402Sru if (rename(temp_index_file, index_file) < 0) { 293114402Sru#ifdef __MSDOS__ 294114402Sru // RENAME could fail on plain MSDOS filesystems because 295114402Sru // INDEX_FILE is an invalid filename, e.g. it has multiple dots. 296151497Sru char *fname = p ? index_file + (p - base_name) : 0; 297114402Sru char *dot = 0; 298114402Sru 299114402Sru // Replace the dot with an underscore and try again. 300114402Sru if (fname 301114402Sru && (dot = strchr(fname, '.')) != 0 302114402Sru && strcmp(dot, INDEX_SUFFIX) != 0) 303114402Sru *dot = '_'; 304114402Sru if (rename(temp_index_file, index_file) < 0) 305114402Sru#endif 306114402Sru fatal("can't rename temporary index file: %1", strerror(errno)); 307114402Sru } 308114402Sru#else /* not HAVE_RENAME */ 309114402Sru ignore_fatal_signals(); 310114402Sru if (unlink(index_file) < 0) { 311114402Sru if (errno != ENOENT) 312114402Sru fatal("can't unlink `%1': %2", index_file, strerror(errno)); 313114402Sru } 314114402Sru if (link(temp_index_file, index_file) < 0) 315114402Sru fatal("can't link temporary index file: %1", strerror(errno)); 316114402Sru if (unlink(temp_index_file) < 0) 317114402Sru fatal("can't unlink temporary index file: %1", strerror(errno)); 318114402Sru#endif /* not HAVE_RENAME */ 319114402Sru temp_index_file = 0; 320114402Sru return failed; 321114402Sru} 322114402Sru 323114402Srustatic void usage(FILE *stream) 324114402Sru{ 325114402Sru fprintf(stream, 326114402Sru"usage: %s [-vw] [-c file] [-d dir] [-f file] [-h n] [-i XYZ] [-k n]\n" 327114402Sru" [-l n] [-n n] [-o base] [-t n] [files...]\n", 328114402Sru program_name); 329114402Sru} 330114402Sru 331114402Srustatic void check_integer_arg(char opt, const char *arg, int min, int *res) 332114402Sru{ 333114402Sru char *ptr; 334114402Sru long n = strtol(arg, &ptr, 10); 335114402Sru if (n == 0 && ptr == arg) 336114402Sru error("argument to -%1 not an integer", opt); 337114402Sru else if (n < min) 338114402Sru error("argument to -%1 must not be less than %2", opt, min); 339114402Sru else { 340114402Sru if (n > INT_MAX) 341114402Sru error("argument to -%1 greater than maximum integer", opt); 342114402Sru else if (*ptr != '\0') 343114402Sru error("junk after integer argument to -%1", opt); 344114402Sru *res = int(n); 345114402Sru } 346114402Sru} 347114402Sru 348114402Srustatic char *get_cwd() 349114402Sru{ 350114402Sru char *buf; 351114402Sru int size = 12; 352114402Sru 353114402Sru for (;;) { 354114402Sru buf = new char[size]; 355114402Sru if (getcwd(buf, size)) 356114402Sru break; 357114402Sru if (errno != ERANGE) 358114402Sru fatal("cannot get current working directory: %1", strerror(errno)); 359114402Sru a_delete buf; 360114402Sru if (size == INT_MAX) 361114402Sru fatal("current working directory longer than INT_MAX"); 362114402Sru if (size > INT_MAX/2) 363114402Sru size = INT_MAX; 364114402Sru else 365114402Sru size *= 2; 366114402Sru } 367114402Sru return buf; 368114402Sru} 369114402Sru 370114402Sruword_list::word_list(const char *s, int n, word_list *p) 371114402Sru: next(p), len(n) 372114402Sru{ 373114402Sru str = new char[n]; 374114402Sru memcpy(str, s, n); 375114402Sru} 376114402Sru 377114402Srustatic void read_common_words_file() 378114402Sru{ 379114402Sru if (n_ignore_words <= 0) 380114402Sru return; 381114402Sru errno = 0; 382114402Sru FILE *fp = fopen(common_words_file, "r"); 383114402Sru if (!fp) 384114402Sru fatal("can't open `%1': %2", common_words_file, strerror(errno)); 385114402Sru common_words_table = new word_list * [hash_table_size]; 386114402Sru for (int i = 0; i < hash_table_size; i++) 387114402Sru common_words_table[i] = 0; 388114402Sru int count = 0; 389114402Sru int key_len = 0; 390114402Sru for (;;) { 391114402Sru int c = getc(fp); 392114402Sru while (c != EOF && !csalnum(c)) 393114402Sru c = getc(fp); 394114402Sru if (c == EOF) 395114402Sru break; 396114402Sru do { 397114402Sru if (key_len < truncate_len) 398114402Sru key_buffer[key_len++] = cmlower(c); 399114402Sru c = getc(fp); 400114402Sru } while (c != EOF && csalnum(c)); 401114402Sru if (key_len >= shortest_len) { 402114402Sru int h = hash(key_buffer, key_len) % hash_table_size; 403114402Sru common_words_table[h] = new word_list(key_buffer, key_len, 404114402Sru common_words_table[h]); 405114402Sru } 406114402Sru if (++count >= n_ignore_words) 407114402Sru break; 408114402Sru key_len = 0; 409114402Sru if (c == EOF) 410114402Sru break; 411114402Sru } 412114402Sru n_ignore_words = count; 413114402Sru fclose(fp); 414114402Sru} 415114402Sru 416114402Srustatic int do_whole_file(const char *filename) 417114402Sru{ 418114402Sru errno = 0; 419114402Sru FILE *fp = fopen(filename, "r"); 420114402Sru if (!fp) { 421114402Sru error("can't open `%1': %2", filename, strerror(errno)); 422114402Sru return 0; 423114402Sru } 424114402Sru int count = 0; 425114402Sru int key_len = 0; 426114402Sru int c; 427114402Sru while ((c = getc(fp)) != EOF) { 428114402Sru if (csalnum(c)) { 429114402Sru key_len = 1; 430114402Sru key_buffer[0] = c; 431114402Sru while ((c = getc(fp)) != EOF) { 432114402Sru if (!csalnum(c)) 433114402Sru break; 434114402Sru if (key_len < truncate_len) 435114402Sru key_buffer[key_len++] = c; 436114402Sru } 437114402Sru if (store_key(key_buffer, key_len)) { 438114402Sru if (++count >= max_keys_per_item) 439114402Sru break; 440114402Sru } 441114402Sru if (c == EOF) 442114402Sru break; 443114402Sru } 444114402Sru } 445114402Sru store_reference(filenames.length(), 0, 0); 446114402Sru store_filename(filename); 447114402Sru fclose(fp); 448114402Sru return 1; 449114402Sru} 450114402Sru 451114402Srustatic int do_file(const char *filename) 452114402Sru{ 453114402Sru errno = 0; 454114402Sru // Need binary I/O for MS-DOS/MS-Windows, because indxbib relies on 455114402Sru // byte counts to be consistent with fseek. 456114402Sru FILE *fp = fopen(filename, FOPEN_RB); 457114402Sru if (fp == 0) { 458114402Sru error("can't open `%1': %2", filename, strerror(errno)); 459114402Sru return 0; 460114402Sru } 461114402Sru int filename_index = filenames.length(); 462114402Sru store_filename(filename); 463114402Sru 464114402Sru enum { 465114402Sru START, // at the start of the file; also in between references 466114402Sru BOL, // in the middle of a reference, at the beginning of the line 467114402Sru PERCENT, // seen a percent at the beginning of the line 468114402Sru IGNORE, // ignoring a field 469114402Sru IGNORE_BOL, // at the beginning of a line ignoring a field 470114402Sru KEY, // in the middle of a key 471114402Sru DISCARD, // after truncate_len bytes of a key 472114402Sru MIDDLE // in between keys 473114402Sru } state = START; 474114402Sru 475114402Sru // In states START, BOL, IGNORE_BOL, space_count how many spaces at 476114402Sru // the beginning have been seen. In states PERCENT, IGNORE, KEY, 477114402Sru // MIDDLE space_count must be 0. 478114402Sru int space_count = 0; 479114402Sru int byte_count = 0; // bytes read 480114402Sru int key_len = 0; 481114402Sru int ref_start = -1; // position of start of current reference 482114402Sru for (;;) { 483114402Sru int c = getc(fp); 484114402Sru if (c == EOF) 485114402Sru break; 486114402Sru // We opened the file in binary mode, so we need to skip 487114402Sru // every CR character before a Newline. 488114402Sru if (c == '\r') { 489114402Sru int peek = getc(fp); 490114402Sru if (peek == '\n') { 491114402Sru byte_count++; 492114402Sru c = peek; 493114402Sru } 494114402Sru else 495114402Sru ungetc(peek, fp); 496114402Sru } 497114402Sru#if defined(__MSDOS__) || defined(_MSC_VER) || defined(__EMX__) 498114402Sru else if (c == 0x1a) // ^Z means EOF in text files 499114402Sru break; 500114402Sru#endif 501114402Sru byte_count++; 502114402Sru switch (state) { 503114402Sru case START: 504114402Sru if (c == ' ' || c == '\t') { 505114402Sru space_count++; 506114402Sru break; 507114402Sru } 508114402Sru if (c == '\n') { 509114402Sru space_count = 0; 510114402Sru break; 511114402Sru } 512114402Sru ref_start = byte_count - space_count - 1; 513114402Sru space_count = 0; 514114402Sru if (c == '%') 515114402Sru state = PERCENT; 516114402Sru else if (csalnum(c)) { 517114402Sru state = KEY; 518114402Sru key_buffer[0] = c; 519114402Sru key_len = 1; 520114402Sru } 521114402Sru else 522114402Sru state = MIDDLE; 523114402Sru break; 524114402Sru case BOL: 525114402Sru switch (c) { 526114402Sru case '%': 527114402Sru if (space_count > 0) { 528114402Sru space_count = 0; 529114402Sru state = MIDDLE; 530114402Sru } 531114402Sru else 532114402Sru state = PERCENT; 533114402Sru break; 534114402Sru case ' ': 535114402Sru case '\t': 536114402Sru space_count++; 537114402Sru break; 538114402Sru case '\n': 539114402Sru store_reference(filename_index, ref_start, 540114402Sru byte_count - 1 - space_count - ref_start); 541114402Sru state = START; 542114402Sru space_count = 0; 543114402Sru break; 544114402Sru default: 545114402Sru space_count = 0; 546114402Sru if (csalnum(c)) { 547114402Sru state = KEY; 548114402Sru key_buffer[0] = c; 549114402Sru key_len = 1; 550114402Sru } 551114402Sru else 552114402Sru state = MIDDLE; 553114402Sru } 554114402Sru break; 555114402Sru case PERCENT: 556114402Sru if (strchr(ignore_fields, c) != 0) 557114402Sru state = IGNORE; 558114402Sru else if (c == '\n') 559114402Sru state = BOL; 560114402Sru else 561114402Sru state = MIDDLE; 562114402Sru break; 563114402Sru case IGNORE: 564114402Sru if (c == '\n') 565114402Sru state = IGNORE_BOL; 566114402Sru break; 567114402Sru case IGNORE_BOL: 568114402Sru switch (c) { 569114402Sru case '%': 570114402Sru if (space_count > 0) { 571114402Sru state = IGNORE; 572114402Sru space_count = 0; 573114402Sru } 574114402Sru else 575114402Sru state = PERCENT; 576114402Sru break; 577114402Sru case ' ': 578114402Sru case '\t': 579114402Sru space_count++; 580114402Sru break; 581114402Sru case '\n': 582114402Sru store_reference(filename_index, ref_start, 583114402Sru byte_count - 1 - space_count - ref_start); 584114402Sru state = START; 585114402Sru space_count = 0; 586114402Sru break; 587114402Sru default: 588114402Sru space_count = 0; 589114402Sru state = IGNORE; 590114402Sru } 591114402Sru break; 592114402Sru case KEY: 593114402Sru if (csalnum(c)) { 594114402Sru if (key_len < truncate_len) 595114402Sru key_buffer[key_len++] = c; 596114402Sru else 597114402Sru state = DISCARD; 598114402Sru } 599114402Sru else { 600114402Sru possibly_store_key(key_buffer, key_len); 601114402Sru key_len = 0; 602114402Sru if (c == '\n') 603114402Sru state = BOL; 604114402Sru else 605114402Sru state = MIDDLE; 606114402Sru } 607114402Sru break; 608114402Sru case DISCARD: 609114402Sru if (!csalnum(c)) { 610114402Sru possibly_store_key(key_buffer, key_len); 611114402Sru key_len = 0; 612114402Sru if (c == '\n') 613114402Sru state = BOL; 614114402Sru else 615114402Sru state = MIDDLE; 616114402Sru } 617114402Sru break; 618114402Sru case MIDDLE: 619114402Sru if (csalnum(c)) { 620114402Sru state = KEY; 621114402Sru key_buffer[0] = c; 622114402Sru key_len = 1; 623114402Sru } 624114402Sru else if (c == '\n') 625114402Sru state = BOL; 626114402Sru break; 627114402Sru default: 628114402Sru assert(0); 629114402Sru } 630114402Sru } 631114402Sru switch (state) { 632114402Sru case START: 633114402Sru break; 634114402Sru case DISCARD: 635114402Sru case KEY: 636114402Sru possibly_store_key(key_buffer, key_len); 637114402Sru // fall through 638114402Sru case BOL: 639114402Sru case PERCENT: 640114402Sru case IGNORE_BOL: 641114402Sru case IGNORE: 642114402Sru case MIDDLE: 643114402Sru store_reference(filename_index, ref_start, 644114402Sru byte_count - ref_start - space_count); 645114402Sru break; 646114402Sru default: 647114402Sru assert(0); 648114402Sru } 649114402Sru fclose(fp); 650114402Sru return 1; 651114402Sru} 652114402Sru 653114402Srustatic void store_reference(int filename_index, int pos, int len) 654114402Sru{ 655114402Sru tag t; 656114402Sru t.filename_index = filename_index; 657114402Sru t.start = pos; 658114402Sru t.length = len; 659114402Sru fwrite_or_die(&t, sizeof(t), 1, indxfp); 660114402Sru ntags++; 661114402Sru} 662114402Sru 663114402Srustatic void store_filename(const char *fn) 664114402Sru{ 665114402Sru filenames += fn; 666114402Sru filenames += '\0'; 667114402Sru} 668114402Sru 669114402Srustatic void init_hash_table() 670114402Sru{ 671114402Sru hash_table = new table_entry[hash_table_size]; 672114402Sru for (int i = 0; i < hash_table_size; i++) 673114402Sru hash_table[i].ptr = 0; 674114402Sru} 675114402Sru 676114402Srustatic void possibly_store_key(char *s, int len) 677114402Sru{ 678114402Sru static int last_tagno = -1; 679114402Sru static int key_count; 680114402Sru if (last_tagno != ntags) { 681114402Sru last_tagno = ntags; 682114402Sru key_count = 0; 683114402Sru } 684114402Sru if (key_count < max_keys_per_item) { 685114402Sru if (store_key(s, len)) 686114402Sru key_count++; 687114402Sru } 688114402Sru} 689114402Sru 690114402Srustatic int store_key(char *s, int len) 691114402Sru{ 692114402Sru if (len < shortest_len) 693114402Sru return 0; 694114402Sru int is_number = 1; 695114402Sru for (int i = 0; i < len; i++) 696114402Sru if (!csdigit(s[i])) { 697114402Sru is_number = 0; 698114402Sru s[i] = cmlower(s[i]); 699114402Sru } 700114402Sru if (is_number && !(len == 4 && s[0] == '1' && s[1] == '9')) 701114402Sru return 0; 702114402Sru int h = hash(s, len) % hash_table_size; 703114402Sru if (common_words_table) { 704114402Sru for (word_list *ptr = common_words_table[h]; ptr; ptr = ptr->next) 705114402Sru if (len == ptr->len && memcmp(s, ptr->str, len) == 0) 706114402Sru return 0; 707114402Sru } 708114402Sru table_entry *pp = hash_table + h; 709114402Sru if (!pp->ptr) 710114402Sru pp->ptr = new block; 711114402Sru else if (pp->ptr->v[pp->ptr->used - 1] == ntags) 712114402Sru return 1; 713114402Sru else if (pp->ptr->used >= BLOCK_SIZE) 714114402Sru pp->ptr = new block(pp->ptr); 715114402Sru pp->ptr->v[(pp->ptr->used)++] = ntags; 716114402Sru return 1; 717114402Sru} 718114402Sru 719114402Srustatic void write_hash_table() 720114402Sru{ 721114402Sru const int minus_one = -1; 722114402Sru int li = 0; 723114402Sru for (int i = 0; i < hash_table_size; i++) { 724114402Sru block *ptr = hash_table[i].ptr; 725114402Sru if (!ptr) 726114402Sru hash_table[i].count = -1; 727114402Sru else { 728114402Sru hash_table[i].count = li; 729114402Sru block *rev = 0; 730114402Sru while (ptr) { 731114402Sru block *tem = ptr; 732114402Sru ptr = ptr->next; 733114402Sru tem->next = rev; 734114402Sru rev = tem; 735114402Sru } 736114402Sru while (rev) { 737114402Sru fwrite_or_die(rev->v, sizeof(int), rev->used, indxfp); 738114402Sru li += rev->used; 739114402Sru block *tem = rev; 740114402Sru rev = rev->next; 741114402Sru delete tem; 742114402Sru } 743114402Sru fwrite_or_die(&minus_one, sizeof(int), 1, indxfp); 744114402Sru li += 1; 745114402Sru } 746114402Sru } 747114402Sru if (sizeof(table_entry) == sizeof(int)) 748114402Sru fwrite_or_die(hash_table, sizeof(int), hash_table_size, indxfp); 749114402Sru else { 750114402Sru // write it out word by word 751114402Sru for (int i = 0; i < hash_table_size; i++) 752114402Sru fwrite_or_die(&hash_table[i].count, sizeof(int), 1, indxfp); 753114402Sru } 754114402Sru fwrite_or_die(filenames.contents(), 1, filenames.length(), indxfp); 755114402Sru if (fseek(indxfp, 0, 0) < 0) 756114402Sru fatal("error seeking on index file: %1", strerror(errno)); 757114402Sru index_header h; 758114402Sru h.magic = INDEX_MAGIC; 759114402Sru h.version = INDEX_VERSION; 760114402Sru h.tags_size = ntags; 761114402Sru h.lists_size = li; 762114402Sru h.table_size = hash_table_size; 763114402Sru h.strings_size = filenames.length(); 764114402Sru h.truncate = truncate_len; 765114402Sru h.shortest = shortest_len; 766114402Sru h.common = n_ignore_words; 767114402Sru fwrite_or_die(&h, sizeof(h), 1, indxfp); 768114402Sru} 769114402Sru 770114402Srustatic void fwrite_or_die(const void *ptr, int size, int nitems, FILE *fp) 771114402Sru{ 772114402Sru if (fwrite(ptr, size, nitems, fp) != (size_t)nitems) 773114402Sru fatal("fwrite failed: %1", strerror(errno)); 774114402Sru} 775114402Sru 776114402Sruvoid fatal_error_exit() 777114402Sru{ 778114402Sru cleanup(); 779114402Sru exit(3); 780114402Sru} 781114402Sru 782114402Sruextern "C" { 783114402Sru 784114402Sruvoid cleanup() 785114402Sru{ 786114402Sru if (temp_index_file) 787114402Sru unlink(temp_index_file); 788114402Sru} 789114402Sru 790114402Sru} 791