1114402Sru// -*- C++ -*-
2151497Sru/* Copyright (C) 1989-1992, 2000, 2001, 2002, 2003, 2004
3114402Sru   Free Software Foundation, Inc.
4114402Sru     Written by James Clark (jjc@jclark.com)
5114402Sru
6114402SruThis file is part of groff.
7114402Sru
8114402Srugroff is free software; you can redistribute it and/or modify it under
9114402Sruthe terms of the GNU General Public License as published by the Free
10114402SruSoftware Foundation; either version 2, or (at your option) any later
11114402Sruversion.
12114402Sru
13114402Srugroff is distributed in the hope that it will be useful, but WITHOUT ANY
14114402SruWARRANTY; without even the implied warranty of MERCHANTABILITY or
15114402SruFITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
16114402Srufor more details.
17114402Sru
18114402SruYou should have received a copy of the GNU General Public License along
19114402Sruwith groff; see the file COPYING.  If not, write to the Free Software
20151497SruFoundation, 51 Franklin St - Fifth Floor, Boston, MA 02110-1301, USA. */
21114402Sru
22114402Sru#include "lib.h"
23114402Sru
24114402Sru#include <stdlib.h>
25114402Sru#include <assert.h>
26114402Sru#include <errno.h>
27114402Sru
28114402Sru#include "posix.h"
29114402Sru#include "errarg.h"
30114402Sru#include "error.h"
31114402Sru#include "stringclass.h"
32114402Sru#include "cset.h"
33114402Sru#include "cmap.h"
34114402Sru
35114402Sru#include "defs.h"
36114402Sru#include "index.h"
37114402Sru
38114402Sru#include "nonposix.h"
39114402Sru
40114402Sruextern "C" const char *Version_string;
41114402Sru
42114402Sru#define DEFAULT_HASH_TABLE_SIZE 997
43114402Sru#define TEMP_INDEX_TEMPLATE "indxbibXXXXXX"
44114402Sru
45114402Sru// (2^n - MALLOC_OVERHEAD) should be a good argument for malloc().
46114402Sru
47114402Sru#define MALLOC_OVERHEAD 16
48114402Sru
49114402Sru#ifdef BLOCK_SIZE
50114402Sru#undef BLOCK_SIZE
51114402Sru#endif
52114402Sru
53114402Sruconst int BLOCK_SIZE = ((1024 - MALLOC_OVERHEAD - sizeof(struct block *)
54114402Sru			 - sizeof(int)) / sizeof(int));
55114402Srustruct block {
56114402Sru  block *next;
57114402Sru  int used;
58114402Sru  int v[BLOCK_SIZE];
59114402Sru
60114402Sru  block(block *p = 0) : next(p), used(0) { }
61114402Sru};
62114402Sru
63114402Srustruct block;
64114402Sru
65114402Sruunion table_entry {
66114402Sru  block *ptr;
67114402Sru  int count;
68114402Sru};
69114402Sru
70114402Srustruct word_list {
71114402Sru  word_list *next;
72114402Sru  char *str;
73114402Sru  int len;
74114402Sru  word_list(const char *, int, word_list *);
75114402Sru};
76114402Sru
77114402Srutable_entry *hash_table;
78114402Sruint hash_table_size = DEFAULT_HASH_TABLE_SIZE;
79114402Sru// We make this the same size as hash_table so we only have to do one
80114402Sru// mod per key.
81114402Srustatic word_list **common_words_table = 0;
82114402Sruchar *key_buffer;
83114402Sru
84114402SruFILE *indxfp;
85114402Sruint ntags = 0;
86114402Srustring filenames;
87114402Sruchar *temp_index_file = 0;
88114402Sru
89114402Sruconst char *ignore_fields = "XYZ";
90114402Sruconst char *common_words_file = COMMON_WORDS_FILE;
91114402Sruint n_ignore_words = 100;
92114402Sruint truncate_len = 6;
93114402Sruint shortest_len = 3;
94114402Sruint max_keys_per_item = 100;
95114402Sru
96114402Srustatic void usage(FILE *stream);
97114402Srustatic void write_hash_table();
98114402Srustatic void init_hash_table();
99114402Srustatic void read_common_words_file();
100114402Srustatic int store_key(char *s, int len);
101114402Srustatic void possibly_store_key(char *s, int len);
102114402Srustatic int do_whole_file(const char *filename);
103114402Srustatic int do_file(const char *filename);
104114402Srustatic void store_reference(int filename_index, int pos, int len);
105114402Srustatic void check_integer_arg(char opt, const char *arg, int min, int *res);
106114402Srustatic void store_filename(const char *);
107114402Srustatic void fwrite_or_die(const void *ptr, int size, int nitems, FILE *fp);
108114402Srustatic char *get_cwd();
109114402Sru
110114402Sruextern "C" {
111114402Sru  void cleanup();
112114402Sru  void catch_fatal_signals();
113114402Sru  void ignore_fatal_signals();
114114402Sru}
115114402Sru
116114402Sruint main(int argc, char **argv)
117114402Sru{
118114402Sru  program_name = argv[0];
119114402Sru  static char stderr_buf[BUFSIZ];
120114402Sru  setbuf(stderr, stderr_buf);
121114402Sru
122151497Sru  const char *base_name = 0;
123114402Sru  typedef int (*parser_t)(const char *);
124114402Sru  parser_t parser = do_file;
125114402Sru  const char *directory = 0;
126114402Sru  const char *foption = 0;
127114402Sru  int opt;
128114402Sru  static const struct option long_options[] = {
129114402Sru    { "help", no_argument, 0, CHAR_MAX + 1 },
130114402Sru    { "version", no_argument, 0, 'v' },
131114402Sru    { NULL, 0, 0, 0 }
132114402Sru  };
133114402Sru  while ((opt = getopt_long(argc, argv, "c:o:h:i:k:l:t:n:c:d:f:vw",
134114402Sru			    long_options, NULL))
135114402Sru	 != EOF)
136114402Sru    switch (opt) {
137114402Sru    case 'c':
138114402Sru      common_words_file = optarg;
139114402Sru      break;
140114402Sru    case 'd':
141114402Sru      directory = optarg;
142114402Sru      break;
143114402Sru    case 'f':
144114402Sru      foption = optarg;
145114402Sru      break;
146114402Sru    case 'h':
147114402Sru      check_integer_arg('h', optarg, 1, &hash_table_size);
148114402Sru      if (!is_prime(hash_table_size)) {
149114402Sru	while (!is_prime(++hash_table_size))
150114402Sru	  ;
151114402Sru	warning("%1 not prime: using %2 instead", optarg, hash_table_size);
152114402Sru      }
153114402Sru      break;
154114402Sru    case 'i':
155114402Sru      ignore_fields = optarg;
156114402Sru      break;
157114402Sru    case 'k':
158114402Sru      check_integer_arg('k', optarg, 1, &max_keys_per_item);
159114402Sru      break;
160114402Sru    case 'l':
161114402Sru      check_integer_arg('l', optarg, 0, &shortest_len);
162114402Sru      break;
163114402Sru    case 'n':
164114402Sru      check_integer_arg('n', optarg, 0, &n_ignore_words);
165114402Sru      break;
166114402Sru    case 'o':
167151497Sru      base_name = optarg;
168114402Sru      break;
169114402Sru    case 't':
170114402Sru      check_integer_arg('t', optarg, 1, &truncate_len);
171114402Sru      break;
172114402Sru    case 'w':
173114402Sru      parser = do_whole_file;
174114402Sru      break;
175114402Sru    case 'v':
176114402Sru      printf("GNU indxbib (groff) version %s\n", Version_string);
177114402Sru      exit(0);
178114402Sru      break;
179114402Sru    case CHAR_MAX + 1: // --help
180114402Sru      usage(stdout);
181114402Sru      exit(0);
182114402Sru      break;
183114402Sru    case '?':
184114402Sru      usage(stderr);
185114402Sru      exit(1);
186114402Sru      break;
187114402Sru    default:
188114402Sru      assert(0);
189114402Sru      break;
190114402Sru    }
191114402Sru  if (optind >= argc && foption == 0)
192114402Sru    fatal("no files and no -f option");
193114402Sru  if (!directory) {
194114402Sru    char *path = get_cwd();
195114402Sru    store_filename(path);
196114402Sru    a_delete path;
197114402Sru  }
198114402Sru  else
199114402Sru    store_filename(directory);
200114402Sru  init_hash_table();
201114402Sru  store_filename(common_words_file);
202114402Sru  store_filename(ignore_fields);
203114402Sru  key_buffer = new char[truncate_len];
204114402Sru  read_common_words_file();
205151497Sru  if (!base_name)
206151497Sru    base_name = optind < argc ? argv[optind] : DEFAULT_INDEX_NAME;
207151497Sru  const char *p = strrchr(base_name, DIR_SEPS[0]), *p1;
208114402Sru  const char *sep = &DIR_SEPS[1];
209114402Sru  while (*sep) {
210151497Sru    p1 = strrchr(base_name, *sep);
211114402Sru    if (p1 && (!p || p1 > p))
212114402Sru      p = p1;
213114402Sru    sep++;
214114402Sru  }
215114402Sru  size_t name_max;
216114402Sru  if (p) {
217151497Sru    char *dir = strsave(base_name);
218151497Sru    dir[p - base_name] = '\0';
219114402Sru    name_max = file_name_max(dir);
220114402Sru    a_delete dir;
221114402Sru  }
222114402Sru  else
223114402Sru    name_max = file_name_max(".");
224151497Sru  const char *filename = p ? p + 1 : base_name;
225114402Sru  if (strlen(filename) + sizeof(INDEX_SUFFIX) - 1 > name_max)
226114402Sru    fatal("`%1.%2' is too long for a filename", filename, INDEX_SUFFIX);
227114402Sru  if (p) {
228114402Sru    p++;
229151497Sru    temp_index_file = new char[p - base_name + sizeof(TEMP_INDEX_TEMPLATE)];
230151497Sru    memcpy(temp_index_file, base_name, p - base_name);
231151497Sru    strcpy(temp_index_file + (p - base_name), TEMP_INDEX_TEMPLATE);
232114402Sru  }
233114402Sru  else {
234114402Sru    temp_index_file = strsave(TEMP_INDEX_TEMPLATE);
235114402Sru  }
236114402Sru  catch_fatal_signals();
237114402Sru  int fd = mkstemp(temp_index_file);
238114402Sru  if (fd < 0)
239114402Sru    fatal("can't create temporary index file: %1", strerror(errno));
240114402Sru  indxfp = fdopen(fd, FOPEN_WB);
241114402Sru  if (indxfp == 0)
242114402Sru    fatal("fdopen failed");
243114402Sru  if (fseek(indxfp, sizeof(index_header), 0) < 0)
244114402Sru    fatal("can't seek past index header: %1", strerror(errno));
245114402Sru  int failed = 0;
246114402Sru  if (foption) {
247114402Sru    FILE *fp = stdin;
248114402Sru    if (strcmp(foption, "-") != 0) {
249114402Sru      errno = 0;
250114402Sru      fp = fopen(foption, "r");
251114402Sru      if (!fp)
252114402Sru	fatal("can't open `%1': %2", foption, strerror(errno));
253114402Sru    }
254114402Sru    string path;
255114402Sru    int lineno = 1;
256114402Sru    for (;;) {
257114402Sru      int c;
258114402Sru      for (c = getc(fp); c != '\n' && c != EOF; c = getc(fp)) {
259114402Sru	if (c == '\0')
260114402Sru	  error_with_file_and_line(foption, lineno,
261114402Sru				   "nul character in pathname ignored");
262114402Sru	else
263114402Sru	  path += c;
264114402Sru      }
265114402Sru      if (path.length() > 0) {
266114402Sru	path += '\0';
267114402Sru	if (!(*parser)(path.contents()))
268114402Sru	  failed = 1;
269114402Sru	path.clear();
270114402Sru      }
271114402Sru      if (c == EOF)
272114402Sru	break;
273114402Sru      lineno++;
274114402Sru    }
275114402Sru    if (fp != stdin)
276114402Sru      fclose(fp);
277114402Sru  }
278114402Sru  for (int i = optind; i < argc; i++)
279114402Sru    if (!(*parser)(argv[i]))
280114402Sru      failed = 1;
281114402Sru  write_hash_table();
282114402Sru  if (fclose(indxfp) < 0)
283114402Sru    fatal("error closing temporary index file: %1", strerror(errno));
284151497Sru  char *index_file = new char[strlen(base_name) + sizeof(INDEX_SUFFIX)];
285151497Sru  strcpy(index_file, base_name);
286114402Sru  strcat(index_file, INDEX_SUFFIX);
287114402Sru#ifdef HAVE_RENAME
288114402Sru#ifdef __EMX__
289114402Sru  if (access(index_file, R_OK) == 0)
290114402Sru    unlink(index_file);
291114402Sru#endif /* __EMX__ */
292114402Sru  if (rename(temp_index_file, index_file) < 0) {
293114402Sru#ifdef __MSDOS__
294114402Sru    // RENAME could fail on plain MSDOS filesystems because
295114402Sru    // INDEX_FILE is an invalid filename, e.g. it has multiple dots.
296151497Sru    char *fname = p ? index_file + (p - base_name) : 0;
297114402Sru    char *dot = 0;
298114402Sru
299114402Sru    // Replace the dot with an underscore and try again.
300114402Sru    if (fname
301114402Sru        && (dot = strchr(fname, '.')) != 0
302114402Sru        && strcmp(dot, INDEX_SUFFIX) != 0)
303114402Sru      *dot = '_';
304114402Sru    if (rename(temp_index_file, index_file) < 0)
305114402Sru#endif
306114402Sru    fatal("can't rename temporary index file: %1", strerror(errno));
307114402Sru  }
308114402Sru#else /* not HAVE_RENAME */
309114402Sru  ignore_fatal_signals();
310114402Sru  if (unlink(index_file) < 0) {
311114402Sru    if (errno != ENOENT)
312114402Sru      fatal("can't unlink `%1': %2", index_file, strerror(errno));
313114402Sru  }
314114402Sru  if (link(temp_index_file, index_file) < 0)
315114402Sru    fatal("can't link temporary index file: %1", strerror(errno));
316114402Sru  if (unlink(temp_index_file) < 0)
317114402Sru    fatal("can't unlink temporary index file: %1", strerror(errno));
318114402Sru#endif /* not HAVE_RENAME */
319114402Sru  temp_index_file = 0;
320114402Sru  return failed;
321114402Sru}
322114402Sru
323114402Srustatic void usage(FILE *stream)
324114402Sru{
325114402Sru  fprintf(stream,
326114402Sru"usage: %s [-vw] [-c file] [-d dir] [-f file] [-h n] [-i XYZ] [-k n]\n"
327114402Sru"       [-l n] [-n n] [-o base] [-t n] [files...]\n",
328114402Sru	  program_name);
329114402Sru}
330114402Sru
331114402Srustatic void check_integer_arg(char opt, const char *arg, int min, int *res)
332114402Sru{
333114402Sru  char *ptr;
334114402Sru  long n = strtol(arg, &ptr, 10);
335114402Sru  if (n == 0 && ptr == arg)
336114402Sru    error("argument to -%1 not an integer", opt);
337114402Sru  else if (n < min)
338114402Sru    error("argument to -%1 must not be less than %2", opt, min);
339114402Sru  else {
340114402Sru    if (n > INT_MAX)
341114402Sru      error("argument to -%1 greater than maximum integer", opt);
342114402Sru    else if (*ptr != '\0')
343114402Sru      error("junk after integer argument to -%1", opt);
344114402Sru    *res = int(n);
345114402Sru  }
346114402Sru}
347114402Sru
348114402Srustatic char *get_cwd()
349114402Sru{
350114402Sru  char *buf;
351114402Sru  int size = 12;
352114402Sru
353114402Sru  for (;;) {
354114402Sru    buf = new char[size];
355114402Sru    if (getcwd(buf, size))
356114402Sru      break;
357114402Sru    if (errno != ERANGE)
358114402Sru      fatal("cannot get current working directory: %1", strerror(errno));
359114402Sru    a_delete buf;
360114402Sru    if (size == INT_MAX)
361114402Sru      fatal("current working directory longer than INT_MAX");
362114402Sru    if (size > INT_MAX/2)
363114402Sru      size = INT_MAX;
364114402Sru    else
365114402Sru      size *= 2;
366114402Sru  }
367114402Sru  return buf;
368114402Sru}
369114402Sru
370114402Sruword_list::word_list(const char *s, int n, word_list *p)
371114402Sru: next(p), len(n)
372114402Sru{
373114402Sru  str = new char[n];
374114402Sru  memcpy(str, s, n);
375114402Sru}
376114402Sru
377114402Srustatic void read_common_words_file()
378114402Sru{
379114402Sru  if (n_ignore_words <= 0)
380114402Sru    return;
381114402Sru  errno = 0;
382114402Sru  FILE *fp = fopen(common_words_file, "r");
383114402Sru  if (!fp)
384114402Sru    fatal("can't open `%1': %2", common_words_file, strerror(errno));
385114402Sru  common_words_table = new word_list * [hash_table_size];
386114402Sru  for (int i = 0; i < hash_table_size; i++)
387114402Sru    common_words_table[i] = 0;
388114402Sru  int count = 0;
389114402Sru  int key_len = 0;
390114402Sru  for (;;) {
391114402Sru    int c = getc(fp);
392114402Sru    while (c != EOF && !csalnum(c))
393114402Sru      c = getc(fp);
394114402Sru    if (c == EOF)
395114402Sru      break;
396114402Sru    do {
397114402Sru      if (key_len < truncate_len)
398114402Sru	key_buffer[key_len++] = cmlower(c);
399114402Sru      c = getc(fp);
400114402Sru    } while (c != EOF && csalnum(c));
401114402Sru    if (key_len >= shortest_len) {
402114402Sru      int h = hash(key_buffer, key_len) % hash_table_size;
403114402Sru      common_words_table[h] = new word_list(key_buffer, key_len,
404114402Sru					    common_words_table[h]);
405114402Sru    }
406114402Sru    if (++count >= n_ignore_words)
407114402Sru      break;
408114402Sru    key_len = 0;
409114402Sru    if (c == EOF)
410114402Sru      break;
411114402Sru  }
412114402Sru  n_ignore_words = count;
413114402Sru  fclose(fp);
414114402Sru}
415114402Sru
416114402Srustatic int do_whole_file(const char *filename)
417114402Sru{
418114402Sru  errno = 0;
419114402Sru  FILE *fp = fopen(filename, "r");
420114402Sru  if (!fp) {
421114402Sru    error("can't open `%1': %2", filename, strerror(errno));
422114402Sru    return 0;
423114402Sru  }
424114402Sru  int count = 0;
425114402Sru  int key_len = 0;
426114402Sru  int c;
427114402Sru  while ((c = getc(fp)) != EOF) {
428114402Sru    if (csalnum(c)) {
429114402Sru      key_len = 1;
430114402Sru      key_buffer[0] = c;
431114402Sru      while ((c = getc(fp)) != EOF) {
432114402Sru	if (!csalnum(c))
433114402Sru	  break;
434114402Sru	if (key_len < truncate_len)
435114402Sru	  key_buffer[key_len++] = c;
436114402Sru      }
437114402Sru      if (store_key(key_buffer, key_len)) {
438114402Sru	if (++count >= max_keys_per_item)
439114402Sru	  break;
440114402Sru      }
441114402Sru      if (c == EOF)
442114402Sru	break;
443114402Sru    }
444114402Sru  }
445114402Sru  store_reference(filenames.length(), 0, 0);
446114402Sru  store_filename(filename);
447114402Sru  fclose(fp);
448114402Sru  return 1;
449114402Sru}
450114402Sru
451114402Srustatic int do_file(const char *filename)
452114402Sru{
453114402Sru  errno = 0;
454114402Sru  // Need binary I/O for MS-DOS/MS-Windows, because indxbib relies on
455114402Sru  // byte counts to be consistent with fseek.
456114402Sru  FILE *fp = fopen(filename, FOPEN_RB);
457114402Sru  if (fp == 0) {
458114402Sru    error("can't open `%1': %2", filename, strerror(errno));
459114402Sru    return 0;
460114402Sru  }
461114402Sru  int filename_index = filenames.length();
462114402Sru  store_filename(filename);
463114402Sru
464114402Sru  enum {
465114402Sru    START,	// at the start of the file; also in between references
466114402Sru    BOL,	// in the middle of a reference, at the beginning of the line
467114402Sru    PERCENT,	// seen a percent at the beginning of the line
468114402Sru    IGNORE,	// ignoring a field
469114402Sru    IGNORE_BOL,	// at the beginning of a line ignoring a field
470114402Sru    KEY,	// in the middle of a key
471114402Sru    DISCARD,	// after truncate_len bytes of a key
472114402Sru    MIDDLE	// in between keys
473114402Sru  } state = START;
474114402Sru
475114402Sru  // In states START, BOL, IGNORE_BOL, space_count how many spaces at
476114402Sru  // the beginning have been seen.  In states PERCENT, IGNORE, KEY,
477114402Sru  // MIDDLE space_count must be 0.
478114402Sru  int space_count = 0;
479114402Sru  int byte_count = 0;		// bytes read
480114402Sru  int key_len = 0;
481114402Sru  int ref_start = -1;		// position of start of current reference
482114402Sru  for (;;) {
483114402Sru    int c = getc(fp);
484114402Sru    if (c == EOF)
485114402Sru      break;
486114402Sru    // We opened the file in binary mode, so we need to skip
487114402Sru    // every CR character before a Newline.
488114402Sru    if (c == '\r') {
489114402Sru      int peek = getc(fp);
490114402Sru      if (peek == '\n') {
491114402Sru	byte_count++;
492114402Sru	c = peek;
493114402Sru      }
494114402Sru      else
495114402Sru	ungetc(peek, fp);
496114402Sru    }
497114402Sru#if defined(__MSDOS__) || defined(_MSC_VER) || defined(__EMX__)
498114402Sru    else if (c == 0x1a)	// ^Z means EOF in text files
499114402Sru      break;
500114402Sru#endif
501114402Sru    byte_count++;
502114402Sru    switch (state) {
503114402Sru    case START:
504114402Sru      if (c == ' ' || c == '\t') {
505114402Sru	space_count++;
506114402Sru	break;
507114402Sru      }
508114402Sru      if (c == '\n') {
509114402Sru	space_count = 0;
510114402Sru	break;
511114402Sru      }
512114402Sru      ref_start = byte_count - space_count - 1;
513114402Sru      space_count = 0;
514114402Sru      if (c == '%')
515114402Sru	state = PERCENT;
516114402Sru      else if (csalnum(c)) {
517114402Sru	state = KEY;
518114402Sru	key_buffer[0] = c;
519114402Sru	key_len = 1;
520114402Sru      }
521114402Sru      else
522114402Sru	state = MIDDLE;
523114402Sru      break;
524114402Sru    case BOL:
525114402Sru      switch (c) {
526114402Sru      case '%':
527114402Sru	if (space_count > 0) {
528114402Sru	  space_count = 0;
529114402Sru	  state = MIDDLE;
530114402Sru	}
531114402Sru	else
532114402Sru	  state = PERCENT;
533114402Sru	break;
534114402Sru      case ' ':
535114402Sru      case '\t':
536114402Sru	space_count++;
537114402Sru	break;
538114402Sru      case '\n':
539114402Sru	store_reference(filename_index, ref_start,
540114402Sru			byte_count - 1 - space_count - ref_start);
541114402Sru	state = START;
542114402Sru	space_count = 0;
543114402Sru	break;
544114402Sru      default:
545114402Sru	space_count = 0;
546114402Sru	if (csalnum(c)) {
547114402Sru	  state = KEY;
548114402Sru	  key_buffer[0] = c;
549114402Sru	  key_len = 1;
550114402Sru	}
551114402Sru	else
552114402Sru	  state = MIDDLE;
553114402Sru      }
554114402Sru      break;
555114402Sru    case PERCENT:
556114402Sru      if (strchr(ignore_fields, c) != 0)
557114402Sru	state = IGNORE;
558114402Sru      else if (c == '\n')
559114402Sru	state = BOL;
560114402Sru      else
561114402Sru	state = MIDDLE;
562114402Sru      break;
563114402Sru    case IGNORE:
564114402Sru      if (c == '\n')
565114402Sru	state = IGNORE_BOL;
566114402Sru      break;
567114402Sru    case IGNORE_BOL:
568114402Sru      switch (c) {
569114402Sru      case '%':
570114402Sru	if (space_count > 0) {
571114402Sru	  state = IGNORE;
572114402Sru	  space_count = 0;
573114402Sru	}
574114402Sru	else
575114402Sru	  state = PERCENT;
576114402Sru	break;
577114402Sru      case ' ':
578114402Sru      case '\t':
579114402Sru	space_count++;
580114402Sru	break;
581114402Sru      case '\n':
582114402Sru	store_reference(filename_index, ref_start,
583114402Sru			byte_count - 1 - space_count - ref_start);
584114402Sru	state = START;
585114402Sru	space_count = 0;
586114402Sru	break;
587114402Sru      default:
588114402Sru	space_count = 0;
589114402Sru	state = IGNORE;
590114402Sru      }
591114402Sru      break;
592114402Sru    case KEY:
593114402Sru      if (csalnum(c)) {
594114402Sru	if (key_len < truncate_len)
595114402Sru	  key_buffer[key_len++] = c;
596114402Sru	else
597114402Sru	  state = DISCARD;
598114402Sru      }
599114402Sru      else {
600114402Sru	possibly_store_key(key_buffer, key_len);
601114402Sru	key_len = 0;
602114402Sru	if (c == '\n')
603114402Sru	  state = BOL;
604114402Sru	else
605114402Sru	  state = MIDDLE;
606114402Sru      }
607114402Sru      break;
608114402Sru    case DISCARD:
609114402Sru      if (!csalnum(c)) {
610114402Sru	possibly_store_key(key_buffer, key_len);
611114402Sru	key_len = 0;
612114402Sru	if (c == '\n')
613114402Sru	  state = BOL;
614114402Sru	else
615114402Sru	  state = MIDDLE;
616114402Sru      }
617114402Sru      break;
618114402Sru    case MIDDLE:
619114402Sru      if (csalnum(c)) {
620114402Sru	state = KEY;
621114402Sru	key_buffer[0] = c;
622114402Sru	key_len = 1;
623114402Sru      }
624114402Sru      else if (c == '\n')
625114402Sru	state = BOL;
626114402Sru      break;
627114402Sru    default:
628114402Sru      assert(0);
629114402Sru    }
630114402Sru  }
631114402Sru  switch (state) {
632114402Sru  case START:
633114402Sru    break;
634114402Sru  case DISCARD:
635114402Sru  case KEY:
636114402Sru    possibly_store_key(key_buffer, key_len);
637114402Sru    // fall through
638114402Sru  case BOL:
639114402Sru  case PERCENT:
640114402Sru  case IGNORE_BOL:
641114402Sru  case IGNORE:
642114402Sru  case MIDDLE:
643114402Sru    store_reference(filename_index, ref_start,
644114402Sru		    byte_count - ref_start - space_count);
645114402Sru    break;
646114402Sru  default:
647114402Sru    assert(0);
648114402Sru  }
649114402Sru  fclose(fp);
650114402Sru  return 1;
651114402Sru}
652114402Sru
653114402Srustatic void store_reference(int filename_index, int pos, int len)
654114402Sru{
655114402Sru  tag t;
656114402Sru  t.filename_index = filename_index;
657114402Sru  t.start = pos;
658114402Sru  t.length = len;
659114402Sru  fwrite_or_die(&t, sizeof(t), 1, indxfp);
660114402Sru  ntags++;
661114402Sru}
662114402Sru
663114402Srustatic void store_filename(const char *fn)
664114402Sru{
665114402Sru  filenames += fn;
666114402Sru  filenames += '\0';
667114402Sru}
668114402Sru
669114402Srustatic void init_hash_table()
670114402Sru{
671114402Sru  hash_table = new table_entry[hash_table_size];
672114402Sru  for (int i = 0; i < hash_table_size; i++)
673114402Sru    hash_table[i].ptr = 0;
674114402Sru}
675114402Sru
676114402Srustatic void possibly_store_key(char *s, int len)
677114402Sru{
678114402Sru  static int last_tagno = -1;
679114402Sru  static int key_count;
680114402Sru  if (last_tagno != ntags) {
681114402Sru    last_tagno = ntags;
682114402Sru    key_count = 0;
683114402Sru  }
684114402Sru  if (key_count < max_keys_per_item) {
685114402Sru    if (store_key(s, len))
686114402Sru      key_count++;
687114402Sru  }
688114402Sru}
689114402Sru
690114402Srustatic int store_key(char *s, int len)
691114402Sru{
692114402Sru  if (len < shortest_len)
693114402Sru    return 0;
694114402Sru  int is_number = 1;
695114402Sru  for (int i = 0; i < len; i++)
696114402Sru    if (!csdigit(s[i])) {
697114402Sru      is_number = 0;
698114402Sru      s[i] = cmlower(s[i]);
699114402Sru    }
700114402Sru  if (is_number && !(len == 4 && s[0] == '1' && s[1] == '9'))
701114402Sru    return 0;
702114402Sru  int h = hash(s, len) % hash_table_size;
703114402Sru  if (common_words_table) {
704114402Sru    for (word_list *ptr = common_words_table[h]; ptr; ptr = ptr->next)
705114402Sru      if (len == ptr->len && memcmp(s, ptr->str, len) == 0)
706114402Sru	return 0;
707114402Sru  }
708114402Sru  table_entry *pp =  hash_table + h;
709114402Sru  if (!pp->ptr)
710114402Sru    pp->ptr = new block;
711114402Sru  else if (pp->ptr->v[pp->ptr->used - 1] == ntags)
712114402Sru    return 1;
713114402Sru  else if (pp->ptr->used >= BLOCK_SIZE)
714114402Sru    pp->ptr = new block(pp->ptr);
715114402Sru  pp->ptr->v[(pp->ptr->used)++] = ntags;
716114402Sru  return 1;
717114402Sru}
718114402Sru
719114402Srustatic void write_hash_table()
720114402Sru{
721114402Sru  const int minus_one = -1;
722114402Sru  int li = 0;
723114402Sru  for (int i = 0; i < hash_table_size; i++) {
724114402Sru    block *ptr = hash_table[i].ptr;
725114402Sru    if (!ptr)
726114402Sru      hash_table[i].count = -1;
727114402Sru    else {
728114402Sru      hash_table[i].count = li;
729114402Sru      block *rev = 0;
730114402Sru      while (ptr) {
731114402Sru	block *tem = ptr;
732114402Sru	ptr = ptr->next;
733114402Sru	tem->next = rev;
734114402Sru	rev = tem;
735114402Sru      }
736114402Sru      while (rev) {
737114402Sru	fwrite_or_die(rev->v, sizeof(int), rev->used, indxfp);
738114402Sru	li += rev->used;
739114402Sru	block *tem = rev;
740114402Sru	rev = rev->next;
741114402Sru	delete tem;
742114402Sru      }
743114402Sru      fwrite_or_die(&minus_one, sizeof(int), 1, indxfp);
744114402Sru      li += 1;
745114402Sru    }
746114402Sru  }
747114402Sru  if (sizeof(table_entry) == sizeof(int))
748114402Sru    fwrite_or_die(hash_table, sizeof(int), hash_table_size, indxfp);
749114402Sru  else {
750114402Sru    // write it out word by word
751114402Sru    for (int i = 0; i < hash_table_size; i++)
752114402Sru      fwrite_or_die(&hash_table[i].count, sizeof(int), 1, indxfp);
753114402Sru  }
754114402Sru  fwrite_or_die(filenames.contents(), 1, filenames.length(), indxfp);
755114402Sru  if (fseek(indxfp, 0, 0) < 0)
756114402Sru    fatal("error seeking on index file: %1", strerror(errno));
757114402Sru  index_header h;
758114402Sru  h.magic = INDEX_MAGIC;
759114402Sru  h.version = INDEX_VERSION;
760114402Sru  h.tags_size = ntags;
761114402Sru  h.lists_size = li;
762114402Sru  h.table_size = hash_table_size;
763114402Sru  h.strings_size = filenames.length();
764114402Sru  h.truncate = truncate_len;
765114402Sru  h.shortest = shortest_len;
766114402Sru  h.common = n_ignore_words;
767114402Sru  fwrite_or_die(&h, sizeof(h), 1, indxfp);
768114402Sru}
769114402Sru
770114402Srustatic void fwrite_or_die(const void *ptr, int size, int nitems, FILE *fp)
771114402Sru{
772114402Sru  if (fwrite(ptr, size, nitems, fp) != (size_t)nitems)
773114402Sru    fatal("fwrite failed: %1", strerror(errno));
774114402Sru}
775114402Sru
776114402Sruvoid fatal_error_exit()
777114402Sru{
778114402Sru  cleanup();
779114402Sru  exit(3);
780114402Sru}
781114402Sru
782114402Sruextern "C" {
783114402Sru
784114402Sruvoid cleanup()
785114402Sru{
786114402Sru  if (temp_index_file)
787114402Sru    unlink(temp_index_file);
788114402Sru}
789114402Sru
790114402Sru}
791