1// -*- C++ -*-
2/* Copyright (C) 1989-1992, 2000, 2001, 2002, 2003, 2004
3   Free Software Foundation, Inc.
4     Written by James Clark (jjc@jclark.com)
5
6This file is part of groff.
7
8groff is free software; you can redistribute it and/or modify it under
9the terms of the GNU General Public License as published by the Free
10Software Foundation; either version 2, or (at your option) any later
11version.
12
13groff is distributed in the hope that it will be useful, but WITHOUT ANY
14WARRANTY; without even the implied warranty of MERCHANTABILITY or
15FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
16for more details.
17
18You should have received a copy of the GNU General Public License along
19with groff; see the file COPYING.  If not, write to the Free Software
20Foundation, 51 Franklin St - Fifth Floor, Boston, MA 02110-1301, USA. */
21
22#include "lib.h"
23
24#include <stdlib.h>
25#include <assert.h>
26#include <errno.h>
27
28#include "posix.h"
29#include "errarg.h"
30#include "error.h"
31#include "stringclass.h"
32#include "cset.h"
33#include "cmap.h"
34
35#include "defs.h"
36#include "index.h"
37
38#include "nonposix.h"
39
40extern "C" const char *Version_string;
41
42#define DEFAULT_HASH_TABLE_SIZE 997
43#define TEMP_INDEX_TEMPLATE "indxbibXXXXXX"
44
45// (2^n - MALLOC_OVERHEAD) should be a good argument for malloc().
46
47#define MALLOC_OVERHEAD 16
48
49#ifdef BLOCK_SIZE
50#undef BLOCK_SIZE
51#endif
52
53const int BLOCK_SIZE = ((1024 - MALLOC_OVERHEAD - sizeof(struct block *)
54			 - sizeof(int)) / sizeof(int));
55struct block {
56  block *next;
57  int used;
58  int v[BLOCK_SIZE];
59
60  block(block *p = 0) : next(p), used(0) { }
61};
62
63struct block;
64
65union table_entry {
66  block *ptr;
67  int count;
68};
69
70struct word_list {
71  word_list *next;
72  char *str;
73  int len;
74  word_list(const char *, int, word_list *);
75};
76
77table_entry *hash_table;
78int hash_table_size = DEFAULT_HASH_TABLE_SIZE;
79// We make this the same size as hash_table so we only have to do one
80// mod per key.
81static word_list **common_words_table = 0;
82char *key_buffer;
83
84FILE *indxfp;
85int ntags = 0;
86string filenames;
87char *temp_index_file = 0;
88
89const char *ignore_fields = "XYZ";
90const char *common_words_file = COMMON_WORDS_FILE;
91int n_ignore_words = 100;
92int truncate_len = 6;
93int shortest_len = 3;
94int max_keys_per_item = 100;
95
96static void usage(FILE *stream);
97static void write_hash_table();
98static void init_hash_table();
99static void read_common_words_file();
100static int store_key(char *s, int len);
101static void possibly_store_key(char *s, int len);
102static int do_whole_file(const char *filename);
103static int do_file(const char *filename);
104static void store_reference(int filename_index, int pos, int len);
105static void check_integer_arg(char opt, const char *arg, int min, int *res);
106static void store_filename(const char *);
107static void fwrite_or_die(const void *ptr, int size, int nitems, FILE *fp);
108static char *get_cwd();
109
110extern "C" {
111  void cleanup();
112  void catch_fatal_signals();
113  void ignore_fatal_signals();
114}
115
116int main(int argc, char **argv)
117{
118  program_name = argv[0];
119  static char stderr_buf[BUFSIZ];
120  setbuf(stderr, stderr_buf);
121
122  const char *base_name = 0;
123  typedef int (*parser_t)(const char *);
124  parser_t parser = do_file;
125  const char *directory = 0;
126  const char *foption = 0;
127  int opt;
128  static const struct option long_options[] = {
129    { "help", no_argument, 0, CHAR_MAX + 1 },
130    { "version", no_argument, 0, 'v' },
131    { NULL, 0, 0, 0 }
132  };
133  while ((opt = getopt_long(argc, argv, "c:o:h:i:k:l:t:n:c:d:f:vw",
134			    long_options, NULL))
135	 != EOF)
136    switch (opt) {
137    case 'c':
138      common_words_file = optarg;
139      break;
140    case 'd':
141      directory = optarg;
142      break;
143    case 'f':
144      foption = optarg;
145      break;
146    case 'h':
147      check_integer_arg('h', optarg, 1, &hash_table_size);
148      if (!is_prime(hash_table_size)) {
149	while (!is_prime(++hash_table_size))
150	  ;
151	warning("%1 not prime: using %2 instead", optarg, hash_table_size);
152      }
153      break;
154    case 'i':
155      ignore_fields = optarg;
156      break;
157    case 'k':
158      check_integer_arg('k', optarg, 1, &max_keys_per_item);
159      break;
160    case 'l':
161      check_integer_arg('l', optarg, 0, &shortest_len);
162      break;
163    case 'n':
164      check_integer_arg('n', optarg, 0, &n_ignore_words);
165      break;
166    case 'o':
167      base_name = optarg;
168      break;
169    case 't':
170      check_integer_arg('t', optarg, 1, &truncate_len);
171      break;
172    case 'w':
173      parser = do_whole_file;
174      break;
175    case 'v':
176      printf("GNU indxbib (groff) version %s\n", Version_string);
177      exit(0);
178      break;
179    case CHAR_MAX + 1: // --help
180      usage(stdout);
181      exit(0);
182      break;
183    case '?':
184      usage(stderr);
185      exit(1);
186      break;
187    default:
188      assert(0);
189      break;
190    }
191  if (optind >= argc && foption == 0)
192    fatal("no files and no -f option");
193  if (!directory) {
194    char *path = get_cwd();
195    store_filename(path);
196    a_delete path;
197  }
198  else
199    store_filename(directory);
200  init_hash_table();
201  store_filename(common_words_file);
202  store_filename(ignore_fields);
203  key_buffer = new char[truncate_len];
204  read_common_words_file();
205  if (!base_name)
206    base_name = optind < argc ? argv[optind] : DEFAULT_INDEX_NAME;
207  const char *p = strrchr(base_name, DIR_SEPS[0]), *p1;
208  const char *sep = &DIR_SEPS[1];
209  while (*sep) {
210    p1 = strrchr(base_name, *sep);
211    if (p1 && (!p || p1 > p))
212      p = p1;
213    sep++;
214  }
215  size_t name_max;
216  if (p) {
217    char *dir = strsave(base_name);
218    dir[p - base_name] = '\0';
219    name_max = file_name_max(dir);
220    a_delete dir;
221  }
222  else
223    name_max = file_name_max(".");
224  const char *filename = p ? p + 1 : base_name;
225  if (strlen(filename) + sizeof(INDEX_SUFFIX) - 1 > name_max)
226    fatal("`%1.%2' is too long for a filename", filename, INDEX_SUFFIX);
227  if (p) {
228    p++;
229    temp_index_file = new char[p - base_name + sizeof(TEMP_INDEX_TEMPLATE)];
230    memcpy(temp_index_file, base_name, p - base_name);
231    strcpy(temp_index_file + (p - base_name), TEMP_INDEX_TEMPLATE);
232  }
233  else {
234    temp_index_file = strsave(TEMP_INDEX_TEMPLATE);
235  }
236  catch_fatal_signals();
237  int fd = mkstemp(temp_index_file);
238  if (fd < 0)
239    fatal("can't create temporary index file: %1", strerror(errno));
240  indxfp = fdopen(fd, FOPEN_WB);
241  if (indxfp == 0)
242    fatal("fdopen failed");
243  if (fseek(indxfp, sizeof(index_header), 0) < 0)
244    fatal("can't seek past index header: %1", strerror(errno));
245  int failed = 0;
246  if (foption) {
247    FILE *fp = stdin;
248    if (strcmp(foption, "-") != 0) {
249      errno = 0;
250      fp = fopen(foption, "r");
251      if (!fp)
252	fatal("can't open `%1': %2", foption, strerror(errno));
253    }
254    string path;
255    int lineno = 1;
256    for (;;) {
257      int c;
258      for (c = getc(fp); c != '\n' && c != EOF; c = getc(fp)) {
259	if (c == '\0')
260	  error_with_file_and_line(foption, lineno,
261				   "nul character in pathname ignored");
262	else
263	  path += c;
264      }
265      if (path.length() > 0) {
266	path += '\0';
267	if (!(*parser)(path.contents()))
268	  failed = 1;
269	path.clear();
270      }
271      if (c == EOF)
272	break;
273      lineno++;
274    }
275    if (fp != stdin)
276      fclose(fp);
277  }
278  for (int i = optind; i < argc; i++)
279    if (!(*parser)(argv[i]))
280      failed = 1;
281  write_hash_table();
282  if (fclose(indxfp) < 0)
283    fatal("error closing temporary index file: %1", strerror(errno));
284  char *index_file = new char[strlen(base_name) + sizeof(INDEX_SUFFIX)];
285  strcpy(index_file, base_name);
286  strcat(index_file, INDEX_SUFFIX);
287#ifdef HAVE_RENAME
288#ifdef __EMX__
289  if (access(index_file, R_OK) == 0)
290    unlink(index_file);
291#endif /* __EMX__ */
292  if (rename(temp_index_file, index_file) < 0) {
293#ifdef __MSDOS__
294    // RENAME could fail on plain MSDOS filesystems because
295    // INDEX_FILE is an invalid filename, e.g. it has multiple dots.
296    char *fname = p ? index_file + (p - base_name) : 0;
297    char *dot = 0;
298
299    // Replace the dot with an underscore and try again.
300    if (fname
301        && (dot = strchr(fname, '.')) != 0
302        && strcmp(dot, INDEX_SUFFIX) != 0)
303      *dot = '_';
304    if (rename(temp_index_file, index_file) < 0)
305#endif
306    fatal("can't rename temporary index file: %1", strerror(errno));
307  }
308#else /* not HAVE_RENAME */
309  ignore_fatal_signals();
310  if (unlink(index_file) < 0) {
311    if (errno != ENOENT)
312      fatal("can't unlink `%1': %2", index_file, strerror(errno));
313  }
314  if (link(temp_index_file, index_file) < 0)
315    fatal("can't link temporary index file: %1", strerror(errno));
316  if (unlink(temp_index_file) < 0)
317    fatal("can't unlink temporary index file: %1", strerror(errno));
318#endif /* not HAVE_RENAME */
319  temp_index_file = 0;
320  return failed;
321}
322
323static void usage(FILE *stream)
324{
325  fprintf(stream,
326"usage: %s [-vw] [-c file] [-d dir] [-f file] [-h n] [-i XYZ] [-k n]\n"
327"       [-l n] [-n n] [-o base] [-t n] [files...]\n",
328	  program_name);
329}
330
331static void check_integer_arg(char opt, const char *arg, int min, int *res)
332{
333  char *ptr;
334  long n = strtol(arg, &ptr, 10);
335  if (n == 0 && ptr == arg)
336    error("argument to -%1 not an integer", opt);
337  else if (n < min)
338    error("argument to -%1 must not be less than %2", opt, min);
339  else {
340    if (n > INT_MAX)
341      error("argument to -%1 greater than maximum integer", opt);
342    else if (*ptr != '\0')
343      error("junk after integer argument to -%1", opt);
344    *res = int(n);
345  }
346}
347
348static char *get_cwd()
349{
350  char *buf;
351  int size = 12;
352
353  for (;;) {
354    buf = new char[size];
355    if (getcwd(buf, size))
356      break;
357    if (errno != ERANGE)
358      fatal("cannot get current working directory: %1", strerror(errno));
359    a_delete buf;
360    if (size == INT_MAX)
361      fatal("current working directory longer than INT_MAX");
362    if (size > INT_MAX/2)
363      size = INT_MAX;
364    else
365      size *= 2;
366  }
367  return buf;
368}
369
370word_list::word_list(const char *s, int n, word_list *p)
371: next(p), len(n)
372{
373  str = new char[n];
374  memcpy(str, s, n);
375}
376
377static void read_common_words_file()
378{
379  if (n_ignore_words <= 0)
380    return;
381  errno = 0;
382  FILE *fp = fopen(common_words_file, "r");
383  if (!fp)
384    fatal("can't open `%1': %2", common_words_file, strerror(errno));
385  common_words_table = new word_list * [hash_table_size];
386  for (int i = 0; i < hash_table_size; i++)
387    common_words_table[i] = 0;
388  int count = 0;
389  int key_len = 0;
390  for (;;) {
391    int c = getc(fp);
392    while (c != EOF && !csalnum(c))
393      c = getc(fp);
394    if (c == EOF)
395      break;
396    do {
397      if (key_len < truncate_len)
398	key_buffer[key_len++] = cmlower(c);
399      c = getc(fp);
400    } while (c != EOF && csalnum(c));
401    if (key_len >= shortest_len) {
402      int h = hash(key_buffer, key_len) % hash_table_size;
403      common_words_table[h] = new word_list(key_buffer, key_len,
404					    common_words_table[h]);
405    }
406    if (++count >= n_ignore_words)
407      break;
408    key_len = 0;
409    if (c == EOF)
410      break;
411  }
412  n_ignore_words = count;
413  fclose(fp);
414}
415
416static int do_whole_file(const char *filename)
417{
418  errno = 0;
419  FILE *fp = fopen(filename, "r");
420  if (!fp) {
421    error("can't open `%1': %2", filename, strerror(errno));
422    return 0;
423  }
424  int count = 0;
425  int key_len = 0;
426  int c;
427  while ((c = getc(fp)) != EOF) {
428    if (csalnum(c)) {
429      key_len = 1;
430      key_buffer[0] = c;
431      while ((c = getc(fp)) != EOF) {
432	if (!csalnum(c))
433	  break;
434	if (key_len < truncate_len)
435	  key_buffer[key_len++] = c;
436      }
437      if (store_key(key_buffer, key_len)) {
438	if (++count >= max_keys_per_item)
439	  break;
440      }
441      if (c == EOF)
442	break;
443    }
444  }
445  store_reference(filenames.length(), 0, 0);
446  store_filename(filename);
447  fclose(fp);
448  return 1;
449}
450
451static int do_file(const char *filename)
452{
453  errno = 0;
454  // Need binary I/O for MS-DOS/MS-Windows, because indxbib relies on
455  // byte counts to be consistent with fseek.
456  FILE *fp = fopen(filename, FOPEN_RB);
457  if (fp == 0) {
458    error("can't open `%1': %2", filename, strerror(errno));
459    return 0;
460  }
461  int filename_index = filenames.length();
462  store_filename(filename);
463
464  enum {
465    START,	// at the start of the file; also in between references
466    BOL,	// in the middle of a reference, at the beginning of the line
467    PERCENT,	// seen a percent at the beginning of the line
468    IGNORE,	// ignoring a field
469    IGNORE_BOL,	// at the beginning of a line ignoring a field
470    KEY,	// in the middle of a key
471    DISCARD,	// after truncate_len bytes of a key
472    MIDDLE	// in between keys
473  } state = START;
474
475  // In states START, BOL, IGNORE_BOL, space_count how many spaces at
476  // the beginning have been seen.  In states PERCENT, IGNORE, KEY,
477  // MIDDLE space_count must be 0.
478  int space_count = 0;
479  int byte_count = 0;		// bytes read
480  int key_len = 0;
481  int ref_start = -1;		// position of start of current reference
482  for (;;) {
483    int c = getc(fp);
484    if (c == EOF)
485      break;
486    // We opened the file in binary mode, so we need to skip
487    // every CR character before a Newline.
488    if (c == '\r') {
489      int peek = getc(fp);
490      if (peek == '\n') {
491	byte_count++;
492	c = peek;
493      }
494      else
495	ungetc(peek, fp);
496    }
497#if defined(__MSDOS__) || defined(_MSC_VER) || defined(__EMX__)
498    else if (c == 0x1a)	// ^Z means EOF in text files
499      break;
500#endif
501    byte_count++;
502    switch (state) {
503    case START:
504      if (c == ' ' || c == '\t') {
505	space_count++;
506	break;
507      }
508      if (c == '\n') {
509	space_count = 0;
510	break;
511      }
512      ref_start = byte_count - space_count - 1;
513      space_count = 0;
514      if (c == '%')
515	state = PERCENT;
516      else if (csalnum(c)) {
517	state = KEY;
518	key_buffer[0] = c;
519	key_len = 1;
520      }
521      else
522	state = MIDDLE;
523      break;
524    case BOL:
525      switch (c) {
526      case '%':
527	if (space_count > 0) {
528	  space_count = 0;
529	  state = MIDDLE;
530	}
531	else
532	  state = PERCENT;
533	break;
534      case ' ':
535      case '\t':
536	space_count++;
537	break;
538      case '\n':
539	store_reference(filename_index, ref_start,
540			byte_count - 1 - space_count - ref_start);
541	state = START;
542	space_count = 0;
543	break;
544      default:
545	space_count = 0;
546	if (csalnum(c)) {
547	  state = KEY;
548	  key_buffer[0] = c;
549	  key_len = 1;
550	}
551	else
552	  state = MIDDLE;
553      }
554      break;
555    case PERCENT:
556      if (strchr(ignore_fields, c) != 0)
557	state = IGNORE;
558      else if (c == '\n')
559	state = BOL;
560      else
561	state = MIDDLE;
562      break;
563    case IGNORE:
564      if (c == '\n')
565	state = IGNORE_BOL;
566      break;
567    case IGNORE_BOL:
568      switch (c) {
569      case '%':
570	if (space_count > 0) {
571	  state = IGNORE;
572	  space_count = 0;
573	}
574	else
575	  state = PERCENT;
576	break;
577      case ' ':
578      case '\t':
579	space_count++;
580	break;
581      case '\n':
582	store_reference(filename_index, ref_start,
583			byte_count - 1 - space_count - ref_start);
584	state = START;
585	space_count = 0;
586	break;
587      default:
588	space_count = 0;
589	state = IGNORE;
590      }
591      break;
592    case KEY:
593      if (csalnum(c)) {
594	if (key_len < truncate_len)
595	  key_buffer[key_len++] = c;
596	else
597	  state = DISCARD;
598      }
599      else {
600	possibly_store_key(key_buffer, key_len);
601	key_len = 0;
602	if (c == '\n')
603	  state = BOL;
604	else
605	  state = MIDDLE;
606      }
607      break;
608    case DISCARD:
609      if (!csalnum(c)) {
610	possibly_store_key(key_buffer, key_len);
611	key_len = 0;
612	if (c == '\n')
613	  state = BOL;
614	else
615	  state = MIDDLE;
616      }
617      break;
618    case MIDDLE:
619      if (csalnum(c)) {
620	state = KEY;
621	key_buffer[0] = c;
622	key_len = 1;
623      }
624      else if (c == '\n')
625	state = BOL;
626      break;
627    default:
628      assert(0);
629    }
630  }
631  switch (state) {
632  case START:
633    break;
634  case DISCARD:
635  case KEY:
636    possibly_store_key(key_buffer, key_len);
637    // fall through
638  case BOL:
639  case PERCENT:
640  case IGNORE_BOL:
641  case IGNORE:
642  case MIDDLE:
643    store_reference(filename_index, ref_start,
644		    byte_count - ref_start - space_count);
645    break;
646  default:
647    assert(0);
648  }
649  fclose(fp);
650  return 1;
651}
652
653static void store_reference(int filename_index, int pos, int len)
654{
655  tag t;
656  t.filename_index = filename_index;
657  t.start = pos;
658  t.length = len;
659  fwrite_or_die(&t, sizeof(t), 1, indxfp);
660  ntags++;
661}
662
663static void store_filename(const char *fn)
664{
665  filenames += fn;
666  filenames += '\0';
667}
668
669static void init_hash_table()
670{
671  hash_table = new table_entry[hash_table_size];
672  for (int i = 0; i < hash_table_size; i++)
673    hash_table[i].ptr = 0;
674}
675
676static void possibly_store_key(char *s, int len)
677{
678  static int last_tagno = -1;
679  static int key_count;
680  if (last_tagno != ntags) {
681    last_tagno = ntags;
682    key_count = 0;
683  }
684  if (key_count < max_keys_per_item) {
685    if (store_key(s, len))
686      key_count++;
687  }
688}
689
690static int store_key(char *s, int len)
691{
692  if (len < shortest_len)
693    return 0;
694  int is_number = 1;
695  for (int i = 0; i < len; i++)
696    if (!csdigit(s[i])) {
697      is_number = 0;
698      s[i] = cmlower(s[i]);
699    }
700  if (is_number && !(len == 4 && s[0] == '1' && s[1] == '9'))
701    return 0;
702  int h = hash(s, len) % hash_table_size;
703  if (common_words_table) {
704    for (word_list *ptr = common_words_table[h]; ptr; ptr = ptr->next)
705      if (len == ptr->len && memcmp(s, ptr->str, len) == 0)
706	return 0;
707  }
708  table_entry *pp =  hash_table + h;
709  if (!pp->ptr)
710    pp->ptr = new block;
711  else if (pp->ptr->v[pp->ptr->used - 1] == ntags)
712    return 1;
713  else if (pp->ptr->used >= BLOCK_SIZE)
714    pp->ptr = new block(pp->ptr);
715  pp->ptr->v[(pp->ptr->used)++] = ntags;
716  return 1;
717}
718
719static void write_hash_table()
720{
721  const int minus_one = -1;
722  int li = 0;
723  for (int i = 0; i < hash_table_size; i++) {
724    block *ptr = hash_table[i].ptr;
725    if (!ptr)
726      hash_table[i].count = -1;
727    else {
728      hash_table[i].count = li;
729      block *rev = 0;
730      while (ptr) {
731	block *tem = ptr;
732	ptr = ptr->next;
733	tem->next = rev;
734	rev = tem;
735      }
736      while (rev) {
737	fwrite_or_die(rev->v, sizeof(int), rev->used, indxfp);
738	li += rev->used;
739	block *tem = rev;
740	rev = rev->next;
741	delete tem;
742      }
743      fwrite_or_die(&minus_one, sizeof(int), 1, indxfp);
744      li += 1;
745    }
746  }
747  if (sizeof(table_entry) == sizeof(int))
748    fwrite_or_die(hash_table, sizeof(int), hash_table_size, indxfp);
749  else {
750    // write it out word by word
751    for (int i = 0; i < hash_table_size; i++)
752      fwrite_or_die(&hash_table[i].count, sizeof(int), 1, indxfp);
753  }
754  fwrite_or_die(filenames.contents(), 1, filenames.length(), indxfp);
755  if (fseek(indxfp, 0, 0) < 0)
756    fatal("error seeking on index file: %1", strerror(errno));
757  index_header h;
758  h.magic = INDEX_MAGIC;
759  h.version = INDEX_VERSION;
760  h.tags_size = ntags;
761  h.lists_size = li;
762  h.table_size = hash_table_size;
763  h.strings_size = filenames.length();
764  h.truncate = truncate_len;
765  h.shortest = shortest_len;
766  h.common = n_ignore_words;
767  fwrite_or_die(&h, sizeof(h), 1, indxfp);
768}
769
770static void fwrite_or_die(const void *ptr, int size, int nitems, FILE *fp)
771{
772  if (fwrite(ptr, size, nitems, fp) != (size_t)nitems)
773    fatal("fwrite failed: %1", strerror(errno));
774}
775
776void fatal_error_exit()
777{
778  cleanup();
779  exit(3);
780}
781
782extern "C" {
783
784void cleanup()
785{
786  if (temp_index_file)
787    unlink(temp_index_file);
788}
789
790}
791