1/*	$NetBSD: index.cpp,v 1.1.1.1 2016/01/13 18:41:48 christos Exp $	*/
2
3// -*- C++ -*-
4/* Copyright (C) 1989, 1990, 1991, 1992, 2001, 2004
5   Free Software Foundation, Inc.
6     Written by James Clark (jjc@jclark.com)
7
8This file is part of groff.
9
10groff is free software; you can redistribute it and/or modify it under
11the terms of the GNU General Public License as published by the Free
12Software Foundation; either version 2, or (at your option) any later
13version.
14
15groff is distributed in the hope that it will be useful, but WITHOUT ANY
16WARRANTY; without even the implied warranty of MERCHANTABILITY or
17FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
18for more details.
19
20You should have received a copy of the GNU General Public License along
21with groff; see the file COPYING.  If not, write to the Free Software
22Foundation, 51 Franklin St - Fifth Floor, Boston, MA 02110-1301, USA. */
23
24#include "lib.h"
25
26#include <stdlib.h>
27#include <errno.h>
28
29#include "posix.h"
30#include "cset.h"
31#include "cmap.h"
32#include "errarg.h"
33#include "error.h"
34
35#include "refid.h"
36#include "search.h"
37#include "index.h"
38#include "defs.h"
39
40#include "nonposix.h"
41
42// Interface to mmap.
43extern "C" {
44  void *mapread(int fd, int len);
45  int unmap(void *, int len);
46}
47
48#if 0
49const
50#endif
51int minus_one = -1;
52
53int verify_flag = 0;
54
55struct word_list;
56
57class index_search_item : public search_item {
58  search_item *out_of_date_files;
59  index_header header;
60  char *buffer;
61  void *map_addr;
62  int map_len;
63  tag *tags;
64  int *table;
65  int *lists;
66  char *pool;
67  char *key_buffer;
68  char *filename_buffer;
69  int filename_buflen;
70  char **common_words_table;
71  int common_words_table_size;
72  const char *ignore_fields;
73  time_t mtime;
74
75  const char *do_verify();
76  const int *search1(const char **pp, const char *end);
77  const int *search(const char *ptr, int length, int **temp_listp);
78  const char *munge_filename(const char *);
79  void read_common_words_file();
80  void add_out_of_date_file(int fd, const char *filename, int fid);
81public:
82  index_search_item(const char *, int);
83  ~index_search_item();
84  int load(int fd);
85  search_item_iterator *make_search_item_iterator(const char *);
86  int verify();
87  void check_files();
88  int next_filename_id() const;
89  friend class index_search_item_iterator;
90};
91
92class index_search_item_iterator : public search_item_iterator {
93  index_search_item *indx;
94  search_item_iterator *out_of_date_files_iter;
95  search_item *next_out_of_date_file;
96  const int *found_list;
97  int *temp_list;
98  char *buf;
99  int buflen;
100  linear_searcher searcher;
101  char *query;
102  int get_tag(int tagno, const linear_searcher &, const char **, int *,
103	      reference_id *);
104public:
105  index_search_item_iterator(index_search_item *, const char *);
106  ~index_search_item_iterator();
107  int next(const linear_searcher &, const char **, int *, reference_id *);
108};
109
110
111index_search_item::index_search_item(const char *filename, int fid)
112: search_item(filename, fid), out_of_date_files(0), buffer(0), map_addr(0),
113  map_len(0), key_buffer(0), filename_buffer(0), filename_buflen(0),
114  common_words_table(0)
115{
116}
117
118index_search_item::~index_search_item()
119{
120  if (buffer)
121    free(buffer);
122  if (map_addr) {
123    if (unmap(map_addr, map_len) < 0)
124      error("unmap: %1", strerror(errno));
125  }
126  while (out_of_date_files) {
127    search_item *tem = out_of_date_files;
128    out_of_date_files = out_of_date_files->next;
129    delete tem;
130  }
131  a_delete filename_buffer;
132  a_delete key_buffer;
133  if (common_words_table) {
134    for (int i = 0; i < common_words_table_size; i++)
135      a_delete common_words_table[i];
136    a_delete common_words_table;
137  }
138}
139
140class file_closer {
141  int *fdp;
142public:
143  file_closer(int &fd) : fdp(&fd) { }
144  ~file_closer() { close(*fdp); }
145};
146
147// Tell the compiler that a variable is intentionally unused.
148inline void unused(void *) { }
149
150int index_search_item::load(int fd)
151{
152  file_closer fd_closer(fd);	// close fd on return
153  unused(&fd_closer);
154  struct stat sb;
155  if (fstat(fd, &sb) < 0) {
156    error("can't fstat `%1': %2", name, strerror(errno));
157    return 0;
158  }
159  if (!S_ISREG(sb.st_mode)) {
160    error("`%1' is not a regular file", name);
161    return 0;
162  }
163  mtime = sb.st_mtime;
164  int size = int(sb.st_size);
165  char *addr;
166  map_addr = mapread(fd, size);
167  if (map_addr) {
168    addr = (char *)map_addr;
169    map_len = size;
170  }
171  else {
172    addr = buffer = (char *)malloc(size);
173    if (buffer == 0) {
174      error("can't allocate buffer for `%1'", name);
175      return 0;
176    }
177    char *ptr = buffer;
178    int bytes_to_read = size;
179    while (bytes_to_read > 0) {
180      int nread = read(fd, ptr, bytes_to_read);
181      if (nread == 0) {
182	error("unexpected EOF on `%1'", name);
183	return 0;
184      }
185      if (nread < 0) {
186	error("read error on `%1': %2", name, strerror(errno));
187	return 0;
188      }
189      bytes_to_read -= nread;
190      ptr += nread;
191    }
192  }
193  header = *(index_header *)addr;
194  if (header.magic != INDEX_MAGIC) {
195    error("`%1' is not an index file: wrong magic number", name);
196    return 0;
197  }
198  if (header.version != INDEX_VERSION) {
199    error("version number in `%1' is wrong: was %2, should be %3",
200	  name, header.version, INDEX_VERSION);
201    return 0;
202  }
203  int sz = (header.tags_size * sizeof(tag)
204	    + header.lists_size * sizeof(int)
205	    + header.table_size * sizeof(int)
206	    + header.strings_size
207	    + sizeof(header));
208  if (sz != size) {
209    error("size of `%1' is wrong: was %2, should be %3",
210	  name, size, sz);
211    return 0;
212  }
213  tags = (tag *)(addr + sizeof(header));
214  lists = (int *)(tags + header.tags_size);
215  table = (int *)(lists + header.lists_size);
216  pool = (char *)(table + header.table_size);
217  ignore_fields = strchr(strchr(pool, '\0') + 1, '\0') + 1;
218  key_buffer = new char[header.truncate];
219  read_common_words_file();
220  return 1;
221}
222
223const char *index_search_item::do_verify()
224{
225  if (tags == 0)
226    return "not loaded";
227  if (lists[header.lists_size - 1] >= 0)
228    return "last list element not negative";
229  int i;
230  for (i = 0; i < header.table_size; i++) {
231    int li = table[i];
232    if (li >= header.lists_size)
233      return "bad list index";
234    if (li >= 0) {
235      for (int *ptr = lists + li; *ptr >= 0; ptr++) {
236	if (*ptr >= header.tags_size)
237	  return "bad tag index";
238	if (*ptr >= ptr[1] && ptr[1] >= 0)
239	  return "list not ordered";
240      }
241    }
242  }
243  for (i = 0; i < header.tags_size; i++) {
244    if (tags[i].filename_index >= header.strings_size)
245      return "bad index in tags";
246    if (tags[i].length < 0)
247      return "bad length in tags";
248    if (tags[i].start < 0)
249      return "bad start in tags";
250  }
251  if (pool[header.strings_size - 1] != '\0')
252    return "last character in pool not nul";
253  return 0;
254}
255
256int index_search_item::verify()
257{
258  const char *reason = do_verify();
259  if (!reason)
260    return 1;
261  error("`%1' is bad: %2", name, reason);
262  return 0;
263}
264
265int index_search_item::next_filename_id() const
266{
267  return filename_id + header.strings_size + 1;
268}
269
270search_item_iterator *index_search_item::make_search_item_iterator(
271  const char *query)
272{
273  return new index_search_item_iterator(this, query);
274}
275
276search_item *make_index_search_item(const char *filename, int fid)
277{
278  char *index_filename = new char[strlen(filename) + sizeof(INDEX_SUFFIX)];
279  strcpy(index_filename, filename);
280  strcat(index_filename, INDEX_SUFFIX);
281  int fd = open(index_filename, O_RDONLY | O_BINARY);
282  if (fd < 0)
283    return 0;
284  index_search_item *item = new index_search_item(index_filename, fid);
285  a_delete index_filename;
286  if (!item->load(fd)) {
287    close(fd);
288    delete item;
289    return 0;
290  }
291  else if (verify_flag && !item->verify()) {
292    delete item;
293    return 0;
294  }
295  else {
296    item->check_files();
297    return item;
298  }
299}
300
301
302index_search_item_iterator::index_search_item_iterator(index_search_item *ind,
303						       const char *q)
304: indx(ind), out_of_date_files_iter(0), next_out_of_date_file(0), temp_list(0),
305  buf(0), buflen(0),
306  searcher(q, strlen(q), ind->ignore_fields, ind->header.truncate),
307  query(strsave(q))
308{
309  found_list = indx->search(q, strlen(q), &temp_list);
310  if (!found_list) {
311    found_list = &minus_one;
312    warning("all keys would have been discarded in constructing index `%1'",
313	    indx->name);
314  }
315}
316
317index_search_item_iterator::~index_search_item_iterator()
318{
319  a_delete temp_list;
320  a_delete buf;
321  a_delete query;
322  delete out_of_date_files_iter;
323}
324
325int index_search_item_iterator::next(const linear_searcher &,
326				     const char **pp, int *lenp,
327				     reference_id *ridp)
328{
329  if (found_list) {
330    for (;;) {
331      int tagno = *found_list;
332      if (tagno == -1)
333	break;
334      found_list++;
335      if (get_tag(tagno, searcher, pp, lenp, ridp))
336	return 1;
337    }
338    found_list = 0;
339    next_out_of_date_file = indx->out_of_date_files;
340  }
341  while (next_out_of_date_file) {
342    if (out_of_date_files_iter == 0)
343      out_of_date_files_iter
344	= next_out_of_date_file->make_search_item_iterator(query);
345    if (out_of_date_files_iter->next(searcher, pp, lenp, ridp))
346      return 1;
347    delete out_of_date_files_iter;
348    out_of_date_files_iter = 0;
349    next_out_of_date_file = next_out_of_date_file->next;
350  }
351  return 0;
352}
353
354int index_search_item_iterator::get_tag(int tagno,
355					const linear_searcher &searchr,
356					const char **pp, int *lenp,
357					reference_id *ridp)
358{
359  if (tagno < 0 || tagno >= indx->header.tags_size) {
360    error("bad tag number");
361    return 0;
362  }
363  tag *tp = indx->tags + tagno;
364  const char *filename = indx->munge_filename(indx->pool + tp->filename_index);
365  int fd = open(filename, O_RDONLY | O_BINARY);
366  if (fd < 0) {
367    error("can't open `%1': %2", filename, strerror(errno));
368    return 0;
369  }
370  struct stat sb;
371  if (fstat(fd, &sb) < 0) {
372    error("can't fstat: %1", strerror(errno));
373    close(fd);
374    return 0;
375  }
376  time_t mtime = sb.st_mtime;
377  if (mtime > indx->mtime) {
378    indx->add_out_of_date_file(fd, filename,
379			       indx->filename_id + tp->filename_index);
380    return 0;
381  }
382  int res = 0;
383  FILE *fp = fdopen(fd, FOPEN_RB);
384  if (!fp) {
385    error("fdopen failed");
386    close(fd);
387    return 0;
388  }
389  if (tp->start != 0 && fseek(fp, long(tp->start), 0) < 0)
390    error("can't seek on `%1': %2", filename, strerror(errno));
391  else {
392    int length = tp->length;
393    int err = 0;
394    if (length == 0) {
395      if (fstat(fileno(fp), &sb) < 0) {
396	error("can't stat `%1': %2", filename, strerror(errno));
397	err = 1;
398      }
399      else if (!S_ISREG(sb.st_mode)) {
400	error("`%1' is not a regular file", filename);
401	err = 1;
402      }
403      else
404	length = int(sb.st_size);
405    }
406    if (!err) {
407      if (length + 2 > buflen) {
408	a_delete buf;
409	buflen = length + 2;
410	buf = new char[buflen];
411      }
412      if (fread(buf + 1, 1, length, fp) != (size_t)length)
413	error("fread on `%1' failed: %2", filename, strerror(errno));
414      else {
415	buf[0] = '\n';
416	// Remove the CR characters from CRLF pairs.
417	int sidx = 1, didx = 1;
418	for ( ; sidx < length + 1; sidx++, didx++)
419	  {
420	    if (buf[sidx] == '\r')
421	      {
422		if (buf[++sidx] != '\n')
423		  buf[didx++] = '\r';
424		else
425		  length--;
426	      }
427	    if (sidx != didx)
428	      buf[didx] = buf[sidx];
429	  }
430	buf[length + 1] = '\n';
431	res = searchr.search(buf + 1, buf + 2 + length, pp, lenp);
432	if (res && ridp)
433	  *ridp = reference_id(indx->filename_id + tp->filename_index,
434			       tp->start);
435      }
436    }
437  }
438  fclose(fp);
439  return res;
440}
441
442const char *index_search_item::munge_filename(const char *filename)
443{
444  if (IS_ABSOLUTE(filename))
445    return filename;
446  const char *cwd = pool;
447  int need_slash = (cwd[0] != 0
448		    && strchr(DIR_SEPS, strchr(cwd, '\0')[-1]) == 0);
449  int len = strlen(cwd) + strlen(filename) + need_slash + 1;
450  if (len > filename_buflen) {
451    a_delete filename_buffer;
452    filename_buflen = len;
453    filename_buffer = new char[len];
454  }
455  strcpy(filename_buffer, cwd);
456  if (need_slash)
457    strcat(filename_buffer, "/");
458  strcat(filename_buffer, filename);
459  return filename_buffer;
460}
461
462const int *index_search_item::search1(const char **pp, const char *end)
463{
464  while (*pp < end && !csalnum(**pp))
465    *pp += 1;
466  if (*pp >= end)
467    return 0;
468  const char *start = *pp;
469  while (*pp < end && csalnum(**pp))
470    *pp += 1;
471  int len = *pp - start;
472  if (len < header.shortest)
473    return 0;
474  if (len > header.truncate)
475    len = header.truncate;
476  int is_number = 1;
477  for (int i = 0; i < len; i++)
478    if (csdigit(start[i]))
479      key_buffer[i] = start[i];
480    else {
481      key_buffer[i] = cmlower(start[i]);
482      is_number = 0;
483    }
484  if (is_number && !(len == 4 && start[0] == '1' && start[1] == '9'))
485    return 0;
486  unsigned hc = hash(key_buffer, len);
487  if (common_words_table) {
488    for (int h = hc % common_words_table_size;
489	 common_words_table[h];
490	 --h) {
491      if (strlen(common_words_table[h]) == (size_t)len
492	  && memcmp(common_words_table[h], key_buffer, len) == 0)
493	return 0;
494      if (h == 0)
495	h = common_words_table_size;
496    }
497  }
498  int li = table[int(hc % header.table_size)];
499  return li < 0 ? &minus_one : lists + li;
500}
501
502static void merge(int *result, const int *s1, const int *s2)
503{
504  for (; *s1 >= 0; s1++) {
505    while (*s2 >= 0 && *s2 < *s1)
506      s2++;
507    if (*s2 == *s1)
508      *result++ = *s2;
509  }
510  *result++ = -1;
511}
512
513const int *index_search_item::search(const char *ptr, int length,
514				     int **temp_listp)
515{
516  const char *end = ptr + length;
517  if (*temp_listp) {
518    a_delete *temp_listp;
519    *temp_listp = 0;
520  }
521  const int *first_list = 0;
522  while (ptr < end && (first_list = search1(&ptr, end)) == 0)
523    ;
524  if (!first_list)
525    return 0;
526  if (*first_list < 0)
527    return first_list;
528  const int *second_list = 0;
529  while (ptr < end && (second_list = search1(&ptr, end)) == 0)
530    ;
531  if (!second_list)
532    return first_list;
533  if (*second_list < 0)
534    return second_list;
535  const int *p;
536  for (p = first_list; *p >= 0; p++)
537    ;
538  int len = p - first_list;
539  for (p = second_list; *p >= 0; p++)
540    ;
541  if (p - second_list < len)
542    len = p - second_list;
543  int *matches = new int[len + 1];
544  merge(matches, first_list, second_list);
545  while (ptr < end) {
546    const int *list = search1(&ptr, end);
547    if (list != 0) {
548      if (*list < 0) {
549	a_delete matches;
550	return list;
551      }
552      merge(matches, matches, list);
553      if (*matches < 0) {
554	a_delete matches;
555	return &minus_one;
556      }
557    }
558  }
559  *temp_listp = matches;
560  return matches;
561}
562
563void index_search_item::read_common_words_file()
564{
565  if (header.common <= 0)
566    return;
567  const char *common_words_file = munge_filename(strchr(pool, '\0') + 1);
568  errno = 0;
569  FILE *fp = fopen(common_words_file, "r");
570  if (!fp) {
571    error("can't open `%1': %2", common_words_file, strerror(errno));
572    return;
573  }
574  common_words_table_size = 2*header.common + 1;
575  while (!is_prime(common_words_table_size))
576    common_words_table_size++;
577  common_words_table = new char *[common_words_table_size];
578  for (int i = 0; i < common_words_table_size; i++)
579    common_words_table[i] = 0;
580  int count = 0;
581  int key_len = 0;
582  for (;;) {
583    int c = getc(fp);
584    while (c != EOF && !csalnum(c))
585      c = getc(fp);
586    if (c == EOF)
587      break;
588    do {
589      if (key_len < header.truncate)
590	key_buffer[key_len++] = cmlower(c);
591      c = getc(fp);
592    } while (c != EOF && csalnum(c));
593    if (key_len >= header.shortest) {
594      int h = hash(key_buffer, key_len) % common_words_table_size;
595      while (common_words_table[h]) {
596	if (h == 0)
597	  h = common_words_table_size;
598	--h;
599      }
600      common_words_table[h] = new char[key_len + 1];
601      memcpy(common_words_table[h], key_buffer, key_len);
602      common_words_table[h][key_len] = '\0';
603    }
604    if (++count >= header.common)
605      break;
606    key_len = 0;
607    if (c == EOF)
608      break;
609  }
610  fclose(fp);
611}
612
613void index_search_item::add_out_of_date_file(int fd, const char *filename,
614					     int fid)
615{
616  search_item **pp;
617  for (pp = &out_of_date_files; *pp; pp = &(*pp)->next)
618    if ((*pp)->is_named(filename))
619      return;
620  *pp = make_linear_search_item(fd, filename, fid);
621  warning("`%1' modified since `%2' created", filename, name);
622}
623
624void index_search_item::check_files()
625{
626  const char *pool_end = pool + header.strings_size;
627  for (const char *ptr = strchr(ignore_fields, '\0') + 1;
628       ptr < pool_end;
629       ptr = strchr(ptr, '\0') + 1) {
630    const char *path = munge_filename(ptr);
631    struct stat sb;
632    if (stat(path, &sb) < 0)
633      error("can't stat `%1': %2", path, strerror(errno));
634    else if (sb.st_mtime > mtime) {
635      int fd = open(path, O_RDONLY | O_BINARY);
636      if (fd < 0)
637	error("can't open `%1': %2", path, strerror(errno));
638      else
639	add_out_of_date_file(fd, path, filename_id + (ptr - pool));
640    }
641  }
642}
643