1/* $NetBSD: index.cpp,v 1.1.1.1 2016/01/13 18:41:48 christos Exp $ */ 2 3// -*- C++ -*- 4/* Copyright (C) 1989, 1990, 1991, 1992, 2001, 2004 5 Free Software Foundation, Inc. 6 Written by James Clark (jjc@jclark.com) 7 8This file is part of groff. 9 10groff is free software; you can redistribute it and/or modify it under 11the terms of the GNU General Public License as published by the Free 12Software Foundation; either version 2, or (at your option) any later 13version. 14 15groff is distributed in the hope that it will be useful, but WITHOUT ANY 16WARRANTY; without even the implied warranty of MERCHANTABILITY or 17FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 18for more details. 19 20You should have received a copy of the GNU General Public License along 21with groff; see the file COPYING. If not, write to the Free Software 22Foundation, 51 Franklin St - Fifth Floor, Boston, MA 02110-1301, USA. */ 23 24#include "lib.h" 25 26#include <stdlib.h> 27#include <errno.h> 28 29#include "posix.h" 30#include "cset.h" 31#include "cmap.h" 32#include "errarg.h" 33#include "error.h" 34 35#include "refid.h" 36#include "search.h" 37#include "index.h" 38#include "defs.h" 39 40#include "nonposix.h" 41 42// Interface to mmap. 43extern "C" { 44 void *mapread(int fd, int len); 45 int unmap(void *, int len); 46} 47 48#if 0 49const 50#endif 51int minus_one = -1; 52 53int verify_flag = 0; 54 55struct word_list; 56 57class index_search_item : public search_item { 58 search_item *out_of_date_files; 59 index_header header; 60 char *buffer; 61 void *map_addr; 62 int map_len; 63 tag *tags; 64 int *table; 65 int *lists; 66 char *pool; 67 char *key_buffer; 68 char *filename_buffer; 69 int filename_buflen; 70 char **common_words_table; 71 int common_words_table_size; 72 const char *ignore_fields; 73 time_t mtime; 74 75 const char *do_verify(); 76 const int *search1(const char **pp, const char *end); 77 const int *search(const char *ptr, int length, int **temp_listp); 78 const char *munge_filename(const char *); 79 void read_common_words_file(); 80 void add_out_of_date_file(int fd, const char *filename, int fid); 81public: 82 index_search_item(const char *, int); 83 ~index_search_item(); 84 int load(int fd); 85 search_item_iterator *make_search_item_iterator(const char *); 86 int verify(); 87 void check_files(); 88 int next_filename_id() const; 89 friend class index_search_item_iterator; 90}; 91 92class index_search_item_iterator : public search_item_iterator { 93 index_search_item *indx; 94 search_item_iterator *out_of_date_files_iter; 95 search_item *next_out_of_date_file; 96 const int *found_list; 97 int *temp_list; 98 char *buf; 99 int buflen; 100 linear_searcher searcher; 101 char *query; 102 int get_tag(int tagno, const linear_searcher &, const char **, int *, 103 reference_id *); 104public: 105 index_search_item_iterator(index_search_item *, const char *); 106 ~index_search_item_iterator(); 107 int next(const linear_searcher &, const char **, int *, reference_id *); 108}; 109 110 111index_search_item::index_search_item(const char *filename, int fid) 112: search_item(filename, fid), out_of_date_files(0), buffer(0), map_addr(0), 113 map_len(0), key_buffer(0), filename_buffer(0), filename_buflen(0), 114 common_words_table(0) 115{ 116} 117 118index_search_item::~index_search_item() 119{ 120 if (buffer) 121 free(buffer); 122 if (map_addr) { 123 if (unmap(map_addr, map_len) < 0) 124 error("unmap: %1", strerror(errno)); 125 } 126 while (out_of_date_files) { 127 search_item *tem = out_of_date_files; 128 out_of_date_files = out_of_date_files->next; 129 delete tem; 130 } 131 a_delete filename_buffer; 132 a_delete key_buffer; 133 if (common_words_table) { 134 for (int i = 0; i < common_words_table_size; i++) 135 a_delete common_words_table[i]; 136 a_delete common_words_table; 137 } 138} 139 140class file_closer { 141 int *fdp; 142public: 143 file_closer(int &fd) : fdp(&fd) { } 144 ~file_closer() { close(*fdp); } 145}; 146 147// Tell the compiler that a variable is intentionally unused. 148inline void unused(void *) { } 149 150int index_search_item::load(int fd) 151{ 152 file_closer fd_closer(fd); // close fd on return 153 unused(&fd_closer); 154 struct stat sb; 155 if (fstat(fd, &sb) < 0) { 156 error("can't fstat `%1': %2", name, strerror(errno)); 157 return 0; 158 } 159 if (!S_ISREG(sb.st_mode)) { 160 error("`%1' is not a regular file", name); 161 return 0; 162 } 163 mtime = sb.st_mtime; 164 int size = int(sb.st_size); 165 char *addr; 166 map_addr = mapread(fd, size); 167 if (map_addr) { 168 addr = (char *)map_addr; 169 map_len = size; 170 } 171 else { 172 addr = buffer = (char *)malloc(size); 173 if (buffer == 0) { 174 error("can't allocate buffer for `%1'", name); 175 return 0; 176 } 177 char *ptr = buffer; 178 int bytes_to_read = size; 179 while (bytes_to_read > 0) { 180 int nread = read(fd, ptr, bytes_to_read); 181 if (nread == 0) { 182 error("unexpected EOF on `%1'", name); 183 return 0; 184 } 185 if (nread < 0) { 186 error("read error on `%1': %2", name, strerror(errno)); 187 return 0; 188 } 189 bytes_to_read -= nread; 190 ptr += nread; 191 } 192 } 193 header = *(index_header *)addr; 194 if (header.magic != INDEX_MAGIC) { 195 error("`%1' is not an index file: wrong magic number", name); 196 return 0; 197 } 198 if (header.version != INDEX_VERSION) { 199 error("version number in `%1' is wrong: was %2, should be %3", 200 name, header.version, INDEX_VERSION); 201 return 0; 202 } 203 int sz = (header.tags_size * sizeof(tag) 204 + header.lists_size * sizeof(int) 205 + header.table_size * sizeof(int) 206 + header.strings_size 207 + sizeof(header)); 208 if (sz != size) { 209 error("size of `%1' is wrong: was %2, should be %3", 210 name, size, sz); 211 return 0; 212 } 213 tags = (tag *)(addr + sizeof(header)); 214 lists = (int *)(tags + header.tags_size); 215 table = (int *)(lists + header.lists_size); 216 pool = (char *)(table + header.table_size); 217 ignore_fields = strchr(strchr(pool, '\0') + 1, '\0') + 1; 218 key_buffer = new char[header.truncate]; 219 read_common_words_file(); 220 return 1; 221} 222 223const char *index_search_item::do_verify() 224{ 225 if (tags == 0) 226 return "not loaded"; 227 if (lists[header.lists_size - 1] >= 0) 228 return "last list element not negative"; 229 int i; 230 for (i = 0; i < header.table_size; i++) { 231 int li = table[i]; 232 if (li >= header.lists_size) 233 return "bad list index"; 234 if (li >= 0) { 235 for (int *ptr = lists + li; *ptr >= 0; ptr++) { 236 if (*ptr >= header.tags_size) 237 return "bad tag index"; 238 if (*ptr >= ptr[1] && ptr[1] >= 0) 239 return "list not ordered"; 240 } 241 } 242 } 243 for (i = 0; i < header.tags_size; i++) { 244 if (tags[i].filename_index >= header.strings_size) 245 return "bad index in tags"; 246 if (tags[i].length < 0) 247 return "bad length in tags"; 248 if (tags[i].start < 0) 249 return "bad start in tags"; 250 } 251 if (pool[header.strings_size - 1] != '\0') 252 return "last character in pool not nul"; 253 return 0; 254} 255 256int index_search_item::verify() 257{ 258 const char *reason = do_verify(); 259 if (!reason) 260 return 1; 261 error("`%1' is bad: %2", name, reason); 262 return 0; 263} 264 265int index_search_item::next_filename_id() const 266{ 267 return filename_id + header.strings_size + 1; 268} 269 270search_item_iterator *index_search_item::make_search_item_iterator( 271 const char *query) 272{ 273 return new index_search_item_iterator(this, query); 274} 275 276search_item *make_index_search_item(const char *filename, int fid) 277{ 278 char *index_filename = new char[strlen(filename) + sizeof(INDEX_SUFFIX)]; 279 strcpy(index_filename, filename); 280 strcat(index_filename, INDEX_SUFFIX); 281 int fd = open(index_filename, O_RDONLY | O_BINARY); 282 if (fd < 0) 283 return 0; 284 index_search_item *item = new index_search_item(index_filename, fid); 285 a_delete index_filename; 286 if (!item->load(fd)) { 287 close(fd); 288 delete item; 289 return 0; 290 } 291 else if (verify_flag && !item->verify()) { 292 delete item; 293 return 0; 294 } 295 else { 296 item->check_files(); 297 return item; 298 } 299} 300 301 302index_search_item_iterator::index_search_item_iterator(index_search_item *ind, 303 const char *q) 304: indx(ind), out_of_date_files_iter(0), next_out_of_date_file(0), temp_list(0), 305 buf(0), buflen(0), 306 searcher(q, strlen(q), ind->ignore_fields, ind->header.truncate), 307 query(strsave(q)) 308{ 309 found_list = indx->search(q, strlen(q), &temp_list); 310 if (!found_list) { 311 found_list = &minus_one; 312 warning("all keys would have been discarded in constructing index `%1'", 313 indx->name); 314 } 315} 316 317index_search_item_iterator::~index_search_item_iterator() 318{ 319 a_delete temp_list; 320 a_delete buf; 321 a_delete query; 322 delete out_of_date_files_iter; 323} 324 325int index_search_item_iterator::next(const linear_searcher &, 326 const char **pp, int *lenp, 327 reference_id *ridp) 328{ 329 if (found_list) { 330 for (;;) { 331 int tagno = *found_list; 332 if (tagno == -1) 333 break; 334 found_list++; 335 if (get_tag(tagno, searcher, pp, lenp, ridp)) 336 return 1; 337 } 338 found_list = 0; 339 next_out_of_date_file = indx->out_of_date_files; 340 } 341 while (next_out_of_date_file) { 342 if (out_of_date_files_iter == 0) 343 out_of_date_files_iter 344 = next_out_of_date_file->make_search_item_iterator(query); 345 if (out_of_date_files_iter->next(searcher, pp, lenp, ridp)) 346 return 1; 347 delete out_of_date_files_iter; 348 out_of_date_files_iter = 0; 349 next_out_of_date_file = next_out_of_date_file->next; 350 } 351 return 0; 352} 353 354int index_search_item_iterator::get_tag(int tagno, 355 const linear_searcher &searchr, 356 const char **pp, int *lenp, 357 reference_id *ridp) 358{ 359 if (tagno < 0 || tagno >= indx->header.tags_size) { 360 error("bad tag number"); 361 return 0; 362 } 363 tag *tp = indx->tags + tagno; 364 const char *filename = indx->munge_filename(indx->pool + tp->filename_index); 365 int fd = open(filename, O_RDONLY | O_BINARY); 366 if (fd < 0) { 367 error("can't open `%1': %2", filename, strerror(errno)); 368 return 0; 369 } 370 struct stat sb; 371 if (fstat(fd, &sb) < 0) { 372 error("can't fstat: %1", strerror(errno)); 373 close(fd); 374 return 0; 375 } 376 time_t mtime = sb.st_mtime; 377 if (mtime > indx->mtime) { 378 indx->add_out_of_date_file(fd, filename, 379 indx->filename_id + tp->filename_index); 380 return 0; 381 } 382 int res = 0; 383 FILE *fp = fdopen(fd, FOPEN_RB); 384 if (!fp) { 385 error("fdopen failed"); 386 close(fd); 387 return 0; 388 } 389 if (tp->start != 0 && fseek(fp, long(tp->start), 0) < 0) 390 error("can't seek on `%1': %2", filename, strerror(errno)); 391 else { 392 int length = tp->length; 393 int err = 0; 394 if (length == 0) { 395 if (fstat(fileno(fp), &sb) < 0) { 396 error("can't stat `%1': %2", filename, strerror(errno)); 397 err = 1; 398 } 399 else if (!S_ISREG(sb.st_mode)) { 400 error("`%1' is not a regular file", filename); 401 err = 1; 402 } 403 else 404 length = int(sb.st_size); 405 } 406 if (!err) { 407 if (length + 2 > buflen) { 408 a_delete buf; 409 buflen = length + 2; 410 buf = new char[buflen]; 411 } 412 if (fread(buf + 1, 1, length, fp) != (size_t)length) 413 error("fread on `%1' failed: %2", filename, strerror(errno)); 414 else { 415 buf[0] = '\n'; 416 // Remove the CR characters from CRLF pairs. 417 int sidx = 1, didx = 1; 418 for ( ; sidx < length + 1; sidx++, didx++) 419 { 420 if (buf[sidx] == '\r') 421 { 422 if (buf[++sidx] != '\n') 423 buf[didx++] = '\r'; 424 else 425 length--; 426 } 427 if (sidx != didx) 428 buf[didx] = buf[sidx]; 429 } 430 buf[length + 1] = '\n'; 431 res = searchr.search(buf + 1, buf + 2 + length, pp, lenp); 432 if (res && ridp) 433 *ridp = reference_id(indx->filename_id + tp->filename_index, 434 tp->start); 435 } 436 } 437 } 438 fclose(fp); 439 return res; 440} 441 442const char *index_search_item::munge_filename(const char *filename) 443{ 444 if (IS_ABSOLUTE(filename)) 445 return filename; 446 const char *cwd = pool; 447 int need_slash = (cwd[0] != 0 448 && strchr(DIR_SEPS, strchr(cwd, '\0')[-1]) == 0); 449 int len = strlen(cwd) + strlen(filename) + need_slash + 1; 450 if (len > filename_buflen) { 451 a_delete filename_buffer; 452 filename_buflen = len; 453 filename_buffer = new char[len]; 454 } 455 strcpy(filename_buffer, cwd); 456 if (need_slash) 457 strcat(filename_buffer, "/"); 458 strcat(filename_buffer, filename); 459 return filename_buffer; 460} 461 462const int *index_search_item::search1(const char **pp, const char *end) 463{ 464 while (*pp < end && !csalnum(**pp)) 465 *pp += 1; 466 if (*pp >= end) 467 return 0; 468 const char *start = *pp; 469 while (*pp < end && csalnum(**pp)) 470 *pp += 1; 471 int len = *pp - start; 472 if (len < header.shortest) 473 return 0; 474 if (len > header.truncate) 475 len = header.truncate; 476 int is_number = 1; 477 for (int i = 0; i < len; i++) 478 if (csdigit(start[i])) 479 key_buffer[i] = start[i]; 480 else { 481 key_buffer[i] = cmlower(start[i]); 482 is_number = 0; 483 } 484 if (is_number && !(len == 4 && start[0] == '1' && start[1] == '9')) 485 return 0; 486 unsigned hc = hash(key_buffer, len); 487 if (common_words_table) { 488 for (int h = hc % common_words_table_size; 489 common_words_table[h]; 490 --h) { 491 if (strlen(common_words_table[h]) == (size_t)len 492 && memcmp(common_words_table[h], key_buffer, len) == 0) 493 return 0; 494 if (h == 0) 495 h = common_words_table_size; 496 } 497 } 498 int li = table[int(hc % header.table_size)]; 499 return li < 0 ? &minus_one : lists + li; 500} 501 502static void merge(int *result, const int *s1, const int *s2) 503{ 504 for (; *s1 >= 0; s1++) { 505 while (*s2 >= 0 && *s2 < *s1) 506 s2++; 507 if (*s2 == *s1) 508 *result++ = *s2; 509 } 510 *result++ = -1; 511} 512 513const int *index_search_item::search(const char *ptr, int length, 514 int **temp_listp) 515{ 516 const char *end = ptr + length; 517 if (*temp_listp) { 518 a_delete *temp_listp; 519 *temp_listp = 0; 520 } 521 const int *first_list = 0; 522 while (ptr < end && (first_list = search1(&ptr, end)) == 0) 523 ; 524 if (!first_list) 525 return 0; 526 if (*first_list < 0) 527 return first_list; 528 const int *second_list = 0; 529 while (ptr < end && (second_list = search1(&ptr, end)) == 0) 530 ; 531 if (!second_list) 532 return first_list; 533 if (*second_list < 0) 534 return second_list; 535 const int *p; 536 for (p = first_list; *p >= 0; p++) 537 ; 538 int len = p - first_list; 539 for (p = second_list; *p >= 0; p++) 540 ; 541 if (p - second_list < len) 542 len = p - second_list; 543 int *matches = new int[len + 1]; 544 merge(matches, first_list, second_list); 545 while (ptr < end) { 546 const int *list = search1(&ptr, end); 547 if (list != 0) { 548 if (*list < 0) { 549 a_delete matches; 550 return list; 551 } 552 merge(matches, matches, list); 553 if (*matches < 0) { 554 a_delete matches; 555 return &minus_one; 556 } 557 } 558 } 559 *temp_listp = matches; 560 return matches; 561} 562 563void index_search_item::read_common_words_file() 564{ 565 if (header.common <= 0) 566 return; 567 const char *common_words_file = munge_filename(strchr(pool, '\0') + 1); 568 errno = 0; 569 FILE *fp = fopen(common_words_file, "r"); 570 if (!fp) { 571 error("can't open `%1': %2", common_words_file, strerror(errno)); 572 return; 573 } 574 common_words_table_size = 2*header.common + 1; 575 while (!is_prime(common_words_table_size)) 576 common_words_table_size++; 577 common_words_table = new char *[common_words_table_size]; 578 for (int i = 0; i < common_words_table_size; i++) 579 common_words_table[i] = 0; 580 int count = 0; 581 int key_len = 0; 582 for (;;) { 583 int c = getc(fp); 584 while (c != EOF && !csalnum(c)) 585 c = getc(fp); 586 if (c == EOF) 587 break; 588 do { 589 if (key_len < header.truncate) 590 key_buffer[key_len++] = cmlower(c); 591 c = getc(fp); 592 } while (c != EOF && csalnum(c)); 593 if (key_len >= header.shortest) { 594 int h = hash(key_buffer, key_len) % common_words_table_size; 595 while (common_words_table[h]) { 596 if (h == 0) 597 h = common_words_table_size; 598 --h; 599 } 600 common_words_table[h] = new char[key_len + 1]; 601 memcpy(common_words_table[h], key_buffer, key_len); 602 common_words_table[h][key_len] = '\0'; 603 } 604 if (++count >= header.common) 605 break; 606 key_len = 0; 607 if (c == EOF) 608 break; 609 } 610 fclose(fp); 611} 612 613void index_search_item::add_out_of_date_file(int fd, const char *filename, 614 int fid) 615{ 616 search_item **pp; 617 for (pp = &out_of_date_files; *pp; pp = &(*pp)->next) 618 if ((*pp)->is_named(filename)) 619 return; 620 *pp = make_linear_search_item(fd, filename, fid); 621 warning("`%1' modified since `%2' created", filename, name); 622} 623 624void index_search_item::check_files() 625{ 626 const char *pool_end = pool + header.strings_size; 627 for (const char *ptr = strchr(ignore_fields, '\0') + 1; 628 ptr < pool_end; 629 ptr = strchr(ptr, '\0') + 1) { 630 const char *path = munge_filename(ptr); 631 struct stat sb; 632 if (stat(path, &sb) < 0) 633 error("can't stat `%1': %2", path, strerror(errno)); 634 else if (sb.st_mtime > mtime) { 635 int fd = open(path, O_RDONLY | O_BINARY); 636 if (fd < 0) 637 error("can't open `%1': %2", path, strerror(errno)); 638 else 639 add_out_of_date_file(fd, path, filename_id + (ptr - pool)); 640 } 641 } 642} 643