linear.cpp revision 114402
1// -*- C++ -*- 2/* Copyright (C) 1989, 1990, 1991, 1992, 2000, 2001 3 Free Software Foundation, Inc. 4 Written by James Clark (jjc@jclark.com) 5 6This file is part of groff. 7 8groff is free software; you can redistribute it and/or modify it under 9the terms of the GNU General Public License as published by the Free 10Software Foundation; either version 2, or (at your option) any later 11version. 12 13groff is distributed in the hope that it will be useful, but WITHOUT ANY 14WARRANTY; without even the implied warranty of MERCHANTABILITY or 15FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 16for more details. 17 18You should have received a copy of the GNU General Public License along 19with groff; see the file COPYING. If not, write to the Free Software 20Foundation, 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ 21 22#include "lib.h" 23 24#include <stdlib.h> 25#include <assert.h> 26#include <errno.h> 27 28#include "posix.h" 29#include "errarg.h" 30#include "error.h" 31#include "cset.h" 32#include "cmap.h" 33#include "nonposix.h" 34 35#include "refid.h" 36#include "search.h" 37 38class file_buffer { 39 char *buffer; 40 char *bufend; 41public: 42 file_buffer(); 43 ~file_buffer(); 44 int load(int fd, const char *filename); 45 const char *get_start() const; 46 const char *get_end() const; 47}; 48 49typedef unsigned char uchar; 50 51static uchar map[256]; 52static uchar inv_map[256][3]; 53 54struct map_init { 55 map_init(); 56}; 57 58static map_init the_map_init; 59 60map_init::map_init() 61{ 62 int i; 63 for (i = 0; i < 256; i++) 64 map[i] = csalnum(i) ? cmlower(i) : '\0'; 65 for (i = 0; i < 256; i++) { 66 if (cslower(i)) { 67 inv_map[i][0] = i; 68 inv_map[i][1] = cmupper(i); 69 inv_map[i][2] = '\0'; 70 } 71 else if (csdigit(i)) { 72 inv_map[i][0] = i; 73 inv_map[i][1] = 0; 74 } 75 else 76 inv_map[i][0] = '\0'; 77 } 78} 79 80 81class bmpattern { 82 char *pat; 83 int len; 84 int delta[256]; 85public: 86 bmpattern(const char *pattern, int pattern_length); 87 ~bmpattern(); 88 const char *search(const char *p, const char *end) const; 89 int length() const; 90}; 91 92bmpattern::bmpattern(const char *pattern, int pattern_length) 93: len(pattern_length) 94{ 95 pat = new char[len]; 96 int i; 97 for (i = 0; i < len; i++) 98 pat[i] = map[uchar(pattern[i])]; 99 for (i = 0; i < 256; i++) 100 delta[i] = len; 101 for (i = 0; i < len; i++) 102 for (const unsigned char *inv = inv_map[uchar(pat[i])]; *inv; inv++) 103 delta[*inv] = len - i - 1; 104} 105 106const char *bmpattern::search(const char *buf, const char *end) const 107{ 108 int buflen = end - buf; 109 if (len > buflen) 110 return 0; 111 const char *strend; 112 if (buflen > len*4) 113 strend = end - len*4; 114 else 115 strend = buf; 116 const char *k = buf + len - 1; 117 const int *del = delta; 118 const char *pattern = pat; 119 for (;;) { 120 while (k < strend) { 121 int t = del[uchar(*k)]; 122 if (!t) 123 break; 124 k += t; 125 k += del[uchar(*k)]; 126 k += del[uchar(*k)]; 127 } 128 while (k < end && del[uchar(*k)] != 0) 129 k++; 130 if (k == end) 131 break; 132 int j = len - 1; 133 const char *s = k; 134 for (;;) { 135 if (j == 0) 136 return s; 137 if (map[uchar(*--s)] != uchar(pattern[--j])) 138 break; 139 } 140 k++; 141 } 142 return 0; 143} 144 145bmpattern::~bmpattern() 146{ 147 a_delete pat; 148} 149 150inline int bmpattern::length() const 151{ 152 return len; 153} 154 155 156static const char *find_end(const char *bufend, const char *p); 157 158const char *linear_searcher::search_and_check(const bmpattern *key, 159 const char *buf, const char *bufend, const char **start) const 160{ 161 assert(buf[-1] == '\n'); 162 assert(bufend[-1] == '\n'); 163 const char *ptr = buf; 164 for (;;) { 165 const char *found = key->search(ptr, bufend); 166 if (!found) 167 break; 168 if (check_match(buf, bufend, found, key->length(), &ptr, start)) 169 return found; 170 } 171 return 0; 172} 173 174static const char *skip_field(const char *end, const char *p) 175{ 176 for (;;) 177 if (*p++ == '\n') { 178 if (p == end || *p == '%') 179 break; 180 const char *q; 181 for (q = p; *q == ' ' || *q == '\t'; q++) 182 ; 183 if (*q == '\n') 184 break; 185 p = q + 1; 186 } 187 return p; 188} 189 190static const char *find_end(const char *bufend, const char *p) 191{ 192 for (;;) 193 if (*p++ == '\n') { 194 if (p == bufend) 195 break; 196 const char *q; 197 for (q = p; *q == ' ' || *q == '\t'; q++) 198 ; 199 if (*q == '\n') 200 break; 201 p = q + 1; 202 } 203 return p; 204} 205 206 207int linear_searcher::check_match(const char *buf, const char *bufend, 208 const char *match, int matchlen, 209 const char **cont, const char **start) const 210{ 211 *cont = match + 1; 212 // The user is required to supply only the first truncate_len characters 213 // of the key. If truncate_len <= 0, he must supply all the key. 214 if ((truncate_len <= 0 || matchlen < truncate_len) 215 && map[uchar(match[matchlen])] != '\0') 216 return 0; 217 218 // The character before the match must not be an alphanumeric 219 // character (unless the alphanumeric character follows one or two 220 // percent characters at the beginning of the line), nor must it be 221 // a percent character at the beginning of a line, nor a percent 222 // character following a percent character at the beginning of a 223 // line. 224 225 switch (match - buf) { 226 case 0: 227 break; 228 case 1: 229 if (match[-1] == '%' || map[uchar(match[-1])] != '\0') 230 return 0; 231 break; 232 case 2: 233 if (map[uchar(match[-1])] != '\0' && match[-2] != '%') 234 return 0; 235 if (match[-1] == '%' 236 && (match[-2] == '\n' || match[-2] == '%')) 237 return 0; 238 break; 239 default: 240 if (map[uchar(match[-1])] != '\0' 241 && !(match[-2] == '%' 242 && (match[-3] == '\n' 243 || (match[-3] == '%' && match[-4] == '\n')))) 244 return 0; 245 if (match[-1] == '%' 246 && (match[-2] == '\n' 247 || (match[-2] == '%' && match[-3] == '\n'))) 248 return 0; 249 } 250 251 const char *p = match; 252 int had_percent = 0; 253 for (;;) { 254 if (*p == '\n') { 255 if (!had_percent && p[1] == '%') { 256 if (p[2] != '\0' && strchr(ignore_fields, p[2]) != 0) { 257 *cont = skip_field(bufend, match + matchlen); 258 return 0; 259 } 260 if (!start) 261 break; 262 had_percent = 1; 263 } 264 if (p <= buf) { 265 if (start) 266 *start = p + 1; 267 return 1; 268 } 269 const char *q; 270 for (q = p - 1; *q == ' ' || *q == '\t'; q--) 271 ; 272 if (*q == '\n') { 273 if (start) 274 *start = p + 1; 275 break; 276 } 277 p = q; 278 } 279 p--; 280 } 281 return 1; 282} 283 284file_buffer::file_buffer() 285: buffer(0), bufend(0) 286{ 287} 288 289file_buffer::~file_buffer() 290{ 291 a_delete buffer; 292} 293 294const char *file_buffer::get_start() const 295{ 296 return buffer ? buffer + 4 : 0; 297} 298 299const char *file_buffer::get_end() const 300{ 301 return bufend; 302} 303 304int file_buffer::load(int fd, const char *filename) 305{ 306 struct stat sb; 307 if (fstat(fd, &sb) < 0) 308 error("can't fstat `%1': %2", filename, strerror(errno)); 309 else if (!S_ISREG(sb.st_mode)) 310 error("`%1' is not a regular file", filename); 311 else { 312 // We need one character extra at the beginning for an additional newline 313 // used as a sentinel. We get 4 instead so that the read buffer will be 314 // word-aligned. This seems to make the read slightly faster. We also 315 // need one character at the end also for an additional newline used as a 316 // sentinel. 317 int size = int(sb.st_size); 318 buffer = new char[size + 4 + 1]; 319 int nread = read(fd, buffer + 4, size); 320 if (nread < 0) 321 error("error reading `%1': %2", filename, strerror(errno)); 322 else if (nread != size) 323 error("size of `%1' decreased", filename); 324 else { 325 char c; 326 nread = read(fd, &c, 1); 327 if (nread != 0) 328 error("size of `%1' increased", filename); 329 else if (memchr(buffer + 4, '\0', size < 1024 ? size : 1024) != 0) 330 error("database `%1' is a binary file", filename); 331 else { 332 close(fd); 333 buffer[3] = '\n'; 334 int sidx = 4, didx = 4; 335 for ( ; sidx < size + 4; sidx++, didx++) 336 { 337 if (buffer[sidx] == '\r') 338 { 339 if (buffer[++sidx] != '\n') 340 buffer[didx++] = '\r'; 341 else 342 size--; 343 } 344 if (sidx != didx) 345 buffer[didx] = buffer[sidx]; 346 } 347 bufend = buffer + 4 + size; 348 if (bufend[-1] != '\n') 349 *bufend++ = '\n'; 350 return 1; 351 } 352 } 353 a_delete buffer; 354 buffer = 0; 355 } 356 close(fd); 357 return 0; 358} 359 360linear_searcher::linear_searcher(const char *query, int query_len, 361 const char *ign, int trunc) 362: ignore_fields(ign), truncate_len(trunc), keys(0), nkeys(0) 363{ 364 const char *query_end = query + query_len; 365 int nk = 0; 366 const char *p; 367 for (p = query; p < query_end; p++) 368 if (map[uchar(*p)] != '\0' 369 && (p[1] == '\0' || map[uchar(p[1])] == '\0')) 370 nk++; 371 if (nk == 0) 372 return; 373 keys = new bmpattern*[nk]; 374 p = query; 375 for (;;) { 376 while (p < query_end && map[uchar(*p)] == '\0') 377 p++; 378 if (p == query_end) 379 break; 380 const char *start = p; 381 while (p < query_end && map[uchar(*p)] != '\0') 382 p++; 383 keys[nkeys++] = new bmpattern(start, p - start); 384 } 385 assert(nkeys <= nk); 386 if (nkeys == 0) { 387 a_delete keys; 388 keys = 0; 389 } 390} 391 392linear_searcher::~linear_searcher() 393{ 394 for (int i = 0; i < nkeys; i++) 395 delete keys[i]; 396 a_delete keys; 397} 398 399int linear_searcher::search(const char *buffer, const char *bufend, 400 const char **startp, int *lengthp) const 401{ 402 assert(bufend - buffer > 0); 403 assert(buffer[-1] == '\n'); 404 assert(bufend[-1] == '\n'); 405 if (nkeys == 0) 406 return 0; 407 for (;;) { 408 const char *refstart; 409 const char *found = search_and_check(keys[0], buffer, bufend, &refstart); 410 if (!found) 411 break; 412 const char *refend = find_end(bufend, found + keys[0]->length()); 413 int i; 414 for (i = 1; i < nkeys; i++) 415 if (!search_and_check(keys[i], refstart, refend)) 416 break; 417 if (i >= nkeys) { 418 *startp = refstart; 419 *lengthp = refend - refstart; 420 return 1; 421 } 422 buffer = refend; 423 } 424 return 0; 425} 426 427class linear_search_item : public search_item { 428 file_buffer fbuf; 429public: 430 linear_search_item(const char *filename, int fid); 431 ~linear_search_item(); 432 int load(int fd); 433 search_item_iterator *make_search_item_iterator(const char *); 434 friend class linear_search_item_iterator; 435}; 436 437class linear_search_item_iterator : public search_item_iterator { 438 linear_search_item *lsi; 439 int pos; 440public: 441 linear_search_item_iterator(linear_search_item *, const char *query); 442 ~linear_search_item_iterator(); 443 int next(const linear_searcher &, const char **ptr, int *lenp, 444 reference_id *ridp); 445}; 446 447search_item *make_linear_search_item(int fd, const char *filename, int fid) 448{ 449 linear_search_item *item = new linear_search_item(filename, fid); 450 if (!item->load(fd)) { 451 delete item; 452 return 0; 453 } 454 else 455 return item; 456} 457 458linear_search_item::linear_search_item(const char *filename, int fid) 459: search_item(filename, fid) 460{ 461} 462 463linear_search_item::~linear_search_item() 464{ 465} 466 467int linear_search_item::load(int fd) 468{ 469 return fbuf.load(fd, name); 470} 471 472search_item_iterator *linear_search_item::make_search_item_iterator( 473 const char *query) 474{ 475 return new linear_search_item_iterator(this, query); 476} 477 478linear_search_item_iterator::linear_search_item_iterator( 479 linear_search_item *p, const char *) 480: lsi(p), pos(0) 481{ 482} 483 484linear_search_item_iterator::~linear_search_item_iterator() 485{ 486} 487 488int linear_search_item_iterator::next(const linear_searcher &searcher, 489 const char **startp, int *lengthp, 490 reference_id *ridp) 491{ 492 const char *bufstart = lsi->fbuf.get_start(); 493 const char *bufend = lsi->fbuf.get_end(); 494 const char *ptr = bufstart + pos; 495 if (ptr < bufend && searcher.search(ptr, bufend, startp, lengthp)) { 496 pos = *startp + *lengthp - bufstart; 497 if (ridp) 498 *ridp = reference_id(lsi->filename_id, *startp - bufstart); 499 return 1; 500 } 501 else 502 return 0; 503} 504