linear.cpp revision 151497
1114402Sru// -*- C++ -*- 2114402Sru/* Copyright (C) 1989, 1990, 1991, 1992, 2000, 2001 3114402Sru Free Software Foundation, Inc. 4114402Sru Written by James Clark (jjc@jclark.com) 5114402Sru 6114402SruThis file is part of groff. 7114402Sru 8114402Srugroff is free software; you can redistribute it and/or modify it under 9114402Sruthe terms of the GNU General Public License as published by the Free 10114402SruSoftware Foundation; either version 2, or (at your option) any later 11114402Sruversion. 12114402Sru 13114402Srugroff is distributed in the hope that it will be useful, but WITHOUT ANY 14114402SruWARRANTY; without even the implied warranty of MERCHANTABILITY or 15114402SruFITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 16114402Srufor more details. 17114402Sru 18114402SruYou should have received a copy of the GNU General Public License along 19114402Sruwith groff; see the file COPYING. If not, write to the Free Software 20151497SruFoundation, 51 Franklin St - Fifth Floor, Boston, MA 02110-1301, USA. */ 21114402Sru 22114402Sru#include "lib.h" 23114402Sru 24114402Sru#include <stdlib.h> 25114402Sru#include <assert.h> 26114402Sru#include <errno.h> 27114402Sru 28114402Sru#include "posix.h" 29114402Sru#include "errarg.h" 30114402Sru#include "error.h" 31114402Sru#include "cset.h" 32114402Sru#include "cmap.h" 33114402Sru#include "nonposix.h" 34114402Sru 35114402Sru#include "refid.h" 36114402Sru#include "search.h" 37114402Sru 38114402Sruclass file_buffer { 39114402Sru char *buffer; 40114402Sru char *bufend; 41114402Srupublic: 42114402Sru file_buffer(); 43114402Sru ~file_buffer(); 44114402Sru int load(int fd, const char *filename); 45114402Sru const char *get_start() const; 46114402Sru const char *get_end() const; 47114402Sru}; 48114402Sru 49114402Srutypedef unsigned char uchar; 50114402Sru 51114402Srustatic uchar map[256]; 52114402Srustatic uchar inv_map[256][3]; 53114402Sru 54114402Srustruct map_init { 55114402Sru map_init(); 56114402Sru}; 57114402Sru 58114402Srustatic map_init the_map_init; 59114402Sru 60114402Srumap_init::map_init() 61114402Sru{ 62114402Sru int i; 63114402Sru for (i = 0; i < 256; i++) 64114402Sru map[i] = csalnum(i) ? cmlower(i) : '\0'; 65114402Sru for (i = 0; i < 256; i++) { 66114402Sru if (cslower(i)) { 67114402Sru inv_map[i][0] = i; 68114402Sru inv_map[i][1] = cmupper(i); 69114402Sru inv_map[i][2] = '\0'; 70114402Sru } 71114402Sru else if (csdigit(i)) { 72114402Sru inv_map[i][0] = i; 73114402Sru inv_map[i][1] = 0; 74114402Sru } 75114402Sru else 76114402Sru inv_map[i][0] = '\0'; 77114402Sru } 78114402Sru} 79114402Sru 80114402Sru 81114402Sruclass bmpattern { 82114402Sru char *pat; 83114402Sru int len; 84114402Sru int delta[256]; 85114402Srupublic: 86114402Sru bmpattern(const char *pattern, int pattern_length); 87114402Sru ~bmpattern(); 88114402Sru const char *search(const char *p, const char *end) const; 89114402Sru int length() const; 90114402Sru}; 91114402Sru 92114402Srubmpattern::bmpattern(const char *pattern, int pattern_length) 93114402Sru: len(pattern_length) 94114402Sru{ 95114402Sru pat = new char[len]; 96114402Sru int i; 97114402Sru for (i = 0; i < len; i++) 98114402Sru pat[i] = map[uchar(pattern[i])]; 99114402Sru for (i = 0; i < 256; i++) 100114402Sru delta[i] = len; 101114402Sru for (i = 0; i < len; i++) 102114402Sru for (const unsigned char *inv = inv_map[uchar(pat[i])]; *inv; inv++) 103114402Sru delta[*inv] = len - i - 1; 104114402Sru} 105114402Sru 106114402Sruconst char *bmpattern::search(const char *buf, const char *end) const 107114402Sru{ 108114402Sru int buflen = end - buf; 109114402Sru if (len > buflen) 110114402Sru return 0; 111114402Sru const char *strend; 112114402Sru if (buflen > len*4) 113114402Sru strend = end - len*4; 114114402Sru else 115114402Sru strend = buf; 116114402Sru const char *k = buf + len - 1; 117114402Sru const int *del = delta; 118114402Sru const char *pattern = pat; 119114402Sru for (;;) { 120114402Sru while (k < strend) { 121114402Sru int t = del[uchar(*k)]; 122114402Sru if (!t) 123114402Sru break; 124114402Sru k += t; 125114402Sru k += del[uchar(*k)]; 126114402Sru k += del[uchar(*k)]; 127114402Sru } 128114402Sru while (k < end && del[uchar(*k)] != 0) 129114402Sru k++; 130114402Sru if (k == end) 131114402Sru break; 132114402Sru int j = len - 1; 133114402Sru const char *s = k; 134114402Sru for (;;) { 135114402Sru if (j == 0) 136114402Sru return s; 137114402Sru if (map[uchar(*--s)] != uchar(pattern[--j])) 138114402Sru break; 139114402Sru } 140114402Sru k++; 141114402Sru } 142114402Sru return 0; 143114402Sru} 144114402Sru 145114402Srubmpattern::~bmpattern() 146114402Sru{ 147114402Sru a_delete pat; 148114402Sru} 149114402Sru 150114402Sruinline int bmpattern::length() const 151114402Sru{ 152114402Sru return len; 153114402Sru} 154114402Sru 155114402Sru 156114402Srustatic const char *find_end(const char *bufend, const char *p); 157114402Sru 158114402Sruconst char *linear_searcher::search_and_check(const bmpattern *key, 159114402Sru const char *buf, const char *bufend, const char **start) const 160114402Sru{ 161114402Sru assert(buf[-1] == '\n'); 162114402Sru assert(bufend[-1] == '\n'); 163114402Sru const char *ptr = buf; 164114402Sru for (;;) { 165114402Sru const char *found = key->search(ptr, bufend); 166114402Sru if (!found) 167114402Sru break; 168114402Sru if (check_match(buf, bufend, found, key->length(), &ptr, start)) 169114402Sru return found; 170114402Sru } 171114402Sru return 0; 172114402Sru} 173114402Sru 174114402Srustatic const char *skip_field(const char *end, const char *p) 175114402Sru{ 176114402Sru for (;;) 177114402Sru if (*p++ == '\n') { 178114402Sru if (p == end || *p == '%') 179114402Sru break; 180114402Sru const char *q; 181114402Sru for (q = p; *q == ' ' || *q == '\t'; q++) 182114402Sru ; 183114402Sru if (*q == '\n') 184114402Sru break; 185114402Sru p = q + 1; 186114402Sru } 187114402Sru return p; 188114402Sru} 189114402Sru 190114402Srustatic const char *find_end(const char *bufend, const char *p) 191114402Sru{ 192114402Sru for (;;) 193114402Sru if (*p++ == '\n') { 194114402Sru if (p == bufend) 195114402Sru break; 196114402Sru const char *q; 197114402Sru for (q = p; *q == ' ' || *q == '\t'; q++) 198114402Sru ; 199114402Sru if (*q == '\n') 200114402Sru break; 201114402Sru p = q + 1; 202114402Sru } 203114402Sru return p; 204114402Sru} 205114402Sru 206114402Sru 207114402Sruint linear_searcher::check_match(const char *buf, const char *bufend, 208114402Sru const char *match, int matchlen, 209114402Sru const char **cont, const char **start) const 210114402Sru{ 211114402Sru *cont = match + 1; 212114402Sru // The user is required to supply only the first truncate_len characters 213114402Sru // of the key. If truncate_len <= 0, he must supply all the key. 214114402Sru if ((truncate_len <= 0 || matchlen < truncate_len) 215114402Sru && map[uchar(match[matchlen])] != '\0') 216114402Sru return 0; 217114402Sru 218114402Sru // The character before the match must not be an alphanumeric 219114402Sru // character (unless the alphanumeric character follows one or two 220114402Sru // percent characters at the beginning of the line), nor must it be 221114402Sru // a percent character at the beginning of a line, nor a percent 222114402Sru // character following a percent character at the beginning of a 223114402Sru // line. 224114402Sru 225114402Sru switch (match - buf) { 226114402Sru case 0: 227114402Sru break; 228114402Sru case 1: 229114402Sru if (match[-1] == '%' || map[uchar(match[-1])] != '\0') 230114402Sru return 0; 231114402Sru break; 232114402Sru case 2: 233114402Sru if (map[uchar(match[-1])] != '\0' && match[-2] != '%') 234114402Sru return 0; 235114402Sru if (match[-1] == '%' 236114402Sru && (match[-2] == '\n' || match[-2] == '%')) 237114402Sru return 0; 238114402Sru break; 239114402Sru default: 240114402Sru if (map[uchar(match[-1])] != '\0' 241114402Sru && !(match[-2] == '%' 242114402Sru && (match[-3] == '\n' 243114402Sru || (match[-3] == '%' && match[-4] == '\n')))) 244114402Sru return 0; 245114402Sru if (match[-1] == '%' 246114402Sru && (match[-2] == '\n' 247114402Sru || (match[-2] == '%' && match[-3] == '\n'))) 248114402Sru return 0; 249114402Sru } 250114402Sru 251114402Sru const char *p = match; 252114402Sru int had_percent = 0; 253114402Sru for (;;) { 254114402Sru if (*p == '\n') { 255114402Sru if (!had_percent && p[1] == '%') { 256114402Sru if (p[2] != '\0' && strchr(ignore_fields, p[2]) != 0) { 257114402Sru *cont = skip_field(bufend, match + matchlen); 258114402Sru return 0; 259114402Sru } 260114402Sru if (!start) 261114402Sru break; 262114402Sru had_percent = 1; 263114402Sru } 264114402Sru if (p <= buf) { 265114402Sru if (start) 266114402Sru *start = p + 1; 267114402Sru return 1; 268114402Sru } 269114402Sru const char *q; 270114402Sru for (q = p - 1; *q == ' ' || *q == '\t'; q--) 271114402Sru ; 272114402Sru if (*q == '\n') { 273114402Sru if (start) 274114402Sru *start = p + 1; 275114402Sru break; 276114402Sru } 277114402Sru p = q; 278114402Sru } 279114402Sru p--; 280114402Sru } 281114402Sru return 1; 282114402Sru} 283114402Sru 284114402Srufile_buffer::file_buffer() 285114402Sru: buffer(0), bufend(0) 286114402Sru{ 287114402Sru} 288114402Sru 289114402Srufile_buffer::~file_buffer() 290114402Sru{ 291114402Sru a_delete buffer; 292114402Sru} 293114402Sru 294114402Sruconst char *file_buffer::get_start() const 295114402Sru{ 296114402Sru return buffer ? buffer + 4 : 0; 297114402Sru} 298114402Sru 299114402Sruconst char *file_buffer::get_end() const 300114402Sru{ 301114402Sru return bufend; 302114402Sru} 303114402Sru 304114402Sruint file_buffer::load(int fd, const char *filename) 305114402Sru{ 306114402Sru struct stat sb; 307114402Sru if (fstat(fd, &sb) < 0) 308114402Sru error("can't fstat `%1': %2", filename, strerror(errno)); 309114402Sru else if (!S_ISREG(sb.st_mode)) 310114402Sru error("`%1' is not a regular file", filename); 311114402Sru else { 312114402Sru // We need one character extra at the beginning for an additional newline 313114402Sru // used as a sentinel. We get 4 instead so that the read buffer will be 314114402Sru // word-aligned. This seems to make the read slightly faster. We also 315114402Sru // need one character at the end also for an additional newline used as a 316114402Sru // sentinel. 317114402Sru int size = int(sb.st_size); 318114402Sru buffer = new char[size + 4 + 1]; 319114402Sru int nread = read(fd, buffer + 4, size); 320114402Sru if (nread < 0) 321114402Sru error("error reading `%1': %2", filename, strerror(errno)); 322114402Sru else if (nread != size) 323114402Sru error("size of `%1' decreased", filename); 324114402Sru else { 325114402Sru char c; 326114402Sru nread = read(fd, &c, 1); 327114402Sru if (nread != 0) 328114402Sru error("size of `%1' increased", filename); 329114402Sru else if (memchr(buffer + 4, '\0', size < 1024 ? size : 1024) != 0) 330114402Sru error("database `%1' is a binary file", filename); 331114402Sru else { 332114402Sru close(fd); 333114402Sru buffer[3] = '\n'; 334114402Sru int sidx = 4, didx = 4; 335114402Sru for ( ; sidx < size + 4; sidx++, didx++) 336114402Sru { 337114402Sru if (buffer[sidx] == '\r') 338114402Sru { 339114402Sru if (buffer[++sidx] != '\n') 340114402Sru buffer[didx++] = '\r'; 341114402Sru else 342114402Sru size--; 343114402Sru } 344114402Sru if (sidx != didx) 345114402Sru buffer[didx] = buffer[sidx]; 346114402Sru } 347114402Sru bufend = buffer + 4 + size; 348114402Sru if (bufend[-1] != '\n') 349114402Sru *bufend++ = '\n'; 350114402Sru return 1; 351114402Sru } 352114402Sru } 353114402Sru a_delete buffer; 354114402Sru buffer = 0; 355114402Sru } 356114402Sru close(fd); 357114402Sru return 0; 358114402Sru} 359114402Sru 360114402Srulinear_searcher::linear_searcher(const char *query, int query_len, 361114402Sru const char *ign, int trunc) 362114402Sru: ignore_fields(ign), truncate_len(trunc), keys(0), nkeys(0) 363114402Sru{ 364114402Sru const char *query_end = query + query_len; 365114402Sru int nk = 0; 366114402Sru const char *p; 367114402Sru for (p = query; p < query_end; p++) 368114402Sru if (map[uchar(*p)] != '\0' 369114402Sru && (p[1] == '\0' || map[uchar(p[1])] == '\0')) 370114402Sru nk++; 371114402Sru if (nk == 0) 372114402Sru return; 373114402Sru keys = new bmpattern*[nk]; 374114402Sru p = query; 375114402Sru for (;;) { 376114402Sru while (p < query_end && map[uchar(*p)] == '\0') 377114402Sru p++; 378114402Sru if (p == query_end) 379114402Sru break; 380114402Sru const char *start = p; 381114402Sru while (p < query_end && map[uchar(*p)] != '\0') 382114402Sru p++; 383114402Sru keys[nkeys++] = new bmpattern(start, p - start); 384114402Sru } 385114402Sru assert(nkeys <= nk); 386114402Sru if (nkeys == 0) { 387114402Sru a_delete keys; 388114402Sru keys = 0; 389114402Sru } 390114402Sru} 391114402Sru 392114402Srulinear_searcher::~linear_searcher() 393114402Sru{ 394114402Sru for (int i = 0; i < nkeys; i++) 395114402Sru delete keys[i]; 396114402Sru a_delete keys; 397114402Sru} 398114402Sru 399114402Sruint linear_searcher::search(const char *buffer, const char *bufend, 400114402Sru const char **startp, int *lengthp) const 401114402Sru{ 402114402Sru assert(bufend - buffer > 0); 403114402Sru assert(buffer[-1] == '\n'); 404114402Sru assert(bufend[-1] == '\n'); 405114402Sru if (nkeys == 0) 406114402Sru return 0; 407114402Sru for (;;) { 408114402Sru const char *refstart; 409114402Sru const char *found = search_and_check(keys[0], buffer, bufend, &refstart); 410114402Sru if (!found) 411114402Sru break; 412114402Sru const char *refend = find_end(bufend, found + keys[0]->length()); 413114402Sru int i; 414114402Sru for (i = 1; i < nkeys; i++) 415114402Sru if (!search_and_check(keys[i], refstart, refend)) 416114402Sru break; 417114402Sru if (i >= nkeys) { 418114402Sru *startp = refstart; 419114402Sru *lengthp = refend - refstart; 420114402Sru return 1; 421114402Sru } 422114402Sru buffer = refend; 423114402Sru } 424114402Sru return 0; 425114402Sru} 426114402Sru 427114402Sruclass linear_search_item : public search_item { 428114402Sru file_buffer fbuf; 429114402Srupublic: 430114402Sru linear_search_item(const char *filename, int fid); 431114402Sru ~linear_search_item(); 432114402Sru int load(int fd); 433114402Sru search_item_iterator *make_search_item_iterator(const char *); 434114402Sru friend class linear_search_item_iterator; 435114402Sru}; 436114402Sru 437114402Sruclass linear_search_item_iterator : public search_item_iterator { 438114402Sru linear_search_item *lsi; 439114402Sru int pos; 440114402Srupublic: 441114402Sru linear_search_item_iterator(linear_search_item *, const char *query); 442114402Sru ~linear_search_item_iterator(); 443114402Sru int next(const linear_searcher &, const char **ptr, int *lenp, 444114402Sru reference_id *ridp); 445114402Sru}; 446114402Sru 447114402Srusearch_item *make_linear_search_item(int fd, const char *filename, int fid) 448114402Sru{ 449114402Sru linear_search_item *item = new linear_search_item(filename, fid); 450114402Sru if (!item->load(fd)) { 451114402Sru delete item; 452114402Sru return 0; 453114402Sru } 454114402Sru else 455114402Sru return item; 456114402Sru} 457114402Sru 458114402Srulinear_search_item::linear_search_item(const char *filename, int fid) 459114402Sru: search_item(filename, fid) 460114402Sru{ 461114402Sru} 462114402Sru 463114402Srulinear_search_item::~linear_search_item() 464114402Sru{ 465114402Sru} 466114402Sru 467114402Sruint linear_search_item::load(int fd) 468114402Sru{ 469114402Sru return fbuf.load(fd, name); 470114402Sru} 471114402Sru 472114402Srusearch_item_iterator *linear_search_item::make_search_item_iterator( 473114402Sru const char *query) 474114402Sru{ 475114402Sru return new linear_search_item_iterator(this, query); 476114402Sru} 477114402Sru 478114402Srulinear_search_item_iterator::linear_search_item_iterator( 479114402Sru linear_search_item *p, const char *) 480114402Sru: lsi(p), pos(0) 481114402Sru{ 482114402Sru} 483114402Sru 484114402Srulinear_search_item_iterator::~linear_search_item_iterator() 485114402Sru{ 486114402Sru} 487114402Sru 488114402Sruint linear_search_item_iterator::next(const linear_searcher &searcher, 489114402Sru const char **startp, int *lengthp, 490114402Sru reference_id *ridp) 491114402Sru{ 492114402Sru const char *bufstart = lsi->fbuf.get_start(); 493114402Sru const char *bufend = lsi->fbuf.get_end(); 494114402Sru const char *ptr = bufstart + pos; 495114402Sru if (ptr < bufend && searcher.search(ptr, bufend, startp, lengthp)) { 496114402Sru pos = *startp + *lengthp - bufstart; 497114402Sru if (ridp) 498114402Sru *ridp = reference_id(lsi->filename_id, *startp - bufstart); 499114402Sru return 1; 500114402Sru } 501114402Sru else 502114402Sru return 0; 503114402Sru} 504