1114402Sru// -*- C++ -*-
2114402Sru/* Copyright (C) 1989, 1990, 1991, 1992, 2000, 2001
3114402Sru   Free Software Foundation, Inc.
4114402Sru     Written by James Clark (jjc@jclark.com)
5114402Sru
6114402SruThis file is part of groff.
7114402Sru
8114402Srugroff is free software; you can redistribute it and/or modify it under
9114402Sruthe terms of the GNU General Public License as published by the Free
10114402SruSoftware Foundation; either version 2, or (at your option) any later
11114402Sruversion.
12114402Sru
13114402Srugroff is distributed in the hope that it will be useful, but WITHOUT ANY
14114402SruWARRANTY; without even the implied warranty of MERCHANTABILITY or
15114402SruFITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
16114402Srufor more details.
17114402Sru
18114402SruYou should have received a copy of the GNU General Public License along
19114402Sruwith groff; see the file COPYING.  If not, write to the Free Software
20151497SruFoundation, 51 Franklin St - Fifth Floor, Boston, MA 02110-1301, USA. */
21114402Sru
22114402Sru#include "lib.h"
23114402Sru
24114402Sru#include <stdlib.h>
25114402Sru#include <assert.h>
26114402Sru#include <errno.h>
27114402Sru
28114402Sru#include "posix.h"
29114402Sru#include "errarg.h"
30114402Sru#include "error.h"
31114402Sru#include "cset.h"
32114402Sru#include "cmap.h"
33114402Sru#include "nonposix.h"
34114402Sru
35114402Sru#include "refid.h"
36114402Sru#include "search.h"
37114402Sru
38114402Sruclass file_buffer {
39114402Sru  char *buffer;
40114402Sru  char *bufend;
41114402Srupublic:
42114402Sru  file_buffer();
43114402Sru  ~file_buffer();
44114402Sru  int load(int fd, const char *filename);
45114402Sru  const char *get_start() const;
46114402Sru  const char *get_end() const;
47114402Sru};
48114402Sru
49114402Srutypedef unsigned char uchar;
50114402Sru
51114402Srustatic uchar map[256];
52114402Srustatic uchar inv_map[256][3];
53114402Sru
54114402Srustruct map_init {
55114402Sru  map_init();
56114402Sru};
57114402Sru
58114402Srustatic map_init the_map_init;
59114402Sru
60114402Srumap_init::map_init()
61114402Sru{
62114402Sru  int i;
63114402Sru  for (i = 0; i < 256; i++)
64114402Sru    map[i] = csalnum(i) ? cmlower(i) : '\0';
65114402Sru  for (i = 0; i < 256; i++) {
66114402Sru    if (cslower(i)) {
67114402Sru      inv_map[i][0] = i;
68114402Sru      inv_map[i][1] = cmupper(i);
69114402Sru      inv_map[i][2] = '\0';
70114402Sru    }
71114402Sru    else if (csdigit(i)) {
72114402Sru      inv_map[i][0] = i;
73114402Sru      inv_map[i][1] = 0;
74114402Sru    }
75114402Sru    else
76114402Sru      inv_map[i][0] = '\0';
77114402Sru  }
78114402Sru}
79114402Sru
80114402Sru
81114402Sruclass bmpattern {
82114402Sru  char *pat;
83114402Sru  int len;
84114402Sru  int delta[256];
85114402Srupublic:
86114402Sru  bmpattern(const char *pattern, int pattern_length);
87114402Sru  ~bmpattern();
88114402Sru  const char *search(const char *p, const char *end) const;
89114402Sru  int length() const;
90114402Sru};
91114402Sru
92114402Srubmpattern::bmpattern(const char *pattern, int pattern_length)
93114402Sru: len(pattern_length)
94114402Sru{
95114402Sru  pat = new char[len];
96114402Sru  int i;
97114402Sru  for (i = 0; i < len; i++)
98114402Sru    pat[i] = map[uchar(pattern[i])];
99114402Sru  for (i = 0; i < 256; i++)
100114402Sru    delta[i] = len;
101114402Sru  for (i = 0; i < len; i++)
102114402Sru    for (const unsigned char *inv = inv_map[uchar(pat[i])]; *inv; inv++)
103114402Sru      delta[*inv] = len - i - 1;
104114402Sru}
105114402Sru
106114402Sruconst char *bmpattern::search(const char *buf, const char *end) const
107114402Sru{
108114402Sru  int buflen = end - buf;
109114402Sru  if (len > buflen)
110114402Sru    return 0;
111114402Sru  const char *strend;
112114402Sru  if (buflen > len*4)
113114402Sru    strend = end - len*4;
114114402Sru  else
115114402Sru    strend = buf;
116114402Sru  const char *k = buf + len - 1;
117114402Sru  const int *del = delta;
118114402Sru  const char *pattern = pat;
119114402Sru  for (;;) {
120114402Sru    while (k < strend) {
121114402Sru      int t = del[uchar(*k)];
122114402Sru      if (!t)
123114402Sru	break;
124114402Sru      k += t;
125114402Sru      k += del[uchar(*k)];
126114402Sru      k += del[uchar(*k)];
127114402Sru    }
128114402Sru    while (k < end && del[uchar(*k)] != 0)
129114402Sru      k++;
130114402Sru    if (k == end)
131114402Sru      break;
132114402Sru    int j = len - 1;
133114402Sru    const char *s = k;
134114402Sru    for (;;) {
135114402Sru      if (j == 0)
136114402Sru	return s;
137114402Sru      if (map[uchar(*--s)] != uchar(pattern[--j]))
138114402Sru	break;
139114402Sru    }
140114402Sru    k++;
141114402Sru  }
142114402Sru  return 0;
143114402Sru}
144114402Sru
145114402Srubmpattern::~bmpattern()
146114402Sru{
147114402Sru  a_delete pat;
148114402Sru}
149114402Sru
150114402Sruinline int bmpattern::length() const
151114402Sru{
152114402Sru  return len;
153114402Sru}
154114402Sru
155114402Sru
156114402Srustatic const char *find_end(const char *bufend, const char *p);
157114402Sru
158114402Sruconst char *linear_searcher::search_and_check(const bmpattern *key,
159114402Sru  const char *buf, const char *bufend, const char **start) const
160114402Sru{
161114402Sru  assert(buf[-1] == '\n');
162114402Sru  assert(bufend[-1] == '\n');
163114402Sru  const char *ptr = buf;
164114402Sru  for (;;) {
165114402Sru    const char *found = key->search(ptr, bufend);
166114402Sru    if (!found)
167114402Sru      break;
168114402Sru    if (check_match(buf, bufend, found, key->length(), &ptr, start))
169114402Sru      return found;
170114402Sru  }
171114402Sru  return 0;
172114402Sru}
173114402Sru
174114402Srustatic const char *skip_field(const char *end, const char *p)
175114402Sru{
176114402Sru  for (;;)
177114402Sru    if (*p++ == '\n') {
178114402Sru      if (p == end || *p == '%')
179114402Sru	break;
180114402Sru      const char *q;
181114402Sru      for (q = p; *q == ' ' || *q == '\t'; q++)
182114402Sru	;
183114402Sru      if (*q == '\n')
184114402Sru	break;
185114402Sru      p = q + 1;
186114402Sru    }
187114402Sru  return p;
188114402Sru}
189114402Sru
190114402Srustatic const char *find_end(const char *bufend, const char *p)
191114402Sru{
192114402Sru  for (;;)
193114402Sru    if (*p++ == '\n') {
194114402Sru      if (p == bufend)
195114402Sru	break;
196114402Sru      const char *q;
197114402Sru      for (q = p; *q == ' ' || *q == '\t'; q++)
198114402Sru	;
199114402Sru      if (*q == '\n')
200114402Sru	break;
201114402Sru      p = q + 1;
202114402Sru    }
203114402Sru  return p;
204114402Sru}
205114402Sru
206114402Sru
207114402Sruint linear_searcher::check_match(const char *buf, const char *bufend,
208114402Sru				 const char *match, int matchlen,
209114402Sru				 const char **cont, const char **start) const
210114402Sru{
211114402Sru  *cont = match + 1;
212114402Sru  // The user is required to supply only the first truncate_len characters
213114402Sru  // of the key.  If truncate_len <= 0, he must supply all the key.
214114402Sru  if ((truncate_len <= 0 || matchlen < truncate_len)
215114402Sru      && map[uchar(match[matchlen])] != '\0')
216114402Sru    return 0;
217114402Sru
218114402Sru  // The character before the match must not be an alphanumeric
219114402Sru  // character (unless the alphanumeric character follows one or two
220114402Sru  // percent characters at the beginning of the line), nor must it be
221114402Sru  // a percent character at the beginning of a line, nor a percent
222114402Sru  // character following a percent character at the beginning of a
223114402Sru  // line.
224114402Sru
225114402Sru  switch (match - buf) {
226114402Sru  case 0:
227114402Sru    break;
228114402Sru  case 1:
229114402Sru    if (match[-1] == '%' || map[uchar(match[-1])] != '\0')
230114402Sru      return 0;
231114402Sru    break;
232114402Sru  case 2:
233114402Sru    if (map[uchar(match[-1])] != '\0' && match[-2] != '%')
234114402Sru      return 0;
235114402Sru    if (match[-1] == '%'
236114402Sru	&& (match[-2] == '\n' || match[-2] == '%'))
237114402Sru      return 0;
238114402Sru    break;
239114402Sru  default:
240114402Sru    if (map[uchar(match[-1])] != '\0'
241114402Sru	&& !(match[-2] == '%'
242114402Sru	     && (match[-3] == '\n'
243114402Sru		 || (match[-3] == '%' && match[-4] == '\n'))))
244114402Sru      return 0;
245114402Sru    if (match[-1] == '%'
246114402Sru	&& (match[-2] == '\n'
247114402Sru	    || (match[-2] == '%' && match[-3] == '\n')))
248114402Sru      return 0;
249114402Sru  }
250114402Sru
251114402Sru  const char *p = match;
252114402Sru  int had_percent = 0;
253114402Sru  for (;;) {
254114402Sru    if (*p == '\n') {
255114402Sru      if (!had_percent && p[1] == '%') {
256114402Sru	if (p[2] != '\0' && strchr(ignore_fields, p[2]) != 0) {
257114402Sru	  *cont = skip_field(bufend, match + matchlen);
258114402Sru	  return 0;
259114402Sru	}
260114402Sru	if (!start)
261114402Sru	  break;
262114402Sru	had_percent = 1;
263114402Sru      }
264114402Sru      if (p <= buf) {
265114402Sru	if (start)
266114402Sru	  *start = p + 1;
267114402Sru	return 1;
268114402Sru      }
269114402Sru      const char *q;
270114402Sru      for (q = p - 1; *q == ' ' || *q == '\t'; q--)
271114402Sru	;
272114402Sru      if (*q == '\n') {
273114402Sru	if (start)
274114402Sru	  *start = p + 1;
275114402Sru	break;
276114402Sru      }
277114402Sru      p = q;
278114402Sru    }
279114402Sru    p--;
280114402Sru  }
281114402Sru  return 1;
282114402Sru}
283114402Sru
284114402Srufile_buffer::file_buffer()
285114402Sru: buffer(0), bufend(0)
286114402Sru{
287114402Sru}
288114402Sru
289114402Srufile_buffer::~file_buffer()
290114402Sru{
291114402Sru  a_delete buffer;
292114402Sru}
293114402Sru
294114402Sruconst char *file_buffer::get_start() const
295114402Sru{
296114402Sru  return buffer ? buffer + 4 : 0;
297114402Sru}
298114402Sru
299114402Sruconst char *file_buffer::get_end() const
300114402Sru{
301114402Sru  return bufend;
302114402Sru}
303114402Sru
304114402Sruint file_buffer::load(int fd, const char *filename)
305114402Sru{
306114402Sru  struct stat sb;
307114402Sru  if (fstat(fd, &sb) < 0)
308114402Sru    error("can't fstat `%1': %2", filename, strerror(errno));
309114402Sru  else if (!S_ISREG(sb.st_mode))
310114402Sru    error("`%1' is not a regular file", filename);
311114402Sru  else {
312114402Sru    // We need one character extra at the beginning for an additional newline
313114402Sru    // used as a sentinel.  We get 4 instead so that the read buffer will be
314114402Sru    // word-aligned.  This seems to make the read slightly faster.  We also
315114402Sru    // need one character at the end also for an additional newline used as a
316114402Sru    // sentinel.
317114402Sru    int size = int(sb.st_size);
318114402Sru    buffer = new char[size + 4 + 1];
319114402Sru    int nread = read(fd, buffer + 4, size);
320114402Sru    if (nread < 0)
321114402Sru      error("error reading `%1': %2", filename, strerror(errno));
322114402Sru    else if (nread != size)
323114402Sru      error("size of `%1' decreased", filename);
324114402Sru    else {
325114402Sru      char c;
326114402Sru      nread = read(fd, &c, 1);
327114402Sru      if (nread != 0)
328114402Sru	error("size of `%1' increased", filename);
329114402Sru      else if (memchr(buffer + 4, '\0', size < 1024 ? size : 1024) != 0)
330114402Sru	error("database `%1' is a binary file", filename);
331114402Sru      else {
332114402Sru	close(fd);
333114402Sru	buffer[3] = '\n';
334114402Sru	int sidx = 4, didx = 4;
335114402Sru	for ( ; sidx < size + 4; sidx++, didx++)
336114402Sru	  {
337114402Sru	    if (buffer[sidx] == '\r')
338114402Sru	      {
339114402Sru		if (buffer[++sidx] != '\n')
340114402Sru		  buffer[didx++] = '\r';
341114402Sru		else
342114402Sru		  size--;
343114402Sru	      }
344114402Sru	    if (sidx != didx)
345114402Sru	      buffer[didx] = buffer[sidx];
346114402Sru	  }
347114402Sru	bufend = buffer + 4 + size;
348114402Sru	if (bufend[-1] != '\n')
349114402Sru	  *bufend++ = '\n';
350114402Sru	return 1;
351114402Sru      }
352114402Sru    }
353114402Sru    a_delete buffer;
354114402Sru    buffer = 0;
355114402Sru  }
356114402Sru  close(fd);
357114402Sru  return 0;
358114402Sru}
359114402Sru
360114402Srulinear_searcher::linear_searcher(const char *query, int query_len,
361114402Sru				 const char *ign, int trunc)
362114402Sru: ignore_fields(ign), truncate_len(trunc), keys(0), nkeys(0)
363114402Sru{
364114402Sru  const char *query_end = query + query_len;
365114402Sru  int nk = 0;
366114402Sru  const char *p;
367114402Sru  for (p = query; p < query_end; p++)
368114402Sru    if (map[uchar(*p)] != '\0'
369114402Sru	&& (p[1] == '\0' || map[uchar(p[1])] == '\0'))
370114402Sru      nk++;
371114402Sru  if (nk == 0)
372114402Sru    return;
373114402Sru  keys = new bmpattern*[nk];
374114402Sru  p = query;
375114402Sru  for (;;) {
376114402Sru    while (p < query_end && map[uchar(*p)] == '\0')
377114402Sru      p++;
378114402Sru    if (p == query_end)
379114402Sru      break;
380114402Sru    const char *start = p;
381114402Sru    while (p < query_end && map[uchar(*p)] != '\0')
382114402Sru      p++;
383114402Sru    keys[nkeys++] = new bmpattern(start, p - start);
384114402Sru  }
385114402Sru  assert(nkeys <= nk);
386114402Sru  if (nkeys == 0) {
387114402Sru    a_delete keys;
388114402Sru    keys = 0;
389114402Sru  }
390114402Sru}
391114402Sru
392114402Srulinear_searcher::~linear_searcher()
393114402Sru{
394114402Sru  for (int i = 0; i < nkeys; i++)
395114402Sru    delete keys[i];
396114402Sru  a_delete keys;
397114402Sru}
398114402Sru
399114402Sruint linear_searcher::search(const char *buffer, const char *bufend,
400114402Sru			    const char **startp, int *lengthp) const
401114402Sru{
402114402Sru  assert(bufend - buffer > 0);
403114402Sru  assert(buffer[-1] == '\n');
404114402Sru  assert(bufend[-1] == '\n');
405114402Sru  if (nkeys == 0)
406114402Sru    return 0;
407114402Sru  for (;;) {
408114402Sru    const char *refstart;
409114402Sru    const char *found = search_and_check(keys[0], buffer, bufend, &refstart);
410114402Sru    if (!found)
411114402Sru      break;
412114402Sru    const char *refend = find_end(bufend, found + keys[0]->length());
413114402Sru    int i;
414114402Sru    for (i = 1; i < nkeys; i++)
415114402Sru      if (!search_and_check(keys[i], refstart, refend))
416114402Sru	break;
417114402Sru    if (i >= nkeys) {
418114402Sru      *startp = refstart;
419114402Sru      *lengthp = refend - refstart;
420114402Sru      return 1;
421114402Sru    }
422114402Sru    buffer = refend;
423114402Sru  }
424114402Sru  return 0;
425114402Sru}
426114402Sru
427114402Sruclass linear_search_item : public search_item {
428114402Sru  file_buffer fbuf;
429114402Srupublic:
430114402Sru  linear_search_item(const char *filename, int fid);
431114402Sru  ~linear_search_item();
432114402Sru  int load(int fd);
433114402Sru  search_item_iterator *make_search_item_iterator(const char *);
434114402Sru  friend class linear_search_item_iterator;
435114402Sru};
436114402Sru
437114402Sruclass linear_search_item_iterator : public search_item_iterator {
438114402Sru  linear_search_item *lsi;
439114402Sru  int pos;
440114402Srupublic:
441114402Sru  linear_search_item_iterator(linear_search_item *, const char *query);
442114402Sru  ~linear_search_item_iterator();
443114402Sru  int next(const linear_searcher &, const char **ptr, int *lenp,
444114402Sru	   reference_id *ridp);
445114402Sru};
446114402Sru
447114402Srusearch_item *make_linear_search_item(int fd, const char *filename, int fid)
448114402Sru{
449114402Sru  linear_search_item *item = new linear_search_item(filename, fid);
450114402Sru  if (!item->load(fd)) {
451114402Sru    delete item;
452114402Sru    return 0;
453114402Sru  }
454114402Sru  else
455114402Sru    return item;
456114402Sru}
457114402Sru
458114402Srulinear_search_item::linear_search_item(const char *filename, int fid)
459114402Sru: search_item(filename, fid)
460114402Sru{
461114402Sru}
462114402Sru
463114402Srulinear_search_item::~linear_search_item()
464114402Sru{
465114402Sru}
466114402Sru
467114402Sruint linear_search_item::load(int fd)
468114402Sru{
469114402Sru  return fbuf.load(fd, name);
470114402Sru}
471114402Sru
472114402Srusearch_item_iterator *linear_search_item::make_search_item_iterator(
473114402Sru  const char *query)
474114402Sru{
475114402Sru  return new linear_search_item_iterator(this, query);
476114402Sru}
477114402Sru
478114402Srulinear_search_item_iterator::linear_search_item_iterator(
479114402Sru  linear_search_item *p, const char *)
480114402Sru: lsi(p), pos(0)
481114402Sru{
482114402Sru}
483114402Sru
484114402Srulinear_search_item_iterator::~linear_search_item_iterator()
485114402Sru{
486114402Sru}
487114402Sru
488114402Sruint linear_search_item_iterator::next(const linear_searcher &searcher,
489114402Sru				      const char **startp, int *lengthp,
490114402Sru				      reference_id *ridp)
491114402Sru{
492114402Sru  const char *bufstart = lsi->fbuf.get_start();
493114402Sru  const char *bufend = lsi->fbuf.get_end();
494114402Sru  const char *ptr = bufstart + pos;
495114402Sru  if (ptr < bufend && searcher.search(ptr, bufend, startp, lengthp)) {
496114402Sru    pos = *startp + *lengthp - bufstart;
497114402Sru    if (ridp)
498114402Sru      *ridp = reference_id(lsi->filename_id, *startp - bufstart);
499114402Sru    return 1;
500114402Sru  }
501114402Sru  else
502114402Sru    return 0;
503114402Sru}
504