1// -*- C++ -*-
2/* Copyright (C) 1989, 1990, 1991, 1992, 2000, 2001
3   Free Software Foundation, Inc.
4     Written by James Clark (jjc@jclark.com)
5
6This file is part of groff.
7
8groff is free software; you can redistribute it and/or modify it under
9the terms of the GNU General Public License as published by the Free
10Software Foundation; either version 2, or (at your option) any later
11version.
12
13groff is distributed in the hope that it will be useful, but WITHOUT ANY
14WARRANTY; without even the implied warranty of MERCHANTABILITY or
15FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
16for more details.
17
18You should have received a copy of the GNU General Public License along
19with groff; see the file COPYING.  If not, write to the Free Software
20Foundation, 51 Franklin St - Fifth Floor, Boston, MA 02110-1301, USA. */
21
22#include "lib.h"
23
24#include <stdlib.h>
25#include <assert.h>
26#include <errno.h>
27
28#include "posix.h"
29#include "errarg.h"
30#include "error.h"
31#include "cset.h"
32#include "cmap.h"
33#include "nonposix.h"
34
35#include "refid.h"
36#include "search.h"
37
38class file_buffer {
39  char *buffer;
40  char *bufend;
41public:
42  file_buffer();
43  ~file_buffer();
44  int load(int fd, const char *filename);
45  const char *get_start() const;
46  const char *get_end() const;
47};
48
49typedef unsigned char uchar;
50
51static uchar map[256];
52static uchar inv_map[256][3];
53
54struct map_init {
55  map_init();
56};
57
58static map_init the_map_init;
59
60map_init::map_init()
61{
62  int i;
63  for (i = 0; i < 256; i++)
64    map[i] = csalnum(i) ? cmlower(i) : '\0';
65  for (i = 0; i < 256; i++) {
66    if (cslower(i)) {
67      inv_map[i][0] = i;
68      inv_map[i][1] = cmupper(i);
69      inv_map[i][2] = '\0';
70    }
71    else if (csdigit(i)) {
72      inv_map[i][0] = i;
73      inv_map[i][1] = 0;
74    }
75    else
76      inv_map[i][0] = '\0';
77  }
78}
79
80
81class bmpattern {
82  char *pat;
83  int len;
84  int delta[256];
85public:
86  bmpattern(const char *pattern, int pattern_length);
87  ~bmpattern();
88  const char *search(const char *p, const char *end) const;
89  int length() const;
90};
91
92bmpattern::bmpattern(const char *pattern, int pattern_length)
93: len(pattern_length)
94{
95  pat = new char[len];
96  int i;
97  for (i = 0; i < len; i++)
98    pat[i] = map[uchar(pattern[i])];
99  for (i = 0; i < 256; i++)
100    delta[i] = len;
101  for (i = 0; i < len; i++)
102    for (const unsigned char *inv = inv_map[uchar(pat[i])]; *inv; inv++)
103      delta[*inv] = len - i - 1;
104}
105
106const char *bmpattern::search(const char *buf, const char *end) const
107{
108  int buflen = end - buf;
109  if (len > buflen)
110    return 0;
111  const char *strend;
112  if (buflen > len*4)
113    strend = end - len*4;
114  else
115    strend = buf;
116  const char *k = buf + len - 1;
117  const int *del = delta;
118  const char *pattern = pat;
119  for (;;) {
120    while (k < strend) {
121      int t = del[uchar(*k)];
122      if (!t)
123	break;
124      k += t;
125      k += del[uchar(*k)];
126      k += del[uchar(*k)];
127    }
128    while (k < end && del[uchar(*k)] != 0)
129      k++;
130    if (k == end)
131      break;
132    int j = len - 1;
133    const char *s = k;
134    for (;;) {
135      if (j == 0)
136	return s;
137      if (map[uchar(*--s)] != uchar(pattern[--j]))
138	break;
139    }
140    k++;
141  }
142  return 0;
143}
144
145bmpattern::~bmpattern()
146{
147  a_delete pat;
148}
149
150inline int bmpattern::length() const
151{
152  return len;
153}
154
155
156static const char *find_end(const char *bufend, const char *p);
157
158const char *linear_searcher::search_and_check(const bmpattern *key,
159  const char *buf, const char *bufend, const char **start) const
160{
161  assert(buf[-1] == '\n');
162  assert(bufend[-1] == '\n');
163  const char *ptr = buf;
164  for (;;) {
165    const char *found = key->search(ptr, bufend);
166    if (!found)
167      break;
168    if (check_match(buf, bufend, found, key->length(), &ptr, start))
169      return found;
170  }
171  return 0;
172}
173
174static const char *skip_field(const char *end, const char *p)
175{
176  for (;;)
177    if (*p++ == '\n') {
178      if (p == end || *p == '%')
179	break;
180      const char *q;
181      for (q = p; *q == ' ' || *q == '\t'; q++)
182	;
183      if (*q == '\n')
184	break;
185      p = q + 1;
186    }
187  return p;
188}
189
190static const char *find_end(const char *bufend, const char *p)
191{
192  for (;;)
193    if (*p++ == '\n') {
194      if (p == bufend)
195	break;
196      const char *q;
197      for (q = p; *q == ' ' || *q == '\t'; q++)
198	;
199      if (*q == '\n')
200	break;
201      p = q + 1;
202    }
203  return p;
204}
205
206
207int linear_searcher::check_match(const char *buf, const char *bufend,
208				 const char *match, int matchlen,
209				 const char **cont, const char **start) const
210{
211  *cont = match + 1;
212  // The user is required to supply only the first truncate_len characters
213  // of the key.  If truncate_len <= 0, he must supply all the key.
214  if ((truncate_len <= 0 || matchlen < truncate_len)
215      && map[uchar(match[matchlen])] != '\0')
216    return 0;
217
218  // The character before the match must not be an alphanumeric
219  // character (unless the alphanumeric character follows one or two
220  // percent characters at the beginning of the line), nor must it be
221  // a percent character at the beginning of a line, nor a percent
222  // character following a percent character at the beginning of a
223  // line.
224
225  switch (match - buf) {
226  case 0:
227    break;
228  case 1:
229    if (match[-1] == '%' || map[uchar(match[-1])] != '\0')
230      return 0;
231    break;
232  case 2:
233    if (map[uchar(match[-1])] != '\0' && match[-2] != '%')
234      return 0;
235    if (match[-1] == '%'
236	&& (match[-2] == '\n' || match[-2] == '%'))
237      return 0;
238    break;
239  default:
240    if (map[uchar(match[-1])] != '\0'
241	&& !(match[-2] == '%'
242	     && (match[-3] == '\n'
243		 || (match[-3] == '%' && match[-4] == '\n'))))
244      return 0;
245    if (match[-1] == '%'
246	&& (match[-2] == '\n'
247	    || (match[-2] == '%' && match[-3] == '\n')))
248      return 0;
249  }
250
251  const char *p = match;
252  int had_percent = 0;
253  for (;;) {
254    if (*p == '\n') {
255      if (!had_percent && p[1] == '%') {
256	if (p[2] != '\0' && strchr(ignore_fields, p[2]) != 0) {
257	  *cont = skip_field(bufend, match + matchlen);
258	  return 0;
259	}
260	if (!start)
261	  break;
262	had_percent = 1;
263      }
264      if (p <= buf) {
265	if (start)
266	  *start = p + 1;
267	return 1;
268      }
269      const char *q;
270      for (q = p - 1; *q == ' ' || *q == '\t'; q--)
271	;
272      if (*q == '\n') {
273	if (start)
274	  *start = p + 1;
275	break;
276      }
277      p = q;
278    }
279    p--;
280  }
281  return 1;
282}
283
284file_buffer::file_buffer()
285: buffer(0), bufend(0)
286{
287}
288
289file_buffer::~file_buffer()
290{
291  a_delete buffer;
292}
293
294const char *file_buffer::get_start() const
295{
296  return buffer ? buffer + 4 : 0;
297}
298
299const char *file_buffer::get_end() const
300{
301  return bufend;
302}
303
304int file_buffer::load(int fd, const char *filename)
305{
306  struct stat sb;
307  if (fstat(fd, &sb) < 0)
308    error("can't fstat `%1': %2", filename, strerror(errno));
309  else if (!S_ISREG(sb.st_mode))
310    error("`%1' is not a regular file", filename);
311  else {
312    // We need one character extra at the beginning for an additional newline
313    // used as a sentinel.  We get 4 instead so that the read buffer will be
314    // word-aligned.  This seems to make the read slightly faster.  We also
315    // need one character at the end also for an additional newline used as a
316    // sentinel.
317    int size = int(sb.st_size);
318    buffer = new char[size + 4 + 1];
319    int nread = read(fd, buffer + 4, size);
320    if (nread < 0)
321      error("error reading `%1': %2", filename, strerror(errno));
322    else if (nread != size)
323      error("size of `%1' decreased", filename);
324    else {
325      char c;
326      nread = read(fd, &c, 1);
327      if (nread != 0)
328	error("size of `%1' increased", filename);
329      else if (memchr(buffer + 4, '\0', size < 1024 ? size : 1024) != 0)
330	error("database `%1' is a binary file", filename);
331      else {
332	close(fd);
333	buffer[3] = '\n';
334	int sidx = 4, didx = 4;
335	for ( ; sidx < size + 4; sidx++, didx++)
336	  {
337	    if (buffer[sidx] == '\r')
338	      {
339		if (buffer[++sidx] != '\n')
340		  buffer[didx++] = '\r';
341		else
342		  size--;
343	      }
344	    if (sidx != didx)
345	      buffer[didx] = buffer[sidx];
346	  }
347	bufend = buffer + 4 + size;
348	if (bufend[-1] != '\n')
349	  *bufend++ = '\n';
350	return 1;
351      }
352    }
353    a_delete buffer;
354    buffer = 0;
355  }
356  close(fd);
357  return 0;
358}
359
360linear_searcher::linear_searcher(const char *query, int query_len,
361				 const char *ign, int trunc)
362: ignore_fields(ign), truncate_len(trunc), keys(0), nkeys(0)
363{
364  const char *query_end = query + query_len;
365  int nk = 0;
366  const char *p;
367  for (p = query; p < query_end; p++)
368    if (map[uchar(*p)] != '\0'
369	&& (p[1] == '\0' || map[uchar(p[1])] == '\0'))
370      nk++;
371  if (nk == 0)
372    return;
373  keys = new bmpattern*[nk];
374  p = query;
375  for (;;) {
376    while (p < query_end && map[uchar(*p)] == '\0')
377      p++;
378    if (p == query_end)
379      break;
380    const char *start = p;
381    while (p < query_end && map[uchar(*p)] != '\0')
382      p++;
383    keys[nkeys++] = new bmpattern(start, p - start);
384  }
385  assert(nkeys <= nk);
386  if (nkeys == 0) {
387    a_delete keys;
388    keys = 0;
389  }
390}
391
392linear_searcher::~linear_searcher()
393{
394  for (int i = 0; i < nkeys; i++)
395    delete keys[i];
396  a_delete keys;
397}
398
399int linear_searcher::search(const char *buffer, const char *bufend,
400			    const char **startp, int *lengthp) const
401{
402  assert(bufend - buffer > 0);
403  assert(buffer[-1] == '\n');
404  assert(bufend[-1] == '\n');
405  if (nkeys == 0)
406    return 0;
407  for (;;) {
408    const char *refstart;
409    const char *found = search_and_check(keys[0], buffer, bufend, &refstart);
410    if (!found)
411      break;
412    const char *refend = find_end(bufend, found + keys[0]->length());
413    int i;
414    for (i = 1; i < nkeys; i++)
415      if (!search_and_check(keys[i], refstart, refend))
416	break;
417    if (i >= nkeys) {
418      *startp = refstart;
419      *lengthp = refend - refstart;
420      return 1;
421    }
422    buffer = refend;
423  }
424  return 0;
425}
426
427class linear_search_item : public search_item {
428  file_buffer fbuf;
429public:
430  linear_search_item(const char *filename, int fid);
431  ~linear_search_item();
432  int load(int fd);
433  search_item_iterator *make_search_item_iterator(const char *);
434  friend class linear_search_item_iterator;
435};
436
437class linear_search_item_iterator : public search_item_iterator {
438  linear_search_item *lsi;
439  int pos;
440public:
441  linear_search_item_iterator(linear_search_item *, const char *query);
442  ~linear_search_item_iterator();
443  int next(const linear_searcher &, const char **ptr, int *lenp,
444	   reference_id *ridp);
445};
446
447search_item *make_linear_search_item(int fd, const char *filename, int fid)
448{
449  linear_search_item *item = new linear_search_item(filename, fid);
450  if (!item->load(fd)) {
451    delete item;
452    return 0;
453  }
454  else
455    return item;
456}
457
458linear_search_item::linear_search_item(const char *filename, int fid)
459: search_item(filename, fid)
460{
461}
462
463linear_search_item::~linear_search_item()
464{
465}
466
467int linear_search_item::load(int fd)
468{
469  return fbuf.load(fd, name);
470}
471
472search_item_iterator *linear_search_item::make_search_item_iterator(
473  const char *query)
474{
475  return new linear_search_item_iterator(this, query);
476}
477
478linear_search_item_iterator::linear_search_item_iterator(
479  linear_search_item *p, const char *)
480: lsi(p), pos(0)
481{
482}
483
484linear_search_item_iterator::~linear_search_item_iterator()
485{
486}
487
488int linear_search_item_iterator::next(const linear_searcher &searcher,
489				      const char **startp, int *lengthp,
490				      reference_id *ridp)
491{
492  const char *bufstart = lsi->fbuf.get_start();
493  const char *bufend = lsi->fbuf.get_end();
494  const char *ptr = bufstart + pos;
495  if (ptr < bufend && searcher.search(ptr, bufend, startp, lengthp)) {
496    pos = *startp + *lengthp - bufstart;
497    if (ridp)
498      *ridp = reference_id(lsi->filename_id, *startp - bufstart);
499    return 1;
500  }
501  else
502    return 0;
503}
504