1/* kwset.c - search for any of a set of keywords.
2   Copyright 1989, 1998, 2000, 2005 Free Software Foundation, Inc.
3
4   This program is free software; you can redistribute it and/or modify
5   it under the terms of the GNU General Public License as published by
6   the Free Software Foundation; either version 2, or (at your option)
7   any later version.
8
9   This program is distributed in the hope that it will be useful,
10   but WITHOUT ANY WARRANTY; without even the implied warranty of
11   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12   GNU General Public License for more details.
13
14   You should have received a copy of the GNU General Public License
15   along with this program; if not, write to the Free Software Foundation,
16   Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.  */
17
18/* Written August 1989 by Mike Haertel.
19   The author may be reached (Email) at the address mike@ai.mit.edu,
20   or (US mail) as Mike Haertel c/o Free Software Foundation. */
21
22/* The algorithm implemented by these routines bears a startling resemblence
23   to one discovered by Beate Commentz-Walter, although it is not identical.
24   See "A String Matching Algorithm Fast on the Average," Technical Report,
25   IBM-Germany, Scientific Center Heidelberg, Tiergartenstrasse 15, D-6900
26   Heidelberg, Germany.  See also Aho, A.V., and M. Corasick, "Efficient
27   String Matching:  An Aid to Bibliographic Search," CACM June 1975,
28   Vol. 18, No. 6, which describes the failure function used below. */
29
30#ifdef HAVE_CONFIG_H
31# include <config.h>
32#endif
33#include <sys/types.h>
34#include "kwset.h"
35#include <limits.h>
36#include <stdlib.h>
37#include "obstack.h"
38#include "gettext.h"
39#define _(str) gettext (str)
40
41#ifdef GREP
42extern char *xmalloc();
43# undef malloc
44# define malloc xmalloc
45#endif
46
47#define NCHAR (UCHAR_MAX + 1)
48#define obstack_chunk_alloc malloc
49#define obstack_chunk_free free
50
51/* Balanced tree of edges and labels leaving a given trie node. */
52struct tree
53{
54  struct tree *llink;		/* Left link; MUST be first field. */
55  struct tree *rlink;		/* Right link (to larger labels). */
56  struct trie *trie;		/* Trie node pointed to by this edge. */
57  unsigned char label;		/* Label on this edge. */
58  char balance;			/* Difference in depths of subtrees. */
59};
60
61/* Node of a trie representing a set of reversed keywords. */
62struct trie
63{
64  unsigned int accepting;	/* Word index of accepted word, or zero. */
65  struct tree *links;		/* Tree of edges leaving this node. */
66  struct trie *parent;		/* Parent of this node. */
67  struct trie *next;		/* List of all trie nodes in level order. */
68  struct trie *fail;		/* Aho-Corasick failure function. */
69  int depth;			/* Depth of this node from the root. */
70  int shift;			/* Shift function for search failures. */
71  int maxshift;			/* Max shift of self and descendents. */
72};
73
74/* Structure returned opaquely to the caller, containing everything. */
75struct kwset
76{
77  struct obstack obstack;	/* Obstack for node allocation. */
78  int words;			/* Number of words in the trie. */
79  struct trie *trie;		/* The trie itself. */
80  int mind;			/* Minimum depth of an accepting node. */
81  int maxd;			/* Maximum depth of any node. */
82  unsigned char delta[NCHAR];	/* Delta table for rapid search. */
83  struct trie *next[NCHAR];	/* Table of children of the root. */
84  char *target;			/* Target string if there's only one. */
85  int mind2;			/* Used in Boyer-Moore search for one string. */
86  char const *trans;		/* Character translation table. */
87};
88
89/* Allocate and initialize a keyword set object, returning an opaque
90   pointer to it.  Return NULL if memory is not available. */
91kwset_t
92kwsalloc (char const *trans)
93{
94  struct kwset *kwset;
95
96  kwset = (struct kwset *) malloc(sizeof (struct kwset));
97  if (!kwset)
98    return 0;
99
100  obstack_init(&kwset->obstack);
101  kwset->words = 0;
102  kwset->trie
103    = (struct trie *) obstack_alloc(&kwset->obstack, sizeof (struct trie));
104  if (!kwset->trie)
105    {
106      kwsfree((kwset_t) kwset);
107      return 0;
108    }
109  kwset->trie->accepting = 0;
110  kwset->trie->links = 0;
111  kwset->trie->parent = 0;
112  kwset->trie->next = 0;
113  kwset->trie->fail = 0;
114  kwset->trie->depth = 0;
115  kwset->trie->shift = 0;
116  kwset->mind = INT_MAX;
117  kwset->maxd = -1;
118  kwset->target = 0;
119  kwset->trans = trans;
120
121  return (kwset_t) kwset;
122}
123
124/* Add the given string to the contents of the keyword set.  Return NULL
125   for success, an error message otherwise. */
126const char *
127kwsincr (kwset_t kws, char const *text, size_t len)
128{
129  struct kwset *kwset;
130  register struct trie *trie;
131  register unsigned char label;
132  register struct tree *link;
133  register int depth;
134  struct tree *links[12];
135  enum { L, R } dirs[12];
136  struct tree *t, *r, *l, *rl, *lr;
137
138  kwset = (struct kwset *) kws;
139  trie = kwset->trie;
140  text += len;
141
142  /* Descend the trie (built of reversed keywords) character-by-character,
143     installing new nodes when necessary. */
144  while (len--)
145    {
146      label = kwset->trans ? kwset->trans[(unsigned char) *--text] : *--text;
147
148      /* Descend the tree of outgoing links for this trie node,
149	 looking for the current character and keeping track
150	 of the path followed. */
151      link = trie->links;
152      links[0] = (struct tree *) &trie->links;
153      dirs[0] = L;
154      depth = 1;
155
156      while (link && label != link->label)
157	{
158	  links[depth] = link;
159	  if (label < link->label)
160	    dirs[depth++] = L, link = link->llink;
161	  else
162	    dirs[depth++] = R, link = link->rlink;
163	}
164
165      /* The current character doesn't have an outgoing link at
166	 this trie node, so build a new trie node and install
167	 a link in the current trie node's tree. */
168      if (!link)
169	{
170	  link = (struct tree *) obstack_alloc(&kwset->obstack,
171					       sizeof (struct tree));
172	  if (!link)
173	    return _("memory exhausted");
174	  link->llink = 0;
175	  link->rlink = 0;
176	  link->trie = (struct trie *) obstack_alloc(&kwset->obstack,
177						     sizeof (struct trie));
178	  if (!link->trie)
179	    return _("memory exhausted");
180	  link->trie->accepting = 0;
181	  link->trie->links = 0;
182	  link->trie->parent = trie;
183	  link->trie->next = 0;
184	  link->trie->fail = 0;
185	  link->trie->depth = trie->depth + 1;
186	  link->trie->shift = 0;
187	  link->label = label;
188	  link->balance = 0;
189
190	  /* Install the new tree node in its parent. */
191	  if (dirs[--depth] == L)
192	    links[depth]->llink = link;
193	  else
194	    links[depth]->rlink = link;
195
196	  /* Back up the tree fixing the balance flags. */
197	  while (depth && !links[depth]->balance)
198	    {
199	      if (dirs[depth] == L)
200		--links[depth]->balance;
201	      else
202		++links[depth]->balance;
203	      --depth;
204	    }
205
206	  /* Rebalance the tree by pointer rotations if necessary. */
207	  if (depth && ((dirs[depth] == L && --links[depth]->balance)
208			|| (dirs[depth] == R && ++links[depth]->balance)))
209	    {
210	      switch (links[depth]->balance)
211		{
212		case (char) -2:
213		  switch (dirs[depth + 1])
214		    {
215		    case L:
216		      r = links[depth], t = r->llink, rl = t->rlink;
217		      t->rlink = r, r->llink = rl;
218		      t->balance = r->balance = 0;
219		      break;
220		    case R:
221		      r = links[depth], l = r->llink, t = l->rlink;
222		      rl = t->rlink, lr = t->llink;
223		      t->llink = l, l->rlink = lr, t->rlink = r, r->llink = rl;
224		      l->balance = t->balance != 1 ? 0 : -1;
225		      r->balance = t->balance != (char) -1 ? 0 : 1;
226		      t->balance = 0;
227		      break;
228		    default:
229		      abort ();
230		    }
231		  break;
232		case 2:
233		  switch (dirs[depth + 1])
234		    {
235		    case R:
236		      l = links[depth], t = l->rlink, lr = t->llink;
237		      t->llink = l, l->rlink = lr;
238		      t->balance = l->balance = 0;
239		      break;
240		    case L:
241		      l = links[depth], r = l->rlink, t = r->llink;
242		      lr = t->llink, rl = t->rlink;
243		      t->llink = l, l->rlink = lr, t->rlink = r, r->llink = rl;
244		      l->balance = t->balance != 1 ? 0 : -1;
245		      r->balance = t->balance != (char) -1 ? 0 : 1;
246		      t->balance = 0;
247		      break;
248		    default:
249		      abort ();
250		    }
251		  break;
252		default:
253		  abort ();
254		}
255
256	      if (dirs[depth - 1] == L)
257		links[depth - 1]->llink = t;
258	      else
259		links[depth - 1]->rlink = t;
260	    }
261	}
262
263      trie = link->trie;
264    }
265
266  /* Mark the node we finally reached as accepting, encoding the
267     index number of this word in the keyword set so far. */
268  if (!trie->accepting)
269    trie->accepting = 1 + 2 * kwset->words;
270  ++kwset->words;
271
272  /* Keep track of the longest and shortest string of the keyword set. */
273  if (trie->depth < kwset->mind)
274    kwset->mind = trie->depth;
275  if (trie->depth > kwset->maxd)
276    kwset->maxd = trie->depth;
277
278  return 0;
279}
280
281/* Enqueue the trie nodes referenced from the given tree in the
282   given queue. */
283static void
284enqueue (struct tree *tree, struct trie **last)
285{
286  if (!tree)
287    return;
288  enqueue(tree->llink, last);
289  enqueue(tree->rlink, last);
290  (*last) = (*last)->next = tree->trie;
291}
292
293/* Compute the Aho-Corasick failure function for the trie nodes referenced
294   from the given tree, given the failure function for their parent as
295   well as a last resort failure node. */
296static void
297treefails (register struct tree const *tree, struct trie const *fail,
298	   struct trie *recourse)
299{
300  register struct tree *link;
301
302  if (!tree)
303    return;
304
305  treefails(tree->llink, fail, recourse);
306  treefails(tree->rlink, fail, recourse);
307
308  /* Find, in the chain of fails going back to the root, the first
309     node that has a descendent on the current label. */
310  while (fail)
311    {
312      link = fail->links;
313      while (link && tree->label != link->label)
314	if (tree->label < link->label)
315	  link = link->llink;
316	else
317	  link = link->rlink;
318      if (link)
319	{
320	  tree->trie->fail = link->trie;
321	  return;
322	}
323      fail = fail->fail;
324    }
325
326  tree->trie->fail = recourse;
327}
328
329/* Set delta entries for the links of the given tree such that
330   the preexisting delta value is larger than the current depth. */
331static void
332treedelta (register struct tree const *tree,
333	   register unsigned int depth,
334	   unsigned char delta[])
335{
336  if (!tree)
337    return;
338  treedelta(tree->llink, depth, delta);
339  treedelta(tree->rlink, depth, delta);
340  if (depth < delta[tree->label])
341    delta[tree->label] = depth;
342}
343
344/* Return true if A has every label in B. */
345static int
346hasevery (register struct tree const *a, register struct tree const *b)
347{
348  if (!b)
349    return 1;
350  if (!hasevery(a, b->llink))
351    return 0;
352  if (!hasevery(a, b->rlink))
353    return 0;
354  while (a && b->label != a->label)
355    if (b->label < a->label)
356      a = a->llink;
357    else
358      a = a->rlink;
359  return !!a;
360}
361
362/* Compute a vector, indexed by character code, of the trie nodes
363   referenced from the given tree. */
364static void
365treenext (struct tree const *tree, struct trie *next[])
366{
367  if (!tree)
368    return;
369  treenext(tree->llink, next);
370  treenext(tree->rlink, next);
371  next[tree->label] = tree->trie;
372}
373
374/* Compute the shift for each trie node, as well as the delta
375   table and next cache for the given keyword set. */
376const char *
377kwsprep (kwset_t kws)
378{
379  register struct kwset *kwset;
380  register int i;
381  register struct trie *curr, *fail;
382  register char const *trans;
383  unsigned char delta[NCHAR];
384  struct trie *last, *next[NCHAR];
385
386  kwset = (struct kwset *) kws;
387
388  /* Initial values for the delta table; will be changed later.  The
389     delta entry for a given character is the smallest depth of any
390     node at which an outgoing edge is labeled by that character. */
391  if (kwset->mind < 256)
392    for (i = 0; i < NCHAR; ++i)
393      delta[i] = kwset->mind;
394  else
395    for (i = 0; i < NCHAR; ++i)
396      delta[i] = 255;
397
398  /* Check if we can use the simple boyer-moore algorithm, instead
399     of the hairy commentz-walter algorithm. */
400  if (kwset->words == 1 && kwset->trans == 0)
401    {
402      /* Looking for just one string.  Extract it from the trie. */
403      kwset->target = obstack_alloc(&kwset->obstack, kwset->mind);
404      for (i = kwset->mind - 1, curr = kwset->trie; i >= 0; --i)
405	{
406	  kwset->target[i] = curr->links->label;
407	  curr = curr->links->trie;
408	}
409      /* Build the Boyer Moore delta.  Boy that's easy compared to CW. */
410      for (i = 0; i < kwset->mind; ++i)
411	delta[(unsigned char) kwset->target[i]] = kwset->mind - (i + 1);
412      kwset->mind2 = kwset->mind;
413      /* Find the minimal delta2 shift that we might make after
414	 a backwards match has failed. */
415      for (i = 0; i < kwset->mind - 1; ++i)
416	if (kwset->target[i] == kwset->target[kwset->mind - 1])
417	  kwset->mind2 = kwset->mind - (i + 1);
418    }
419  else
420    {
421      /* Traverse the nodes of the trie in level order, simultaneously
422	 computing the delta table, failure function, and shift function. */
423      for (curr = last = kwset->trie; curr; curr = curr->next)
424	{
425	  /* Enqueue the immediate descendents in the level order queue. */
426	  enqueue(curr->links, &last);
427
428	  curr->shift = kwset->mind;
429	  curr->maxshift = kwset->mind;
430
431	  /* Update the delta table for the descendents of this node. */
432	  treedelta(curr->links, curr->depth, delta);
433
434	  /* Compute the failure function for the decendents of this node. */
435	  treefails(curr->links, curr->fail, kwset->trie);
436
437	  /* Update the shifts at each node in the current node's chain
438	     of fails back to the root. */
439	  for (fail = curr->fail; fail; fail = fail->fail)
440	    {
441	      /* If the current node has some outgoing edge that the fail
442		 doesn't, then the shift at the fail should be no larger
443		 than the difference of their depths. */
444	      if (!hasevery(fail->links, curr->links))
445		if (curr->depth - fail->depth < fail->shift)
446		  fail->shift = curr->depth - fail->depth;
447
448	      /* If the current node is accepting then the shift at the
449		 fail and its descendents should be no larger than the
450		 difference of their depths. */
451	      if (curr->accepting && fail->maxshift > curr->depth - fail->depth)
452		fail->maxshift = curr->depth - fail->depth;
453	    }
454	}
455
456      /* Traverse the trie in level order again, fixing up all nodes whose
457	 shift exceeds their inherited maxshift. */
458      for (curr = kwset->trie->next; curr; curr = curr->next)
459	{
460	  if (curr->maxshift > curr->parent->maxshift)
461	    curr->maxshift = curr->parent->maxshift;
462	  if (curr->shift > curr->maxshift)
463	    curr->shift = curr->maxshift;
464	}
465
466      /* Create a vector, indexed by character code, of the outgoing links
467	 from the root node. */
468      for (i = 0; i < NCHAR; ++i)
469	next[i] = 0;
470      treenext(kwset->trie->links, next);
471
472      if ((trans = kwset->trans) != 0)
473	for (i = 0; i < NCHAR; ++i)
474	  kwset->next[i] = next[(unsigned char) trans[i]];
475      else
476	for (i = 0; i < NCHAR; ++i)
477	  kwset->next[i] = next[i];
478    }
479
480  /* Fix things up for any translation table. */
481  if ((trans = kwset->trans) != 0)
482    for (i = 0; i < NCHAR; ++i)
483      kwset->delta[i] = delta[(unsigned char) trans[i]];
484  else
485    for (i = 0; i < NCHAR; ++i)
486      kwset->delta[i] = delta[i];
487
488  return 0;
489}
490
491#define U(C) ((unsigned char) (C))
492
493/* Fast boyer-moore search. */
494static size_t
495bmexec (kwset_t kws, char const *text, size_t size)
496{
497  struct kwset const *kwset;
498  register unsigned char const *d1;
499  register char const *ep, *sp, *tp;
500  register int d, gc, i, len, md2;
501
502  kwset = (struct kwset const *) kws;
503  len = kwset->mind;
504
505  if (len == 0)
506    return 0;
507  if (len > size)
508    return -1;
509  if (len == 1)
510    {
511      tp = memchr (text, kwset->target[0], size);
512      return tp ? tp - text : -1;
513    }
514
515  d1 = kwset->delta;
516  sp = kwset->target + len;
517  gc = U(sp[-2]);
518  md2 = kwset->mind2;
519  tp = text + len;
520
521  /* Significance of 12: 1 (initial offset) + 10 (skip loop) + 1 (md2). */
522  if (size > 12 * len)
523    /* 11 is not a bug, the initial offset happens only once. */
524    for (ep = text + size - 11 * len;;)
525      {
526	while (tp <= ep)
527	  {
528	    d = d1[U(tp[-1])], tp += d;
529	    d = d1[U(tp[-1])], tp += d;
530	    if (d == 0)
531	      goto found;
532	    d = d1[U(tp[-1])], tp += d;
533	    d = d1[U(tp[-1])], tp += d;
534	    d = d1[U(tp[-1])], tp += d;
535	    if (d == 0)
536	      goto found;
537	    d = d1[U(tp[-1])], tp += d;
538	    d = d1[U(tp[-1])], tp += d;
539	    d = d1[U(tp[-1])], tp += d;
540	    if (d == 0)
541	      goto found;
542	    d = d1[U(tp[-1])], tp += d;
543	    d = d1[U(tp[-1])], tp += d;
544	  }
545	break;
546      found:
547	if (U(tp[-2]) == gc)
548	  {
549	    for (i = 3; i <= len && U(tp[-i]) == U(sp[-i]); ++i)
550	      ;
551	    if (i > len)
552	      return tp - len - text;
553	  }
554	tp += md2;
555      }
556
557  /* Now we have only a few characters left to search.  We
558     carefully avoid ever producing an out-of-bounds pointer. */
559  ep = text + size;
560  d = d1[U(tp[-1])];
561  while (d <= ep - tp)
562    {
563      d = d1[U((tp += d)[-1])];
564      if (d != 0)
565	continue;
566      if (U(tp[-2]) == gc)
567	{
568	  for (i = 3; i <= len && U(tp[-i]) == U(sp[-i]); ++i)
569	    ;
570	  if (i > len)
571	    return tp - len - text;
572	}
573      d = md2;
574    }
575
576  return -1;
577}
578
579/* Hairy multiple string search. */
580static size_t
581cwexec (kwset_t kws, char const *text, size_t len, struct kwsmatch *kwsmatch)
582{
583  struct kwset const *kwset;
584  struct trie * const *next;
585  struct trie const *trie;
586  struct trie const *accept;
587  char const *beg, *lim, *mch, *lmch;
588  register unsigned char c;
589  register unsigned char const *delta;
590  register int d;
591  register char const *end, *qlim;
592  register struct tree const *tree;
593  register char const *trans;
594
595  accept = NULL;
596
597  /* Initialize register copies and look for easy ways out. */
598  kwset = (struct kwset *) kws;
599  if (len < kwset->mind)
600    return -1;
601  next = kwset->next;
602  delta = kwset->delta;
603  trans = kwset->trans;
604  lim = text + len;
605  end = text;
606  if ((d = kwset->mind) != 0)
607    mch = 0;
608  else
609    {
610      mch = text, accept = kwset->trie;
611      goto match;
612    }
613
614  if (len >= 4 * kwset->mind)
615    qlim = lim - 4 * kwset->mind;
616  else
617    qlim = 0;
618
619  while (lim - end >= d)
620    {
621      if (qlim && end <= qlim)
622	{
623	  end += d - 1;
624	  while ((d = delta[c = *end]) && end < qlim)
625	    {
626	      end += d;
627	      end += delta[(unsigned char) *end];
628	      end += delta[(unsigned char) *end];
629	    }
630	  ++end;
631	}
632      else
633	d = delta[c = (end += d)[-1]];
634      if (d)
635	continue;
636      beg = end - 1;
637      trie = next[c];
638      if (trie->accepting)
639	{
640	  mch = beg;
641	  accept = trie;
642	}
643      d = trie->shift;
644      while (beg > text)
645	{
646	  c = trans ? trans[(unsigned char) *--beg] : *--beg;
647	  tree = trie->links;
648	  while (tree && c != tree->label)
649	    if (c < tree->label)
650	      tree = tree->llink;
651	    else
652	      tree = tree->rlink;
653	  if (tree)
654	    {
655	      trie = tree->trie;
656	      if (trie->accepting)
657		{
658		  mch = beg;
659		  accept = trie;
660		}
661	    }
662	  else
663	    break;
664	  d = trie->shift;
665	}
666      if (mch)
667	goto match;
668    }
669  return -1;
670
671 match:
672  /* Given a known match, find the longest possible match anchored
673     at or before its starting point.  This is nearly a verbatim
674     copy of the preceding main search loops. */
675  if (lim - mch > kwset->maxd)
676    lim = mch + kwset->maxd;
677  lmch = 0;
678  d = 1;
679  while (lim - end >= d)
680    {
681      if ((d = delta[c = (end += d)[-1]]) != 0)
682	continue;
683      beg = end - 1;
684      if (!(trie = next[c]))
685	{
686	  d = 1;
687	  continue;
688	}
689      if (trie->accepting && beg <= mch)
690	{
691	  lmch = beg;
692	  accept = trie;
693	}
694      d = trie->shift;
695      while (beg > text)
696	{
697	  c = trans ? trans[(unsigned char) *--beg] : *--beg;
698	  tree = trie->links;
699	  while (tree && c != tree->label)
700	    if (c < tree->label)
701	      tree = tree->llink;
702	    else
703	      tree = tree->rlink;
704	  if (tree)
705	    {
706	      trie = tree->trie;
707	      if (trie->accepting && beg <= mch)
708		{
709		  lmch = beg;
710		  accept = trie;
711		}
712	    }
713	  else
714	    break;
715	  d = trie->shift;
716	}
717      if (lmch)
718	{
719	  mch = lmch;
720	  goto match;
721	}
722      if (!d)
723	d = 1;
724    }
725
726  if (kwsmatch)
727    {
728      kwsmatch->index = accept->accepting / 2;
729      kwsmatch->offset[0] = mch - text;
730      kwsmatch->size[0] = accept->depth;
731    }
732  return mch - text;
733}
734
735/* Search through the given text for a match of any member of the
736   given keyword set.  Return a pointer to the first character of
737   the matching substring, or NULL if no match is found.  If FOUNDLEN
738   is non-NULL store in the referenced location the length of the
739   matching substring.  Similarly, if FOUNDIDX is non-NULL, store
740   in the referenced location the index number of the particular
741   keyword matched. */
742size_t
743kwsexec (kwset_t kws, char const *text, size_t size,
744	 struct kwsmatch *kwsmatch)
745{
746  struct kwset const *kwset = (struct kwset *) kws;
747  if (kwset->words == 1 && kwset->trans == 0)
748    {
749      size_t ret = bmexec (kws, text, size);
750      if (kwsmatch != 0 && ret != (size_t) -1)
751	{
752	  kwsmatch->index = 0;
753	  kwsmatch->offset[0] = ret;
754	  kwsmatch->size[0] = kwset->mind;
755	}
756      return ret;
757    }
758  else
759    return cwexec(kws, text, size, kwsmatch);
760}
761
762/* Free the components of the given keyword set. */
763void
764kwsfree (kwset_t kws)
765{
766  struct kwset *kwset;
767
768  kwset = (struct kwset *) kws;
769  obstack_free(&kwset->obstack, 0);
770  free(kws);
771}
772