1/* kwset.c - search for any of a set of keywords. 2 Copyright 1989, 1998, 2000, 2005 Free Software Foundation, Inc. 3 4 This program is free software; you can redistribute it and/or modify 5 it under the terms of the GNU General Public License as published by 6 the Free Software Foundation; either version 2, or (at your option) 7 any later version. 8 9 This program is distributed in the hope that it will be useful, 10 but WITHOUT ANY WARRANTY; without even the implied warranty of 11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 GNU General Public License for more details. 13 14 You should have received a copy of the GNU General Public License 15 along with this program; if not, write to the Free Software Foundation, 16 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ 17 18/* Written August 1989 by Mike Haertel. 19 The author may be reached (Email) at the address mike@ai.mit.edu, 20 or (US mail) as Mike Haertel c/o Free Software Foundation. */ 21 22/* The algorithm implemented by these routines bears a startling resemblence 23 to one discovered by Beate Commentz-Walter, although it is not identical. 24 See "A String Matching Algorithm Fast on the Average," Technical Report, 25 IBM-Germany, Scientific Center Heidelberg, Tiergartenstrasse 15, D-6900 26 Heidelberg, Germany. See also Aho, A.V., and M. Corasick, "Efficient 27 String Matching: An Aid to Bibliographic Search," CACM June 1975, 28 Vol. 18, No. 6, which describes the failure function used below. */ 29 30#ifdef HAVE_CONFIG_H 31# include <config.h> 32#endif 33#include <sys/types.h> 34#include "kwset.h" 35#include <limits.h> 36#include <stdlib.h> 37#include "obstack.h" 38#include "gettext.h" 39#define _(str) gettext (str) 40 41#ifdef GREP 42extern char *xmalloc(); 43# undef malloc 44# define malloc xmalloc 45#endif 46 47#define NCHAR (UCHAR_MAX + 1) 48#define obstack_chunk_alloc malloc 49#define obstack_chunk_free free 50 51/* Balanced tree of edges and labels leaving a given trie node. */ 52struct tree 53{ 54 struct tree *llink; /* Left link; MUST be first field. */ 55 struct tree *rlink; /* Right link (to larger labels). */ 56 struct trie *trie; /* Trie node pointed to by this edge. */ 57 unsigned char label; /* Label on this edge. */ 58 char balance; /* Difference in depths of subtrees. */ 59}; 60 61/* Node of a trie representing a set of reversed keywords. */ 62struct trie 63{ 64 unsigned int accepting; /* Word index of accepted word, or zero. */ 65 struct tree *links; /* Tree of edges leaving this node. */ 66 struct trie *parent; /* Parent of this node. */ 67 struct trie *next; /* List of all trie nodes in level order. */ 68 struct trie *fail; /* Aho-Corasick failure function. */ 69 int depth; /* Depth of this node from the root. */ 70 int shift; /* Shift function for search failures. */ 71 int maxshift; /* Max shift of self and descendents. */ 72}; 73 74/* Structure returned opaquely to the caller, containing everything. */ 75struct kwset 76{ 77 struct obstack obstack; /* Obstack for node allocation. */ 78 int words; /* Number of words in the trie. */ 79 struct trie *trie; /* The trie itself. */ 80 int mind; /* Minimum depth of an accepting node. */ 81 int maxd; /* Maximum depth of any node. */ 82 unsigned char delta[NCHAR]; /* Delta table for rapid search. */ 83 struct trie *next[NCHAR]; /* Table of children of the root. */ 84 char *target; /* Target string if there's only one. */ 85 int mind2; /* Used in Boyer-Moore search for one string. */ 86 char const *trans; /* Character translation table. */ 87}; 88 89/* Allocate and initialize a keyword set object, returning an opaque 90 pointer to it. Return NULL if memory is not available. */ 91kwset_t 92kwsalloc (char const *trans) 93{ 94 struct kwset *kwset; 95 96 kwset = (struct kwset *) malloc(sizeof (struct kwset)); 97 if (!kwset) 98 return 0; 99 100 obstack_init(&kwset->obstack); 101 kwset->words = 0; 102 kwset->trie 103 = (struct trie *) obstack_alloc(&kwset->obstack, sizeof (struct trie)); 104 if (!kwset->trie) 105 { 106 kwsfree((kwset_t) kwset); 107 return 0; 108 } 109 kwset->trie->accepting = 0; 110 kwset->trie->links = 0; 111 kwset->trie->parent = 0; 112 kwset->trie->next = 0; 113 kwset->trie->fail = 0; 114 kwset->trie->depth = 0; 115 kwset->trie->shift = 0; 116 kwset->mind = INT_MAX; 117 kwset->maxd = -1; 118 kwset->target = 0; 119 kwset->trans = trans; 120 121 return (kwset_t) kwset; 122} 123 124/* Add the given string to the contents of the keyword set. Return NULL 125 for success, an error message otherwise. */ 126const char * 127kwsincr (kwset_t kws, char const *text, size_t len) 128{ 129 struct kwset *kwset; 130 register struct trie *trie; 131 register unsigned char label; 132 register struct tree *link; 133 register int depth; 134 struct tree *links[12]; 135 enum { L, R } dirs[12]; 136 struct tree *t, *r, *l, *rl, *lr; 137 138 kwset = (struct kwset *) kws; 139 trie = kwset->trie; 140 text += len; 141 142 /* Descend the trie (built of reversed keywords) character-by-character, 143 installing new nodes when necessary. */ 144 while (len--) 145 { 146 label = kwset->trans ? kwset->trans[(unsigned char) *--text] : *--text; 147 148 /* Descend the tree of outgoing links for this trie node, 149 looking for the current character and keeping track 150 of the path followed. */ 151 link = trie->links; 152 links[0] = (struct tree *) &trie->links; 153 dirs[0] = L; 154 depth = 1; 155 156 while (link && label != link->label) 157 { 158 links[depth] = link; 159 if (label < link->label) 160 dirs[depth++] = L, link = link->llink; 161 else 162 dirs[depth++] = R, link = link->rlink; 163 } 164 165 /* The current character doesn't have an outgoing link at 166 this trie node, so build a new trie node and install 167 a link in the current trie node's tree. */ 168 if (!link) 169 { 170 link = (struct tree *) obstack_alloc(&kwset->obstack, 171 sizeof (struct tree)); 172 if (!link) 173 return _("memory exhausted"); 174 link->llink = 0; 175 link->rlink = 0; 176 link->trie = (struct trie *) obstack_alloc(&kwset->obstack, 177 sizeof (struct trie)); 178 if (!link->trie) 179 return _("memory exhausted"); 180 link->trie->accepting = 0; 181 link->trie->links = 0; 182 link->trie->parent = trie; 183 link->trie->next = 0; 184 link->trie->fail = 0; 185 link->trie->depth = trie->depth + 1; 186 link->trie->shift = 0; 187 link->label = label; 188 link->balance = 0; 189 190 /* Install the new tree node in its parent. */ 191 if (dirs[--depth] == L) 192 links[depth]->llink = link; 193 else 194 links[depth]->rlink = link; 195 196 /* Back up the tree fixing the balance flags. */ 197 while (depth && !links[depth]->balance) 198 { 199 if (dirs[depth] == L) 200 --links[depth]->balance; 201 else 202 ++links[depth]->balance; 203 --depth; 204 } 205 206 /* Rebalance the tree by pointer rotations if necessary. */ 207 if (depth && ((dirs[depth] == L && --links[depth]->balance) 208 || (dirs[depth] == R && ++links[depth]->balance))) 209 { 210 switch (links[depth]->balance) 211 { 212 case (char) -2: 213 switch (dirs[depth + 1]) 214 { 215 case L: 216 r = links[depth], t = r->llink, rl = t->rlink; 217 t->rlink = r, r->llink = rl; 218 t->balance = r->balance = 0; 219 break; 220 case R: 221 r = links[depth], l = r->llink, t = l->rlink; 222 rl = t->rlink, lr = t->llink; 223 t->llink = l, l->rlink = lr, t->rlink = r, r->llink = rl; 224 l->balance = t->balance != 1 ? 0 : -1; 225 r->balance = t->balance != (char) -1 ? 0 : 1; 226 t->balance = 0; 227 break; 228 default: 229 abort (); 230 } 231 break; 232 case 2: 233 switch (dirs[depth + 1]) 234 { 235 case R: 236 l = links[depth], t = l->rlink, lr = t->llink; 237 t->llink = l, l->rlink = lr; 238 t->balance = l->balance = 0; 239 break; 240 case L: 241 l = links[depth], r = l->rlink, t = r->llink; 242 lr = t->llink, rl = t->rlink; 243 t->llink = l, l->rlink = lr, t->rlink = r, r->llink = rl; 244 l->balance = t->balance != 1 ? 0 : -1; 245 r->balance = t->balance != (char) -1 ? 0 : 1; 246 t->balance = 0; 247 break; 248 default: 249 abort (); 250 } 251 break; 252 default: 253 abort (); 254 } 255 256 if (dirs[depth - 1] == L) 257 links[depth - 1]->llink = t; 258 else 259 links[depth - 1]->rlink = t; 260 } 261 } 262 263 trie = link->trie; 264 } 265 266 /* Mark the node we finally reached as accepting, encoding the 267 index number of this word in the keyword set so far. */ 268 if (!trie->accepting) 269 trie->accepting = 1 + 2 * kwset->words; 270 ++kwset->words; 271 272 /* Keep track of the longest and shortest string of the keyword set. */ 273 if (trie->depth < kwset->mind) 274 kwset->mind = trie->depth; 275 if (trie->depth > kwset->maxd) 276 kwset->maxd = trie->depth; 277 278 return 0; 279} 280 281/* Enqueue the trie nodes referenced from the given tree in the 282 given queue. */ 283static void 284enqueue (struct tree *tree, struct trie **last) 285{ 286 if (!tree) 287 return; 288 enqueue(tree->llink, last); 289 enqueue(tree->rlink, last); 290 (*last) = (*last)->next = tree->trie; 291} 292 293/* Compute the Aho-Corasick failure function for the trie nodes referenced 294 from the given tree, given the failure function for their parent as 295 well as a last resort failure node. */ 296static void 297treefails (register struct tree const *tree, struct trie const *fail, 298 struct trie *recourse) 299{ 300 register struct tree *link; 301 302 if (!tree) 303 return; 304 305 treefails(tree->llink, fail, recourse); 306 treefails(tree->rlink, fail, recourse); 307 308 /* Find, in the chain of fails going back to the root, the first 309 node that has a descendent on the current label. */ 310 while (fail) 311 { 312 link = fail->links; 313 while (link && tree->label != link->label) 314 if (tree->label < link->label) 315 link = link->llink; 316 else 317 link = link->rlink; 318 if (link) 319 { 320 tree->trie->fail = link->trie; 321 return; 322 } 323 fail = fail->fail; 324 } 325 326 tree->trie->fail = recourse; 327} 328 329/* Set delta entries for the links of the given tree such that 330 the preexisting delta value is larger than the current depth. */ 331static void 332treedelta (register struct tree const *tree, 333 register unsigned int depth, 334 unsigned char delta[]) 335{ 336 if (!tree) 337 return; 338 treedelta(tree->llink, depth, delta); 339 treedelta(tree->rlink, depth, delta); 340 if (depth < delta[tree->label]) 341 delta[tree->label] = depth; 342} 343 344/* Return true if A has every label in B. */ 345static int 346hasevery (register struct tree const *a, register struct tree const *b) 347{ 348 if (!b) 349 return 1; 350 if (!hasevery(a, b->llink)) 351 return 0; 352 if (!hasevery(a, b->rlink)) 353 return 0; 354 while (a && b->label != a->label) 355 if (b->label < a->label) 356 a = a->llink; 357 else 358 a = a->rlink; 359 return !!a; 360} 361 362/* Compute a vector, indexed by character code, of the trie nodes 363 referenced from the given tree. */ 364static void 365treenext (struct tree const *tree, struct trie *next[]) 366{ 367 if (!tree) 368 return; 369 treenext(tree->llink, next); 370 treenext(tree->rlink, next); 371 next[tree->label] = tree->trie; 372} 373 374/* Compute the shift for each trie node, as well as the delta 375 table and next cache for the given keyword set. */ 376const char * 377kwsprep (kwset_t kws) 378{ 379 register struct kwset *kwset; 380 register int i; 381 register struct trie *curr, *fail; 382 register char const *trans; 383 unsigned char delta[NCHAR]; 384 struct trie *last, *next[NCHAR]; 385 386 kwset = (struct kwset *) kws; 387 388 /* Initial values for the delta table; will be changed later. The 389 delta entry for a given character is the smallest depth of any 390 node at which an outgoing edge is labeled by that character. */ 391 if (kwset->mind < 256) 392 for (i = 0; i < NCHAR; ++i) 393 delta[i] = kwset->mind; 394 else 395 for (i = 0; i < NCHAR; ++i) 396 delta[i] = 255; 397 398 /* Check if we can use the simple boyer-moore algorithm, instead 399 of the hairy commentz-walter algorithm. */ 400 if (kwset->words == 1 && kwset->trans == 0) 401 { 402 /* Looking for just one string. Extract it from the trie. */ 403 kwset->target = obstack_alloc(&kwset->obstack, kwset->mind); 404 for (i = kwset->mind - 1, curr = kwset->trie; i >= 0; --i) 405 { 406 kwset->target[i] = curr->links->label; 407 curr = curr->links->trie; 408 } 409 /* Build the Boyer Moore delta. Boy that's easy compared to CW. */ 410 for (i = 0; i < kwset->mind; ++i) 411 delta[(unsigned char) kwset->target[i]] = kwset->mind - (i + 1); 412 kwset->mind2 = kwset->mind; 413 /* Find the minimal delta2 shift that we might make after 414 a backwards match has failed. */ 415 for (i = 0; i < kwset->mind - 1; ++i) 416 if (kwset->target[i] == kwset->target[kwset->mind - 1]) 417 kwset->mind2 = kwset->mind - (i + 1); 418 } 419 else 420 { 421 /* Traverse the nodes of the trie in level order, simultaneously 422 computing the delta table, failure function, and shift function. */ 423 for (curr = last = kwset->trie; curr; curr = curr->next) 424 { 425 /* Enqueue the immediate descendents in the level order queue. */ 426 enqueue(curr->links, &last); 427 428 curr->shift = kwset->mind; 429 curr->maxshift = kwset->mind; 430 431 /* Update the delta table for the descendents of this node. */ 432 treedelta(curr->links, curr->depth, delta); 433 434 /* Compute the failure function for the decendents of this node. */ 435 treefails(curr->links, curr->fail, kwset->trie); 436 437 /* Update the shifts at each node in the current node's chain 438 of fails back to the root. */ 439 for (fail = curr->fail; fail; fail = fail->fail) 440 { 441 /* If the current node has some outgoing edge that the fail 442 doesn't, then the shift at the fail should be no larger 443 than the difference of their depths. */ 444 if (!hasevery(fail->links, curr->links)) 445 if (curr->depth - fail->depth < fail->shift) 446 fail->shift = curr->depth - fail->depth; 447 448 /* If the current node is accepting then the shift at the 449 fail and its descendents should be no larger than the 450 difference of their depths. */ 451 if (curr->accepting && fail->maxshift > curr->depth - fail->depth) 452 fail->maxshift = curr->depth - fail->depth; 453 } 454 } 455 456 /* Traverse the trie in level order again, fixing up all nodes whose 457 shift exceeds their inherited maxshift. */ 458 for (curr = kwset->trie->next; curr; curr = curr->next) 459 { 460 if (curr->maxshift > curr->parent->maxshift) 461 curr->maxshift = curr->parent->maxshift; 462 if (curr->shift > curr->maxshift) 463 curr->shift = curr->maxshift; 464 } 465 466 /* Create a vector, indexed by character code, of the outgoing links 467 from the root node. */ 468 for (i = 0; i < NCHAR; ++i) 469 next[i] = 0; 470 treenext(kwset->trie->links, next); 471 472 if ((trans = kwset->trans) != 0) 473 for (i = 0; i < NCHAR; ++i) 474 kwset->next[i] = next[(unsigned char) trans[i]]; 475 else 476 for (i = 0; i < NCHAR; ++i) 477 kwset->next[i] = next[i]; 478 } 479 480 /* Fix things up for any translation table. */ 481 if ((trans = kwset->trans) != 0) 482 for (i = 0; i < NCHAR; ++i) 483 kwset->delta[i] = delta[(unsigned char) trans[i]]; 484 else 485 for (i = 0; i < NCHAR; ++i) 486 kwset->delta[i] = delta[i]; 487 488 return 0; 489} 490 491#define U(C) ((unsigned char) (C)) 492 493/* Fast boyer-moore search. */ 494static size_t 495bmexec (kwset_t kws, char const *text, size_t size) 496{ 497 struct kwset const *kwset; 498 register unsigned char const *d1; 499 register char const *ep, *sp, *tp; 500 register int d, gc, i, len, md2; 501 502 kwset = (struct kwset const *) kws; 503 len = kwset->mind; 504 505 if (len == 0) 506 return 0; 507 if (len > size) 508 return -1; 509 if (len == 1) 510 { 511 tp = memchr (text, kwset->target[0], size); 512 return tp ? tp - text : -1; 513 } 514 515 d1 = kwset->delta; 516 sp = kwset->target + len; 517 gc = U(sp[-2]); 518 md2 = kwset->mind2; 519 tp = text + len; 520 521 /* Significance of 12: 1 (initial offset) + 10 (skip loop) + 1 (md2). */ 522 if (size > 12 * len) 523 /* 11 is not a bug, the initial offset happens only once. */ 524 for (ep = text + size - 11 * len;;) 525 { 526 while (tp <= ep) 527 { 528 d = d1[U(tp[-1])], tp += d; 529 d = d1[U(tp[-1])], tp += d; 530 if (d == 0) 531 goto found; 532 d = d1[U(tp[-1])], tp += d; 533 d = d1[U(tp[-1])], tp += d; 534 d = d1[U(tp[-1])], tp += d; 535 if (d == 0) 536 goto found; 537 d = d1[U(tp[-1])], tp += d; 538 d = d1[U(tp[-1])], tp += d; 539 d = d1[U(tp[-1])], tp += d; 540 if (d == 0) 541 goto found; 542 d = d1[U(tp[-1])], tp += d; 543 d = d1[U(tp[-1])], tp += d; 544 } 545 break; 546 found: 547 if (U(tp[-2]) == gc) 548 { 549 for (i = 3; i <= len && U(tp[-i]) == U(sp[-i]); ++i) 550 ; 551 if (i > len) 552 return tp - len - text; 553 } 554 tp += md2; 555 } 556 557 /* Now we have only a few characters left to search. We 558 carefully avoid ever producing an out-of-bounds pointer. */ 559 ep = text + size; 560 d = d1[U(tp[-1])]; 561 while (d <= ep - tp) 562 { 563 d = d1[U((tp += d)[-1])]; 564 if (d != 0) 565 continue; 566 if (U(tp[-2]) == gc) 567 { 568 for (i = 3; i <= len && U(tp[-i]) == U(sp[-i]); ++i) 569 ; 570 if (i > len) 571 return tp - len - text; 572 } 573 d = md2; 574 } 575 576 return -1; 577} 578 579/* Hairy multiple string search. */ 580static size_t 581cwexec (kwset_t kws, char const *text, size_t len, struct kwsmatch *kwsmatch) 582{ 583 struct kwset const *kwset; 584 struct trie * const *next; 585 struct trie const *trie; 586 struct trie const *accept; 587 char const *beg, *lim, *mch, *lmch; 588 register unsigned char c; 589 register unsigned char const *delta; 590 register int d; 591 register char const *end, *qlim; 592 register struct tree const *tree; 593 register char const *trans; 594 595 accept = NULL; 596 597 /* Initialize register copies and look for easy ways out. */ 598 kwset = (struct kwset *) kws; 599 if (len < kwset->mind) 600 return -1; 601 next = kwset->next; 602 delta = kwset->delta; 603 trans = kwset->trans; 604 lim = text + len; 605 end = text; 606 if ((d = kwset->mind) != 0) 607 mch = 0; 608 else 609 { 610 mch = text, accept = kwset->trie; 611 goto match; 612 } 613 614 if (len >= 4 * kwset->mind) 615 qlim = lim - 4 * kwset->mind; 616 else 617 qlim = 0; 618 619 while (lim - end >= d) 620 { 621 if (qlim && end <= qlim) 622 { 623 end += d - 1; 624 while ((d = delta[c = *end]) && end < qlim) 625 { 626 end += d; 627 end += delta[(unsigned char) *end]; 628 end += delta[(unsigned char) *end]; 629 } 630 ++end; 631 } 632 else 633 d = delta[c = (end += d)[-1]]; 634 if (d) 635 continue; 636 beg = end - 1; 637 trie = next[c]; 638 if (trie->accepting) 639 { 640 mch = beg; 641 accept = trie; 642 } 643 d = trie->shift; 644 while (beg > text) 645 { 646 c = trans ? trans[(unsigned char) *--beg] : *--beg; 647 tree = trie->links; 648 while (tree && c != tree->label) 649 if (c < tree->label) 650 tree = tree->llink; 651 else 652 tree = tree->rlink; 653 if (tree) 654 { 655 trie = tree->trie; 656 if (trie->accepting) 657 { 658 mch = beg; 659 accept = trie; 660 } 661 } 662 else 663 break; 664 d = trie->shift; 665 } 666 if (mch) 667 goto match; 668 } 669 return -1; 670 671 match: 672 /* Given a known match, find the longest possible match anchored 673 at or before its starting point. This is nearly a verbatim 674 copy of the preceding main search loops. */ 675 if (lim - mch > kwset->maxd) 676 lim = mch + kwset->maxd; 677 lmch = 0; 678 d = 1; 679 while (lim - end >= d) 680 { 681 if ((d = delta[c = (end += d)[-1]]) != 0) 682 continue; 683 beg = end - 1; 684 if (!(trie = next[c])) 685 { 686 d = 1; 687 continue; 688 } 689 if (trie->accepting && beg <= mch) 690 { 691 lmch = beg; 692 accept = trie; 693 } 694 d = trie->shift; 695 while (beg > text) 696 { 697 c = trans ? trans[(unsigned char) *--beg] : *--beg; 698 tree = trie->links; 699 while (tree && c != tree->label) 700 if (c < tree->label) 701 tree = tree->llink; 702 else 703 tree = tree->rlink; 704 if (tree) 705 { 706 trie = tree->trie; 707 if (trie->accepting && beg <= mch) 708 { 709 lmch = beg; 710 accept = trie; 711 } 712 } 713 else 714 break; 715 d = trie->shift; 716 } 717 if (lmch) 718 { 719 mch = lmch; 720 goto match; 721 } 722 if (!d) 723 d = 1; 724 } 725 726 if (kwsmatch) 727 { 728 kwsmatch->index = accept->accepting / 2; 729 kwsmatch->offset[0] = mch - text; 730 kwsmatch->size[0] = accept->depth; 731 } 732 return mch - text; 733} 734 735/* Search through the given text for a match of any member of the 736 given keyword set. Return a pointer to the first character of 737 the matching substring, or NULL if no match is found. If FOUNDLEN 738 is non-NULL store in the referenced location the length of the 739 matching substring. Similarly, if FOUNDIDX is non-NULL, store 740 in the referenced location the index number of the particular 741 keyword matched. */ 742size_t 743kwsexec (kwset_t kws, char const *text, size_t size, 744 struct kwsmatch *kwsmatch) 745{ 746 struct kwset const *kwset = (struct kwset *) kws; 747 if (kwset->words == 1 && kwset->trans == 0) 748 { 749 size_t ret = bmexec (kws, text, size); 750 if (kwsmatch != 0 && ret != (size_t) -1) 751 { 752 kwsmatch->index = 0; 753 kwsmatch->offset[0] = ret; 754 kwsmatch->size[0] = kwset->mind; 755 } 756 return ret; 757 } 758 else 759 return cwexec(kws, text, size, kwsmatch); 760} 761 762/* Free the components of the given keyword set. */ 763void 764kwsfree (kwset_t kws) 765{ 766 struct kwset *kwset; 767 768 kwset = (struct kwset *) kws; 769 obstack_free(&kwset->obstack, 0); 770 free(kws); 771} 772