kwset.c revision 126435
1/* kwset.c - search for any of a set of keywords. 2 Copyright 1989, 1998, 2000 Free Software Foundation, Inc. 3 4 This program is free software; you can redistribute it and/or modify 5 it under the terms of the GNU General Public License as published by 6 the Free Software Foundation; either version 2, or (at your option) 7 any later version. 8 9 This program is distributed in the hope that it will be useful, 10 but WITHOUT ANY WARRANTY; without even the implied warranty of 11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 GNU General Public License for more details. 13 14 You should have received a copy of the GNU General Public License 15 along with this program; if not, write to the Free Software 16 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 17 02111-1307, USA. */ 18 19/* $FreeBSD: head/gnu/usr.bin/grep/kwset.c 126435 2004-03-01 08:37:20Z ache $ */ 20 21/* Written August 1989 by Mike Haertel. 22 The author may be reached (Email) at the address mike@ai.mit.edu, 23 or (US mail) as Mike Haertel c/o Free Software Foundation. */ 24 25/* The algorithm implemented by these routines bears a startling resemblence 26 to one discovered by Beate Commentz-Walter, although it is not identical. 27 See "A String Matching Algorithm Fast on the Average," Technical Report, 28 IBM-Germany, Scientific Center Heidelberg, Tiergartenstrasse 15, D-6900 29 Heidelberg, Germany. See also Aho, A.V., and M. Corasick, "Efficient 30 String Matching: An Aid to Bibliographic Search," CACM June 1975, 31 Vol. 18, No. 6, which describes the failure function used below. */ 32 33#ifdef HAVE_CONFIG_H 34# include <config.h> 35#endif 36#include <sys/types.h> 37#include "system.h" 38#include "kwset.h" 39#include "obstack.h" 40 41#ifdef GREP 42extern char *xmalloc(); 43# undef malloc 44# define malloc xmalloc 45#endif 46 47#define NCHAR (UCHAR_MAX + 1) 48#define obstack_chunk_alloc malloc 49#define obstack_chunk_free free 50 51/* Balanced tree of edges and labels leaving a given trie node. */ 52struct tree 53{ 54 struct tree *llink; /* Left link; MUST be first field. */ 55 struct tree *rlink; /* Right link (to larger labels). */ 56 struct trie *trie; /* Trie node pointed to by this edge. */ 57 unsigned char label; /* Label on this edge. */ 58 char balance; /* Difference in depths of subtrees. */ 59}; 60 61/* Node of a trie representing a set of reversed keywords. */ 62struct trie 63{ 64 unsigned int accepting; /* Word index of accepted word, or zero. */ 65 struct tree *links; /* Tree of edges leaving this node. */ 66 struct trie *parent; /* Parent of this node. */ 67 struct trie *next; /* List of all trie nodes in level order. */ 68 struct trie *fail; /* Aho-Corasick failure function. */ 69 int depth; /* Depth of this node from the root. */ 70 int shift; /* Shift function for search failures. */ 71 int maxshift; /* Max shift of self and descendents. */ 72}; 73 74/* Structure returned opaquely to the caller, containing everything. */ 75struct kwset 76{ 77 struct obstack obstack; /* Obstack for node allocation. */ 78 int words; /* Number of words in the trie. */ 79 struct trie *trie; /* The trie itself. */ 80 int mind; /* Minimum depth of an accepting node. */ 81 int maxd; /* Maximum depth of any node. */ 82 unsigned char delta[NCHAR]; /* Delta table for rapid search. */ 83 struct trie *next[NCHAR]; /* Table of children of the root. */ 84 char *target; /* Target string if there's only one. */ 85 int mind2; /* Used in Boyer-Moore search for one string. */ 86 char *trans; /* Character translation table. */ 87}; 88 89/* prototypes */ 90static void enqueue PARAMS((struct tree *, struct trie **)); 91static void treefails PARAMS((register struct tree *, struct trie *, struct trie *)); 92static void treedelta PARAMS((register struct tree *,register unsigned int, unsigned char *)); 93static int hasevery PARAMS((register struct tree *, register struct tree *)); 94static void treenext PARAMS((struct tree *, struct trie **)); 95static char * bmexec PARAMS((kwset_t, char *, size_t)); 96static char * cwexec PARAMS((kwset_t, char *, size_t, struct kwsmatch *)); 97 98/* Allocate and initialize a keyword set object, returning an opaque 99 pointer to it. Return NULL if memory is not available. */ 100kwset_t 101kwsalloc (char *trans) 102{ 103 struct kwset *kwset; 104 105 kwset = (struct kwset *) malloc(sizeof (struct kwset)); 106 if (!kwset) 107 return 0; 108 109 obstack_init(&kwset->obstack); 110 kwset->words = 0; 111 kwset->trie 112 = (struct trie *) obstack_alloc(&kwset->obstack, sizeof (struct trie)); 113 if (!kwset->trie) 114 { 115 kwsfree((kwset_t) kwset); 116 return 0; 117 } 118 kwset->trie->accepting = 0; 119 kwset->trie->links = 0; 120 kwset->trie->parent = 0; 121 kwset->trie->next = 0; 122 kwset->trie->fail = 0; 123 kwset->trie->depth = 0; 124 kwset->trie->shift = 0; 125 kwset->mind = INT_MAX; 126 kwset->maxd = -1; 127 kwset->target = 0; 128 kwset->trans = trans; 129 130 return (kwset_t) kwset; 131} 132 133/* Add the given string to the contents of the keyword set. Return NULL 134 for success, an error message otherwise. */ 135char * 136kwsincr (kwset_t kws, char *text, size_t len) 137{ 138 struct kwset *kwset; 139 register struct trie *trie; 140 register unsigned char label; 141 register struct tree *link; 142 register int depth; 143 struct tree *links[12]; 144 enum { L, R } dirs[12]; 145 struct tree *t, *r, *l, *rl, *lr; 146 147 kwset = (struct kwset *) kws; 148 trie = kwset->trie; 149 text += len; 150 151 /* Descend the trie (built of reversed keywords) character-by-character, 152 installing new nodes when necessary. */ 153 while (len--) 154 { 155 label = kwset->trans ? kwset->trans[(unsigned char) *--text] : *--text; 156 157 /* Descend the tree of outgoing links for this trie node, 158 looking for the current character and keeping track 159 of the path followed. */ 160 link = trie->links; 161 links[0] = (struct tree *) &trie->links; 162 dirs[0] = L; 163 depth = 1; 164 165 while (link && label != link->label) 166 { 167 links[depth] = link; 168 if (label < link->label) 169 dirs[depth++] = L, link = link->llink; 170 else 171 dirs[depth++] = R, link = link->rlink; 172 } 173 174 /* The current character doesn't have an outgoing link at 175 this trie node, so build a new trie node and install 176 a link in the current trie node's tree. */ 177 if (!link) 178 { 179 link = (struct tree *) obstack_alloc(&kwset->obstack, 180 sizeof (struct tree)); 181 if (!link) 182 return _("memory exhausted"); 183 link->llink = 0; 184 link->rlink = 0; 185 link->trie = (struct trie *) obstack_alloc(&kwset->obstack, 186 sizeof (struct trie)); 187 if (!link->trie) 188 return _("memory exhausted"); 189 link->trie->accepting = 0; 190 link->trie->links = 0; 191 link->trie->parent = trie; 192 link->trie->next = 0; 193 link->trie->fail = 0; 194 link->trie->depth = trie->depth + 1; 195 link->trie->shift = 0; 196 link->label = label; 197 link->balance = 0; 198 199 /* Install the new tree node in its parent. */ 200 if (dirs[--depth] == L) 201 links[depth]->llink = link; 202 else 203 links[depth]->rlink = link; 204 205 /* Back up the tree fixing the balance flags. */ 206 while (depth && !links[depth]->balance) 207 { 208 if (dirs[depth] == L) 209 --links[depth]->balance; 210 else 211 ++links[depth]->balance; 212 --depth; 213 } 214 215 /* Rebalance the tree by pointer rotations if necessary. */ 216 if (depth && ((dirs[depth] == L && --links[depth]->balance) 217 || (dirs[depth] == R && ++links[depth]->balance))) 218 { 219 switch (links[depth]->balance) 220 { 221 case (char) -2: 222 switch (dirs[depth + 1]) 223 { 224 case L: 225 r = links[depth], t = r->llink, rl = t->rlink; 226 t->rlink = r, r->llink = rl; 227 t->balance = r->balance = 0; 228 break; 229 case R: 230 r = links[depth], l = r->llink, t = l->rlink; 231 rl = t->rlink, lr = t->llink; 232 t->llink = l, l->rlink = lr, t->rlink = r, r->llink = rl; 233 l->balance = t->balance != 1 ? 0 : -1; 234 r->balance = t->balance != (char) -1 ? 0 : 1; 235 t->balance = 0; 236 break; 237 default: 238 abort (); 239 } 240 break; 241 case 2: 242 switch (dirs[depth + 1]) 243 { 244 case R: 245 l = links[depth], t = l->rlink, lr = t->llink; 246 t->llink = l, l->rlink = lr; 247 t->balance = l->balance = 0; 248 break; 249 case L: 250 l = links[depth], r = l->rlink, t = r->llink; 251 lr = t->llink, rl = t->rlink; 252 t->llink = l, l->rlink = lr, t->rlink = r, r->llink = rl; 253 l->balance = t->balance != 1 ? 0 : -1; 254 r->balance = t->balance != (char) -1 ? 0 : 1; 255 t->balance = 0; 256 break; 257 default: 258 abort (); 259 } 260 break; 261 default: 262 abort (); 263 } 264 265 if (dirs[depth - 1] == L) 266 links[depth - 1]->llink = t; 267 else 268 links[depth - 1]->rlink = t; 269 } 270 } 271 272 trie = link->trie; 273 } 274 275 /* Mark the node we finally reached as accepting, encoding the 276 index number of this word in the keyword set so far. */ 277 if (!trie->accepting) 278 trie->accepting = 1 + 2 * kwset->words; 279 ++kwset->words; 280 281 /* Keep track of the longest and shortest string of the keyword set. */ 282 if (trie->depth < kwset->mind) 283 kwset->mind = trie->depth; 284 if (trie->depth > kwset->maxd) 285 kwset->maxd = trie->depth; 286 287 return 0; 288} 289 290/* Enqueue the trie nodes referenced from the given tree in the 291 given queue. */ 292static void 293enqueue (struct tree *tree, struct trie **last) 294{ 295 if (!tree) 296 return; 297 enqueue(tree->llink, last); 298 enqueue(tree->rlink, last); 299 (*last) = (*last)->next = tree->trie; 300} 301 302/* Compute the Aho-Corasick failure function for the trie nodes referenced 303 from the given tree, given the failure function for their parent as 304 well as a last resort failure node. */ 305static void 306treefails (register struct tree *tree, struct trie *fail, struct trie *recourse) 307{ 308 register struct tree *link; 309 310 if (!tree) 311 return; 312 313 treefails(tree->llink, fail, recourse); 314 treefails(tree->rlink, fail, recourse); 315 316 /* Find, in the chain of fails going back to the root, the first 317 node that has a descendent on the current label. */ 318 while (fail) 319 { 320 link = fail->links; 321 while (link && tree->label != link->label) 322 if (tree->label < link->label) 323 link = link->llink; 324 else 325 link = link->rlink; 326 if (link) 327 { 328 tree->trie->fail = link->trie; 329 return; 330 } 331 fail = fail->fail; 332 } 333 334 tree->trie->fail = recourse; 335} 336 337/* Set delta entries for the links of the given tree such that 338 the preexisting delta value is larger than the current depth. */ 339static void 340treedelta (register struct tree *tree, 341 register unsigned int depth, 342 unsigned char delta[]) 343{ 344 if (!tree) 345 return; 346 treedelta(tree->llink, depth, delta); 347 treedelta(tree->rlink, depth, delta); 348 if (depth < delta[tree->label]) 349 delta[tree->label] = depth; 350} 351 352/* Return true if A has every label in B. */ 353static int 354hasevery (register struct tree *a, register struct tree *b) 355{ 356 if (!b) 357 return 1; 358 if (!hasevery(a, b->llink)) 359 return 0; 360 if (!hasevery(a, b->rlink)) 361 return 0; 362 while (a && b->label != a->label) 363 if (b->label < a->label) 364 a = a->llink; 365 else 366 a = a->rlink; 367 return !!a; 368} 369 370/* Compute a vector, indexed by character code, of the trie nodes 371 referenced from the given tree. */ 372static void 373treenext (struct tree *tree, struct trie *next[]) 374{ 375 if (!tree) 376 return; 377 treenext(tree->llink, next); 378 treenext(tree->rlink, next); 379 next[tree->label] = tree->trie; 380} 381 382/* Compute the shift for each trie node, as well as the delta 383 table and next cache for the given keyword set. */ 384char * 385kwsprep (kwset_t kws) 386{ 387 register struct kwset *kwset; 388 register int i; 389 register struct trie *curr, *fail; 390 register char *trans; 391 unsigned char delta[NCHAR]; 392 struct trie *last, *next[NCHAR]; 393 394 kwset = (struct kwset *) kws; 395 396 /* Initial values for the delta table; will be changed later. The 397 delta entry for a given character is the smallest depth of any 398 node at which an outgoing edge is labeled by that character. */ 399 if (kwset->mind < 256) 400 for (i = 0; i < NCHAR; ++i) 401 delta[i] = kwset->mind; 402 else 403 for (i = 0; i < NCHAR; ++i) 404 delta[i] = 255; 405 406 /* Check if we can use the simple boyer-moore algorithm, instead 407 of the hairy commentz-walter algorithm. */ 408 if (kwset->words == 1 && kwset->trans == 0) 409 { 410 /* Looking for just one string. Extract it from the trie. */ 411 kwset->target = obstack_alloc(&kwset->obstack, kwset->mind); 412 for (i = kwset->mind - 1, curr = kwset->trie; i >= 0; --i) 413 { 414 kwset->target[i] = curr->links->label; 415 curr = curr->links->trie; 416 } 417 /* Build the Boyer Moore delta. Boy that's easy compared to CW. */ 418 for (i = 0; i < kwset->mind; ++i) 419 delta[(unsigned char) kwset->target[i]] = kwset->mind - (i + 1); 420 kwset->mind2 = kwset->mind; 421 /* Find the minimal delta2 shift that we might make after 422 a backwards match has failed. */ 423 for (i = 0; i < kwset->mind - 1; ++i) 424 if (kwset->target[i] == kwset->target[kwset->mind - 1]) 425 kwset->mind2 = kwset->mind - (i + 1); 426 } 427 else 428 { 429 /* Traverse the nodes of the trie in level order, simultaneously 430 computing the delta table, failure function, and shift function. */ 431 for (curr = last = kwset->trie; curr; curr = curr->next) 432 { 433 /* Enqueue the immediate descendents in the level order queue. */ 434 enqueue(curr->links, &last); 435 436 curr->shift = kwset->mind; 437 curr->maxshift = kwset->mind; 438 439 /* Update the delta table for the descendents of this node. */ 440 treedelta(curr->links, curr->depth, delta); 441 442 /* Compute the failure function for the decendents of this node. */ 443 treefails(curr->links, curr->fail, kwset->trie); 444 445 /* Update the shifts at each node in the current node's chain 446 of fails back to the root. */ 447 for (fail = curr->fail; fail; fail = fail->fail) 448 { 449 /* If the current node has some outgoing edge that the fail 450 doesn't, then the shift at the fail should be no larger 451 than the difference of their depths. */ 452 if (!hasevery(fail->links, curr->links)) 453 if (curr->depth - fail->depth < fail->shift) 454 fail->shift = curr->depth - fail->depth; 455 456 /* If the current node is accepting then the shift at the 457 fail and its descendents should be no larger than the 458 difference of their depths. */ 459 if (curr->accepting && fail->maxshift > curr->depth - fail->depth) 460 fail->maxshift = curr->depth - fail->depth; 461 } 462 } 463 464 /* Traverse the trie in level order again, fixing up all nodes whose 465 shift exceeds their inherited maxshift. */ 466 for (curr = kwset->trie->next; curr; curr = curr->next) 467 { 468 if (curr->maxshift > curr->parent->maxshift) 469 curr->maxshift = curr->parent->maxshift; 470 if (curr->shift > curr->maxshift) 471 curr->shift = curr->maxshift; 472 } 473 474 /* Create a vector, indexed by character code, of the outgoing links 475 from the root node. */ 476 for (i = 0; i < NCHAR; ++i) 477 next[i] = 0; 478 treenext(kwset->trie->links, next); 479 480 if ((trans = kwset->trans) != 0) 481 for (i = 0; i < NCHAR; ++i) 482 kwset->next[i] = next[(unsigned char) trans[i]]; 483 else 484 for (i = 0; i < NCHAR; ++i) 485 kwset->next[i] = next[i]; 486 } 487 488 /* Fix things up for any translation table. */ 489 if ((trans = kwset->trans) != 0) 490 for (i = 0; i < NCHAR; ++i) 491 kwset->delta[i] = delta[(unsigned char) trans[i]]; 492 else 493 for (i = 0; i < NCHAR; ++i) 494 kwset->delta[i] = delta[i]; 495 496 return 0; 497} 498 499#define U(C) ((unsigned char) (C)) 500 501/* Fast boyer-moore search. */ 502static char * 503bmexec (kwset_t kws, char *text, size_t size) 504{ 505 struct kwset *kwset; 506 register unsigned char *d1; 507 register char *ep, *sp, *tp; 508 register int d, gc, i, len, md2; 509 510 kwset = (struct kwset *) kws; 511 len = kwset->mind; 512 513 if (len == 0) 514 return text; 515 if (len > size) 516 return 0; 517 if (len == 1) 518 return memchr(text, kwset->target[0], size); 519 520 d1 = kwset->delta; 521 sp = kwset->target + len; 522 gc = U(sp[-2]); 523 md2 = kwset->mind2; 524 tp = text + len; 525 526 /* Significance of 12: 1 (initial offset) + 10 (skip loop) + 1 (md2). */ 527 if (size > 12 * len) 528 /* 11 is not a bug, the initial offset happens only once. */ 529 for (ep = text + size - 11 * len;;) 530 { 531 while (tp <= ep) 532 { 533 d = d1[U(tp[-1])], tp += d; 534 d = d1[U(tp[-1])], tp += d; 535 if (d == 0) 536 goto found; 537 d = d1[U(tp[-1])], tp += d; 538 d = d1[U(tp[-1])], tp += d; 539 d = d1[U(tp[-1])], tp += d; 540 if (d == 0) 541 goto found; 542 d = d1[U(tp[-1])], tp += d; 543 d = d1[U(tp[-1])], tp += d; 544 d = d1[U(tp[-1])], tp += d; 545 if (d == 0) 546 goto found; 547 d = d1[U(tp[-1])], tp += d; 548 d = d1[U(tp[-1])], tp += d; 549 } 550 break; 551 found: 552 if (U(tp[-2]) == gc) 553 { 554 for (i = 3; i <= len && U(tp[-i]) == U(sp[-i]); ++i) 555 ; 556 if (i > len) 557 return tp - len; 558 } 559 tp += md2; 560 } 561 562 /* Now we have only a few characters left to search. We 563 carefully avoid ever producing an out-of-bounds pointer. */ 564 ep = text + size; 565 d = d1[U(tp[-1])]; 566 while (d <= ep - tp) 567 { 568 d = d1[U((tp += d)[-1])]; 569 if (d != 0) 570 continue; 571 if (U(tp[-2]) == gc) 572 { 573 for (i = 3; i <= len && U(tp[-i]) == U(sp[-i]); ++i) 574 ; 575 if (i > len) 576 return tp - len; 577 } 578 d = md2; 579 } 580 581 return 0; 582} 583 584/* Hairy multiple string search. */ 585static char * 586cwexec (kwset_t kws, char *text, size_t len, struct kwsmatch *kwsmatch) 587{ 588 struct kwset *kwset; 589 struct trie **next, *trie, *accept; 590 char *beg, *lim, *mch, *lmch; 591 register unsigned char c, *delta; 592 register int d; 593 register char *end, *qlim; 594 register struct tree *tree; 595 register char *trans; 596 597#ifdef lint 598 accept = NULL; 599#endif 600 601 /* Initialize register copies and look for easy ways out. */ 602 kwset = (struct kwset *) kws; 603 if (len < kwset->mind) 604 return 0; 605 next = kwset->next; 606 delta = kwset->delta; 607 trans = kwset->trans; 608 lim = text + len; 609 end = text; 610 if ((d = kwset->mind) != 0) 611 mch = 0; 612 else 613 { 614 mch = text, accept = kwset->trie; 615 goto match; 616 } 617 618 if (len >= 4 * kwset->mind) 619 qlim = lim - 4 * kwset->mind; 620 else 621 qlim = 0; 622 623 while (lim - end >= d) 624 { 625 if (qlim && end <= qlim) 626 { 627 end += d - 1; 628 while ((d = delta[c = *end]) && end < qlim) 629 { 630 end += d; 631 end += delta[(unsigned char) *end]; 632 end += delta[(unsigned char) *end]; 633 } 634 ++end; 635 } 636 else 637 d = delta[c = (end += d)[-1]]; 638 if (d) 639 continue; 640 beg = end - 1; 641 trie = next[c]; 642 if (trie->accepting) 643 { 644 mch = beg; 645 accept = trie; 646 } 647 d = trie->shift; 648 while (beg > text) 649 { 650 c = trans ? trans[(unsigned char) *--beg] : *--beg; 651 tree = trie->links; 652 while (tree && c != tree->label) 653 if (c < tree->label) 654 tree = tree->llink; 655 else 656 tree = tree->rlink; 657 if (tree) 658 { 659 trie = tree->trie; 660 if (trie->accepting) 661 { 662 mch = beg; 663 accept = trie; 664 } 665 } 666 else 667 break; 668 d = trie->shift; 669 } 670 if (mch) 671 goto match; 672 } 673 return 0; 674 675 match: 676 /* Given a known match, find the longest possible match anchored 677 at or before its starting point. This is nearly a verbatim 678 copy of the preceding main search loops. */ 679 if (lim - mch > kwset->maxd) 680 lim = mch + kwset->maxd; 681 lmch = 0; 682 d = 1; 683 while (lim - end >= d) 684 { 685 if ((d = delta[c = (end += d)[-1]]) != 0) 686 continue; 687 beg = end - 1; 688 if (!(trie = next[c])) 689 { 690 d = 1; 691 continue; 692 } 693 if (trie->accepting && beg <= mch) 694 { 695 lmch = beg; 696 accept = trie; 697 } 698 d = trie->shift; 699 while (beg > text) 700 { 701 c = trans ? trans[(unsigned char) *--beg] : *--beg; 702 tree = trie->links; 703 while (tree && c != tree->label) 704 if (c < tree->label) 705 tree = tree->llink; 706 else 707 tree = tree->rlink; 708 if (tree) 709 { 710 trie = tree->trie; 711 if (trie->accepting && beg <= mch) 712 { 713 lmch = beg; 714 accept = trie; 715 } 716 } 717 else 718 break; 719 d = trie->shift; 720 } 721 if (lmch) 722 { 723 mch = lmch; 724 goto match; 725 } 726 if (!d) 727 d = 1; 728 } 729 730 if (kwsmatch) 731 { 732 kwsmatch->index = accept->accepting / 2; 733 kwsmatch->beg[0] = mch; 734 kwsmatch->size[0] = accept->depth; 735 } 736 return mch; 737} 738 739/* Search through the given text for a match of any member of the 740 given keyword set. Return a pointer to the first character of 741 the matching substring, or NULL if no match is found. If FOUNDLEN 742 is non-NULL store in the referenced location the length of the 743 matching substring. Similarly, if FOUNDIDX is non-NULL, store 744 in the referenced location the index number of the particular 745 keyword matched. */ 746char * 747kwsexec (kwset_t kws, char *text, size_t size, struct kwsmatch *kwsmatch) 748{ 749 struct kwset *kwset; 750 char *ret; 751 752 kwset = (struct kwset *) kws; 753 if (kwset->words == 1 && kwset->trans == 0) 754 { 755 ret = bmexec(kws, text, size); 756 if (kwsmatch != 0 && ret != 0) 757 { 758 kwsmatch->index = 0; 759 kwsmatch->beg[0] = ret; 760 kwsmatch->size[0] = kwset->mind; 761 } 762 return ret; 763 } 764 else 765 return cwexec(kws, text, size, kwsmatch); 766} 767 768/* Free the components of the given keyword set. */ 769void 770kwsfree (kwset_t kws) 771{ 772 struct kwset *kwset; 773 774 kwset = (struct kwset *) kws; 775 obstack_free(&kwset->obstack, 0); 776 free(kws); 777} 778