dfa.c revision 55379
1/* dfa.c - deterministic extended regexp routines for GNU 2 Copyright (C) 1988, 1998 Free Software Foundation, Inc. 3 4 This program is free software; you can redistribute it and/or modify 5 it under the terms of the GNU General Public License as published by 6 the Free Software Foundation; either version 2, or (at your option) 7 any later version. 8 9 This program is distributed in the hope that it will be useful, 10 but WITHOUT ANY WARRANTY; without even the implied warranty of 11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 GNU General Public License for more details. 13 14 You should have received a copy of the GNU General Public License 15 along with this program; if not, write to the Free Software 16 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA */ 17 18/* Written June, 1988 by Mike Haertel 19 Modified July, 1988 by Arthur David Olson to assist BMG speedups */ 20 21/* $FreeBSD: head/gnu/usr.bin/grep/dfa.c 55379 2000-01-04 03:25:40Z obrien $ */ 22 23#ifdef HAVE_CONFIG_H 24#include <config.h> 25#endif 26 27#include <assert.h> 28#include <ctype.h> 29#include <stdio.h> 30 31#include <sys/types.h> 32#ifdef STDC_HEADERS 33#include <stdlib.h> 34#else 35extern char *calloc(), *malloc(), *realloc(); 36extern void free(); 37#endif 38 39#if defined(HAVE_STRING_H) || defined(STDC_HEADERS) 40#include <string.h> 41#undef index 42#define index strchr 43#else 44#include <strings.h> 45#endif 46 47#ifndef DEBUG /* use the same approach as regex.c */ 48#undef assert 49#define assert(e) 50#endif /* DEBUG */ 51 52#ifndef isgraph 53#define isgraph(C) (isprint(C) && !isspace(C)) 54#endif 55 56#if defined (STDC_HEADERS) || (!defined (isascii) && !defined (HAVE_ISASCII)) 57#define ISALPHA(C) isalpha(C) 58#define ISUPPER(C) isupper(C) 59#define ISLOWER(C) islower(C) 60#define ISDIGIT(C) isdigit(C) 61#define ISXDIGIT(C) isxdigit(C) 62#define ISSPACE(C) isspace(C) 63#define ISPUNCT(C) ispunct(C) 64#define ISALNUM(C) isalnum(C) 65#define ISPRINT(C) isprint(C) 66#define ISGRAPH(C) isgraph(C) 67#define ISCNTRL(C) iscntrl(C) 68#else 69#define ISALPHA(C) (isascii(C) && isalpha(C)) 70#define ISUPPER(C) (isascii(C) && isupper(C)) 71#define ISLOWER(C) (isascii(C) && islower(C)) 72#define ISDIGIT(C) (isascii(C) && isdigit(C)) 73#define ISXDIGIT(C) (isascii(C) && isxdigit(C)) 74#define ISSPACE(C) (isascii(C) && isspace(C)) 75#define ISPUNCT(C) (isascii(C) && ispunct(C)) 76#define ISALNUM(C) (isascii(C) && isalnum(C)) 77#define ISPRINT(C) (isascii(C) && isprint(C)) 78#define ISGRAPH(C) (isascii(C) && isgraph(C)) 79#define ISCNTRL(C) (isascii(C) && iscntrl(C)) 80#endif 81 82/* If we (don't) have I18N. */ 83/* glibc defines _ */ 84#ifndef _ 85# ifdef HAVE_LIBINTL_H 86# include <libintl.h> 87# ifndef _ 88# define _(Str) gettext (Str) 89# endif 90# else 91# define _(Str) (Str) 92# endif 93#endif 94 95#ifdef __FreeBSD__ 96#include <gnuregex.h> 97#else 98#include "regex.h" 99#endif 100#include "dfa.h" 101 102/* HPUX, define those as macros in sys/param.h */ 103#ifdef setbit 104# undef setbit 105#endif 106#ifdef clrbit 107# undef clrbit 108#endif 109 110static void dfamust PARAMS ((struct dfa *dfa)); 111 112static ptr_t xcalloc PARAMS ((size_t n, size_t s)); 113static ptr_t xmalloc PARAMS ((size_t n)); 114static ptr_t xrealloc PARAMS ((ptr_t p, size_t n)); 115#ifdef DEBUG 116static void prtok PARAMS ((token t)); 117#endif 118static int tstbit PARAMS ((int b, charclass c)); 119static void setbit PARAMS ((int b, charclass c)); 120static void clrbit PARAMS ((int b, charclass c)); 121static void copyset PARAMS ((charclass src, charclass dst)); 122static void zeroset PARAMS ((charclass s)); 123static void notset PARAMS ((charclass s)); 124static int equal PARAMS ((charclass s1, charclass s2)); 125static int charclass_index PARAMS ((charclass s)); 126static int looking_at PARAMS ((const char *s)); 127static token lex PARAMS ((void)); 128static void addtok PARAMS ((token t)); 129static void atom PARAMS ((void)); 130static int nsubtoks PARAMS ((int tindex)); 131static void copytoks PARAMS ((int tindex, int ntokens)); 132static void closure PARAMS ((void)); 133static void branch PARAMS ((void)); 134static void regexp PARAMS ((int toplevel)); 135static void copy PARAMS ((position_set *src, position_set *dst)); 136static void insert PARAMS ((position p, position_set *s)); 137static void merge PARAMS ((position_set *s1, position_set *s2, position_set *m)); 138static void delete PARAMS ((position p, position_set *s)); 139static int state_index PARAMS ((struct dfa *d, position_set *s, 140 int newline, int letter)); 141static void build_state PARAMS ((int s, struct dfa *d)); 142static void build_state_zero PARAMS ((struct dfa *d)); 143static char *icatalloc PARAMS ((char *old, char *new)); 144static char *icpyalloc PARAMS ((char *string)); 145static char *istrstr PARAMS ((char *lookin, char *lookfor)); 146static void ifree PARAMS ((char *cp)); 147static void freelist PARAMS ((char **cpp)); 148static char **enlist PARAMS ((char **cpp, char *new, size_t len)); 149static char **comsubs PARAMS ((char *left, char *right)); 150static char **addlists PARAMS ((char **old, char **new)); 151static char **inboth PARAMS ((char **left, char **right)); 152 153#ifdef __FreeBSD__ 154static int 155collate_range_cmp(c1, c2) 156 int c1, c2; 157{ 158 static char s1[2], s2[2]; 159 int r; 160 161 if (c1 == c2) 162 return 0; 163 s1[0] = c1; 164 s2[0] = c2; 165 if ((r = strcoll(s1, s2)) == 0) 166 r = c1 - c2; 167 168 return r; 169} 170#endif 171 172static ptr_t 173xcalloc(n, s) 174 size_t n; 175 size_t s; 176{ 177 ptr_t r = calloc(n, s); 178 179 if (!r) 180 dfaerror(_("Memory exhausted")); 181 return r; 182} 183 184static ptr_t 185xmalloc(n) 186 size_t n; 187{ 188 ptr_t r = malloc(n); 189 190 assert(n != 0); 191 if (!r) 192 dfaerror(_("Memory exhausted")); 193 return r; 194} 195 196static ptr_t 197xrealloc(p, n) 198 ptr_t p; 199 size_t n; 200{ 201 ptr_t r = realloc(p, n); 202 203 assert(n != 0); 204 if (!r) 205 dfaerror(_("Memory exhausted")); 206 return r; 207} 208 209#define CALLOC(p, t, n) ((p) = (t *) xcalloc((size_t)(n), sizeof (t))) 210#define MALLOC(p, t, n) ((p) = (t *) xmalloc((n) * sizeof (t))) 211#define REALLOC(p, t, n) ((p) = (t *) xrealloc((ptr_t) (p), (n) * sizeof (t))) 212 213/* Reallocate an array of type t if nalloc is too small for index. */ 214#define REALLOC_IF_NECESSARY(p, t, nalloc, index) \ 215 if ((index) >= (nalloc)) \ 216 { \ 217 while ((index) >= (nalloc)) \ 218 (nalloc) *= 2; \ 219 REALLOC(p, t, nalloc); \ 220 } 221 222#ifdef DEBUG 223 224static void 225prtok(t) 226 token t; 227{ 228 char *s; 229 230 if (t < 0) 231 fprintf(stderr, "END"); 232 else if (t < NOTCHAR) 233 fprintf(stderr, "%c", t); 234 else 235 { 236 switch (t) 237 { 238 case EMPTY: s = "EMPTY"; break; 239 case BACKREF: s = "BACKREF"; break; 240 case BEGLINE: s = "BEGLINE"; break; 241 case ENDLINE: s = "ENDLINE"; break; 242 case BEGWORD: s = "BEGWORD"; break; 243 case ENDWORD: s = "ENDWORD"; break; 244 case LIMWORD: s = "LIMWORD"; break; 245 case NOTLIMWORD: s = "NOTLIMWORD"; break; 246 case QMARK: s = "QMARK"; break; 247 case STAR: s = "STAR"; break; 248 case PLUS: s = "PLUS"; break; 249 case CAT: s = "CAT"; break; 250 case OR: s = "OR"; break; 251 case ORTOP: s = "ORTOP"; break; 252 case LPAREN: s = "LPAREN"; break; 253 case RPAREN: s = "RPAREN"; break; 254 default: s = "CSET"; break; 255 } 256 fprintf(stderr, "%s", s); 257 } 258} 259#endif /* DEBUG */ 260 261/* Stuff pertaining to charclasses. */ 262 263static int 264tstbit(b, c) 265 int b; 266 charclass c; 267{ 268 return c[b / INTBITS] & 1 << b % INTBITS; 269} 270 271static void 272setbit(b, c) 273 int b; 274 charclass c; 275{ 276 c[b / INTBITS] |= 1 << b % INTBITS; 277} 278 279static void 280clrbit(b, c) 281 int b; 282 charclass c; 283{ 284 c[b / INTBITS] &= ~(1 << b % INTBITS); 285} 286 287static void 288copyset(src, dst) 289 charclass src; 290 charclass dst; 291{ 292 int i; 293 294 for (i = 0; i < CHARCLASS_INTS; ++i) 295 dst[i] = src[i]; 296} 297 298static void 299zeroset(s) 300 charclass s; 301{ 302 int i; 303 304 for (i = 0; i < CHARCLASS_INTS; ++i) 305 s[i] = 0; 306} 307 308static void 309notset(s) 310 charclass s; 311{ 312 int i; 313 314 for (i = 0; i < CHARCLASS_INTS; ++i) 315 s[i] = ~s[i]; 316} 317 318static int 319equal(s1, s2) 320 charclass s1; 321 charclass s2; 322{ 323 int i; 324 325 for (i = 0; i < CHARCLASS_INTS; ++i) 326 if (s1[i] != s2[i]) 327 return 0; 328 return 1; 329} 330 331/* A pointer to the current dfa is kept here during parsing. */ 332static struct dfa *dfa; 333 334/* Find the index of charclass s in dfa->charclasses, or allocate a new charclass. */ 335static int 336charclass_index(s) 337 charclass s; 338{ 339 int i; 340 341 for (i = 0; i < dfa->cindex; ++i) 342 if (equal(s, dfa->charclasses[i])) 343 return i; 344 REALLOC_IF_NECESSARY(dfa->charclasses, charclass, dfa->calloc, dfa->cindex); 345 ++dfa->cindex; 346 copyset(s, dfa->charclasses[i]); 347 return i; 348} 349 350/* Syntax bits controlling the behavior of the lexical analyzer. */ 351static reg_syntax_t syntax_bits, syntax_bits_set; 352 353/* Flag for case-folding letters into sets. */ 354static int case_fold; 355 356/* End-of-line byte in data. */ 357static unsigned char eolbyte; 358 359/* Entry point to set syntax options. */ 360void 361dfasyntax(bits, fold, eol) 362 reg_syntax_t bits; 363 int fold; 364 int eol; 365{ 366 syntax_bits_set = 1; 367 syntax_bits = bits; 368 case_fold = fold; 369 eolbyte = eol; 370} 371 372/* Lexical analyzer. All the dross that deals with the obnoxious 373 GNU Regex syntax bits is located here. The poor, suffering 374 reader is referred to the GNU Regex documentation for the 375 meaning of the @#%!@#%^!@ syntax bits. */ 376 377static char *lexstart; /* Pointer to beginning of input string. */ 378static char *lexptr; /* Pointer to next input character. */ 379static int lexleft; /* Number of characters remaining. */ 380static token lasttok; /* Previous token returned; initially END. */ 381static int laststart; /* True if we're separated from beginning or (, | 382 only by zero-width characters. */ 383static int parens; /* Count of outstanding left parens. */ 384static int minrep, maxrep; /* Repeat counts for {m,n}. */ 385 386/* Note that characters become unsigned here. */ 387#define FETCH(c, eoferr) \ 388 { \ 389 if (! lexleft) \ 390 if (eoferr != 0) \ 391 dfaerror(eoferr); \ 392 else \ 393 return lasttok = END; \ 394 (c) = (unsigned char) *lexptr++; \ 395 --lexleft; \ 396 } 397 398#ifdef __STDC__ 399#define FUNC(F, P) static int F(int c) { return P(c); } 400#else 401#define FUNC(F, P) static int F(c) int c; { return P(c); } 402#endif 403 404FUNC(is_alpha, ISALPHA) 405FUNC(is_upper, ISUPPER) 406FUNC(is_lower, ISLOWER) 407FUNC(is_digit, ISDIGIT) 408FUNC(is_xdigit, ISXDIGIT) 409FUNC(is_space, ISSPACE) 410FUNC(is_punct, ISPUNCT) 411FUNC(is_alnum, ISALNUM) 412FUNC(is_print, ISPRINT) 413FUNC(is_graph, ISGRAPH) 414FUNC(is_cntrl, ISCNTRL) 415 416static int is_blank(c) 417int c; 418{ 419 return (c == ' ' || c == '\t'); 420} 421 422/* The following list maps the names of the Posix named character classes 423 to predicate functions that determine whether a given character is in 424 the class. The leading [ has already been eaten by the lexical analyzer. */ 425static struct { 426 const char *name; 427 int (*pred) PARAMS ((int)); 428} prednames[] = { 429 { ":alpha:]", is_alpha }, 430 { ":upper:]", is_upper }, 431 { ":lower:]", is_lower }, 432 { ":digit:]", is_digit }, 433 { ":xdigit:]", is_xdigit }, 434 { ":space:]", is_space }, 435 { ":punct:]", is_punct }, 436 { ":alnum:]", is_alnum }, 437 { ":print:]", is_print }, 438 { ":graph:]", is_graph }, 439 { ":cntrl:]", is_cntrl }, 440 { ":blank:]", is_blank }, 441 { 0 } 442}; 443 444/* Return non-zero if C is a `word-constituent' byte; zero otherwise. */ 445#define IS_WORD_CONSTITUENT(C) (ISALNUM(C) || (C) == '_') 446 447static int 448looking_at(s) 449 const char *s; 450{ 451 size_t len; 452 453 len = strlen(s); 454 if (lexleft < len) 455 return 0; 456 return strncmp(s, lexptr, len) == 0; 457} 458 459static token 460lex() 461{ 462 token c, c1, c2; 463 int backslash = 0, invert; 464 charclass ccl; 465 int i; 466 467 /* Basic plan: We fetch a character. If it's a backslash, 468 we set the backslash flag and go through the loop again. 469 On the plus side, this avoids having a duplicate of the 470 main switch inside the backslash case. On the minus side, 471 it means that just about every case begins with 472 "if (backslash) ...". */ 473 for (i = 0; i < 2; ++i) 474 { 475 FETCH(c, 0); 476 switch (c) 477 { 478 case '\\': 479 if (backslash) 480 goto normal_char; 481 if (lexleft == 0) 482 dfaerror(_("Unfinished \\ escape")); 483 backslash = 1; 484 break; 485 486 case '^': 487 if (backslash) 488 goto normal_char; 489 if (syntax_bits & RE_CONTEXT_INDEP_ANCHORS 490 || lasttok == END 491 || lasttok == LPAREN 492 || lasttok == OR) 493 return lasttok = BEGLINE; 494 goto normal_char; 495 496 case '$': 497 if (backslash) 498 goto normal_char; 499 if (syntax_bits & RE_CONTEXT_INDEP_ANCHORS 500 || lexleft == 0 501 || (syntax_bits & RE_NO_BK_PARENS 502 ? lexleft > 0 && *lexptr == ')' 503 : lexleft > 1 && lexptr[0] == '\\' && lexptr[1] == ')') 504 || (syntax_bits & RE_NO_BK_VBAR 505 ? lexleft > 0 && *lexptr == '|' 506 : lexleft > 1 && lexptr[0] == '\\' && lexptr[1] == '|') 507 || ((syntax_bits & RE_NEWLINE_ALT) 508 && lexleft > 0 && *lexptr == '\n')) 509 return lasttok = ENDLINE; 510 goto normal_char; 511 512 case '1': 513 case '2': 514 case '3': 515 case '4': 516 case '5': 517 case '6': 518 case '7': 519 case '8': 520 case '9': 521 if (backslash && !(syntax_bits & RE_NO_BK_REFS)) 522 { 523 laststart = 0; 524 return lasttok = BACKREF; 525 } 526 goto normal_char; 527 528 case '`': 529 if (backslash && !(syntax_bits & RE_NO_GNU_OPS)) 530 return lasttok = BEGLINE; /* FIXME: should be beginning of string */ 531 goto normal_char; 532 533 case '\'': 534 if (backslash && !(syntax_bits & RE_NO_GNU_OPS)) 535 return lasttok = ENDLINE; /* FIXME: should be end of string */ 536 goto normal_char; 537 538 case '<': 539 if (backslash && !(syntax_bits & RE_NO_GNU_OPS)) 540 return lasttok = BEGWORD; 541 goto normal_char; 542 543 case '>': 544 if (backslash && !(syntax_bits & RE_NO_GNU_OPS)) 545 return lasttok = ENDWORD; 546 goto normal_char; 547 548 case 'b': 549 if (backslash && !(syntax_bits & RE_NO_GNU_OPS)) 550 return lasttok = LIMWORD; 551 goto normal_char; 552 553 case 'B': 554 if (backslash && !(syntax_bits & RE_NO_GNU_OPS)) 555 return lasttok = NOTLIMWORD; 556 goto normal_char; 557 558 case '?': 559 if (syntax_bits & RE_LIMITED_OPS) 560 goto normal_char; 561 if (backslash != ((syntax_bits & RE_BK_PLUS_QM) != 0)) 562 goto normal_char; 563 if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart) 564 goto normal_char; 565 return lasttok = QMARK; 566 567 case '*': 568 if (backslash) 569 goto normal_char; 570 if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart) 571 goto normal_char; 572 return lasttok = STAR; 573 574 case '+': 575 if (syntax_bits & RE_LIMITED_OPS) 576 goto normal_char; 577 if (backslash != ((syntax_bits & RE_BK_PLUS_QM) != 0)) 578 goto normal_char; 579 if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart) 580 goto normal_char; 581 return lasttok = PLUS; 582 583 case '{': 584 if (!(syntax_bits & RE_INTERVALS)) 585 goto normal_char; 586 if (backslash != ((syntax_bits & RE_NO_BK_BRACES) == 0)) 587 goto normal_char; 588 if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart) 589 goto normal_char; 590 591 if (syntax_bits & RE_NO_BK_BRACES) 592 { 593 /* Scan ahead for a valid interval; if it's not valid, 594 treat it as a literal '{'. */ 595 int lo = -1, hi = -1; 596 char const *p = lexptr; 597 char const *lim = p + lexleft; 598 for (; p != lim && ISDIGIT (*p); p++) 599 lo = (lo < 0 ? 0 : lo * 10) + *p - '0'; 600 if (p != lim && *p == ',') 601 while (++p != lim && ISDIGIT (*p)) 602 hi = (hi < 0 ? 0 : hi * 10) + *p - '0'; 603 else 604 hi = lo; 605 if (p == lim || *p != '}' 606 || lo < 0 || RE_DUP_MAX < hi || (0 <= hi && hi < lo)) 607 goto normal_char; 608 } 609 610 minrep = 0; 611 /* Cases: 612 {M} - exact count 613 {M,} - minimum count, maximum is infinity 614 {M,N} - M through N */ 615 FETCH(c, _("unfinished repeat count")); 616 if (ISDIGIT(c)) 617 { 618 minrep = c - '0'; 619 for (;;) 620 { 621 FETCH(c, _("unfinished repeat count")); 622 if (!ISDIGIT(c)) 623 break; 624 minrep = 10 * minrep + c - '0'; 625 } 626 } 627 else 628 dfaerror(_("malformed repeat count")); 629 if (c == ',') 630 { 631 FETCH (c, _("unfinished repeat count")); 632 if (! ISDIGIT (c)) 633 maxrep = -1; 634 else 635 { 636 maxrep = c - '0'; 637 for (;;) 638 { 639 FETCH (c, _("unfinished repeat count")); 640 if (! ISDIGIT (c)) 641 break; 642 maxrep = 10 * maxrep + c - '0'; 643 } 644 if (0 <= maxrep && maxrep < minrep) 645 dfaerror (_("malformed repeat count")); 646 } 647 } 648 else 649 maxrep = minrep; 650 if (!(syntax_bits & RE_NO_BK_BRACES)) 651 { 652 if (c != '\\') 653 dfaerror(_("malformed repeat count")); 654 FETCH(c, _("unfinished repeat count")); 655 } 656 if (c != '}') 657 dfaerror(_("malformed repeat count")); 658 laststart = 0; 659 return lasttok = REPMN; 660 661 case '|': 662 if (syntax_bits & RE_LIMITED_OPS) 663 goto normal_char; 664 if (backslash != ((syntax_bits & RE_NO_BK_VBAR) == 0)) 665 goto normal_char; 666 laststart = 1; 667 return lasttok = OR; 668 669 case '\n': 670 if (syntax_bits & RE_LIMITED_OPS 671 || backslash 672 || !(syntax_bits & RE_NEWLINE_ALT)) 673 goto normal_char; 674 laststart = 1; 675 return lasttok = OR; 676 677 case '(': 678 if (backslash != ((syntax_bits & RE_NO_BK_PARENS) == 0)) 679 goto normal_char; 680 ++parens; 681 laststart = 1; 682 return lasttok = LPAREN; 683 684 case ')': 685 if (backslash != ((syntax_bits & RE_NO_BK_PARENS) == 0)) 686 goto normal_char; 687 if (parens == 0 && syntax_bits & RE_UNMATCHED_RIGHT_PAREN_ORD) 688 goto normal_char; 689 --parens; 690 laststart = 0; 691 return lasttok = RPAREN; 692 693 case '.': 694 if (backslash) 695 goto normal_char; 696 zeroset(ccl); 697 notset(ccl); 698 if (!(syntax_bits & RE_DOT_NEWLINE)) 699 clrbit(eolbyte, ccl); 700 if (syntax_bits & RE_DOT_NOT_NULL) 701 clrbit('\0', ccl); 702 laststart = 0; 703 return lasttok = CSET + charclass_index(ccl); 704 705 case 'w': 706 case 'W': 707 if (!backslash || (syntax_bits & RE_NO_GNU_OPS)) 708 goto normal_char; 709 zeroset(ccl); 710 for (c2 = 0; c2 < NOTCHAR; ++c2) 711 if (IS_WORD_CONSTITUENT(c2)) 712 setbit(c2, ccl); 713 if (c == 'W') 714 notset(ccl); 715 laststart = 0; 716 return lasttok = CSET + charclass_index(ccl); 717 718 case '[': 719 if (backslash) 720 goto normal_char; 721 zeroset(ccl); 722 FETCH(c, _("Unbalanced [")); 723 if (c == '^') 724 { 725 FETCH(c, _("Unbalanced [")); 726 invert = 1; 727 } 728 else 729 invert = 0; 730 do 731 { 732 /* Nobody ever said this had to be fast. :-) 733 Note that if we're looking at some other [:...:] 734 construct, we just treat it as a bunch of ordinary 735 characters. We can do this because we assume 736 regex has checked for syntax errors before 737 dfa is ever called. */ 738 if (c == '[' && (syntax_bits & RE_CHAR_CLASSES)) 739 for (c1 = 0; prednames[c1].name; ++c1) 740 if (looking_at(prednames[c1].name)) 741 { 742 int (*pred)() = prednames[c1].pred; 743 if (case_fold 744 && (pred == is_upper || pred == is_lower)) 745 pred = is_alpha; 746 747 for (c2 = 0; c2 < NOTCHAR; ++c2) 748 if ((*pred)(c2)) 749 setbit(c2, ccl); 750 lexptr += strlen(prednames[c1].name); 751 lexleft -= strlen(prednames[c1].name); 752 FETCH(c1, _("Unbalanced [")); 753 goto skip; 754 } 755 if (c == '\\' && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS)) 756 FETCH(c, _("Unbalanced [")); 757 FETCH(c1, _("Unbalanced [")); 758 if (c1 == '-') 759 { 760 FETCH(c2, _("Unbalanced [")); 761 if (c2 == ']') 762 { 763 /* In the case [x-], the - is an ordinary hyphen, 764 which is left in c1, the lookahead character. */ 765 --lexptr; 766 ++lexleft; 767 c2 = c; 768 } 769 else 770 { 771 if (c2 == '\\' 772 && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS)) 773 FETCH(c2, _("Unbalanced [")); 774 FETCH(c1, _("Unbalanced [")); 775 } 776 } 777 else 778 c2 = c; 779#ifdef __FreeBSD__ 780 if (collate_range_cmp(c, c2) <= 0) 781 { 782 token c3; 783 784 for (c3 = 0; c3 < NOTCHAR; ++c3) { 785 if (collate_range_cmp(c, c3) <= 0 && 786 collate_range_cmp(c3, c2) <= 0) { 787 setbit(c3, ccl); 788 if (case_fold) 789 if (ISUPPER(c3)) 790 setbit(tolower(c3), ccl); 791 else if (ISLOWER(c)) 792 setbit(toupper(c3), ccl); 793 } 794 } 795 } 796#else 797 while (c <= c2) 798 { 799 setbit(c, ccl); 800 if (case_fold) 801 if (ISUPPER(c)) 802 setbit(tolower(c), ccl); 803 else if (ISLOWER(c)) 804 setbit(toupper(c), ccl); 805 ++c; 806 } 807#endif 808 skip: 809 ; 810 } 811 while ((c = c1) != ']'); 812 if (invert) 813 { 814 notset(ccl); 815 if (syntax_bits & RE_HAT_LISTS_NOT_NEWLINE) 816 clrbit(eolbyte, ccl); 817 } 818 laststart = 0; 819 return lasttok = CSET + charclass_index(ccl); 820 821 default: 822 normal_char: 823 laststart = 0; 824 if (case_fold && ISALPHA(c)) 825 { 826 zeroset(ccl); 827 setbit(c, ccl); 828 if (isupper(c)) 829 setbit(tolower(c), ccl); 830 else 831 setbit(toupper(c), ccl); 832 return lasttok = CSET + charclass_index(ccl); 833 } 834 return c; 835 } 836 } 837 838 /* The above loop should consume at most a backslash 839 and some other character. */ 840 abort(); 841 return END; /* keeps pedantic compilers happy. */ 842} 843 844/* Recursive descent parser for regular expressions. */ 845 846static token tok; /* Lookahead token. */ 847static int depth; /* Current depth of a hypothetical stack 848 holding deferred productions. This is 849 used to determine the depth that will be 850 required of the real stack later on in 851 dfaanalyze(). */ 852 853/* Add the given token to the parse tree, maintaining the depth count and 854 updating the maximum depth if necessary. */ 855static void 856addtok(t) 857 token t; 858{ 859 REALLOC_IF_NECESSARY(dfa->tokens, token, dfa->talloc, dfa->tindex); 860 dfa->tokens[dfa->tindex++] = t; 861 862 switch (t) 863 { 864 case QMARK: 865 case STAR: 866 case PLUS: 867 break; 868 869 case CAT: 870 case OR: 871 case ORTOP: 872 --depth; 873 break; 874 875 default: 876 ++dfa->nleaves; 877 case EMPTY: 878 ++depth; 879 break; 880 } 881 if (depth > dfa->depth) 882 dfa->depth = depth; 883} 884 885/* The grammar understood by the parser is as follows. 886 887 regexp: 888 regexp OR branch 889 branch 890 891 branch: 892 branch closure 893 closure 894 895 closure: 896 closure QMARK 897 closure STAR 898 closure PLUS 899 atom 900 901 atom: 902 <normal character> 903 CSET 904 BACKREF 905 BEGLINE 906 ENDLINE 907 BEGWORD 908 ENDWORD 909 LIMWORD 910 NOTLIMWORD 911 <empty> 912 913 The parser builds a parse tree in postfix form in an array of tokens. */ 914 915static void 916atom() 917{ 918 if ((tok >= 0 && tok < NOTCHAR) || tok >= CSET || tok == BACKREF 919 || tok == BEGLINE || tok == ENDLINE || tok == BEGWORD 920 || tok == ENDWORD || tok == LIMWORD || tok == NOTLIMWORD) 921 { 922 addtok(tok); 923 tok = lex(); 924 } 925 else if (tok == LPAREN) 926 { 927 tok = lex(); 928 regexp(0); 929 if (tok != RPAREN) 930 dfaerror(_("Unbalanced (")); 931 tok = lex(); 932 } 933 else 934 addtok(EMPTY); 935} 936 937/* Return the number of tokens in the given subexpression. */ 938static int 939nsubtoks(tindex) 940int tindex; 941{ 942 int ntoks1; 943 944 switch (dfa->tokens[tindex - 1]) 945 { 946 default: 947 return 1; 948 case QMARK: 949 case STAR: 950 case PLUS: 951 return 1 + nsubtoks(tindex - 1); 952 case CAT: 953 case OR: 954 case ORTOP: 955 ntoks1 = nsubtoks(tindex - 1); 956 return 1 + ntoks1 + nsubtoks(tindex - 1 - ntoks1); 957 } 958} 959 960/* Copy the given subexpression to the top of the tree. */ 961static void 962copytoks(tindex, ntokens) 963 int tindex, ntokens; 964{ 965 int i; 966 967 for (i = 0; i < ntokens; ++i) 968 addtok(dfa->tokens[tindex + i]); 969} 970 971static void 972closure() 973{ 974 int tindex, ntokens, i; 975 976 atom(); 977 while (tok == QMARK || tok == STAR || tok == PLUS || tok == REPMN) 978 if (tok == REPMN) 979 { 980 ntokens = nsubtoks(dfa->tindex); 981 tindex = dfa->tindex - ntokens; 982 if (maxrep < 0) 983 addtok(PLUS); 984 if (minrep == 0) 985 addtok(QMARK); 986 for (i = 1; i < minrep; ++i) 987 { 988 copytoks(tindex, ntokens); 989 addtok(CAT); 990 } 991 for (; i < maxrep; ++i) 992 { 993 copytoks(tindex, ntokens); 994 addtok(QMARK); 995 addtok(CAT); 996 } 997 tok = lex(); 998 } 999 else 1000 { 1001 addtok(tok); 1002 tok = lex(); 1003 } 1004} 1005 1006static void 1007branch() 1008{ 1009 closure(); 1010 while (tok != RPAREN && tok != OR && tok >= 0) 1011 { 1012 closure(); 1013 addtok(CAT); 1014 } 1015} 1016 1017static void 1018regexp(toplevel) 1019 int toplevel; 1020{ 1021 branch(); 1022 while (tok == OR) 1023 { 1024 tok = lex(); 1025 branch(); 1026 if (toplevel) 1027 addtok(ORTOP); 1028 else 1029 addtok(OR); 1030 } 1031} 1032 1033/* Main entry point for the parser. S is a string to be parsed, len is the 1034 length of the string, so s can include NUL characters. D is a pointer to 1035 the struct dfa to parse into. */ 1036void 1037dfaparse(s, len, d) 1038 char *s; 1039 size_t len; 1040 struct dfa *d; 1041 1042{ 1043 dfa = d; 1044 lexstart = lexptr = s; 1045 lexleft = len; 1046 lasttok = END; 1047 laststart = 1; 1048 parens = 0; 1049 1050 if (! syntax_bits_set) 1051 dfaerror(_("No syntax specified")); 1052 1053 tok = lex(); 1054 depth = d->depth; 1055 1056 regexp(1); 1057 1058 if (tok != END) 1059 dfaerror(_("Unbalanced )")); 1060 1061 addtok(END - d->nregexps); 1062 addtok(CAT); 1063 1064 if (d->nregexps) 1065 addtok(ORTOP); 1066 1067 ++d->nregexps; 1068} 1069 1070/* Some primitives for operating on sets of positions. */ 1071 1072/* Copy one set to another; the destination must be large enough. */ 1073static void 1074copy(src, dst) 1075 position_set *src; 1076 position_set *dst; 1077{ 1078 int i; 1079 1080 for (i = 0; i < src->nelem; ++i) 1081 dst->elems[i] = src->elems[i]; 1082 dst->nelem = src->nelem; 1083} 1084 1085/* Insert a position in a set. Position sets are maintained in sorted 1086 order according to index. If position already exists in the set with 1087 the same index then their constraints are logically or'd together. 1088 S->elems must point to an array large enough to hold the resulting set. */ 1089static void 1090insert(p, s) 1091 position p; 1092 position_set *s; 1093{ 1094 int i; 1095 position t1, t2; 1096 1097 for (i = 0; i < s->nelem && p.index < s->elems[i].index; ++i) 1098 continue; 1099 if (i < s->nelem && p.index == s->elems[i].index) 1100 s->elems[i].constraint |= p.constraint; 1101 else 1102 { 1103 t1 = p; 1104 ++s->nelem; 1105 while (i < s->nelem) 1106 { 1107 t2 = s->elems[i]; 1108 s->elems[i++] = t1; 1109 t1 = t2; 1110 } 1111 } 1112} 1113 1114/* Merge two sets of positions into a third. The result is exactly as if 1115 the positions of both sets were inserted into an initially empty set. */ 1116static void 1117merge(s1, s2, m) 1118 position_set *s1; 1119 position_set *s2; 1120 position_set *m; 1121{ 1122 int i = 0, j = 0; 1123 1124 m->nelem = 0; 1125 while (i < s1->nelem && j < s2->nelem) 1126 if (s1->elems[i].index > s2->elems[j].index) 1127 m->elems[m->nelem++] = s1->elems[i++]; 1128 else if (s1->elems[i].index < s2->elems[j].index) 1129 m->elems[m->nelem++] = s2->elems[j++]; 1130 else 1131 { 1132 m->elems[m->nelem] = s1->elems[i++]; 1133 m->elems[m->nelem++].constraint |= s2->elems[j++].constraint; 1134 } 1135 while (i < s1->nelem) 1136 m->elems[m->nelem++] = s1->elems[i++]; 1137 while (j < s2->nelem) 1138 m->elems[m->nelem++] = s2->elems[j++]; 1139} 1140 1141/* Delete a position from a set. */ 1142static void 1143delete(p, s) 1144 position p; 1145 position_set *s; 1146{ 1147 int i; 1148 1149 for (i = 0; i < s->nelem; ++i) 1150 if (p.index == s->elems[i].index) 1151 break; 1152 if (i < s->nelem) 1153 for (--s->nelem; i < s->nelem; ++i) 1154 s->elems[i] = s->elems[i + 1]; 1155} 1156 1157/* Find the index of the state corresponding to the given position set with 1158 the given preceding context, or create a new state if there is no such 1159 state. Newline and letter tell whether we got here on a newline or 1160 letter, respectively. */ 1161static int 1162state_index(d, s, newline, letter) 1163 struct dfa *d; 1164 position_set *s; 1165 int newline; 1166 int letter; 1167{ 1168 int hash = 0; 1169 int constraint; 1170 int i, j; 1171 1172 newline = newline ? 1 : 0; 1173 letter = letter ? 1 : 0; 1174 1175 for (i = 0; i < s->nelem; ++i) 1176 hash ^= s->elems[i].index + s->elems[i].constraint; 1177 1178 /* Try to find a state that exactly matches the proposed one. */ 1179 for (i = 0; i < d->sindex; ++i) 1180 { 1181 if (hash != d->states[i].hash || s->nelem != d->states[i].elems.nelem 1182 || newline != d->states[i].newline || letter != d->states[i].letter) 1183 continue; 1184 for (j = 0; j < s->nelem; ++j) 1185 if (s->elems[j].constraint 1186 != d->states[i].elems.elems[j].constraint 1187 || s->elems[j].index != d->states[i].elems.elems[j].index) 1188 break; 1189 if (j == s->nelem) 1190 return i; 1191 } 1192 1193 /* We'll have to create a new state. */ 1194 REALLOC_IF_NECESSARY(d->states, dfa_state, d->salloc, d->sindex); 1195 d->states[i].hash = hash; 1196 MALLOC(d->states[i].elems.elems, position, s->nelem); 1197 copy(s, &d->states[i].elems); 1198 d->states[i].newline = newline; 1199 d->states[i].letter = letter; 1200 d->states[i].backref = 0; 1201 d->states[i].constraint = 0; 1202 d->states[i].first_end = 0; 1203 for (j = 0; j < s->nelem; ++j) 1204 if (d->tokens[s->elems[j].index] < 0) 1205 { 1206 constraint = s->elems[j].constraint; 1207 if (SUCCEEDS_IN_CONTEXT(constraint, newline, 0, letter, 0) 1208 || SUCCEEDS_IN_CONTEXT(constraint, newline, 0, letter, 1) 1209 || SUCCEEDS_IN_CONTEXT(constraint, newline, 1, letter, 0) 1210 || SUCCEEDS_IN_CONTEXT(constraint, newline, 1, letter, 1)) 1211 d->states[i].constraint |= constraint; 1212 if (! d->states[i].first_end) 1213 d->states[i].first_end = d->tokens[s->elems[j].index]; 1214 } 1215 else if (d->tokens[s->elems[j].index] == BACKREF) 1216 { 1217 d->states[i].constraint = NO_CONSTRAINT; 1218 d->states[i].backref = 1; 1219 } 1220 1221 ++d->sindex; 1222 1223 return i; 1224} 1225 1226/* Find the epsilon closure of a set of positions. If any position of the set 1227 contains a symbol that matches the empty string in some context, replace 1228 that position with the elements of its follow labeled with an appropriate 1229 constraint. Repeat exhaustively until no funny positions are left. 1230 S->elems must be large enough to hold the result. */ 1231static void epsclosure PARAMS ((position_set *s, struct dfa *d)); 1232 1233static void 1234epsclosure(s, d) 1235 position_set *s; 1236 struct dfa *d; 1237{ 1238 int i, j; 1239 int *visited; 1240 position p, old; 1241 1242 MALLOC(visited, int, d->tindex); 1243 for (i = 0; i < d->tindex; ++i) 1244 visited[i] = 0; 1245 1246 for (i = 0; i < s->nelem; ++i) 1247 if (d->tokens[s->elems[i].index] >= NOTCHAR 1248 && d->tokens[s->elems[i].index] != BACKREF 1249 && d->tokens[s->elems[i].index] < CSET) 1250 { 1251 old = s->elems[i]; 1252 p.constraint = old.constraint; 1253 delete(s->elems[i], s); 1254 if (visited[old.index]) 1255 { 1256 --i; 1257 continue; 1258 } 1259 visited[old.index] = 1; 1260 switch (d->tokens[old.index]) 1261 { 1262 case BEGLINE: 1263 p.constraint &= BEGLINE_CONSTRAINT; 1264 break; 1265 case ENDLINE: 1266 p.constraint &= ENDLINE_CONSTRAINT; 1267 break; 1268 case BEGWORD: 1269 p.constraint &= BEGWORD_CONSTRAINT; 1270 break; 1271 case ENDWORD: 1272 p.constraint &= ENDWORD_CONSTRAINT; 1273 break; 1274 case LIMWORD: 1275 p.constraint &= LIMWORD_CONSTRAINT; 1276 break; 1277 case NOTLIMWORD: 1278 p.constraint &= NOTLIMWORD_CONSTRAINT; 1279 break; 1280 default: 1281 break; 1282 } 1283 for (j = 0; j < d->follows[old.index].nelem; ++j) 1284 { 1285 p.index = d->follows[old.index].elems[j].index; 1286 insert(p, s); 1287 } 1288 /* Force rescan to start at the beginning. */ 1289 i = -1; 1290 } 1291 1292 free(visited); 1293} 1294 1295/* Perform bottom-up analysis on the parse tree, computing various functions. 1296 Note that at this point, we're pretending constructs like \< are real 1297 characters rather than constraints on what can follow them. 1298 1299 Nullable: A node is nullable if it is at the root of a regexp that can 1300 match the empty string. 1301 * EMPTY leaves are nullable. 1302 * No other leaf is nullable. 1303 * A QMARK or STAR node is nullable. 1304 * A PLUS node is nullable if its argument is nullable. 1305 * A CAT node is nullable if both its arguments are nullable. 1306 * An OR node is nullable if either argument is nullable. 1307 1308 Firstpos: The firstpos of a node is the set of positions (nonempty leaves) 1309 that could correspond to the first character of a string matching the 1310 regexp rooted at the given node. 1311 * EMPTY leaves have empty firstpos. 1312 * The firstpos of a nonempty leaf is that leaf itself. 1313 * The firstpos of a QMARK, STAR, or PLUS node is the firstpos of its 1314 argument. 1315 * The firstpos of a CAT node is the firstpos of the left argument, union 1316 the firstpos of the right if the left argument is nullable. 1317 * The firstpos of an OR node is the union of firstpos of each argument. 1318 1319 Lastpos: The lastpos of a node is the set of positions that could 1320 correspond to the last character of a string matching the regexp at 1321 the given node. 1322 * EMPTY leaves have empty lastpos. 1323 * The lastpos of a nonempty leaf is that leaf itself. 1324 * The lastpos of a QMARK, STAR, or PLUS node is the lastpos of its 1325 argument. 1326 * The lastpos of a CAT node is the lastpos of its right argument, union 1327 the lastpos of the left if the right argument is nullable. 1328 * The lastpos of an OR node is the union of the lastpos of each argument. 1329 1330 Follow: The follow of a position is the set of positions that could 1331 correspond to the character following a character matching the node in 1332 a string matching the regexp. At this point we consider special symbols 1333 that match the empty string in some context to be just normal characters. 1334 Later, if we find that a special symbol is in a follow set, we will 1335 replace it with the elements of its follow, labeled with an appropriate 1336 constraint. 1337 * Every node in the firstpos of the argument of a STAR or PLUS node is in 1338 the follow of every node in the lastpos. 1339 * Every node in the firstpos of the second argument of a CAT node is in 1340 the follow of every node in the lastpos of the first argument. 1341 1342 Because of the postfix representation of the parse tree, the depth-first 1343 analysis is conveniently done by a linear scan with the aid of a stack. 1344 Sets are stored as arrays of the elements, obeying a stack-like allocation 1345 scheme; the number of elements in each set deeper in the stack can be 1346 used to determine the address of a particular set's array. */ 1347void 1348dfaanalyze(d, searchflag) 1349 struct dfa *d; 1350 int searchflag; 1351{ 1352 int *nullable; /* Nullable stack. */ 1353 int *nfirstpos; /* Element count stack for firstpos sets. */ 1354 position *firstpos; /* Array where firstpos elements are stored. */ 1355 int *nlastpos; /* Element count stack for lastpos sets. */ 1356 position *lastpos; /* Array where lastpos elements are stored. */ 1357 int *nalloc; /* Sizes of arrays allocated to follow sets. */ 1358 position_set tmp; /* Temporary set for merging sets. */ 1359 position_set merged; /* Result of merging sets. */ 1360 int wants_newline; /* True if some position wants newline info. */ 1361 int *o_nullable; 1362 int *o_nfirst, *o_nlast; 1363 position *o_firstpos, *o_lastpos; 1364 int i, j; 1365 position *pos; 1366 1367#ifdef DEBUG 1368 fprintf(stderr, "dfaanalyze:\n"); 1369 for (i = 0; i < d->tindex; ++i) 1370 { 1371 fprintf(stderr, " %d:", i); 1372 prtok(d->tokens[i]); 1373 } 1374 putc('\n', stderr); 1375#endif 1376 1377 d->searchflag = searchflag; 1378 1379 MALLOC(nullable, int, d->depth); 1380 o_nullable = nullable; 1381 MALLOC(nfirstpos, int, d->depth); 1382 o_nfirst = nfirstpos; 1383 MALLOC(firstpos, position, d->nleaves); 1384 o_firstpos = firstpos, firstpos += d->nleaves; 1385 MALLOC(nlastpos, int, d->depth); 1386 o_nlast = nlastpos; 1387 MALLOC(lastpos, position, d->nleaves); 1388 o_lastpos = lastpos, lastpos += d->nleaves; 1389 MALLOC(nalloc, int, d->tindex); 1390 for (i = 0; i < d->tindex; ++i) 1391 nalloc[i] = 0; 1392 MALLOC(merged.elems, position, d->nleaves); 1393 1394 CALLOC(d->follows, position_set, d->tindex); 1395 1396 for (i = 0; i < d->tindex; ++i) 1397#ifdef DEBUG 1398 { /* Nonsyntactic #ifdef goo... */ 1399#endif 1400 switch (d->tokens[i]) 1401 { 1402 case EMPTY: 1403 /* The empty set is nullable. */ 1404 *nullable++ = 1; 1405 1406 /* The firstpos and lastpos of the empty leaf are both empty. */ 1407 *nfirstpos++ = *nlastpos++ = 0; 1408 break; 1409 1410 case STAR: 1411 case PLUS: 1412 /* Every element in the firstpos of the argument is in the follow 1413 of every element in the lastpos. */ 1414 tmp.nelem = nfirstpos[-1]; 1415 tmp.elems = firstpos; 1416 pos = lastpos; 1417 for (j = 0; j < nlastpos[-1]; ++j) 1418 { 1419 merge(&tmp, &d->follows[pos[j].index], &merged); 1420 REALLOC_IF_NECESSARY(d->follows[pos[j].index].elems, position, 1421 nalloc[pos[j].index], merged.nelem - 1); 1422 copy(&merged, &d->follows[pos[j].index]); 1423 } 1424 1425 case QMARK: 1426 /* A QMARK or STAR node is automatically nullable. */ 1427 if (d->tokens[i] != PLUS) 1428 nullable[-1] = 1; 1429 break; 1430 1431 case CAT: 1432 /* Every element in the firstpos of the second argument is in the 1433 follow of every element in the lastpos of the first argument. */ 1434 tmp.nelem = nfirstpos[-1]; 1435 tmp.elems = firstpos; 1436 pos = lastpos + nlastpos[-1]; 1437 for (j = 0; j < nlastpos[-2]; ++j) 1438 { 1439 merge(&tmp, &d->follows[pos[j].index], &merged); 1440 REALLOC_IF_NECESSARY(d->follows[pos[j].index].elems, position, 1441 nalloc[pos[j].index], merged.nelem - 1); 1442 copy(&merged, &d->follows[pos[j].index]); 1443 } 1444 1445 /* The firstpos of a CAT node is the firstpos of the first argument, 1446 union that of the second argument if the first is nullable. */ 1447 if (nullable[-2]) 1448 nfirstpos[-2] += nfirstpos[-1]; 1449 else 1450 firstpos += nfirstpos[-1]; 1451 --nfirstpos; 1452 1453 /* The lastpos of a CAT node is the lastpos of the second argument, 1454 union that of the first argument if the second is nullable. */ 1455 if (nullable[-1]) 1456 nlastpos[-2] += nlastpos[-1]; 1457 else 1458 { 1459 pos = lastpos + nlastpos[-2]; 1460 for (j = nlastpos[-1] - 1; j >= 0; --j) 1461 pos[j] = lastpos[j]; 1462 lastpos += nlastpos[-2]; 1463 nlastpos[-2] = nlastpos[-1]; 1464 } 1465 --nlastpos; 1466 1467 /* A CAT node is nullable if both arguments are nullable. */ 1468 nullable[-2] = nullable[-1] && nullable[-2]; 1469 --nullable; 1470 break; 1471 1472 case OR: 1473 case ORTOP: 1474 /* The firstpos is the union of the firstpos of each argument. */ 1475 nfirstpos[-2] += nfirstpos[-1]; 1476 --nfirstpos; 1477 1478 /* The lastpos is the union of the lastpos of each argument. */ 1479 nlastpos[-2] += nlastpos[-1]; 1480 --nlastpos; 1481 1482 /* An OR node is nullable if either argument is nullable. */ 1483 nullable[-2] = nullable[-1] || nullable[-2]; 1484 --nullable; 1485 break; 1486 1487 default: 1488 /* Anything else is a nonempty position. (Note that special 1489 constructs like \< are treated as nonempty strings here; 1490 an "epsilon closure" effectively makes them nullable later. 1491 Backreferences have to get a real position so we can detect 1492 transitions on them later. But they are nullable. */ 1493 *nullable++ = d->tokens[i] == BACKREF; 1494 1495 /* This position is in its own firstpos and lastpos. */ 1496 *nfirstpos++ = *nlastpos++ = 1; 1497 --firstpos, --lastpos; 1498 firstpos->index = lastpos->index = i; 1499 firstpos->constraint = lastpos->constraint = NO_CONSTRAINT; 1500 1501 /* Allocate the follow set for this position. */ 1502 nalloc[i] = 1; 1503 MALLOC(d->follows[i].elems, position, nalloc[i]); 1504 break; 1505 } 1506#ifdef DEBUG 1507 /* ... balance the above nonsyntactic #ifdef goo... */ 1508 fprintf(stderr, "node %d:", i); 1509 prtok(d->tokens[i]); 1510 putc('\n', stderr); 1511 fprintf(stderr, nullable[-1] ? " nullable: yes\n" : " nullable: no\n"); 1512 fprintf(stderr, " firstpos:"); 1513 for (j = nfirstpos[-1] - 1; j >= 0; --j) 1514 { 1515 fprintf(stderr, " %d:", firstpos[j].index); 1516 prtok(d->tokens[firstpos[j].index]); 1517 } 1518 fprintf(stderr, "\n lastpos:"); 1519 for (j = nlastpos[-1] - 1; j >= 0; --j) 1520 { 1521 fprintf(stderr, " %d:", lastpos[j].index); 1522 prtok(d->tokens[lastpos[j].index]); 1523 } 1524 putc('\n', stderr); 1525 } 1526#endif 1527 1528 /* For each follow set that is the follow set of a real position, replace 1529 it with its epsilon closure. */ 1530 for (i = 0; i < d->tindex; ++i) 1531 if (d->tokens[i] < NOTCHAR || d->tokens[i] == BACKREF 1532 || d->tokens[i] >= CSET) 1533 { 1534#ifdef DEBUG 1535 fprintf(stderr, "follows(%d:", i); 1536 prtok(d->tokens[i]); 1537 fprintf(stderr, "):"); 1538 for (j = d->follows[i].nelem - 1; j >= 0; --j) 1539 { 1540 fprintf(stderr, " %d:", d->follows[i].elems[j].index); 1541 prtok(d->tokens[d->follows[i].elems[j].index]); 1542 } 1543 putc('\n', stderr); 1544#endif 1545 copy(&d->follows[i], &merged); 1546 epsclosure(&merged, d); 1547 if (d->follows[i].nelem < merged.nelem) 1548 REALLOC(d->follows[i].elems, position, merged.nelem); 1549 copy(&merged, &d->follows[i]); 1550 } 1551 1552 /* Get the epsilon closure of the firstpos of the regexp. The result will 1553 be the set of positions of state 0. */ 1554 merged.nelem = 0; 1555 for (i = 0; i < nfirstpos[-1]; ++i) 1556 insert(firstpos[i], &merged); 1557 epsclosure(&merged, d); 1558 1559 /* Check if any of the positions of state 0 will want newline context. */ 1560 wants_newline = 0; 1561 for (i = 0; i < merged.nelem; ++i) 1562 if (PREV_NEWLINE_DEPENDENT(merged.elems[i].constraint)) 1563 wants_newline = 1; 1564 1565 /* Build the initial state. */ 1566 d->salloc = 1; 1567 d->sindex = 0; 1568 MALLOC(d->states, dfa_state, d->salloc); 1569 state_index(d, &merged, wants_newline, 0); 1570 1571 free(o_nullable); 1572 free(o_nfirst); 1573 free(o_firstpos); 1574 free(o_nlast); 1575 free(o_lastpos); 1576 free(nalloc); 1577 free(merged.elems); 1578} 1579 1580/* Find, for each character, the transition out of state s of d, and store 1581 it in the appropriate slot of trans. 1582 1583 We divide the positions of s into groups (positions can appear in more 1584 than one group). Each group is labeled with a set of characters that 1585 every position in the group matches (taking into account, if necessary, 1586 preceding context information of s). For each group, find the union 1587 of the its elements' follows. This set is the set of positions of the 1588 new state. For each character in the group's label, set the transition 1589 on this character to be to a state corresponding to the set's positions, 1590 and its associated backward context information, if necessary. 1591 1592 If we are building a searching matcher, we include the positions of state 1593 0 in every state. 1594 1595 The collection of groups is constructed by building an equivalence-class 1596 partition of the positions of s. 1597 1598 For each position, find the set of characters C that it matches. Eliminate 1599 any characters from C that fail on grounds of backward context. 1600 1601 Search through the groups, looking for a group whose label L has nonempty 1602 intersection with C. If L - C is nonempty, create a new group labeled 1603 L - C and having the same positions as the current group, and set L to 1604 the intersection of L and C. Insert the position in this group, set 1605 C = C - L, and resume scanning. 1606 1607 If after comparing with every group there are characters remaining in C, 1608 create a new group labeled with the characters of C and insert this 1609 position in that group. */ 1610void 1611dfastate(s, d, trans) 1612 int s; 1613 struct dfa *d; 1614 int trans[]; 1615{ 1616 position_set grps[NOTCHAR]; /* As many as will ever be needed. */ 1617 charclass labels[NOTCHAR]; /* Labels corresponding to the groups. */ 1618 int ngrps = 0; /* Number of groups actually used. */ 1619 position pos; /* Current position being considered. */ 1620 charclass matches; /* Set of matching characters. */ 1621 int matchesf; /* True if matches is nonempty. */ 1622 charclass intersect; /* Intersection with some label set. */ 1623 int intersectf; /* True if intersect is nonempty. */ 1624 charclass leftovers; /* Stuff in the label that didn't match. */ 1625 int leftoversf; /* True if leftovers is nonempty. */ 1626 static charclass letters; /* Set of characters considered letters. */ 1627 static charclass newline; /* Set of characters that aren't newline. */ 1628 position_set follows; /* Union of the follows of some group. */ 1629 position_set tmp; /* Temporary space for merging sets. */ 1630 int state; /* New state. */ 1631 int wants_newline; /* New state wants to know newline context. */ 1632 int state_newline; /* New state on a newline transition. */ 1633 int wants_letter; /* New state wants to know letter context. */ 1634 int state_letter; /* New state on a letter transition. */ 1635 static int initialized; /* Flag for static initialization. */ 1636 int i, j, k; 1637 1638 /* Initialize the set of letters, if necessary. */ 1639 if (! initialized) 1640 { 1641 initialized = 1; 1642 for (i = 0; i < NOTCHAR; ++i) 1643 if (IS_WORD_CONSTITUENT(i)) 1644 setbit(i, letters); 1645 setbit(eolbyte, newline); 1646 } 1647 1648 zeroset(matches); 1649 1650 for (i = 0; i < d->states[s].elems.nelem; ++i) 1651 { 1652 pos = d->states[s].elems.elems[i]; 1653 if (d->tokens[pos.index] >= 0 && d->tokens[pos.index] < NOTCHAR) 1654 setbit(d->tokens[pos.index], matches); 1655 else if (d->tokens[pos.index] >= CSET) 1656 copyset(d->charclasses[d->tokens[pos.index] - CSET], matches); 1657 else 1658 continue; 1659 1660 /* Some characters may need to be eliminated from matches because 1661 they fail in the current context. */ 1662 if (pos.constraint != 0xFF) 1663 { 1664 if (! MATCHES_NEWLINE_CONTEXT(pos.constraint, 1665 d->states[s].newline, 1)) 1666 clrbit(eolbyte, matches); 1667 if (! MATCHES_NEWLINE_CONTEXT(pos.constraint, 1668 d->states[s].newline, 0)) 1669 for (j = 0; j < CHARCLASS_INTS; ++j) 1670 matches[j] &= newline[j]; 1671 if (! MATCHES_LETTER_CONTEXT(pos.constraint, 1672 d->states[s].letter, 1)) 1673 for (j = 0; j < CHARCLASS_INTS; ++j) 1674 matches[j] &= ~letters[j]; 1675 if (! MATCHES_LETTER_CONTEXT(pos.constraint, 1676 d->states[s].letter, 0)) 1677 for (j = 0; j < CHARCLASS_INTS; ++j) 1678 matches[j] &= letters[j]; 1679 1680 /* If there are no characters left, there's no point in going on. */ 1681 for (j = 0; j < CHARCLASS_INTS && !matches[j]; ++j) 1682 continue; 1683 if (j == CHARCLASS_INTS) 1684 continue; 1685 } 1686 1687 for (j = 0; j < ngrps; ++j) 1688 { 1689 /* If matches contains a single character only, and the current 1690 group's label doesn't contain that character, go on to the 1691 next group. */ 1692 if (d->tokens[pos.index] >= 0 && d->tokens[pos.index] < NOTCHAR 1693 && !tstbit(d->tokens[pos.index], labels[j])) 1694 continue; 1695 1696 /* Check if this group's label has a nonempty intersection with 1697 matches. */ 1698 intersectf = 0; 1699 for (k = 0; k < CHARCLASS_INTS; ++k) 1700 (intersect[k] = matches[k] & labels[j][k]) ? (intersectf = 1) : 0; 1701 if (! intersectf) 1702 continue; 1703 1704 /* It does; now find the set differences both ways. */ 1705 leftoversf = matchesf = 0; 1706 for (k = 0; k < CHARCLASS_INTS; ++k) 1707 { 1708 /* Even an optimizing compiler can't know this for sure. */ 1709 int match = matches[k], label = labels[j][k]; 1710 1711 (leftovers[k] = ~match & label) ? (leftoversf = 1) : 0; 1712 (matches[k] = match & ~label) ? (matchesf = 1) : 0; 1713 } 1714 1715 /* If there were leftovers, create a new group labeled with them. */ 1716 if (leftoversf) 1717 { 1718 copyset(leftovers, labels[ngrps]); 1719 copyset(intersect, labels[j]); 1720 MALLOC(grps[ngrps].elems, position, d->nleaves); 1721 copy(&grps[j], &grps[ngrps]); 1722 ++ngrps; 1723 } 1724 1725 /* Put the position in the current group. Note that there is no 1726 reason to call insert() here. */ 1727 grps[j].elems[grps[j].nelem++] = pos; 1728 1729 /* If every character matching the current position has been 1730 accounted for, we're done. */ 1731 if (! matchesf) 1732 break; 1733 } 1734 1735 /* If we've passed the last group, and there are still characters 1736 unaccounted for, then we'll have to create a new group. */ 1737 if (j == ngrps) 1738 { 1739 copyset(matches, labels[ngrps]); 1740 zeroset(matches); 1741 MALLOC(grps[ngrps].elems, position, d->nleaves); 1742 grps[ngrps].nelem = 1; 1743 grps[ngrps].elems[0] = pos; 1744 ++ngrps; 1745 } 1746 } 1747 1748 MALLOC(follows.elems, position, d->nleaves); 1749 MALLOC(tmp.elems, position, d->nleaves); 1750 1751 /* If we are a searching matcher, the default transition is to a state 1752 containing the positions of state 0, otherwise the default transition 1753 is to fail miserably. */ 1754 if (d->searchflag) 1755 { 1756 wants_newline = 0; 1757 wants_letter = 0; 1758 for (i = 0; i < d->states[0].elems.nelem; ++i) 1759 { 1760 if (PREV_NEWLINE_DEPENDENT(d->states[0].elems.elems[i].constraint)) 1761 wants_newline = 1; 1762 if (PREV_LETTER_DEPENDENT(d->states[0].elems.elems[i].constraint)) 1763 wants_letter = 1; 1764 } 1765 copy(&d->states[0].elems, &follows); 1766 state = state_index(d, &follows, 0, 0); 1767 if (wants_newline) 1768 state_newline = state_index(d, &follows, 1, 0); 1769 else 1770 state_newline = state; 1771 if (wants_letter) 1772 state_letter = state_index(d, &follows, 0, 1); 1773 else 1774 state_letter = state; 1775 for (i = 0; i < NOTCHAR; ++i) 1776 trans[i] = (IS_WORD_CONSTITUENT(i)) ? state_letter : state; 1777 trans[eolbyte] = state_newline; 1778 } 1779 else 1780 for (i = 0; i < NOTCHAR; ++i) 1781 trans[i] = -1; 1782 1783 for (i = 0; i < ngrps; ++i) 1784 { 1785 follows.nelem = 0; 1786 1787 /* Find the union of the follows of the positions of the group. 1788 This is a hideously inefficient loop. Fix it someday. */ 1789 for (j = 0; j < grps[i].nelem; ++j) 1790 for (k = 0; k < d->follows[grps[i].elems[j].index].nelem; ++k) 1791 insert(d->follows[grps[i].elems[j].index].elems[k], &follows); 1792 1793 /* If we are building a searching matcher, throw in the positions 1794 of state 0 as well. */ 1795 if (d->searchflag) 1796 for (j = 0; j < d->states[0].elems.nelem; ++j) 1797 insert(d->states[0].elems.elems[j], &follows); 1798 1799 /* Find out if the new state will want any context information. */ 1800 wants_newline = 0; 1801 if (tstbit(eolbyte, labels[i])) 1802 for (j = 0; j < follows.nelem; ++j) 1803 if (PREV_NEWLINE_DEPENDENT(follows.elems[j].constraint)) 1804 wants_newline = 1; 1805 1806 wants_letter = 0; 1807 for (j = 0; j < CHARCLASS_INTS; ++j) 1808 if (labels[i][j] & letters[j]) 1809 break; 1810 if (j < CHARCLASS_INTS) 1811 for (j = 0; j < follows.nelem; ++j) 1812 if (PREV_LETTER_DEPENDENT(follows.elems[j].constraint)) 1813 wants_letter = 1; 1814 1815 /* Find the state(s) corresponding to the union of the follows. */ 1816 state = state_index(d, &follows, 0, 0); 1817 if (wants_newline) 1818 state_newline = state_index(d, &follows, 1, 0); 1819 else 1820 state_newline = state; 1821 if (wants_letter) 1822 state_letter = state_index(d, &follows, 0, 1); 1823 else 1824 state_letter = state; 1825 1826 /* Set the transitions for each character in the current label. */ 1827 for (j = 0; j < CHARCLASS_INTS; ++j) 1828 for (k = 0; k < INTBITS; ++k) 1829 if (labels[i][j] & 1 << k) 1830 { 1831 int c = j * INTBITS + k; 1832 1833 if (c == eolbyte) 1834 trans[c] = state_newline; 1835 else if (IS_WORD_CONSTITUENT(c)) 1836 trans[c] = state_letter; 1837 else if (c < NOTCHAR) 1838 trans[c] = state; 1839 } 1840 } 1841 1842 for (i = 0; i < ngrps; ++i) 1843 free(grps[i].elems); 1844 free(follows.elems); 1845 free(tmp.elems); 1846} 1847 1848/* Some routines for manipulating a compiled dfa's transition tables. 1849 Each state may or may not have a transition table; if it does, and it 1850 is a non-accepting state, then d->trans[state] points to its table. 1851 If it is an accepting state then d->fails[state] points to its table. 1852 If it has no table at all, then d->trans[state] is NULL. 1853 TODO: Improve this comment, get rid of the unnecessary redundancy. */ 1854 1855static void 1856build_state(s, d) 1857 int s; 1858 struct dfa *d; 1859{ 1860 int *trans; /* The new transition table. */ 1861 int i; 1862 1863 /* Set an upper limit on the number of transition tables that will ever 1864 exist at once. 1024 is arbitrary. The idea is that the frequently 1865 used transition tables will be quickly rebuilt, whereas the ones that 1866 were only needed once or twice will be cleared away. */ 1867 if (d->trcount >= 1024) 1868 { 1869 for (i = 0; i < d->tralloc; ++i) 1870 if (d->trans[i]) 1871 { 1872 free((ptr_t) d->trans[i]); 1873 d->trans[i] = NULL; 1874 } 1875 else if (d->fails[i]) 1876 { 1877 free((ptr_t) d->fails[i]); 1878 d->fails[i] = NULL; 1879 } 1880 d->trcount = 0; 1881 } 1882 1883 ++d->trcount; 1884 1885 /* Set up the success bits for this state. */ 1886 d->success[s] = 0; 1887 if (ACCEPTS_IN_CONTEXT(d->states[s].newline, 1, d->states[s].letter, 0, 1888 s, *d)) 1889 d->success[s] |= 4; 1890 if (ACCEPTS_IN_CONTEXT(d->states[s].newline, 0, d->states[s].letter, 1, 1891 s, *d)) 1892 d->success[s] |= 2; 1893 if (ACCEPTS_IN_CONTEXT(d->states[s].newline, 0, d->states[s].letter, 0, 1894 s, *d)) 1895 d->success[s] |= 1; 1896 1897 MALLOC(trans, int, NOTCHAR); 1898 dfastate(s, d, trans); 1899 1900 /* Now go through the new transition table, and make sure that the trans 1901 and fail arrays are allocated large enough to hold a pointer for the 1902 largest state mentioned in the table. */ 1903 for (i = 0; i < NOTCHAR; ++i) 1904 if (trans[i] >= d->tralloc) 1905 { 1906 int oldalloc = d->tralloc; 1907 1908 while (trans[i] >= d->tralloc) 1909 d->tralloc *= 2; 1910 REALLOC(d->realtrans, int *, d->tralloc + 1); 1911 d->trans = d->realtrans + 1; 1912 REALLOC(d->fails, int *, d->tralloc); 1913 REALLOC(d->success, int, d->tralloc); 1914 REALLOC(d->newlines, int, d->tralloc); 1915 while (oldalloc < d->tralloc) 1916 { 1917 d->trans[oldalloc] = NULL; 1918 d->fails[oldalloc++] = NULL; 1919 } 1920 } 1921 1922 /* Keep the newline transition in a special place so we can use it as 1923 a sentinel. */ 1924 d->newlines[s] = trans[eolbyte]; 1925 trans[eolbyte] = -1; 1926 1927 if (ACCEPTING(s, *d)) 1928 d->fails[s] = trans; 1929 else 1930 d->trans[s] = trans; 1931} 1932 1933static void 1934build_state_zero(d) 1935 struct dfa *d; 1936{ 1937 d->tralloc = 1; 1938 d->trcount = 0; 1939 CALLOC(d->realtrans, int *, d->tralloc + 1); 1940 d->trans = d->realtrans + 1; 1941 CALLOC(d->fails, int *, d->tralloc); 1942 MALLOC(d->success, int, d->tralloc); 1943 MALLOC(d->newlines, int, d->tralloc); 1944 build_state(0, d); 1945} 1946 1947/* Search through a buffer looking for a match to the given struct dfa. 1948 Find the first occurrence of a string matching the regexp in the buffer, 1949 and the shortest possible version thereof. Return a pointer to the first 1950 character after the match, or NULL if none is found. Begin points to 1951 the beginning of the buffer, and end points to the first character after 1952 its end. We store a newline in *end to act as a sentinel, so end had 1953 better point somewhere valid. Newline is a flag indicating whether to 1954 allow newlines to be in the matching string. If count is non- 1955 NULL it points to a place we're supposed to increment every time we 1956 see a newline. Finally, if backref is non-NULL it points to a place 1957 where we're supposed to store a 1 if backreferencing happened and the 1958 match needs to be verified by a backtracking matcher. Otherwise 1959 we store a 0 in *backref. */ 1960char * 1961dfaexec(d, begin, end, newline, count, backref) 1962 struct dfa *d; 1963 char *begin; 1964 char *end; 1965 int newline; 1966 int *count; 1967 int *backref; 1968{ 1969 register int s, s1, tmp; /* Current state. */ 1970 register unsigned char *p; /* Current input character. */ 1971 register int **trans, *t; /* Copy of d->trans so it can be optimized 1972 into a register. */ 1973 register unsigned char eol = eolbyte; /* Likewise for eolbyte. */ 1974 static int sbit[NOTCHAR]; /* Table for anding with d->success. */ 1975 static int sbit_init; 1976 1977 if (! sbit_init) 1978 { 1979 int i; 1980 1981 sbit_init = 1; 1982 for (i = 0; i < NOTCHAR; ++i) 1983 sbit[i] = (IS_WORD_CONSTITUENT(i)) ? 2 : 1; 1984 sbit[eol] = 4; 1985 } 1986 1987 if (! d->tralloc) 1988 build_state_zero(d); 1989 1990 s = s1 = 0; 1991 p = (unsigned char *) begin; 1992 trans = d->trans; 1993 *end = eol; 1994 1995 for (;;) 1996 { 1997 while ((t = trans[s]) != 0) { /* hand-optimized loop */ 1998 s1 = t[*p++]; 1999 if ((t = trans[s1]) == 0) { 2000 tmp = s ; s = s1 ; s1 = tmp ; /* swap */ 2001 break; 2002 } 2003 s = t[*p++]; 2004 } 2005 2006 if (s >= 0 && p <= (unsigned char *) end && d->fails[s]) 2007 { 2008 if (d->success[s] & sbit[*p]) 2009 { 2010 if (backref) 2011 *backref = (d->states[s].backref != 0); 2012 return (char *) p; 2013 } 2014 2015 s1 = s; 2016 s = d->fails[s][*p++]; 2017 continue; 2018 } 2019 2020 /* If the previous character was a newline, count it. */ 2021 if (count && (char *) p <= end && p[-1] == eol) 2022 ++*count; 2023 2024 /* Check if we've run off the end of the buffer. */ 2025 if ((char *) p > end) 2026 return NULL; 2027 2028 if (s >= 0) 2029 { 2030 build_state(s, d); 2031 trans = d->trans; 2032 continue; 2033 } 2034 2035 if (p[-1] == eol && newline) 2036 { 2037 s = d->newlines[s1]; 2038 continue; 2039 } 2040 2041 s = 0; 2042 } 2043} 2044 2045/* Initialize the components of a dfa that the other routines don't 2046 initialize for themselves. */ 2047void 2048dfainit(d) 2049 struct dfa *d; 2050{ 2051 d->calloc = 1; 2052 MALLOC(d->charclasses, charclass, d->calloc); 2053 d->cindex = 0; 2054 2055 d->talloc = 1; 2056 MALLOC(d->tokens, token, d->talloc); 2057 d->tindex = d->depth = d->nleaves = d->nregexps = 0; 2058 2059 d->searchflag = 0; 2060 d->tralloc = 0; 2061 2062 d->musts = 0; 2063} 2064 2065/* Parse and analyze a single string of the given length. */ 2066void 2067dfacomp(s, len, d, searchflag) 2068 char *s; 2069 size_t len; 2070 struct dfa *d; 2071 int searchflag; 2072{ 2073 if (case_fold) /* dummy folding in service of dfamust() */ 2074 { 2075 char *lcopy; 2076 int i; 2077 2078 lcopy = malloc(len); 2079 if (!lcopy) 2080 dfaerror(_("out of memory")); 2081 2082 /* This is a kludge. */ 2083 case_fold = 0; 2084 for (i = 0; i < len; ++i) 2085 if (ISUPPER ((unsigned char) s[i])) 2086 lcopy[i] = tolower ((unsigned char) s[i]); 2087 else 2088 lcopy[i] = s[i]; 2089 2090 dfainit(d); 2091 dfaparse(lcopy, len, d); 2092 free(lcopy); 2093 dfamust(d); 2094 d->cindex = d->tindex = d->depth = d->nleaves = d->nregexps = 0; 2095 case_fold = 1; 2096 dfaparse(s, len, d); 2097 dfaanalyze(d, searchflag); 2098 } 2099 else 2100 { 2101 dfainit(d); 2102 dfaparse(s, len, d); 2103 dfamust(d); 2104 dfaanalyze(d, searchflag); 2105 } 2106} 2107 2108/* Free the storage held by the components of a dfa. */ 2109void 2110dfafree(d) 2111 struct dfa *d; 2112{ 2113 int i; 2114 struct dfamust *dm, *ndm; 2115 2116 free((ptr_t) d->charclasses); 2117 free((ptr_t) d->tokens); 2118 for (i = 0; i < d->sindex; ++i) 2119 free((ptr_t) d->states[i].elems.elems); 2120 free((ptr_t) d->states); 2121 for (i = 0; i < d->tindex; ++i) 2122 if (d->follows[i].elems) 2123 free((ptr_t) d->follows[i].elems); 2124 free((ptr_t) d->follows); 2125 for (i = 0; i < d->tralloc; ++i) 2126 if (d->trans[i]) 2127 free((ptr_t) d->trans[i]); 2128 else if (d->fails[i]) 2129 free((ptr_t) d->fails[i]); 2130 if (d->realtrans) free((ptr_t) d->realtrans); 2131 if (d->fails) free((ptr_t) d->fails); 2132 if (d->newlines) free((ptr_t) d->newlines); 2133 if (d->success) free((ptr_t) d->success); 2134 for (dm = d->musts; dm; dm = ndm) 2135 { 2136 ndm = dm->next; 2137 free(dm->must); 2138 free((ptr_t) dm); 2139 } 2140} 2141 2142/* Having found the postfix representation of the regular expression, 2143 try to find a long sequence of characters that must appear in any line 2144 containing the r.e. 2145 Finding a "longest" sequence is beyond the scope here; 2146 we take an easy way out and hope for the best. 2147 (Take "(ab|a)b"--please.) 2148 2149 We do a bottom-up calculation of sequences of characters that must appear 2150 in matches of r.e.'s represented by trees rooted at the nodes of the postfix 2151 representation: 2152 sequences that must appear at the left of the match ("left") 2153 sequences that must appear at the right of the match ("right") 2154 lists of sequences that must appear somewhere in the match ("in") 2155 sequences that must constitute the match ("is") 2156 2157 When we get to the root of the tree, we use one of the longest of its 2158 calculated "in" sequences as our answer. The sequence we find is returned in 2159 d->must (where "d" is the single argument passed to "dfamust"); 2160 the length of the sequence is returned in d->mustn. 2161 2162 The sequences calculated for the various types of node (in pseudo ANSI c) 2163 are shown below. "p" is the operand of unary operators (and the left-hand 2164 operand of binary operators); "q" is the right-hand operand of binary 2165 operators. 2166 2167 "ZERO" means "a zero-length sequence" below. 2168 2169 Type left right is in 2170 ---- ---- ----- -- -- 2171 char c # c # c # c # c 2172 2173 CSET ZERO ZERO ZERO ZERO 2174 2175 STAR ZERO ZERO ZERO ZERO 2176 2177 QMARK ZERO ZERO ZERO ZERO 2178 2179 PLUS p->left p->right ZERO p->in 2180 2181 CAT (p->is==ZERO)? (q->is==ZERO)? (p->is!=ZERO && p->in plus 2182 p->left : q->right : q->is!=ZERO) ? q->in plus 2183 p->is##q->left p->right##q->is p->is##q->is : p->right##q->left 2184 ZERO 2185 2186 OR longest common longest common (do p->is and substrings common to 2187 leading trailing q->is have same p->in and q->in 2188 (sub)sequence (sub)sequence length and 2189 of p->left of p->right content) ? 2190 and q->left and q->right p->is : NULL 2191 2192 If there's anything else we recognize in the tree, all four sequences get set 2193 to zero-length sequences. If there's something we don't recognize in the tree, 2194 we just return a zero-length sequence. 2195 2196 Break ties in favor of infrequent letters (choosing 'zzz' in preference to 2197 'aaa')? 2198 2199 And. . .is it here or someplace that we might ponder "optimizations" such as 2200 egrep 'psi|epsilon' -> egrep 'psi' 2201 egrep 'pepsi|epsilon' -> egrep 'epsi' 2202 (Yes, we now find "epsi" as a "string 2203 that must occur", but we might also 2204 simplify the *entire* r.e. being sought) 2205 grep '[c]' -> grep 'c' 2206 grep '(ab|a)b' -> grep 'ab' 2207 grep 'ab*' -> grep 'a' 2208 grep 'a*b' -> grep 'b' 2209 2210 There are several issues: 2211 2212 Is optimization easy (enough)? 2213 2214 Does optimization actually accomplish anything, 2215 or is the automaton you get from "psi|epsilon" (for example) 2216 the same as the one you get from "psi" (for example)? 2217 2218 Are optimizable r.e.'s likely to be used in real-life situations 2219 (something like 'ab*' is probably unlikely; something like is 2220 'psi|epsilon' is likelier)? */ 2221 2222static char * 2223icatalloc(old, new) 2224 char *old; 2225 char *new; 2226{ 2227 char *result; 2228 size_t oldsize, newsize; 2229 2230 newsize = (new == NULL) ? 0 : strlen(new); 2231 if (old == NULL) 2232 oldsize = 0; 2233 else if (newsize == 0) 2234 return old; 2235 else oldsize = strlen(old); 2236 if (old == NULL) 2237 result = (char *) malloc(newsize + 1); 2238 else 2239 result = (char *) realloc((void *) old, oldsize + newsize + 1); 2240 if (result != NULL && new != NULL) 2241 (void) strcpy(result + oldsize, new); 2242 return result; 2243} 2244 2245static char * 2246icpyalloc(string) 2247 char *string; 2248{ 2249 return icatalloc((char *) NULL, string); 2250} 2251 2252static char * 2253istrstr(lookin, lookfor) 2254 char *lookin; 2255 char *lookfor; 2256{ 2257 char *cp; 2258 size_t len; 2259 2260 len = strlen(lookfor); 2261 for (cp = lookin; *cp != '\0'; ++cp) 2262 if (strncmp(cp, lookfor, len) == 0) 2263 return cp; 2264 return NULL; 2265} 2266 2267static void 2268ifree(cp) 2269 char *cp; 2270{ 2271 if (cp != NULL) 2272 free(cp); 2273} 2274 2275static void 2276freelist(cpp) 2277 char **cpp; 2278{ 2279 int i; 2280 2281 if (cpp == NULL) 2282 return; 2283 for (i = 0; cpp[i] != NULL; ++i) 2284 { 2285 free(cpp[i]); 2286 cpp[i] = NULL; 2287 } 2288} 2289 2290static char ** 2291enlist(cpp, new, len) 2292 char **cpp; 2293 char *new; 2294 size_t len; 2295{ 2296 int i, j; 2297 2298 if (cpp == NULL) 2299 return NULL; 2300 if ((new = icpyalloc(new)) == NULL) 2301 { 2302 freelist(cpp); 2303 return NULL; 2304 } 2305 new[len] = '\0'; 2306 /* Is there already something in the list that's new (or longer)? */ 2307 for (i = 0; cpp[i] != NULL; ++i) 2308 if (istrstr(cpp[i], new) != NULL) 2309 { 2310 free(new); 2311 return cpp; 2312 } 2313 /* Eliminate any obsoleted strings. */ 2314 j = 0; 2315 while (cpp[j] != NULL) 2316 if (istrstr(new, cpp[j]) == NULL) 2317 ++j; 2318 else 2319 { 2320 free(cpp[j]); 2321 if (--i == j) 2322 break; 2323 cpp[j] = cpp[i]; 2324 cpp[i] = NULL; 2325 } 2326 /* Add the new string. */ 2327 cpp = (char **) realloc((char *) cpp, (i + 2) * sizeof *cpp); 2328 if (cpp == NULL) 2329 return NULL; 2330 cpp[i] = new; 2331 cpp[i + 1] = NULL; 2332 return cpp; 2333} 2334 2335/* Given pointers to two strings, return a pointer to an allocated 2336 list of their distinct common substrings. Return NULL if something 2337 seems wild. */ 2338static char ** 2339comsubs(left, right) 2340 char *left; 2341 char *right; 2342{ 2343 char **cpp; 2344 char *lcp; 2345 char *rcp; 2346 size_t i, len; 2347 2348 if (left == NULL || right == NULL) 2349 return NULL; 2350 cpp = (char **) malloc(sizeof *cpp); 2351 if (cpp == NULL) 2352 return NULL; 2353 cpp[0] = NULL; 2354 for (lcp = left; *lcp != '\0'; ++lcp) 2355 { 2356 len = 0; 2357 rcp = index(right, *lcp); 2358 while (rcp != NULL) 2359 { 2360 for (i = 1; lcp[i] != '\0' && lcp[i] == rcp[i]; ++i) 2361 continue; 2362 if (i > len) 2363 len = i; 2364 rcp = index(rcp + 1, *lcp); 2365 } 2366 if (len == 0) 2367 continue; 2368 if ((cpp = enlist(cpp, lcp, len)) == NULL) 2369 break; 2370 } 2371 return cpp; 2372} 2373 2374static char ** 2375addlists(old, new) 2376char **old; 2377char **new; 2378{ 2379 int i; 2380 2381 if (old == NULL || new == NULL) 2382 return NULL; 2383 for (i = 0; new[i] != NULL; ++i) 2384 { 2385 old = enlist(old, new[i], strlen(new[i])); 2386 if (old == NULL) 2387 break; 2388 } 2389 return old; 2390} 2391 2392/* Given two lists of substrings, return a new list giving substrings 2393 common to both. */ 2394static char ** 2395inboth(left, right) 2396 char **left; 2397 char **right; 2398{ 2399 char **both; 2400 char **temp; 2401 int lnum, rnum; 2402 2403 if (left == NULL || right == NULL) 2404 return NULL; 2405 both = (char **) malloc(sizeof *both); 2406 if (both == NULL) 2407 return NULL; 2408 both[0] = NULL; 2409 for (lnum = 0; left[lnum] != NULL; ++lnum) 2410 { 2411 for (rnum = 0; right[rnum] != NULL; ++rnum) 2412 { 2413 temp = comsubs(left[lnum], right[rnum]); 2414 if (temp == NULL) 2415 { 2416 freelist(both); 2417 return NULL; 2418 } 2419 both = addlists(both, temp); 2420 freelist(temp); 2421 free(temp); 2422 if (both == NULL) 2423 return NULL; 2424 } 2425 } 2426 return both; 2427} 2428 2429typedef struct 2430{ 2431 char **in; 2432 char *left; 2433 char *right; 2434 char *is; 2435} must; 2436 2437static void 2438resetmust(mp) 2439must *mp; 2440{ 2441 mp->left[0] = mp->right[0] = mp->is[0] = '\0'; 2442 freelist(mp->in); 2443} 2444 2445static void 2446dfamust(dfa) 2447struct dfa *dfa; 2448{ 2449 must *musts; 2450 must *mp; 2451 char *result; 2452 int ri; 2453 int i; 2454 int exact; 2455 token t; 2456 static must must0; 2457 struct dfamust *dm; 2458 static char empty_string[] = ""; 2459 2460 result = empty_string; 2461 exact = 0; 2462 musts = (must *) malloc((dfa->tindex + 1) * sizeof *musts); 2463 if (musts == NULL) 2464 return; 2465 mp = musts; 2466 for (i = 0; i <= dfa->tindex; ++i) 2467 mp[i] = must0; 2468 for (i = 0; i <= dfa->tindex; ++i) 2469 { 2470 mp[i].in = (char **) malloc(sizeof *mp[i].in); 2471 mp[i].left = malloc(2); 2472 mp[i].right = malloc(2); 2473 mp[i].is = malloc(2); 2474 if (mp[i].in == NULL || mp[i].left == NULL || 2475 mp[i].right == NULL || mp[i].is == NULL) 2476 goto done; 2477 mp[i].left[0] = mp[i].right[0] = mp[i].is[0] = '\0'; 2478 mp[i].in[0] = NULL; 2479 } 2480#ifdef DEBUG 2481 fprintf(stderr, "dfamust:\n"); 2482 for (i = 0; i < dfa->tindex; ++i) 2483 { 2484 fprintf(stderr, " %d:", i); 2485 prtok(dfa->tokens[i]); 2486 } 2487 putc('\n', stderr); 2488#endif 2489 for (ri = 0; ri < dfa->tindex; ++ri) 2490 { 2491 switch (t = dfa->tokens[ri]) 2492 { 2493 case LPAREN: 2494 case RPAREN: 2495 goto done; /* "cannot happen" */ 2496 case EMPTY: 2497 case BEGLINE: 2498 case ENDLINE: 2499 case BEGWORD: 2500 case ENDWORD: 2501 case LIMWORD: 2502 case NOTLIMWORD: 2503 case BACKREF: 2504 resetmust(mp); 2505 break; 2506 case STAR: 2507 case QMARK: 2508 if (mp <= musts) 2509 goto done; /* "cannot happen" */ 2510 --mp; 2511 resetmust(mp); 2512 break; 2513 case OR: 2514 case ORTOP: 2515 if (mp < &musts[2]) 2516 goto done; /* "cannot happen" */ 2517 { 2518 char **new; 2519 must *lmp; 2520 must *rmp; 2521 int j, ln, rn, n; 2522 2523 rmp = --mp; 2524 lmp = --mp; 2525 /* Guaranteed to be. Unlikely, but. . . */ 2526 if (strcmp(lmp->is, rmp->is) != 0) 2527 lmp->is[0] = '\0'; 2528 /* Left side--easy */ 2529 i = 0; 2530 while (lmp->left[i] != '\0' && lmp->left[i] == rmp->left[i]) 2531 ++i; 2532 lmp->left[i] = '\0'; 2533 /* Right side */ 2534 ln = strlen(lmp->right); 2535 rn = strlen(rmp->right); 2536 n = ln; 2537 if (n > rn) 2538 n = rn; 2539 for (i = 0; i < n; ++i) 2540 if (lmp->right[ln - i - 1] != rmp->right[rn - i - 1]) 2541 break; 2542 for (j = 0; j < i; ++j) 2543 lmp->right[j] = lmp->right[(ln - i) + j]; 2544 lmp->right[j] = '\0'; 2545 new = inboth(lmp->in, rmp->in); 2546 if (new == NULL) 2547 goto done; 2548 freelist(lmp->in); 2549 free((char *) lmp->in); 2550 lmp->in = new; 2551 } 2552 break; 2553 case PLUS: 2554 if (mp <= musts) 2555 goto done; /* "cannot happen" */ 2556 --mp; 2557 mp->is[0] = '\0'; 2558 break; 2559 case END: 2560 if (mp != &musts[1]) 2561 goto done; /* "cannot happen" */ 2562 for (i = 0; musts[0].in[i] != NULL; ++i) 2563 if (strlen(musts[0].in[i]) > strlen(result)) 2564 result = musts[0].in[i]; 2565 if (strcmp(result, musts[0].is) == 0) 2566 exact = 1; 2567 goto done; 2568 case CAT: 2569 if (mp < &musts[2]) 2570 goto done; /* "cannot happen" */ 2571 { 2572 must *lmp; 2573 must *rmp; 2574 2575 rmp = --mp; 2576 lmp = --mp; 2577 /* In. Everything in left, plus everything in 2578 right, plus catenation of 2579 left's right and right's left. */ 2580 lmp->in = addlists(lmp->in, rmp->in); 2581 if (lmp->in == NULL) 2582 goto done; 2583 if (lmp->right[0] != '\0' && 2584 rmp->left[0] != '\0') 2585 { 2586 char *tp; 2587 2588 tp = icpyalloc(lmp->right); 2589 if (tp == NULL) 2590 goto done; 2591 tp = icatalloc(tp, rmp->left); 2592 if (tp == NULL) 2593 goto done; 2594 lmp->in = enlist(lmp->in, tp, 2595 strlen(tp)); 2596 free(tp); 2597 if (lmp->in == NULL) 2598 goto done; 2599 } 2600 /* Left-hand */ 2601 if (lmp->is[0] != '\0') 2602 { 2603 lmp->left = icatalloc(lmp->left, 2604 rmp->left); 2605 if (lmp->left == NULL) 2606 goto done; 2607 } 2608 /* Right-hand */ 2609 if (rmp->is[0] == '\0') 2610 lmp->right[0] = '\0'; 2611 lmp->right = icatalloc(lmp->right, rmp->right); 2612 if (lmp->right == NULL) 2613 goto done; 2614 /* Guaranteed to be */ 2615 if (lmp->is[0] != '\0' && rmp->is[0] != '\0') 2616 { 2617 lmp->is = icatalloc(lmp->is, rmp->is); 2618 if (lmp->is == NULL) 2619 goto done; 2620 } 2621 else 2622 lmp->is[0] = '\0'; 2623 } 2624 break; 2625 default: 2626 if (t < END) 2627 { 2628 /* "cannot happen" */ 2629 goto done; 2630 } 2631 else if (t == '\0') 2632 { 2633 /* not on *my* shift */ 2634 goto done; 2635 } 2636 else if (t >= CSET) 2637 { 2638 /* easy enough */ 2639 resetmust(mp); 2640 } 2641 else 2642 { 2643 /* plain character */ 2644 resetmust(mp); 2645 mp->is[0] = mp->left[0] = mp->right[0] = t; 2646 mp->is[1] = mp->left[1] = mp->right[1] = '\0'; 2647 mp->in = enlist(mp->in, mp->is, (size_t)1); 2648 if (mp->in == NULL) 2649 goto done; 2650 } 2651 break; 2652 } 2653#ifdef DEBUG 2654 fprintf(stderr, " node: %d:", ri); 2655 prtok(dfa->tokens[ri]); 2656 fprintf(stderr, "\n in:"); 2657 for (i = 0; mp->in[i]; ++i) 2658 fprintf(stderr, " \"%s\"", mp->in[i]); 2659 fprintf(stderr, "\n is: \"%s\"\n", mp->is); 2660 fprintf(stderr, " left: \"%s\"\n", mp->left); 2661 fprintf(stderr, " right: \"%s\"\n", mp->right); 2662#endif 2663 ++mp; 2664 } 2665 done: 2666 if (strlen(result)) 2667 { 2668 dm = (struct dfamust *) malloc(sizeof (struct dfamust)); 2669 dm->exact = exact; 2670 dm->must = malloc(strlen(result) + 1); 2671 strcpy(dm->must, result); 2672 dm->next = dfa->musts; 2673 dfa->musts = dm; 2674 } 2675 mp = musts; 2676 for (i = 0; i <= dfa->tindex; ++i) 2677 { 2678 freelist(mp[i].in); 2679 ifree((char *) mp[i].in); 2680 ifree(mp[i].left); 2681 ifree(mp[i].right); 2682 ifree(mp[i].is); 2683 } 2684 free((char *) mp); 2685} 2686