dfa.c revision 126435
1/* dfa.c - deterministic extended regexp routines for GNU 2 Copyright 1988, 1998, 2000 Free Software Foundation, Inc. 3 4 This program is free software; you can redistribute it and/or modify 5 it under the terms of the GNU General Public License as published by 6 the Free Software Foundation; either version 2, or (at your option) 7 any later version. 8 9 This program is distributed in the hope that it will be useful, 10 but WITHOUT ANY WARRANTY; without even the implied warranty of 11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 GNU General Public License for more details. 13 14 You should have received a copy of the GNU General Public License 15 along with this program; if not, write to the Free Software 16 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA */ 17 18/* Written June, 1988 by Mike Haertel 19 Modified July, 1988 by Arthur David Olson to assist BMG speedups */ 20 21/* $FreeBSD: head/gnu/usr.bin/grep/dfa.c 126435 2004-03-01 08:37:20Z ache $ */ 22 23#ifdef HAVE_CONFIG_H 24#include <config.h> 25#endif 26 27#include <assert.h> 28#include <ctype.h> 29#include <stdio.h> 30 31#include <sys/types.h> 32#ifdef STDC_HEADERS 33#include <stdlib.h> 34#else 35extern char *calloc(), *malloc(), *realloc(); 36extern void free(); 37#endif 38 39#if defined(HAVE_STRING_H) || defined(STDC_HEADERS) 40#include <string.h> 41#undef index 42#define index strchr 43#else 44#include <strings.h> 45#endif 46 47#ifndef DEBUG /* use the same approach as regex.c */ 48#undef assert 49#define assert(e) 50#endif /* DEBUG */ 51 52#ifndef isgraph 53#define isgraph(C) (isprint(C) && !isspace(C)) 54#endif 55 56#if defined (STDC_HEADERS) || (!defined (isascii) && !defined (HAVE_ISASCII)) 57#define ISALPHA(C) isalpha(C) 58#define ISUPPER(C) isupper(C) 59#define ISLOWER(C) islower(C) 60#define ISDIGIT(C) isdigit(C) 61#define ISXDIGIT(C) isxdigit(C) 62#define ISSPACE(C) isspace(C) 63#define ISPUNCT(C) ispunct(C) 64#define ISALNUM(C) isalnum(C) 65#define ISPRINT(C) isprint(C) 66#define ISGRAPH(C) isgraph(C) 67#define ISCNTRL(C) iscntrl(C) 68#else 69#define ISALPHA(C) (isascii(C) && isalpha(C)) 70#define ISUPPER(C) (isascii(C) && isupper(C)) 71#define ISLOWER(C) (isascii(C) && islower(C)) 72#define ISDIGIT(C) (isascii(C) && isdigit(C)) 73#define ISXDIGIT(C) (isascii(C) && isxdigit(C)) 74#define ISSPACE(C) (isascii(C) && isspace(C)) 75#define ISPUNCT(C) (isascii(C) && ispunct(C)) 76#define ISALNUM(C) (isascii(C) && isalnum(C)) 77#define ISPRINT(C) (isascii(C) && isprint(C)) 78#define ISGRAPH(C) (isascii(C) && isgraph(C)) 79#define ISCNTRL(C) (isascii(C) && iscntrl(C)) 80#endif 81 82/* ISASCIIDIGIT differs from ISDIGIT, as follows: 83 - Its arg may be any int or unsigned int; it need not be an unsigned char. 84 - It's guaranteed to evaluate its argument exactly once. 85 - It's typically faster. 86 Posix 1003.2-1992 section 2.5.2.1 page 50 lines 1556-1558 says that 87 only '0' through '9' are digits. Prefer ISASCIIDIGIT to ISDIGIT unless 88 it's important to use the locale's definition of `digit' even when the 89 host does not conform to Posix. */ 90#define ISASCIIDIGIT(c) ((unsigned) (c) - '0' <= 9) 91 92/* If we (don't) have I18N. */ 93/* glibc defines _ */ 94#ifndef _ 95# ifdef HAVE_LIBINTL_H 96# include <libintl.h> 97# ifndef _ 98# define _(Str) gettext (Str) 99# endif 100# else 101# define _(Str) (Str) 102# endif 103#endif 104 105#include "regex.h" 106#include "dfa.h" 107 108/* HPUX, define those as macros in sys/param.h */ 109#ifdef setbit 110# undef setbit 111#endif 112#ifdef clrbit 113# undef clrbit 114#endif 115 116static void dfamust PARAMS ((struct dfa *dfa)); 117 118static ptr_t xcalloc PARAMS ((size_t n, size_t s)); 119static ptr_t xmalloc PARAMS ((size_t n)); 120static ptr_t xrealloc PARAMS ((ptr_t p, size_t n)); 121#ifdef DEBUG 122static void prtok PARAMS ((token t)); 123#endif 124static int tstbit PARAMS ((int b, charclass c)); 125static void setbit PARAMS ((int b, charclass c)); 126static void clrbit PARAMS ((int b, charclass c)); 127static void copyset PARAMS ((charclass src, charclass dst)); 128static void zeroset PARAMS ((charclass s)); 129static void notset PARAMS ((charclass s)); 130static int equal PARAMS ((charclass s1, charclass s2)); 131static int charclass_index PARAMS ((charclass s)); 132static int looking_at PARAMS ((const char *s)); 133static token lex PARAMS ((void)); 134static void addtok PARAMS ((token t)); 135static void atom PARAMS ((void)); 136static int nsubtoks PARAMS ((int tindex)); 137static void copytoks PARAMS ((int tindex, int ntokens)); 138static void closure PARAMS ((void)); 139static void branch PARAMS ((void)); 140static void regexp PARAMS ((int toplevel)); 141static void copy PARAMS ((position_set *src, position_set *dst)); 142static void insert PARAMS ((position p, position_set *s)); 143static void merge PARAMS ((position_set *s1, position_set *s2, position_set *m)); 144static void delete PARAMS ((position p, position_set *s)); 145static int state_index PARAMS ((struct dfa *d, position_set *s, 146 int newline, int letter)); 147static void build_state PARAMS ((int s, struct dfa *d)); 148static void build_state_zero PARAMS ((struct dfa *d)); 149static char *icatalloc PARAMS ((char *old, char *new)); 150static char *icpyalloc PARAMS ((char *string)); 151static char *istrstr PARAMS ((char *lookin, char *lookfor)); 152static void ifree PARAMS ((char *cp)); 153static void freelist PARAMS ((char **cpp)); 154static char **enlist PARAMS ((char **cpp, char *new, size_t len)); 155static char **comsubs PARAMS ((char *left, char *right)); 156static char **addlists PARAMS ((char **old, char **new)); 157static char **inboth PARAMS ((char **left, char **right)); 158 159static ptr_t 160xcalloc (size_t n, size_t s) 161{ 162 ptr_t r = calloc(n, s); 163 164 if (!r) 165 dfaerror(_("Memory exhausted")); 166 return r; 167} 168 169static ptr_t 170xmalloc (size_t n) 171{ 172 ptr_t r = malloc(n); 173 174 assert(n != 0); 175 if (!r) 176 dfaerror(_("Memory exhausted")); 177 return r; 178} 179 180static ptr_t 181xrealloc (ptr_t p, size_t n) 182{ 183 ptr_t r = realloc(p, n); 184 185 assert(n != 0); 186 if (!r) 187 dfaerror(_("Memory exhausted")); 188 return r; 189} 190 191#define CALLOC(p, t, n) ((p) = (t *) xcalloc((size_t)(n), sizeof (t))) 192#define MALLOC(p, t, n) ((p) = (t *) xmalloc((n) * sizeof (t))) 193#define REALLOC(p, t, n) ((p) = (t *) xrealloc((ptr_t) (p), (n) * sizeof (t))) 194 195/* Reallocate an array of type t if nalloc is too small for index. */ 196#define REALLOC_IF_NECESSARY(p, t, nalloc, index) \ 197 if ((index) >= (nalloc)) \ 198 { \ 199 while ((index) >= (nalloc)) \ 200 (nalloc) *= 2; \ 201 REALLOC(p, t, nalloc); \ 202 } 203 204#ifdef DEBUG 205 206static void 207prtok (token t) 208{ 209 char *s; 210 211 if (t < 0) 212 fprintf(stderr, "END"); 213 else if (t < NOTCHAR) 214 fprintf(stderr, "%c", t); 215 else 216 { 217 switch (t) 218 { 219 case EMPTY: s = "EMPTY"; break; 220 case BACKREF: s = "BACKREF"; break; 221 case BEGLINE: s = "BEGLINE"; break; 222 case ENDLINE: s = "ENDLINE"; break; 223 case BEGWORD: s = "BEGWORD"; break; 224 case ENDWORD: s = "ENDWORD"; break; 225 case LIMWORD: s = "LIMWORD"; break; 226 case NOTLIMWORD: s = "NOTLIMWORD"; break; 227 case QMARK: s = "QMARK"; break; 228 case STAR: s = "STAR"; break; 229 case PLUS: s = "PLUS"; break; 230 case CAT: s = "CAT"; break; 231 case OR: s = "OR"; break; 232 case ORTOP: s = "ORTOP"; break; 233 case LPAREN: s = "LPAREN"; break; 234 case RPAREN: s = "RPAREN"; break; 235 default: s = "CSET"; break; 236 } 237 fprintf(stderr, "%s", s); 238 } 239} 240#endif /* DEBUG */ 241 242/* Stuff pertaining to charclasses. */ 243 244static int 245tstbit (int b, charclass c) 246{ 247 return c[b / INTBITS] & 1 << b % INTBITS; 248} 249 250static void 251setbit (int b, charclass c) 252{ 253 c[b / INTBITS] |= 1 << b % INTBITS; 254} 255 256static void 257clrbit (int b, charclass c) 258{ 259 c[b / INTBITS] &= ~(1 << b % INTBITS); 260} 261 262static void 263copyset (charclass src, charclass dst) 264{ 265 int i; 266 267 for (i = 0; i < CHARCLASS_INTS; ++i) 268 dst[i] = src[i]; 269} 270 271static void 272zeroset (charclass s) 273{ 274 int i; 275 276 for (i = 0; i < CHARCLASS_INTS; ++i) 277 s[i] = 0; 278} 279 280static void 281notset (charclass s) 282{ 283 int i; 284 285 for (i = 0; i < CHARCLASS_INTS; ++i) 286 s[i] = ~s[i]; 287} 288 289static int 290equal (charclass s1, charclass s2) 291{ 292 int i; 293 294 for (i = 0; i < CHARCLASS_INTS; ++i) 295 if (s1[i] != s2[i]) 296 return 0; 297 return 1; 298} 299 300/* A pointer to the current dfa is kept here during parsing. */ 301static struct dfa *dfa; 302 303/* Find the index of charclass s in dfa->charclasses, or allocate a new charclass. */ 304static int 305charclass_index (charclass s) 306{ 307 int i; 308 309 for (i = 0; i < dfa->cindex; ++i) 310 if (equal(s, dfa->charclasses[i])) 311 return i; 312 REALLOC_IF_NECESSARY(dfa->charclasses, charclass, dfa->calloc, dfa->cindex); 313 ++dfa->cindex; 314 copyset(s, dfa->charclasses[i]); 315 return i; 316} 317 318/* Syntax bits controlling the behavior of the lexical analyzer. */ 319static reg_syntax_t syntax_bits, syntax_bits_set; 320 321/* Flag for case-folding letters into sets. */ 322static int case_fold; 323 324/* End-of-line byte in data. */ 325static unsigned char eolbyte; 326 327/* Entry point to set syntax options. */ 328void 329dfasyntax (reg_syntax_t bits, int fold, int eol) 330{ 331 syntax_bits_set = 1; 332 syntax_bits = bits; 333 case_fold = fold; 334 eolbyte = eol; 335} 336 337/* Lexical analyzer. All the dross that deals with the obnoxious 338 GNU Regex syntax bits is located here. The poor, suffering 339 reader is referred to the GNU Regex documentation for the 340 meaning of the @#%!@#%^!@ syntax bits. */ 341 342static char *lexstart; /* Pointer to beginning of input string. */ 343static char *lexptr; /* Pointer to next input character. */ 344static int lexleft; /* Number of characters remaining. */ 345static token lasttok; /* Previous token returned; initially END. */ 346static int laststart; /* True if we're separated from beginning or (, | 347 only by zero-width characters. */ 348static int parens; /* Count of outstanding left parens. */ 349static int minrep, maxrep; /* Repeat counts for {m,n}. */ 350 351/* Note that characters become unsigned here. */ 352#define FETCH(c, eoferr) \ 353 { \ 354 if (! lexleft) \ 355 { \ 356 if (eoferr != 0) \ 357 dfaerror (eoferr); \ 358 else \ 359 return lasttok = END; \ 360 } \ 361 (c) = (unsigned char) *lexptr++; \ 362 --lexleft; \ 363 } 364 365#ifdef __STDC__ 366#define FUNC(F, P) static int F(int c) { return P(c); } 367#else 368#define FUNC(F, P) static int F(c) int c; { return P(c); } 369#endif 370 371FUNC(is_alpha, ISALPHA) 372FUNC(is_upper, ISUPPER) 373FUNC(is_lower, ISLOWER) 374FUNC(is_digit, ISDIGIT) 375FUNC(is_xdigit, ISXDIGIT) 376FUNC(is_space, ISSPACE) 377FUNC(is_punct, ISPUNCT) 378FUNC(is_alnum, ISALNUM) 379FUNC(is_print, ISPRINT) 380FUNC(is_graph, ISGRAPH) 381FUNC(is_cntrl, ISCNTRL) 382 383static int 384is_blank (int c) 385{ 386 return (c == ' ' || c == '\t'); 387} 388 389/* The following list maps the names of the Posix named character classes 390 to predicate functions that determine whether a given character is in 391 the class. The leading [ has already been eaten by the lexical analyzer. */ 392static struct { 393 const char *name; 394 int (*pred) PARAMS ((int)); 395} prednames[] = { 396 { ":alpha:]", is_alpha }, 397 { ":upper:]", is_upper }, 398 { ":lower:]", is_lower }, 399 { ":digit:]", is_digit }, 400 { ":xdigit:]", is_xdigit }, 401 { ":space:]", is_space }, 402 { ":punct:]", is_punct }, 403 { ":alnum:]", is_alnum }, 404 { ":print:]", is_print }, 405 { ":graph:]", is_graph }, 406 { ":cntrl:]", is_cntrl }, 407 { ":blank:]", is_blank }, 408 { 0 } 409}; 410 411/* Return non-zero if C is a `word-constituent' byte; zero otherwise. */ 412#define IS_WORD_CONSTITUENT(C) (ISALNUM(C) || (C) == '_') 413 414static int 415looking_at (char const *s) 416{ 417 size_t len; 418 419 len = strlen(s); 420 if (lexleft < len) 421 return 0; 422 return strncmp(s, lexptr, len) == 0; 423} 424 425static token 426lex (void) 427{ 428 token c, c1, c2; 429 int backslash = 0, invert; 430 charclass ccl; 431 int i; 432 char lo[2]; 433 char hi[2]; 434 435 /* Basic plan: We fetch a character. If it's a backslash, 436 we set the backslash flag and go through the loop again. 437 On the plus side, this avoids having a duplicate of the 438 main switch inside the backslash case. On the minus side, 439 it means that just about every case begins with 440 "if (backslash) ...". */ 441 for (i = 0; i < 2; ++i) 442 { 443 FETCH(c, 0); 444 switch (c) 445 { 446 case '\\': 447 if (backslash) 448 goto normal_char; 449 if (lexleft == 0) 450 dfaerror(_("Unfinished \\ escape")); 451 backslash = 1; 452 break; 453 454 case '^': 455 if (backslash) 456 goto normal_char; 457 if (syntax_bits & RE_CONTEXT_INDEP_ANCHORS 458 || lasttok == END 459 || lasttok == LPAREN 460 || lasttok == OR) 461 return lasttok = BEGLINE; 462 goto normal_char; 463 464 case '$': 465 if (backslash) 466 goto normal_char; 467 if (syntax_bits & RE_CONTEXT_INDEP_ANCHORS 468 || lexleft == 0 469 || (syntax_bits & RE_NO_BK_PARENS 470 ? lexleft > 0 && *lexptr == ')' 471 : lexleft > 1 && lexptr[0] == '\\' && lexptr[1] == ')') 472 || (syntax_bits & RE_NO_BK_VBAR 473 ? lexleft > 0 && *lexptr == '|' 474 : lexleft > 1 && lexptr[0] == '\\' && lexptr[1] == '|') 475 || ((syntax_bits & RE_NEWLINE_ALT) 476 && lexleft > 0 && *lexptr == '\n')) 477 return lasttok = ENDLINE; 478 goto normal_char; 479 480 case '1': 481 case '2': 482 case '3': 483 case '4': 484 case '5': 485 case '6': 486 case '7': 487 case '8': 488 case '9': 489 if (backslash && !(syntax_bits & RE_NO_BK_REFS)) 490 { 491 laststart = 0; 492 return lasttok = BACKREF; 493 } 494 goto normal_char; 495 496 case '`': 497 if (backslash && !(syntax_bits & RE_NO_GNU_OPS)) 498 return lasttok = BEGLINE; /* FIXME: should be beginning of string */ 499 goto normal_char; 500 501 case '\'': 502 if (backslash && !(syntax_bits & RE_NO_GNU_OPS)) 503 return lasttok = ENDLINE; /* FIXME: should be end of string */ 504 goto normal_char; 505 506 case '<': 507 if (backslash && !(syntax_bits & RE_NO_GNU_OPS)) 508 return lasttok = BEGWORD; 509 goto normal_char; 510 511 case '>': 512 if (backslash && !(syntax_bits & RE_NO_GNU_OPS)) 513 return lasttok = ENDWORD; 514 goto normal_char; 515 516 case 'b': 517 if (backslash && !(syntax_bits & RE_NO_GNU_OPS)) 518 return lasttok = LIMWORD; 519 goto normal_char; 520 521 case 'B': 522 if (backslash && !(syntax_bits & RE_NO_GNU_OPS)) 523 return lasttok = NOTLIMWORD; 524 goto normal_char; 525 526 case '?': 527 if (syntax_bits & RE_LIMITED_OPS) 528 goto normal_char; 529 if (backslash != ((syntax_bits & RE_BK_PLUS_QM) != 0)) 530 goto normal_char; 531 if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart) 532 goto normal_char; 533 return lasttok = QMARK; 534 535 case '*': 536 if (backslash) 537 goto normal_char; 538 if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart) 539 goto normal_char; 540 return lasttok = STAR; 541 542 case '+': 543 if (syntax_bits & RE_LIMITED_OPS) 544 goto normal_char; 545 if (backslash != ((syntax_bits & RE_BK_PLUS_QM) != 0)) 546 goto normal_char; 547 if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart) 548 goto normal_char; 549 return lasttok = PLUS; 550 551 case '{': 552 if (!(syntax_bits & RE_INTERVALS)) 553 goto normal_char; 554 if (backslash != ((syntax_bits & RE_NO_BK_BRACES) == 0)) 555 goto normal_char; 556 if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart) 557 goto normal_char; 558 559 if (syntax_bits & RE_NO_BK_BRACES) 560 { 561 /* Scan ahead for a valid interval; if it's not valid, 562 treat it as a literal '{'. */ 563 int lo = -1, hi = -1; 564 char const *p = lexptr; 565 char const *lim = p + lexleft; 566 for (; p != lim && ISASCIIDIGIT (*p); p++) 567 lo = (lo < 0 ? 0 : lo * 10) + *p - '0'; 568 if (p != lim && *p == ',') 569 while (++p != lim && ISASCIIDIGIT (*p)) 570 hi = (hi < 0 ? 0 : hi * 10) + *p - '0'; 571 else 572 hi = lo; 573 if (p == lim || *p != '}' 574 || lo < 0 || RE_DUP_MAX < hi || (0 <= hi && hi < lo)) 575 goto normal_char; 576 } 577 578 minrep = 0; 579 /* Cases: 580 {M} - exact count 581 {M,} - minimum count, maximum is infinity 582 {M,N} - M through N */ 583 FETCH(c, _("unfinished repeat count")); 584 if (ISASCIIDIGIT (c)) 585 { 586 minrep = c - '0'; 587 for (;;) 588 { 589 FETCH(c, _("unfinished repeat count")); 590 if (! ISASCIIDIGIT (c)) 591 break; 592 minrep = 10 * minrep + c - '0'; 593 } 594 } 595 else 596 dfaerror(_("malformed repeat count")); 597 if (c == ',') 598 { 599 FETCH (c, _("unfinished repeat count")); 600 if (! ISASCIIDIGIT (c)) 601 maxrep = -1; 602 else 603 { 604 maxrep = c - '0'; 605 for (;;) 606 { 607 FETCH (c, _("unfinished repeat count")); 608 if (! ISASCIIDIGIT (c)) 609 break; 610 maxrep = 10 * maxrep + c - '0'; 611 } 612 if (0 <= maxrep && maxrep < minrep) 613 dfaerror (_("malformed repeat count")); 614 } 615 } 616 else 617 maxrep = minrep; 618 if (!(syntax_bits & RE_NO_BK_BRACES)) 619 { 620 if (c != '\\') 621 dfaerror(_("malformed repeat count")); 622 FETCH(c, _("unfinished repeat count")); 623 } 624 if (c != '}') 625 dfaerror(_("malformed repeat count")); 626 laststart = 0; 627 return lasttok = REPMN; 628 629 case '|': 630 if (syntax_bits & RE_LIMITED_OPS) 631 goto normal_char; 632 if (backslash != ((syntax_bits & RE_NO_BK_VBAR) == 0)) 633 goto normal_char; 634 laststart = 1; 635 return lasttok = OR; 636 637 case '\n': 638 if (syntax_bits & RE_LIMITED_OPS 639 || backslash 640 || !(syntax_bits & RE_NEWLINE_ALT)) 641 goto normal_char; 642 laststart = 1; 643 return lasttok = OR; 644 645 case '(': 646 if (backslash != ((syntax_bits & RE_NO_BK_PARENS) == 0)) 647 goto normal_char; 648 ++parens; 649 laststart = 1; 650 return lasttok = LPAREN; 651 652 case ')': 653 if (backslash != ((syntax_bits & RE_NO_BK_PARENS) == 0)) 654 goto normal_char; 655 if (parens == 0 && syntax_bits & RE_UNMATCHED_RIGHT_PAREN_ORD) 656 goto normal_char; 657 --parens; 658 laststart = 0; 659 return lasttok = RPAREN; 660 661 case '.': 662 if (backslash) 663 goto normal_char; 664 zeroset(ccl); 665 notset(ccl); 666 if (!(syntax_bits & RE_DOT_NEWLINE)) 667 clrbit(eolbyte, ccl); 668 if (syntax_bits & RE_DOT_NOT_NULL) 669 clrbit('\0', ccl); 670 laststart = 0; 671 return lasttok = CSET + charclass_index(ccl); 672 673 case 'w': 674 case 'W': 675 if (!backslash || (syntax_bits & RE_NO_GNU_OPS)) 676 goto normal_char; 677 zeroset(ccl); 678 for (c2 = 0; c2 < NOTCHAR; ++c2) 679 if (IS_WORD_CONSTITUENT(c2)) 680 setbit(c2, ccl); 681 if (c == 'W') 682 notset(ccl); 683 laststart = 0; 684 return lasttok = CSET + charclass_index(ccl); 685 686 case '[': 687 if (backslash) 688 goto normal_char; 689 zeroset(ccl); 690 FETCH(c, _("Unbalanced [")); 691 if (c == '^') 692 { 693 FETCH(c, _("Unbalanced [")); 694 invert = 1; 695 } 696 else 697 invert = 0; 698 do 699 { 700 /* Nobody ever said this had to be fast. :-) 701 Note that if we're looking at some other [:...:] 702 construct, we just treat it as a bunch of ordinary 703 characters. We can do this because we assume 704 regex has checked for syntax errors before 705 dfa is ever called. */ 706 if (c == '[' && (syntax_bits & RE_CHAR_CLASSES)) 707 for (c1 = 0; prednames[c1].name; ++c1) 708 if (looking_at(prednames[c1].name)) 709 { 710 int (*pred)() = prednames[c1].pred; 711 if (case_fold 712 && (pred == is_upper || pred == is_lower)) 713 pred = is_alpha; 714 715 for (c2 = 0; c2 < NOTCHAR; ++c2) 716 if ((*pred)(c2)) 717 setbit(c2, ccl); 718 lexptr += strlen(prednames[c1].name); 719 lexleft -= strlen(prednames[c1].name); 720 FETCH(c1, _("Unbalanced [")); 721 goto skip; 722 } 723 if (c == '\\' && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS)) 724 FETCH(c, _("Unbalanced [")); 725 FETCH(c1, _("Unbalanced [")); 726 if (c1 == '-') 727 { 728 FETCH(c2, _("Unbalanced [")); 729 if (c2 == ']') 730 { 731 /* In the case [x-], the - is an ordinary hyphen, 732 which is left in c1, the lookahead character. */ 733 --lexptr; 734 ++lexleft; 735 c2 = c; 736 } 737 else 738 { 739 if (c2 == '\\' 740 && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS)) 741 FETCH(c2, _("Unbalanced [")); 742 FETCH(c1, _("Unbalanced [")); 743 } 744 } 745 else 746 c2 = c; 747 748 lo[0] = c; lo[1] = '\0'; 749 hi[0] = c2; hi[1] = '\0'; 750 for (c = 0; c < NOTCHAR; c++) 751 { 752 char ch[2]; 753 ch[0] = c; ch[1] = '\0'; 754 if (strcoll (lo, ch) <= 0 && strcoll (ch, hi) <= 0) 755 { 756 setbit (c, ccl); 757 if (case_fold) 758 { 759 if (ISUPPER (c)) 760 setbit (tolower (c), ccl); 761 else if (ISLOWER (c)) 762 setbit (toupper (c), ccl); 763 } 764 } 765 } 766 767 skip: 768 ; 769 } 770 while ((c = c1) != ']'); 771 if (invert) 772 { 773 notset(ccl); 774 if (syntax_bits & RE_HAT_LISTS_NOT_NEWLINE) 775 clrbit(eolbyte, ccl); 776 } 777 laststart = 0; 778 return lasttok = CSET + charclass_index(ccl); 779 780 default: 781 normal_char: 782 laststart = 0; 783 if (case_fold && ISALPHA(c)) 784 { 785 zeroset(ccl); 786 setbit(c, ccl); 787 if (isupper(c)) 788 setbit(tolower(c), ccl); 789 else 790 setbit(toupper(c), ccl); 791 return lasttok = CSET + charclass_index(ccl); 792 } 793 return c; 794 } 795 } 796 797 /* The above loop should consume at most a backslash 798 and some other character. */ 799 abort(); 800 return END; /* keeps pedantic compilers happy. */ 801} 802 803/* Recursive descent parser for regular expressions. */ 804 805static token tok; /* Lookahead token. */ 806static int depth; /* Current depth of a hypothetical stack 807 holding deferred productions. This is 808 used to determine the depth that will be 809 required of the real stack later on in 810 dfaanalyze(). */ 811 812/* Add the given token to the parse tree, maintaining the depth count and 813 updating the maximum depth if necessary. */ 814static void 815addtok (token t) 816{ 817 REALLOC_IF_NECESSARY(dfa->tokens, token, dfa->talloc, dfa->tindex); 818 dfa->tokens[dfa->tindex++] = t; 819 820 switch (t) 821 { 822 case QMARK: 823 case STAR: 824 case PLUS: 825 break; 826 827 case CAT: 828 case OR: 829 case ORTOP: 830 --depth; 831 break; 832 833 default: 834 ++dfa->nleaves; 835 case EMPTY: 836 ++depth; 837 break; 838 } 839 if (depth > dfa->depth) 840 dfa->depth = depth; 841} 842 843/* The grammar understood by the parser is as follows. 844 845 regexp: 846 regexp OR branch 847 branch 848 849 branch: 850 branch closure 851 closure 852 853 closure: 854 closure QMARK 855 closure STAR 856 closure PLUS 857 atom 858 859 atom: 860 <normal character> 861 CSET 862 BACKREF 863 BEGLINE 864 ENDLINE 865 BEGWORD 866 ENDWORD 867 LIMWORD 868 NOTLIMWORD 869 <empty> 870 871 The parser builds a parse tree in postfix form in an array of tokens. */ 872 873static void 874atom (void) 875{ 876 if ((tok >= 0 && tok < NOTCHAR) || tok >= CSET || tok == BACKREF 877 || tok == BEGLINE || tok == ENDLINE || tok == BEGWORD 878 || tok == ENDWORD || tok == LIMWORD || tok == NOTLIMWORD) 879 { 880 addtok(tok); 881 tok = lex(); 882 } 883 else if (tok == LPAREN) 884 { 885 tok = lex(); 886 regexp(0); 887 if (tok != RPAREN) 888 dfaerror(_("Unbalanced (")); 889 tok = lex(); 890 } 891 else 892 addtok(EMPTY); 893} 894 895/* Return the number of tokens in the given subexpression. */ 896static int 897nsubtoks (int tindex) 898{ 899 int ntoks1; 900 901 switch (dfa->tokens[tindex - 1]) 902 { 903 default: 904 return 1; 905 case QMARK: 906 case STAR: 907 case PLUS: 908 return 1 + nsubtoks(tindex - 1); 909 case CAT: 910 case OR: 911 case ORTOP: 912 ntoks1 = nsubtoks(tindex - 1); 913 return 1 + ntoks1 + nsubtoks(tindex - 1 - ntoks1); 914 } 915} 916 917/* Copy the given subexpression to the top of the tree. */ 918static void 919copytoks (int tindex, int ntokens) 920{ 921 int i; 922 923 for (i = 0; i < ntokens; ++i) 924 addtok(dfa->tokens[tindex + i]); 925} 926 927static void 928closure (void) 929{ 930 int tindex, ntokens, i; 931 932 atom(); 933 while (tok == QMARK || tok == STAR || tok == PLUS || tok == REPMN) 934 if (tok == REPMN) 935 { 936 ntokens = nsubtoks(dfa->tindex); 937 tindex = dfa->tindex - ntokens; 938 if (maxrep < 0) 939 addtok(PLUS); 940 if (minrep == 0) 941 addtok(QMARK); 942 for (i = 1; i < minrep; ++i) 943 { 944 copytoks(tindex, ntokens); 945 addtok(CAT); 946 } 947 for (; i < maxrep; ++i) 948 { 949 copytoks(tindex, ntokens); 950 addtok(QMARK); 951 addtok(CAT); 952 } 953 tok = lex(); 954 } 955 else 956 { 957 addtok(tok); 958 tok = lex(); 959 } 960} 961 962static void 963branch (void) 964{ 965 closure(); 966 while (tok != RPAREN && tok != OR && tok >= 0) 967 { 968 closure(); 969 addtok(CAT); 970 } 971} 972 973static void 974regexp (int toplevel) 975{ 976 branch(); 977 while (tok == OR) 978 { 979 tok = lex(); 980 branch(); 981 if (toplevel) 982 addtok(ORTOP); 983 else 984 addtok(OR); 985 } 986} 987 988/* Main entry point for the parser. S is a string to be parsed, len is the 989 length of the string, so s can include NUL characters. D is a pointer to 990 the struct dfa to parse into. */ 991void 992dfaparse (char *s, size_t len, struct dfa *d) 993{ 994 dfa = d; 995 lexstart = lexptr = s; 996 lexleft = len; 997 lasttok = END; 998 laststart = 1; 999 parens = 0; 1000 1001 if (! syntax_bits_set) 1002 dfaerror(_("No syntax specified")); 1003 1004 tok = lex(); 1005 depth = d->depth; 1006 1007 regexp(1); 1008 1009 if (tok != END) 1010 dfaerror(_("Unbalanced )")); 1011 1012 addtok(END - d->nregexps); 1013 addtok(CAT); 1014 1015 if (d->nregexps) 1016 addtok(ORTOP); 1017 1018 ++d->nregexps; 1019} 1020 1021/* Some primitives for operating on sets of positions. */ 1022 1023/* Copy one set to another; the destination must be large enough. */ 1024static void 1025copy (position_set *src, position_set *dst) 1026{ 1027 int i; 1028 1029 for (i = 0; i < src->nelem; ++i) 1030 dst->elems[i] = src->elems[i]; 1031 dst->nelem = src->nelem; 1032} 1033 1034/* Insert a position in a set. Position sets are maintained in sorted 1035 order according to index. If position already exists in the set with 1036 the same index then their constraints are logically or'd together. 1037 S->elems must point to an array large enough to hold the resulting set. */ 1038static void 1039insert (position p, position_set *s) 1040{ 1041 int i; 1042 position t1, t2; 1043 1044 for (i = 0; i < s->nelem && p.index < s->elems[i].index; ++i) 1045 continue; 1046 if (i < s->nelem && p.index == s->elems[i].index) 1047 s->elems[i].constraint |= p.constraint; 1048 else 1049 { 1050 t1 = p; 1051 ++s->nelem; 1052 while (i < s->nelem) 1053 { 1054 t2 = s->elems[i]; 1055 s->elems[i++] = t1; 1056 t1 = t2; 1057 } 1058 } 1059} 1060 1061/* Merge two sets of positions into a third. The result is exactly as if 1062 the positions of both sets were inserted into an initially empty set. */ 1063static void 1064merge (position_set *s1, position_set *s2, position_set *m) 1065{ 1066 int i = 0, j = 0; 1067 1068 m->nelem = 0; 1069 while (i < s1->nelem && j < s2->nelem) 1070 if (s1->elems[i].index > s2->elems[j].index) 1071 m->elems[m->nelem++] = s1->elems[i++]; 1072 else if (s1->elems[i].index < s2->elems[j].index) 1073 m->elems[m->nelem++] = s2->elems[j++]; 1074 else 1075 { 1076 m->elems[m->nelem] = s1->elems[i++]; 1077 m->elems[m->nelem++].constraint |= s2->elems[j++].constraint; 1078 } 1079 while (i < s1->nelem) 1080 m->elems[m->nelem++] = s1->elems[i++]; 1081 while (j < s2->nelem) 1082 m->elems[m->nelem++] = s2->elems[j++]; 1083} 1084 1085/* Delete a position from a set. */ 1086static void 1087delete (position p, position_set *s) 1088{ 1089 int i; 1090 1091 for (i = 0; i < s->nelem; ++i) 1092 if (p.index == s->elems[i].index) 1093 break; 1094 if (i < s->nelem) 1095 for (--s->nelem; i < s->nelem; ++i) 1096 s->elems[i] = s->elems[i + 1]; 1097} 1098 1099/* Find the index of the state corresponding to the given position set with 1100 the given preceding context, or create a new state if there is no such 1101 state. Newline and letter tell whether we got here on a newline or 1102 letter, respectively. */ 1103static int 1104state_index (struct dfa *d, position_set *s, int newline, int letter) 1105{ 1106 int hash = 0; 1107 int constraint; 1108 int i, j; 1109 1110 newline = newline ? 1 : 0; 1111 letter = letter ? 1 : 0; 1112 1113 for (i = 0; i < s->nelem; ++i) 1114 hash ^= s->elems[i].index + s->elems[i].constraint; 1115 1116 /* Try to find a state that exactly matches the proposed one. */ 1117 for (i = 0; i < d->sindex; ++i) 1118 { 1119 if (hash != d->states[i].hash || s->nelem != d->states[i].elems.nelem 1120 || newline != d->states[i].newline || letter != d->states[i].letter) 1121 continue; 1122 for (j = 0; j < s->nelem; ++j) 1123 if (s->elems[j].constraint 1124 != d->states[i].elems.elems[j].constraint 1125 || s->elems[j].index != d->states[i].elems.elems[j].index) 1126 break; 1127 if (j == s->nelem) 1128 return i; 1129 } 1130 1131 /* We'll have to create a new state. */ 1132 REALLOC_IF_NECESSARY(d->states, dfa_state, d->salloc, d->sindex); 1133 d->states[i].hash = hash; 1134 MALLOC(d->states[i].elems.elems, position, s->nelem); 1135 copy(s, &d->states[i].elems); 1136 d->states[i].newline = newline; 1137 d->states[i].letter = letter; 1138 d->states[i].backref = 0; 1139 d->states[i].constraint = 0; 1140 d->states[i].first_end = 0; 1141 for (j = 0; j < s->nelem; ++j) 1142 if (d->tokens[s->elems[j].index] < 0) 1143 { 1144 constraint = s->elems[j].constraint; 1145 if (SUCCEEDS_IN_CONTEXT(constraint, newline, 0, letter, 0) 1146 || SUCCEEDS_IN_CONTEXT(constraint, newline, 0, letter, 1) 1147 || SUCCEEDS_IN_CONTEXT(constraint, newline, 1, letter, 0) 1148 || SUCCEEDS_IN_CONTEXT(constraint, newline, 1, letter, 1)) 1149 d->states[i].constraint |= constraint; 1150 if (! d->states[i].first_end) 1151 d->states[i].first_end = d->tokens[s->elems[j].index]; 1152 } 1153 else if (d->tokens[s->elems[j].index] == BACKREF) 1154 { 1155 d->states[i].constraint = NO_CONSTRAINT; 1156 d->states[i].backref = 1; 1157 } 1158 1159 ++d->sindex; 1160 1161 return i; 1162} 1163 1164/* Find the epsilon closure of a set of positions. If any position of the set 1165 contains a symbol that matches the empty string in some context, replace 1166 that position with the elements of its follow labeled with an appropriate 1167 constraint. Repeat exhaustively until no funny positions are left. 1168 S->elems must be large enough to hold the result. */ 1169static void 1170epsclosure (position_set *s, struct dfa *d) 1171{ 1172 int i, j; 1173 int *visited; 1174 position p, old; 1175 1176 MALLOC(visited, int, d->tindex); 1177 for (i = 0; i < d->tindex; ++i) 1178 visited[i] = 0; 1179 1180 for (i = 0; i < s->nelem; ++i) 1181 if (d->tokens[s->elems[i].index] >= NOTCHAR 1182 && d->tokens[s->elems[i].index] != BACKREF 1183 && d->tokens[s->elems[i].index] < CSET) 1184 { 1185 old = s->elems[i]; 1186 p.constraint = old.constraint; 1187 delete(s->elems[i], s); 1188 if (visited[old.index]) 1189 { 1190 --i; 1191 continue; 1192 } 1193 visited[old.index] = 1; 1194 switch (d->tokens[old.index]) 1195 { 1196 case BEGLINE: 1197 p.constraint &= BEGLINE_CONSTRAINT; 1198 break; 1199 case ENDLINE: 1200 p.constraint &= ENDLINE_CONSTRAINT; 1201 break; 1202 case BEGWORD: 1203 p.constraint &= BEGWORD_CONSTRAINT; 1204 break; 1205 case ENDWORD: 1206 p.constraint &= ENDWORD_CONSTRAINT; 1207 break; 1208 case LIMWORD: 1209 p.constraint &= LIMWORD_CONSTRAINT; 1210 break; 1211 case NOTLIMWORD: 1212 p.constraint &= NOTLIMWORD_CONSTRAINT; 1213 break; 1214 default: 1215 break; 1216 } 1217 for (j = 0; j < d->follows[old.index].nelem; ++j) 1218 { 1219 p.index = d->follows[old.index].elems[j].index; 1220 insert(p, s); 1221 } 1222 /* Force rescan to start at the beginning. */ 1223 i = -1; 1224 } 1225 1226 free(visited); 1227} 1228 1229/* Perform bottom-up analysis on the parse tree, computing various functions. 1230 Note that at this point, we're pretending constructs like \< are real 1231 characters rather than constraints on what can follow them. 1232 1233 Nullable: A node is nullable if it is at the root of a regexp that can 1234 match the empty string. 1235 * EMPTY leaves are nullable. 1236 * No other leaf is nullable. 1237 * A QMARK or STAR node is nullable. 1238 * A PLUS node is nullable if its argument is nullable. 1239 * A CAT node is nullable if both its arguments are nullable. 1240 * An OR node is nullable if either argument is nullable. 1241 1242 Firstpos: The firstpos of a node is the set of positions (nonempty leaves) 1243 that could correspond to the first character of a string matching the 1244 regexp rooted at the given node. 1245 * EMPTY leaves have empty firstpos. 1246 * The firstpos of a nonempty leaf is that leaf itself. 1247 * The firstpos of a QMARK, STAR, or PLUS node is the firstpos of its 1248 argument. 1249 * The firstpos of a CAT node is the firstpos of the left argument, union 1250 the firstpos of the right if the left argument is nullable. 1251 * The firstpos of an OR node is the union of firstpos of each argument. 1252 1253 Lastpos: The lastpos of a node is the set of positions that could 1254 correspond to the last character of a string matching the regexp at 1255 the given node. 1256 * EMPTY leaves have empty lastpos. 1257 * The lastpos of a nonempty leaf is that leaf itself. 1258 * The lastpos of a QMARK, STAR, or PLUS node is the lastpos of its 1259 argument. 1260 * The lastpos of a CAT node is the lastpos of its right argument, union 1261 the lastpos of the left if the right argument is nullable. 1262 * The lastpos of an OR node is the union of the lastpos of each argument. 1263 1264 Follow: The follow of a position is the set of positions that could 1265 correspond to the character following a character matching the node in 1266 a string matching the regexp. At this point we consider special symbols 1267 that match the empty string in some context to be just normal characters. 1268 Later, if we find that a special symbol is in a follow set, we will 1269 replace it with the elements of its follow, labeled with an appropriate 1270 constraint. 1271 * Every node in the firstpos of the argument of a STAR or PLUS node is in 1272 the follow of every node in the lastpos. 1273 * Every node in the firstpos of the second argument of a CAT node is in 1274 the follow of every node in the lastpos of the first argument. 1275 1276 Because of the postfix representation of the parse tree, the depth-first 1277 analysis is conveniently done by a linear scan with the aid of a stack. 1278 Sets are stored as arrays of the elements, obeying a stack-like allocation 1279 scheme; the number of elements in each set deeper in the stack can be 1280 used to determine the address of a particular set's array. */ 1281void 1282dfaanalyze (struct dfa *d, int searchflag) 1283{ 1284 int *nullable; /* Nullable stack. */ 1285 int *nfirstpos; /* Element count stack for firstpos sets. */ 1286 position *firstpos; /* Array where firstpos elements are stored. */ 1287 int *nlastpos; /* Element count stack for lastpos sets. */ 1288 position *lastpos; /* Array where lastpos elements are stored. */ 1289 int *nalloc; /* Sizes of arrays allocated to follow sets. */ 1290 position_set tmp; /* Temporary set for merging sets. */ 1291 position_set merged; /* Result of merging sets. */ 1292 int wants_newline; /* True if some position wants newline info. */ 1293 int *o_nullable; 1294 int *o_nfirst, *o_nlast; 1295 position *o_firstpos, *o_lastpos; 1296 int i, j; 1297 position *pos; 1298 1299#ifdef DEBUG 1300 fprintf(stderr, "dfaanalyze:\n"); 1301 for (i = 0; i < d->tindex; ++i) 1302 { 1303 fprintf(stderr, " %d:", i); 1304 prtok(d->tokens[i]); 1305 } 1306 putc('\n', stderr); 1307#endif 1308 1309 d->searchflag = searchflag; 1310 1311 MALLOC(nullable, int, d->depth); 1312 o_nullable = nullable; 1313 MALLOC(nfirstpos, int, d->depth); 1314 o_nfirst = nfirstpos; 1315 MALLOC(firstpos, position, d->nleaves); 1316 o_firstpos = firstpos, firstpos += d->nleaves; 1317 MALLOC(nlastpos, int, d->depth); 1318 o_nlast = nlastpos; 1319 MALLOC(lastpos, position, d->nleaves); 1320 o_lastpos = lastpos, lastpos += d->nleaves; 1321 MALLOC(nalloc, int, d->tindex); 1322 for (i = 0; i < d->tindex; ++i) 1323 nalloc[i] = 0; 1324 MALLOC(merged.elems, position, d->nleaves); 1325 1326 CALLOC(d->follows, position_set, d->tindex); 1327 1328 for (i = 0; i < d->tindex; ++i) 1329#ifdef DEBUG 1330 { /* Nonsyntactic #ifdef goo... */ 1331#endif 1332 switch (d->tokens[i]) 1333 { 1334 case EMPTY: 1335 /* The empty set is nullable. */ 1336 *nullable++ = 1; 1337 1338 /* The firstpos and lastpos of the empty leaf are both empty. */ 1339 *nfirstpos++ = *nlastpos++ = 0; 1340 break; 1341 1342 case STAR: 1343 case PLUS: 1344 /* Every element in the firstpos of the argument is in the follow 1345 of every element in the lastpos. */ 1346 tmp.nelem = nfirstpos[-1]; 1347 tmp.elems = firstpos; 1348 pos = lastpos; 1349 for (j = 0; j < nlastpos[-1]; ++j) 1350 { 1351 merge(&tmp, &d->follows[pos[j].index], &merged); 1352 REALLOC_IF_NECESSARY(d->follows[pos[j].index].elems, position, 1353 nalloc[pos[j].index], merged.nelem - 1); 1354 copy(&merged, &d->follows[pos[j].index]); 1355 } 1356 1357 case QMARK: 1358 /* A QMARK or STAR node is automatically nullable. */ 1359 if (d->tokens[i] != PLUS) 1360 nullable[-1] = 1; 1361 break; 1362 1363 case CAT: 1364 /* Every element in the firstpos of the second argument is in the 1365 follow of every element in the lastpos of the first argument. */ 1366 tmp.nelem = nfirstpos[-1]; 1367 tmp.elems = firstpos; 1368 pos = lastpos + nlastpos[-1]; 1369 for (j = 0; j < nlastpos[-2]; ++j) 1370 { 1371 merge(&tmp, &d->follows[pos[j].index], &merged); 1372 REALLOC_IF_NECESSARY(d->follows[pos[j].index].elems, position, 1373 nalloc[pos[j].index], merged.nelem - 1); 1374 copy(&merged, &d->follows[pos[j].index]); 1375 } 1376 1377 /* The firstpos of a CAT node is the firstpos of the first argument, 1378 union that of the second argument if the first is nullable. */ 1379 if (nullable[-2]) 1380 nfirstpos[-2] += nfirstpos[-1]; 1381 else 1382 firstpos += nfirstpos[-1]; 1383 --nfirstpos; 1384 1385 /* The lastpos of a CAT node is the lastpos of the second argument, 1386 union that of the first argument if the second is nullable. */ 1387 if (nullable[-1]) 1388 nlastpos[-2] += nlastpos[-1]; 1389 else 1390 { 1391 pos = lastpos + nlastpos[-2]; 1392 for (j = nlastpos[-1] - 1; j >= 0; --j) 1393 pos[j] = lastpos[j]; 1394 lastpos += nlastpos[-2]; 1395 nlastpos[-2] = nlastpos[-1]; 1396 } 1397 --nlastpos; 1398 1399 /* A CAT node is nullable if both arguments are nullable. */ 1400 nullable[-2] = nullable[-1] && nullable[-2]; 1401 --nullable; 1402 break; 1403 1404 case OR: 1405 case ORTOP: 1406 /* The firstpos is the union of the firstpos of each argument. */ 1407 nfirstpos[-2] += nfirstpos[-1]; 1408 --nfirstpos; 1409 1410 /* The lastpos is the union of the lastpos of each argument. */ 1411 nlastpos[-2] += nlastpos[-1]; 1412 --nlastpos; 1413 1414 /* An OR node is nullable if either argument is nullable. */ 1415 nullable[-2] = nullable[-1] || nullable[-2]; 1416 --nullable; 1417 break; 1418 1419 default: 1420 /* Anything else is a nonempty position. (Note that special 1421 constructs like \< are treated as nonempty strings here; 1422 an "epsilon closure" effectively makes them nullable later. 1423 Backreferences have to get a real position so we can detect 1424 transitions on them later. But they are nullable. */ 1425 *nullable++ = d->tokens[i] == BACKREF; 1426 1427 /* This position is in its own firstpos and lastpos. */ 1428 *nfirstpos++ = *nlastpos++ = 1; 1429 --firstpos, --lastpos; 1430 firstpos->index = lastpos->index = i; 1431 firstpos->constraint = lastpos->constraint = NO_CONSTRAINT; 1432 1433 /* Allocate the follow set for this position. */ 1434 nalloc[i] = 1; 1435 MALLOC(d->follows[i].elems, position, nalloc[i]); 1436 break; 1437 } 1438#ifdef DEBUG 1439 /* ... balance the above nonsyntactic #ifdef goo... */ 1440 fprintf(stderr, "node %d:", i); 1441 prtok(d->tokens[i]); 1442 putc('\n', stderr); 1443 fprintf(stderr, nullable[-1] ? " nullable: yes\n" : " nullable: no\n"); 1444 fprintf(stderr, " firstpos:"); 1445 for (j = nfirstpos[-1] - 1; j >= 0; --j) 1446 { 1447 fprintf(stderr, " %d:", firstpos[j].index); 1448 prtok(d->tokens[firstpos[j].index]); 1449 } 1450 fprintf(stderr, "\n lastpos:"); 1451 for (j = nlastpos[-1] - 1; j >= 0; --j) 1452 { 1453 fprintf(stderr, " %d:", lastpos[j].index); 1454 prtok(d->tokens[lastpos[j].index]); 1455 } 1456 putc('\n', stderr); 1457 } 1458#endif 1459 1460 /* For each follow set that is the follow set of a real position, replace 1461 it with its epsilon closure. */ 1462 for (i = 0; i < d->tindex; ++i) 1463 if (d->tokens[i] < NOTCHAR || d->tokens[i] == BACKREF 1464 || d->tokens[i] >= CSET) 1465 { 1466#ifdef DEBUG 1467 fprintf(stderr, "follows(%d:", i); 1468 prtok(d->tokens[i]); 1469 fprintf(stderr, "):"); 1470 for (j = d->follows[i].nelem - 1; j >= 0; --j) 1471 { 1472 fprintf(stderr, " %d:", d->follows[i].elems[j].index); 1473 prtok(d->tokens[d->follows[i].elems[j].index]); 1474 } 1475 putc('\n', stderr); 1476#endif 1477 copy(&d->follows[i], &merged); 1478 epsclosure(&merged, d); 1479 if (d->follows[i].nelem < merged.nelem) 1480 REALLOC(d->follows[i].elems, position, merged.nelem); 1481 copy(&merged, &d->follows[i]); 1482 } 1483 1484 /* Get the epsilon closure of the firstpos of the regexp. The result will 1485 be the set of positions of state 0. */ 1486 merged.nelem = 0; 1487 for (i = 0; i < nfirstpos[-1]; ++i) 1488 insert(firstpos[i], &merged); 1489 epsclosure(&merged, d); 1490 1491 /* Check if any of the positions of state 0 will want newline context. */ 1492 wants_newline = 0; 1493 for (i = 0; i < merged.nelem; ++i) 1494 if (PREV_NEWLINE_DEPENDENT(merged.elems[i].constraint)) 1495 wants_newline = 1; 1496 1497 /* Build the initial state. */ 1498 d->salloc = 1; 1499 d->sindex = 0; 1500 MALLOC(d->states, dfa_state, d->salloc); 1501 state_index(d, &merged, wants_newline, 0); 1502 1503 free(o_nullable); 1504 free(o_nfirst); 1505 free(o_firstpos); 1506 free(o_nlast); 1507 free(o_lastpos); 1508 free(nalloc); 1509 free(merged.elems); 1510} 1511 1512/* Find, for each character, the transition out of state s of d, and store 1513 it in the appropriate slot of trans. 1514 1515 We divide the positions of s into groups (positions can appear in more 1516 than one group). Each group is labeled with a set of characters that 1517 every position in the group matches (taking into account, if necessary, 1518 preceding context information of s). For each group, find the union 1519 of the its elements' follows. This set is the set of positions of the 1520 new state. For each character in the group's label, set the transition 1521 on this character to be to a state corresponding to the set's positions, 1522 and its associated backward context information, if necessary. 1523 1524 If we are building a searching matcher, we include the positions of state 1525 0 in every state. 1526 1527 The collection of groups is constructed by building an equivalence-class 1528 partition of the positions of s. 1529 1530 For each position, find the set of characters C that it matches. Eliminate 1531 any characters from C that fail on grounds of backward context. 1532 1533 Search through the groups, looking for a group whose label L has nonempty 1534 intersection with C. If L - C is nonempty, create a new group labeled 1535 L - C and having the same positions as the current group, and set L to 1536 the intersection of L and C. Insert the position in this group, set 1537 C = C - L, and resume scanning. 1538 1539 If after comparing with every group there are characters remaining in C, 1540 create a new group labeled with the characters of C and insert this 1541 position in that group. */ 1542void 1543dfastate (int s, struct dfa *d, int trans[]) 1544{ 1545 position_set grps[NOTCHAR]; /* As many as will ever be needed. */ 1546 charclass labels[NOTCHAR]; /* Labels corresponding to the groups. */ 1547 int ngrps = 0; /* Number of groups actually used. */ 1548 position pos; /* Current position being considered. */ 1549 charclass matches; /* Set of matching characters. */ 1550 int matchesf; /* True if matches is nonempty. */ 1551 charclass intersect; /* Intersection with some label set. */ 1552 int intersectf; /* True if intersect is nonempty. */ 1553 charclass leftovers; /* Stuff in the label that didn't match. */ 1554 int leftoversf; /* True if leftovers is nonempty. */ 1555 static charclass letters; /* Set of characters considered letters. */ 1556 static charclass newline; /* Set of characters that aren't newline. */ 1557 position_set follows; /* Union of the follows of some group. */ 1558 position_set tmp; /* Temporary space for merging sets. */ 1559 int state; /* New state. */ 1560 int wants_newline; /* New state wants to know newline context. */ 1561 int state_newline; /* New state on a newline transition. */ 1562 int wants_letter; /* New state wants to know letter context. */ 1563 int state_letter; /* New state on a letter transition. */ 1564 static int initialized; /* Flag for static initialization. */ 1565 int i, j, k; 1566 1567 /* Initialize the set of letters, if necessary. */ 1568 if (! initialized) 1569 { 1570 initialized = 1; 1571 for (i = 0; i < NOTCHAR; ++i) 1572 if (IS_WORD_CONSTITUENT(i)) 1573 setbit(i, letters); 1574 setbit(eolbyte, newline); 1575 } 1576 1577 zeroset(matches); 1578 1579 for (i = 0; i < d->states[s].elems.nelem; ++i) 1580 { 1581 pos = d->states[s].elems.elems[i]; 1582 if (d->tokens[pos.index] >= 0 && d->tokens[pos.index] < NOTCHAR) 1583 setbit(d->tokens[pos.index], matches); 1584 else if (d->tokens[pos.index] >= CSET) 1585 copyset(d->charclasses[d->tokens[pos.index] - CSET], matches); 1586 else 1587 continue; 1588 1589 /* Some characters may need to be eliminated from matches because 1590 they fail in the current context. */ 1591 if (pos.constraint != 0xFF) 1592 { 1593 if (! MATCHES_NEWLINE_CONTEXT(pos.constraint, 1594 d->states[s].newline, 1)) 1595 clrbit(eolbyte, matches); 1596 if (! MATCHES_NEWLINE_CONTEXT(pos.constraint, 1597 d->states[s].newline, 0)) 1598 for (j = 0; j < CHARCLASS_INTS; ++j) 1599 matches[j] &= newline[j]; 1600 if (! MATCHES_LETTER_CONTEXT(pos.constraint, 1601 d->states[s].letter, 1)) 1602 for (j = 0; j < CHARCLASS_INTS; ++j) 1603 matches[j] &= ~letters[j]; 1604 if (! MATCHES_LETTER_CONTEXT(pos.constraint, 1605 d->states[s].letter, 0)) 1606 for (j = 0; j < CHARCLASS_INTS; ++j) 1607 matches[j] &= letters[j]; 1608 1609 /* If there are no characters left, there's no point in going on. */ 1610 for (j = 0; j < CHARCLASS_INTS && !matches[j]; ++j) 1611 continue; 1612 if (j == CHARCLASS_INTS) 1613 continue; 1614 } 1615 1616 for (j = 0; j < ngrps; ++j) 1617 { 1618 /* If matches contains a single character only, and the current 1619 group's label doesn't contain that character, go on to the 1620 next group. */ 1621 if (d->tokens[pos.index] >= 0 && d->tokens[pos.index] < NOTCHAR 1622 && !tstbit(d->tokens[pos.index], labels[j])) 1623 continue; 1624 1625 /* Check if this group's label has a nonempty intersection with 1626 matches. */ 1627 intersectf = 0; 1628 for (k = 0; k < CHARCLASS_INTS; ++k) 1629 (intersect[k] = matches[k] & labels[j][k]) ? (intersectf = 1) : 0; 1630 if (! intersectf) 1631 continue; 1632 1633 /* It does; now find the set differences both ways. */ 1634 leftoversf = matchesf = 0; 1635 for (k = 0; k < CHARCLASS_INTS; ++k) 1636 { 1637 /* Even an optimizing compiler can't know this for sure. */ 1638 int match = matches[k], label = labels[j][k]; 1639 1640 (leftovers[k] = ~match & label) ? (leftoversf = 1) : 0; 1641 (matches[k] = match & ~label) ? (matchesf = 1) : 0; 1642 } 1643 1644 /* If there were leftovers, create a new group labeled with them. */ 1645 if (leftoversf) 1646 { 1647 copyset(leftovers, labels[ngrps]); 1648 copyset(intersect, labels[j]); 1649 MALLOC(grps[ngrps].elems, position, d->nleaves); 1650 copy(&grps[j], &grps[ngrps]); 1651 ++ngrps; 1652 } 1653 1654 /* Put the position in the current group. Note that there is no 1655 reason to call insert() here. */ 1656 grps[j].elems[grps[j].nelem++] = pos; 1657 1658 /* If every character matching the current position has been 1659 accounted for, we're done. */ 1660 if (! matchesf) 1661 break; 1662 } 1663 1664 /* If we've passed the last group, and there are still characters 1665 unaccounted for, then we'll have to create a new group. */ 1666 if (j == ngrps) 1667 { 1668 copyset(matches, labels[ngrps]); 1669 zeroset(matches); 1670 MALLOC(grps[ngrps].elems, position, d->nleaves); 1671 grps[ngrps].nelem = 1; 1672 grps[ngrps].elems[0] = pos; 1673 ++ngrps; 1674 } 1675 } 1676 1677 MALLOC(follows.elems, position, d->nleaves); 1678 MALLOC(tmp.elems, position, d->nleaves); 1679 1680 /* If we are a searching matcher, the default transition is to a state 1681 containing the positions of state 0, otherwise the default transition 1682 is to fail miserably. */ 1683 if (d->searchflag) 1684 { 1685 wants_newline = 0; 1686 wants_letter = 0; 1687 for (i = 0; i < d->states[0].elems.nelem; ++i) 1688 { 1689 if (PREV_NEWLINE_DEPENDENT(d->states[0].elems.elems[i].constraint)) 1690 wants_newline = 1; 1691 if (PREV_LETTER_DEPENDENT(d->states[0].elems.elems[i].constraint)) 1692 wants_letter = 1; 1693 } 1694 copy(&d->states[0].elems, &follows); 1695 state = state_index(d, &follows, 0, 0); 1696 if (wants_newline) 1697 state_newline = state_index(d, &follows, 1, 0); 1698 else 1699 state_newline = state; 1700 if (wants_letter) 1701 state_letter = state_index(d, &follows, 0, 1); 1702 else 1703 state_letter = state; 1704 for (i = 0; i < NOTCHAR; ++i) 1705 trans[i] = (IS_WORD_CONSTITUENT(i)) ? state_letter : state; 1706 trans[eolbyte] = state_newline; 1707 } 1708 else 1709 for (i = 0; i < NOTCHAR; ++i) 1710 trans[i] = -1; 1711 1712 for (i = 0; i < ngrps; ++i) 1713 { 1714 follows.nelem = 0; 1715 1716 /* Find the union of the follows of the positions of the group. 1717 This is a hideously inefficient loop. Fix it someday. */ 1718 for (j = 0; j < grps[i].nelem; ++j) 1719 for (k = 0; k < d->follows[grps[i].elems[j].index].nelem; ++k) 1720 insert(d->follows[grps[i].elems[j].index].elems[k], &follows); 1721 1722 /* If we are building a searching matcher, throw in the positions 1723 of state 0 as well. */ 1724 if (d->searchflag) 1725 for (j = 0; j < d->states[0].elems.nelem; ++j) 1726 insert(d->states[0].elems.elems[j], &follows); 1727 1728 /* Find out if the new state will want any context information. */ 1729 wants_newline = 0; 1730 if (tstbit(eolbyte, labels[i])) 1731 for (j = 0; j < follows.nelem; ++j) 1732 if (PREV_NEWLINE_DEPENDENT(follows.elems[j].constraint)) 1733 wants_newline = 1; 1734 1735 wants_letter = 0; 1736 for (j = 0; j < CHARCLASS_INTS; ++j) 1737 if (labels[i][j] & letters[j]) 1738 break; 1739 if (j < CHARCLASS_INTS) 1740 for (j = 0; j < follows.nelem; ++j) 1741 if (PREV_LETTER_DEPENDENT(follows.elems[j].constraint)) 1742 wants_letter = 1; 1743 1744 /* Find the state(s) corresponding to the union of the follows. */ 1745 state = state_index(d, &follows, 0, 0); 1746 if (wants_newline) 1747 state_newline = state_index(d, &follows, 1, 0); 1748 else 1749 state_newline = state; 1750 if (wants_letter) 1751 state_letter = state_index(d, &follows, 0, 1); 1752 else 1753 state_letter = state; 1754 1755 /* Set the transitions for each character in the current label. */ 1756 for (j = 0; j < CHARCLASS_INTS; ++j) 1757 for (k = 0; k < INTBITS; ++k) 1758 if (labels[i][j] & 1 << k) 1759 { 1760 int c = j * INTBITS + k; 1761 1762 if (c == eolbyte) 1763 trans[c] = state_newline; 1764 else if (IS_WORD_CONSTITUENT(c)) 1765 trans[c] = state_letter; 1766 else if (c < NOTCHAR) 1767 trans[c] = state; 1768 } 1769 } 1770 1771 for (i = 0; i < ngrps; ++i) 1772 free(grps[i].elems); 1773 free(follows.elems); 1774 free(tmp.elems); 1775} 1776 1777/* Some routines for manipulating a compiled dfa's transition tables. 1778 Each state may or may not have a transition table; if it does, and it 1779 is a non-accepting state, then d->trans[state] points to its table. 1780 If it is an accepting state then d->fails[state] points to its table. 1781 If it has no table at all, then d->trans[state] is NULL. 1782 TODO: Improve this comment, get rid of the unnecessary redundancy. */ 1783 1784static void 1785build_state (int s, struct dfa *d) 1786{ 1787 int *trans; /* The new transition table. */ 1788 int i; 1789 1790 /* Set an upper limit on the number of transition tables that will ever 1791 exist at once. 1024 is arbitrary. The idea is that the frequently 1792 used transition tables will be quickly rebuilt, whereas the ones that 1793 were only needed once or twice will be cleared away. */ 1794 if (d->trcount >= 1024) 1795 { 1796 for (i = 0; i < d->tralloc; ++i) 1797 if (d->trans[i]) 1798 { 1799 free((ptr_t) d->trans[i]); 1800 d->trans[i] = NULL; 1801 } 1802 else if (d->fails[i]) 1803 { 1804 free((ptr_t) d->fails[i]); 1805 d->fails[i] = NULL; 1806 } 1807 d->trcount = 0; 1808 } 1809 1810 ++d->trcount; 1811 1812 /* Set up the success bits for this state. */ 1813 d->success[s] = 0; 1814 if (ACCEPTS_IN_CONTEXT(d->states[s].newline, 1, d->states[s].letter, 0, 1815 s, *d)) 1816 d->success[s] |= 4; 1817 if (ACCEPTS_IN_CONTEXT(d->states[s].newline, 0, d->states[s].letter, 1, 1818 s, *d)) 1819 d->success[s] |= 2; 1820 if (ACCEPTS_IN_CONTEXT(d->states[s].newline, 0, d->states[s].letter, 0, 1821 s, *d)) 1822 d->success[s] |= 1; 1823 1824 MALLOC(trans, int, NOTCHAR); 1825 dfastate(s, d, trans); 1826 1827 /* Now go through the new transition table, and make sure that the trans 1828 and fail arrays are allocated large enough to hold a pointer for the 1829 largest state mentioned in the table. */ 1830 for (i = 0; i < NOTCHAR; ++i) 1831 if (trans[i] >= d->tralloc) 1832 { 1833 int oldalloc = d->tralloc; 1834 1835 while (trans[i] >= d->tralloc) 1836 d->tralloc *= 2; 1837 REALLOC(d->realtrans, int *, d->tralloc + 1); 1838 d->trans = d->realtrans + 1; 1839 REALLOC(d->fails, int *, d->tralloc); 1840 REALLOC(d->success, int, d->tralloc); 1841 REALLOC(d->newlines, int, d->tralloc); 1842 while (oldalloc < d->tralloc) 1843 { 1844 d->trans[oldalloc] = NULL; 1845 d->fails[oldalloc++] = NULL; 1846 } 1847 } 1848 1849 /* Keep the newline transition in a special place so we can use it as 1850 a sentinel. */ 1851 d->newlines[s] = trans[eolbyte]; 1852 trans[eolbyte] = -1; 1853 1854 if (ACCEPTING(s, *d)) 1855 d->fails[s] = trans; 1856 else 1857 d->trans[s] = trans; 1858} 1859 1860static void 1861build_state_zero (struct dfa *d) 1862{ 1863 d->tralloc = 1; 1864 d->trcount = 0; 1865 CALLOC(d->realtrans, int *, d->tralloc + 1); 1866 d->trans = d->realtrans + 1; 1867 CALLOC(d->fails, int *, d->tralloc); 1868 MALLOC(d->success, int, d->tralloc); 1869 MALLOC(d->newlines, int, d->tralloc); 1870 build_state(0, d); 1871} 1872 1873/* Search through a buffer looking for a match to the given struct dfa. 1874 Find the first occurrence of a string matching the regexp in the buffer, 1875 and the shortest possible version thereof. Return a pointer to the first 1876 character after the match, or NULL if none is found. Begin points to 1877 the beginning of the buffer, and end points to the first character after 1878 its end. We store a newline in *end to act as a sentinel, so end had 1879 better point somewhere valid. Newline is a flag indicating whether to 1880 allow newlines to be in the matching string. If count is non- 1881 NULL it points to a place we're supposed to increment every time we 1882 see a newline. Finally, if backref is non-NULL it points to a place 1883 where we're supposed to store a 1 if backreferencing happened and the 1884 match needs to be verified by a backtracking matcher. Otherwise 1885 we store a 0 in *backref. */ 1886char * 1887dfaexec (struct dfa *d, char *begin, char *end, 1888 int newline, int *count, int *backref) 1889{ 1890 register int s, s1, tmp; /* Current state. */ 1891 register unsigned char *p; /* Current input character. */ 1892 register int **trans, *t; /* Copy of d->trans so it can be optimized 1893 into a register. */ 1894 register unsigned char eol = eolbyte; /* Likewise for eolbyte. */ 1895 static int sbit[NOTCHAR]; /* Table for anding with d->success. */ 1896 static int sbit_init; 1897 1898 if (! sbit_init) 1899 { 1900 int i; 1901 1902 sbit_init = 1; 1903 for (i = 0; i < NOTCHAR; ++i) 1904 sbit[i] = (IS_WORD_CONSTITUENT(i)) ? 2 : 1; 1905 sbit[eol] = 4; 1906 } 1907 1908 if (! d->tralloc) 1909 build_state_zero(d); 1910 1911 s = s1 = 0; 1912 p = (unsigned char *) begin; 1913 trans = d->trans; 1914 *end = eol; 1915 1916 for (;;) 1917 { 1918 while ((t = trans[s]) != 0) { /* hand-optimized loop */ 1919 s1 = t[*p++]; 1920 if ((t = trans[s1]) == 0) { 1921 tmp = s ; s = s1 ; s1 = tmp ; /* swap */ 1922 break; 1923 } 1924 s = t[*p++]; 1925 } 1926 1927 if (s >= 0 && p <= (unsigned char *) end && d->fails[s]) 1928 { 1929 if (d->success[s] & sbit[*p]) 1930 { 1931 if (backref) 1932 *backref = (d->states[s].backref != 0); 1933 return (char *) p; 1934 } 1935 1936 s1 = s; 1937 s = d->fails[s][*p++]; 1938 continue; 1939 } 1940 1941 /* If the previous character was a newline, count it. */ 1942 if (count && (char *) p <= end && p[-1] == eol) 1943 ++*count; 1944 1945 /* Check if we've run off the end of the buffer. */ 1946 if ((char *) p > end) 1947 return NULL; 1948 1949 if (s >= 0) 1950 { 1951 build_state(s, d); 1952 trans = d->trans; 1953 continue; 1954 } 1955 1956 if (p[-1] == eol && newline) 1957 { 1958 s = d->newlines[s1]; 1959 continue; 1960 } 1961 1962 s = 0; 1963 } 1964} 1965 1966/* Initialize the components of a dfa that the other routines don't 1967 initialize for themselves. */ 1968void 1969dfainit (struct dfa *d) 1970{ 1971 d->calloc = 1; 1972 MALLOC(d->charclasses, charclass, d->calloc); 1973 d->cindex = 0; 1974 1975 d->talloc = 1; 1976 MALLOC(d->tokens, token, d->talloc); 1977 d->tindex = d->depth = d->nleaves = d->nregexps = 0; 1978 1979 d->searchflag = 0; 1980 d->tralloc = 0; 1981 1982 d->musts = 0; 1983} 1984 1985/* Parse and analyze a single string of the given length. */ 1986void 1987dfacomp (char *s, size_t len, struct dfa *d, int searchflag) 1988{ 1989 if (case_fold) /* dummy folding in service of dfamust() */ 1990 { 1991 char *lcopy; 1992 int i; 1993 1994 lcopy = malloc(len); 1995 if (!lcopy) 1996 dfaerror(_("out of memory")); 1997 1998 /* This is a kludge. */ 1999 case_fold = 0; 2000 for (i = 0; i < len; ++i) 2001 if (ISUPPER ((unsigned char) s[i])) 2002 lcopy[i] = tolower ((unsigned char) s[i]); 2003 else 2004 lcopy[i] = s[i]; 2005 2006 dfainit(d); 2007 dfaparse(lcopy, len, d); 2008 free(lcopy); 2009 dfamust(d); 2010 d->cindex = d->tindex = d->depth = d->nleaves = d->nregexps = 0; 2011 case_fold = 1; 2012 dfaparse(s, len, d); 2013 dfaanalyze(d, searchflag); 2014 } 2015 else 2016 { 2017 dfainit(d); 2018 dfaparse(s, len, d); 2019 dfamust(d); 2020 dfaanalyze(d, searchflag); 2021 } 2022} 2023 2024/* Free the storage held by the components of a dfa. */ 2025void 2026dfafree (struct dfa *d) 2027{ 2028 int i; 2029 struct dfamust *dm, *ndm; 2030 2031 free((ptr_t) d->charclasses); 2032 free((ptr_t) d->tokens); 2033 for (i = 0; i < d->sindex; ++i) 2034 free((ptr_t) d->states[i].elems.elems); 2035 free((ptr_t) d->states); 2036 for (i = 0; i < d->tindex; ++i) 2037 if (d->follows[i].elems) 2038 free((ptr_t) d->follows[i].elems); 2039 free((ptr_t) d->follows); 2040 for (i = 0; i < d->tralloc; ++i) 2041 if (d->trans[i]) 2042 free((ptr_t) d->trans[i]); 2043 else if (d->fails[i]) 2044 free((ptr_t) d->fails[i]); 2045 if (d->realtrans) free((ptr_t) d->realtrans); 2046 if (d->fails) free((ptr_t) d->fails); 2047 if (d->newlines) free((ptr_t) d->newlines); 2048 if (d->success) free((ptr_t) d->success); 2049 for (dm = d->musts; dm; dm = ndm) 2050 { 2051 ndm = dm->next; 2052 free(dm->must); 2053 free((ptr_t) dm); 2054 } 2055} 2056 2057/* Having found the postfix representation of the regular expression, 2058 try to find a long sequence of characters that must appear in any line 2059 containing the r.e. 2060 Finding a "longest" sequence is beyond the scope here; 2061 we take an easy way out and hope for the best. 2062 (Take "(ab|a)b"--please.) 2063 2064 We do a bottom-up calculation of sequences of characters that must appear 2065 in matches of r.e.'s represented by trees rooted at the nodes of the postfix 2066 representation: 2067 sequences that must appear at the left of the match ("left") 2068 sequences that must appear at the right of the match ("right") 2069 lists of sequences that must appear somewhere in the match ("in") 2070 sequences that must constitute the match ("is") 2071 2072 When we get to the root of the tree, we use one of the longest of its 2073 calculated "in" sequences as our answer. The sequence we find is returned in 2074 d->must (where "d" is the single argument passed to "dfamust"); 2075 the length of the sequence is returned in d->mustn. 2076 2077 The sequences calculated for the various types of node (in pseudo ANSI c) 2078 are shown below. "p" is the operand of unary operators (and the left-hand 2079 operand of binary operators); "q" is the right-hand operand of binary 2080 operators. 2081 2082 "ZERO" means "a zero-length sequence" below. 2083 2084 Type left right is in 2085 ---- ---- ----- -- -- 2086 char c # c # c # c # c 2087 2088 CSET ZERO ZERO ZERO ZERO 2089 2090 STAR ZERO ZERO ZERO ZERO 2091 2092 QMARK ZERO ZERO ZERO ZERO 2093 2094 PLUS p->left p->right ZERO p->in 2095 2096 CAT (p->is==ZERO)? (q->is==ZERO)? (p->is!=ZERO && p->in plus 2097 p->left : q->right : q->is!=ZERO) ? q->in plus 2098 p->is##q->left p->right##q->is p->is##q->is : p->right##q->left 2099 ZERO 2100 2101 OR longest common longest common (do p->is and substrings common to 2102 leading trailing q->is have same p->in and q->in 2103 (sub)sequence (sub)sequence length and 2104 of p->left of p->right content) ? 2105 and q->left and q->right p->is : NULL 2106 2107 If there's anything else we recognize in the tree, all four sequences get set 2108 to zero-length sequences. If there's something we don't recognize in the tree, 2109 we just return a zero-length sequence. 2110 2111 Break ties in favor of infrequent letters (choosing 'zzz' in preference to 2112 'aaa')? 2113 2114 And. . .is it here or someplace that we might ponder "optimizations" such as 2115 egrep 'psi|epsilon' -> egrep 'psi' 2116 egrep 'pepsi|epsilon' -> egrep 'epsi' 2117 (Yes, we now find "epsi" as a "string 2118 that must occur", but we might also 2119 simplify the *entire* r.e. being sought) 2120 grep '[c]' -> grep 'c' 2121 grep '(ab|a)b' -> grep 'ab' 2122 grep 'ab*' -> grep 'a' 2123 grep 'a*b' -> grep 'b' 2124 2125 There are several issues: 2126 2127 Is optimization easy (enough)? 2128 2129 Does optimization actually accomplish anything, 2130 or is the automaton you get from "psi|epsilon" (for example) 2131 the same as the one you get from "psi" (for example)? 2132 2133 Are optimizable r.e.'s likely to be used in real-life situations 2134 (something like 'ab*' is probably unlikely; something like is 2135 'psi|epsilon' is likelier)? */ 2136 2137static char * 2138icatalloc (char *old, char *new) 2139{ 2140 char *result; 2141 size_t oldsize, newsize; 2142 2143 newsize = (new == NULL) ? 0 : strlen(new); 2144 if (old == NULL) 2145 oldsize = 0; 2146 else if (newsize == 0) 2147 return old; 2148 else oldsize = strlen(old); 2149 if (old == NULL) 2150 result = (char *) malloc(newsize + 1); 2151 else 2152 result = (char *) realloc((void *) old, oldsize + newsize + 1); 2153 if (result != NULL && new != NULL) 2154 (void) strcpy(result + oldsize, new); 2155 return result; 2156} 2157 2158static char * 2159icpyalloc (char *string) 2160{ 2161 return icatalloc((char *) NULL, string); 2162} 2163 2164static char * 2165istrstr (char *lookin, char *lookfor) 2166{ 2167 char *cp; 2168 size_t len; 2169 2170 len = strlen(lookfor); 2171 for (cp = lookin; *cp != '\0'; ++cp) 2172 if (strncmp(cp, lookfor, len) == 0) 2173 return cp; 2174 return NULL; 2175} 2176 2177static void 2178ifree (char *cp) 2179{ 2180 if (cp != NULL) 2181 free(cp); 2182} 2183 2184static void 2185freelist (char **cpp) 2186{ 2187 int i; 2188 2189 if (cpp == NULL) 2190 return; 2191 for (i = 0; cpp[i] != NULL; ++i) 2192 { 2193 free(cpp[i]); 2194 cpp[i] = NULL; 2195 } 2196} 2197 2198static char ** 2199enlist (char **cpp, char *new, size_t len) 2200{ 2201 int i, j; 2202 2203 if (cpp == NULL) 2204 return NULL; 2205 if ((new = icpyalloc(new)) == NULL) 2206 { 2207 freelist(cpp); 2208 return NULL; 2209 } 2210 new[len] = '\0'; 2211 /* Is there already something in the list that's new (or longer)? */ 2212 for (i = 0; cpp[i] != NULL; ++i) 2213 if (istrstr(cpp[i], new) != NULL) 2214 { 2215 free(new); 2216 return cpp; 2217 } 2218 /* Eliminate any obsoleted strings. */ 2219 j = 0; 2220 while (cpp[j] != NULL) 2221 if (istrstr(new, cpp[j]) == NULL) 2222 ++j; 2223 else 2224 { 2225 free(cpp[j]); 2226 if (--i == j) 2227 break; 2228 cpp[j] = cpp[i]; 2229 cpp[i] = NULL; 2230 } 2231 /* Add the new string. */ 2232 cpp = (char **) realloc((char *) cpp, (i + 2) * sizeof *cpp); 2233 if (cpp == NULL) 2234 return NULL; 2235 cpp[i] = new; 2236 cpp[i + 1] = NULL; 2237 return cpp; 2238} 2239 2240/* Given pointers to two strings, return a pointer to an allocated 2241 list of their distinct common substrings. Return NULL if something 2242 seems wild. */ 2243static char ** 2244comsubs (char *left, char *right) 2245{ 2246 char **cpp; 2247 char *lcp; 2248 char *rcp; 2249 size_t i, len; 2250 2251 if (left == NULL || right == NULL) 2252 return NULL; 2253 cpp = (char **) malloc(sizeof *cpp); 2254 if (cpp == NULL) 2255 return NULL; 2256 cpp[0] = NULL; 2257 for (lcp = left; *lcp != '\0'; ++lcp) 2258 { 2259 len = 0; 2260 rcp = index(right, *lcp); 2261 while (rcp != NULL) 2262 { 2263 for (i = 1; lcp[i] != '\0' && lcp[i] == rcp[i]; ++i) 2264 continue; 2265 if (i > len) 2266 len = i; 2267 rcp = index(rcp + 1, *lcp); 2268 } 2269 if (len == 0) 2270 continue; 2271 if ((cpp = enlist(cpp, lcp, len)) == NULL) 2272 break; 2273 } 2274 return cpp; 2275} 2276 2277static char ** 2278addlists (char **old, char **new) 2279{ 2280 int i; 2281 2282 if (old == NULL || new == NULL) 2283 return NULL; 2284 for (i = 0; new[i] != NULL; ++i) 2285 { 2286 old = enlist(old, new[i], strlen(new[i])); 2287 if (old == NULL) 2288 break; 2289 } 2290 return old; 2291} 2292 2293/* Given two lists of substrings, return a new list giving substrings 2294 common to both. */ 2295static char ** 2296inboth (char **left, char **right) 2297{ 2298 char **both; 2299 char **temp; 2300 int lnum, rnum; 2301 2302 if (left == NULL || right == NULL) 2303 return NULL; 2304 both = (char **) malloc(sizeof *both); 2305 if (both == NULL) 2306 return NULL; 2307 both[0] = NULL; 2308 for (lnum = 0; left[lnum] != NULL; ++lnum) 2309 { 2310 for (rnum = 0; right[rnum] != NULL; ++rnum) 2311 { 2312 temp = comsubs(left[lnum], right[rnum]); 2313 if (temp == NULL) 2314 { 2315 freelist(both); 2316 return NULL; 2317 } 2318 both = addlists(both, temp); 2319 freelist(temp); 2320 free(temp); 2321 if (both == NULL) 2322 return NULL; 2323 } 2324 } 2325 return both; 2326} 2327 2328typedef struct 2329{ 2330 char **in; 2331 char *left; 2332 char *right; 2333 char *is; 2334} must; 2335 2336static void 2337resetmust (must *mp) 2338{ 2339 mp->left[0] = mp->right[0] = mp->is[0] = '\0'; 2340 freelist(mp->in); 2341} 2342 2343static void 2344dfamust (struct dfa *dfa) 2345{ 2346 must *musts; 2347 must *mp; 2348 char *result; 2349 int ri; 2350 int i; 2351 int exact; 2352 token t; 2353 static must must0; 2354 struct dfamust *dm; 2355 static char empty_string[] = ""; 2356 2357 result = empty_string; 2358 exact = 0; 2359 musts = (must *) malloc((dfa->tindex + 1) * sizeof *musts); 2360 if (musts == NULL) 2361 return; 2362 mp = musts; 2363 for (i = 0; i <= dfa->tindex; ++i) 2364 mp[i] = must0; 2365 for (i = 0; i <= dfa->tindex; ++i) 2366 { 2367 mp[i].in = (char **) malloc(sizeof *mp[i].in); 2368 mp[i].left = malloc(2); 2369 mp[i].right = malloc(2); 2370 mp[i].is = malloc(2); 2371 if (mp[i].in == NULL || mp[i].left == NULL || 2372 mp[i].right == NULL || mp[i].is == NULL) 2373 goto done; 2374 mp[i].left[0] = mp[i].right[0] = mp[i].is[0] = '\0'; 2375 mp[i].in[0] = NULL; 2376 } 2377#ifdef DEBUG 2378 fprintf(stderr, "dfamust:\n"); 2379 for (i = 0; i < dfa->tindex; ++i) 2380 { 2381 fprintf(stderr, " %d:", i); 2382 prtok(dfa->tokens[i]); 2383 } 2384 putc('\n', stderr); 2385#endif 2386 for (ri = 0; ri < dfa->tindex; ++ri) 2387 { 2388 switch (t = dfa->tokens[ri]) 2389 { 2390 case LPAREN: 2391 case RPAREN: 2392 goto done; /* "cannot happen" */ 2393 case EMPTY: 2394 case BEGLINE: 2395 case ENDLINE: 2396 case BEGWORD: 2397 case ENDWORD: 2398 case LIMWORD: 2399 case NOTLIMWORD: 2400 case BACKREF: 2401 resetmust(mp); 2402 break; 2403 case STAR: 2404 case QMARK: 2405 if (mp <= musts) 2406 goto done; /* "cannot happen" */ 2407 --mp; 2408 resetmust(mp); 2409 break; 2410 case OR: 2411 case ORTOP: 2412 if (mp < &musts[2]) 2413 goto done; /* "cannot happen" */ 2414 { 2415 char **new; 2416 must *lmp; 2417 must *rmp; 2418 int j, ln, rn, n; 2419 2420 rmp = --mp; 2421 lmp = --mp; 2422 /* Guaranteed to be. Unlikely, but. . . */ 2423 if (strcmp(lmp->is, rmp->is) != 0) 2424 lmp->is[0] = '\0'; 2425 /* Left side--easy */ 2426 i = 0; 2427 while (lmp->left[i] != '\0' && lmp->left[i] == rmp->left[i]) 2428 ++i; 2429 lmp->left[i] = '\0'; 2430 /* Right side */ 2431 ln = strlen(lmp->right); 2432 rn = strlen(rmp->right); 2433 n = ln; 2434 if (n > rn) 2435 n = rn; 2436 for (i = 0; i < n; ++i) 2437 if (lmp->right[ln - i - 1] != rmp->right[rn - i - 1]) 2438 break; 2439 for (j = 0; j < i; ++j) 2440 lmp->right[j] = lmp->right[(ln - i) + j]; 2441 lmp->right[j] = '\0'; 2442 new = inboth(lmp->in, rmp->in); 2443 if (new == NULL) 2444 goto done; 2445 freelist(lmp->in); 2446 free((char *) lmp->in); 2447 lmp->in = new; 2448 } 2449 break; 2450 case PLUS: 2451 if (mp <= musts) 2452 goto done; /* "cannot happen" */ 2453 --mp; 2454 mp->is[0] = '\0'; 2455 break; 2456 case END: 2457 if (mp != &musts[1]) 2458 goto done; /* "cannot happen" */ 2459 for (i = 0; musts[0].in[i] != NULL; ++i) 2460 if (strlen(musts[0].in[i]) > strlen(result)) 2461 result = musts[0].in[i]; 2462 if (strcmp(result, musts[0].is) == 0) 2463 exact = 1; 2464 goto done; 2465 case CAT: 2466 if (mp < &musts[2]) 2467 goto done; /* "cannot happen" */ 2468 { 2469 must *lmp; 2470 must *rmp; 2471 2472 rmp = --mp; 2473 lmp = --mp; 2474 /* In. Everything in left, plus everything in 2475 right, plus catenation of 2476 left's right and right's left. */ 2477 lmp->in = addlists(lmp->in, rmp->in); 2478 if (lmp->in == NULL) 2479 goto done; 2480 if (lmp->right[0] != '\0' && 2481 rmp->left[0] != '\0') 2482 { 2483 char *tp; 2484 2485 tp = icpyalloc(lmp->right); 2486 if (tp == NULL) 2487 goto done; 2488 tp = icatalloc(tp, rmp->left); 2489 if (tp == NULL) 2490 goto done; 2491 lmp->in = enlist(lmp->in, tp, 2492 strlen(tp)); 2493 free(tp); 2494 if (lmp->in == NULL) 2495 goto done; 2496 } 2497 /* Left-hand */ 2498 if (lmp->is[0] != '\0') 2499 { 2500 lmp->left = icatalloc(lmp->left, 2501 rmp->left); 2502 if (lmp->left == NULL) 2503 goto done; 2504 } 2505 /* Right-hand */ 2506 if (rmp->is[0] == '\0') 2507 lmp->right[0] = '\0'; 2508 lmp->right = icatalloc(lmp->right, rmp->right); 2509 if (lmp->right == NULL) 2510 goto done; 2511 /* Guaranteed to be */ 2512 if (lmp->is[0] != '\0' && rmp->is[0] != '\0') 2513 { 2514 lmp->is = icatalloc(lmp->is, rmp->is); 2515 if (lmp->is == NULL) 2516 goto done; 2517 } 2518 else 2519 lmp->is[0] = '\0'; 2520 } 2521 break; 2522 default: 2523 if (t < END) 2524 { 2525 /* "cannot happen" */ 2526 goto done; 2527 } 2528 else if (t == '\0') 2529 { 2530 /* not on *my* shift */ 2531 goto done; 2532 } 2533 else if (t >= CSET) 2534 { 2535 /* easy enough */ 2536 resetmust(mp); 2537 } 2538 else 2539 { 2540 /* plain character */ 2541 resetmust(mp); 2542 mp->is[0] = mp->left[0] = mp->right[0] = t; 2543 mp->is[1] = mp->left[1] = mp->right[1] = '\0'; 2544 mp->in = enlist(mp->in, mp->is, (size_t)1); 2545 if (mp->in == NULL) 2546 goto done; 2547 } 2548 break; 2549 } 2550#ifdef DEBUG 2551 fprintf(stderr, " node: %d:", ri); 2552 prtok(dfa->tokens[ri]); 2553 fprintf(stderr, "\n in:"); 2554 for (i = 0; mp->in[i]; ++i) 2555 fprintf(stderr, " \"%s\"", mp->in[i]); 2556 fprintf(stderr, "\n is: \"%s\"\n", mp->is); 2557 fprintf(stderr, " left: \"%s\"\n", mp->left); 2558 fprintf(stderr, " right: \"%s\"\n", mp->right); 2559#endif 2560 ++mp; 2561 } 2562 done: 2563 if (strlen(result)) 2564 { 2565 dm = (struct dfamust *) malloc(sizeof (struct dfamust)); 2566 dm->exact = exact; 2567 dm->must = malloc(strlen(result) + 1); 2568 strcpy(dm->must, result); 2569 dm->next = dfa->musts; 2570 dfa->musts = dm; 2571 } 2572 mp = musts; 2573 for (i = 0; i <= dfa->tindex; ++i) 2574 { 2575 freelist(mp[i].in); 2576 ifree((char *) mp[i].in); 2577 ifree(mp[i].left); 2578 ifree(mp[i].right); 2579 ifree(mp[i].is); 2580 } 2581 free((char *) mp); 2582} 2583