dfa.c revision 56919
1/* dfa.c - deterministic extended regexp routines for GNU 2 Copyright (C) 1988, 1998 Free Software Foundation, Inc. 3 4 This program is free software; you can redistribute it and/or modify 5 it under the terms of the GNU General Public License as published by 6 the Free Software Foundation; either version 2, or (at your option) 7 any later version. 8 9 This program is distributed in the hope that it will be useful, 10 but WITHOUT ANY WARRANTY; without even the implied warranty of 11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 GNU General Public License for more details. 13 14 You should have received a copy of the GNU General Public License 15 along with this program; if not, write to the Free Software 16 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA */ 17 18/* Written June, 1988 by Mike Haertel 19 Modified July, 1988 by Arthur David Olson to assist BMG speedups */ 20 21/* $FreeBSD: head/gnu/usr.bin/grep/dfa.c 56919 2000-01-31 13:28:08Z ru $ */ 22 23#ifdef HAVE_CONFIG_H 24#include <config.h> 25#endif 26 27#include <assert.h> 28#include <ctype.h> 29#include <stdio.h> 30 31#include <sys/types.h> 32#ifdef STDC_HEADERS 33#include <stdlib.h> 34#else 35extern char *calloc(), *malloc(), *realloc(); 36extern void free(); 37#endif 38 39#if defined(HAVE_STRING_H) || defined(STDC_HEADERS) 40#include <string.h> 41#undef index 42#define index strchr 43#else 44#include <strings.h> 45#endif 46 47#ifndef DEBUG /* use the same approach as regex.c */ 48#undef assert 49#define assert(e) 50#endif /* DEBUG */ 51 52#ifndef isgraph 53#define isgraph(C) (isprint(C) && !isspace(C)) 54#endif 55 56#if defined (STDC_HEADERS) || (!defined (isascii) && !defined (HAVE_ISASCII)) 57#define ISALPHA(C) isalpha(C) 58#define ISUPPER(C) isupper(C) 59#define ISLOWER(C) islower(C) 60#define ISDIGIT(C) isdigit(C) 61#define ISXDIGIT(C) isxdigit(C) 62#define ISSPACE(C) isspace(C) 63#define ISPUNCT(C) ispunct(C) 64#define ISALNUM(C) isalnum(C) 65#define ISPRINT(C) isprint(C) 66#define ISGRAPH(C) isgraph(C) 67#define ISCNTRL(C) iscntrl(C) 68#else 69#define ISALPHA(C) (isascii(C) && isalpha(C)) 70#define ISUPPER(C) (isascii(C) && isupper(C)) 71#define ISLOWER(C) (isascii(C) && islower(C)) 72#define ISDIGIT(C) (isascii(C) && isdigit(C)) 73#define ISXDIGIT(C) (isascii(C) && isxdigit(C)) 74#define ISSPACE(C) (isascii(C) && isspace(C)) 75#define ISPUNCT(C) (isascii(C) && ispunct(C)) 76#define ISALNUM(C) (isascii(C) && isalnum(C)) 77#define ISPRINT(C) (isascii(C) && isprint(C)) 78#define ISGRAPH(C) (isascii(C) && isgraph(C)) 79#define ISCNTRL(C) (isascii(C) && iscntrl(C)) 80#endif 81 82/* ISASCIIDIGIT differs from ISDIGIT, as follows: 83 - Its arg may be any int or unsigned int; it need not be an unsigned char. 84 - It's guaranteed to evaluate its argument exactly once. 85 - It's typically faster. 86 Posix 1003.2-1992 section 2.5.2.1 page 50 lines 1556-1558 says that 87 only '0' through '9' are digits. Prefer ISASCIIDIGIT to ISDIGIT unless 88 it's important to use the locale's definition of `digit' even when the 89 host does not conform to Posix. */ 90#define ISASCIIDIGIT(c) ((unsigned) (c) - '0' <= 9) 91 92/* If we (don't) have I18N. */ 93/* glibc defines _ */ 94#ifndef _ 95# ifdef HAVE_LIBINTL_H 96# include <libintl.h> 97# ifndef _ 98# define _(Str) gettext (Str) 99# endif 100# else 101# define _(Str) (Str) 102# endif 103#endif 104 105#ifdef __FreeBSD__ 106#include <gnuregex.h> 107#else 108#include "regex.h" 109#endif 110#include "dfa.h" 111 112/* HPUX, define those as macros in sys/param.h */ 113#ifdef setbit 114# undef setbit 115#endif 116#ifdef clrbit 117# undef clrbit 118#endif 119 120static void dfamust PARAMS ((struct dfa *dfa)); 121 122static ptr_t xcalloc PARAMS ((size_t n, size_t s)); 123static ptr_t xmalloc PARAMS ((size_t n)); 124static ptr_t xrealloc PARAMS ((ptr_t p, size_t n)); 125#ifdef DEBUG 126static void prtok PARAMS ((token t)); 127#endif 128static int tstbit PARAMS ((int b, charclass c)); 129static void setbit PARAMS ((int b, charclass c)); 130static void clrbit PARAMS ((int b, charclass c)); 131static void copyset PARAMS ((charclass src, charclass dst)); 132static void zeroset PARAMS ((charclass s)); 133static void notset PARAMS ((charclass s)); 134static int equal PARAMS ((charclass s1, charclass s2)); 135static int charclass_index PARAMS ((charclass s)); 136static int looking_at PARAMS ((const char *s)); 137static token lex PARAMS ((void)); 138static void addtok PARAMS ((token t)); 139static void atom PARAMS ((void)); 140static int nsubtoks PARAMS ((int tindex)); 141static void copytoks PARAMS ((int tindex, int ntokens)); 142static void closure PARAMS ((void)); 143static void branch PARAMS ((void)); 144static void regexp PARAMS ((int toplevel)); 145static void copy PARAMS ((position_set *src, position_set *dst)); 146static void insert PARAMS ((position p, position_set *s)); 147static void merge PARAMS ((position_set *s1, position_set *s2, position_set *m)); 148static void delete PARAMS ((position p, position_set *s)); 149static int state_index PARAMS ((struct dfa *d, position_set *s, 150 int newline, int letter)); 151static void build_state PARAMS ((int s, struct dfa *d)); 152static void build_state_zero PARAMS ((struct dfa *d)); 153static char *icatalloc PARAMS ((char *old, char *new)); 154static char *icpyalloc PARAMS ((char *string)); 155static char *istrstr PARAMS ((char *lookin, char *lookfor)); 156static void ifree PARAMS ((char *cp)); 157static void freelist PARAMS ((char **cpp)); 158static char **enlist PARAMS ((char **cpp, char *new, size_t len)); 159static char **comsubs PARAMS ((char *left, char *right)); 160static char **addlists PARAMS ((char **old, char **new)); 161static char **inboth PARAMS ((char **left, char **right)); 162 163static ptr_t 164xcalloc (size_t n, size_t s) 165{ 166 ptr_t r = calloc(n, s); 167 168 if (!r) 169 dfaerror(_("Memory exhausted")); 170 return r; 171} 172 173static ptr_t 174xmalloc (size_t n) 175{ 176 ptr_t r = malloc(n); 177 178 assert(n != 0); 179 if (!r) 180 dfaerror(_("Memory exhausted")); 181 return r; 182} 183 184static ptr_t 185xrealloc (ptr_t p, size_t n) 186{ 187 ptr_t r = realloc(p, n); 188 189 assert(n != 0); 190 if (!r) 191 dfaerror(_("Memory exhausted")); 192 return r; 193} 194 195#define CALLOC(p, t, n) ((p) = (t *) xcalloc((size_t)(n), sizeof (t))) 196#define MALLOC(p, t, n) ((p) = (t *) xmalloc((n) * sizeof (t))) 197#define REALLOC(p, t, n) ((p) = (t *) xrealloc((ptr_t) (p), (n) * sizeof (t))) 198 199/* Reallocate an array of type t if nalloc is too small for index. */ 200#define REALLOC_IF_NECESSARY(p, t, nalloc, index) \ 201 if ((index) >= (nalloc)) \ 202 { \ 203 while ((index) >= (nalloc)) \ 204 (nalloc) *= 2; \ 205 REALLOC(p, t, nalloc); \ 206 } 207 208#ifdef DEBUG 209 210static void 211prtok (token t) 212{ 213 char *s; 214 215 if (t < 0) 216 fprintf(stderr, "END"); 217 else if (t < NOTCHAR) 218 fprintf(stderr, "%c", t); 219 else 220 { 221 switch (t) 222 { 223 case EMPTY: s = "EMPTY"; break; 224 case BACKREF: s = "BACKREF"; break; 225 case BEGLINE: s = "BEGLINE"; break; 226 case ENDLINE: s = "ENDLINE"; break; 227 case BEGWORD: s = "BEGWORD"; break; 228 case ENDWORD: s = "ENDWORD"; break; 229 case LIMWORD: s = "LIMWORD"; break; 230 case NOTLIMWORD: s = "NOTLIMWORD"; break; 231 case QMARK: s = "QMARK"; break; 232 case STAR: s = "STAR"; break; 233 case PLUS: s = "PLUS"; break; 234 case CAT: s = "CAT"; break; 235 case OR: s = "OR"; break; 236 case ORTOP: s = "ORTOP"; break; 237 case LPAREN: s = "LPAREN"; break; 238 case RPAREN: s = "RPAREN"; break; 239 default: s = "CSET"; break; 240 } 241 fprintf(stderr, "%s", s); 242 } 243} 244#endif /* DEBUG */ 245 246/* Stuff pertaining to charclasses. */ 247 248static int 249tstbit (int b, charclass c) 250{ 251 return c[b / INTBITS] & 1 << b % INTBITS; 252} 253 254static void 255setbit (int b, charclass c) 256{ 257 c[b / INTBITS] |= 1 << b % INTBITS; 258} 259 260static void 261clrbit (int b, charclass c) 262{ 263 c[b / INTBITS] &= ~(1 << b % INTBITS); 264} 265 266static void 267copyset (charclass src, charclass dst) 268{ 269 int i; 270 271 for (i = 0; i < CHARCLASS_INTS; ++i) 272 dst[i] = src[i]; 273} 274 275static void 276zeroset (charclass s) 277{ 278 int i; 279 280 for (i = 0; i < CHARCLASS_INTS; ++i) 281 s[i] = 0; 282} 283 284static void 285notset (charclass s) 286{ 287 int i; 288 289 for (i = 0; i < CHARCLASS_INTS; ++i) 290 s[i] = ~s[i]; 291} 292 293static int 294equal (charclass s1, charclass s2) 295{ 296 int i; 297 298 for (i = 0; i < CHARCLASS_INTS; ++i) 299 if (s1[i] != s2[i]) 300 return 0; 301 return 1; 302} 303 304/* A pointer to the current dfa is kept here during parsing. */ 305static struct dfa *dfa; 306 307/* Find the index of charclass s in dfa->charclasses, or allocate a new charclass. */ 308static int 309charclass_index (charclass s) 310{ 311 int i; 312 313 for (i = 0; i < dfa->cindex; ++i) 314 if (equal(s, dfa->charclasses[i])) 315 return i; 316 REALLOC_IF_NECESSARY(dfa->charclasses, charclass, dfa->calloc, dfa->cindex); 317 ++dfa->cindex; 318 copyset(s, dfa->charclasses[i]); 319 return i; 320} 321 322/* Syntax bits controlling the behavior of the lexical analyzer. */ 323static reg_syntax_t syntax_bits, syntax_bits_set; 324 325/* Flag for case-folding letters into sets. */ 326static int case_fold; 327 328/* End-of-line byte in data. */ 329static unsigned char eolbyte; 330 331/* Entry point to set syntax options. */ 332void 333dfasyntax (reg_syntax_t bits, int fold, int eol) 334{ 335 syntax_bits_set = 1; 336 syntax_bits = bits; 337 case_fold = fold; 338 eolbyte = eol; 339} 340 341/* Lexical analyzer. All the dross that deals with the obnoxious 342 GNU Regex syntax bits is located here. The poor, suffering 343 reader is referred to the GNU Regex documentation for the 344 meaning of the @#%!@#%^!@ syntax bits. */ 345 346static char *lexstart; /* Pointer to beginning of input string. */ 347static char *lexptr; /* Pointer to next input character. */ 348static int lexleft; /* Number of characters remaining. */ 349static token lasttok; /* Previous token returned; initially END. */ 350static int laststart; /* True if we're separated from beginning or (, | 351 only by zero-width characters. */ 352static int parens; /* Count of outstanding left parens. */ 353static int minrep, maxrep; /* Repeat counts for {m,n}. */ 354 355/* Note that characters become unsigned here. */ 356#define FETCH(c, eoferr) \ 357 { \ 358 if (! lexleft) \ 359 { \ 360 if (eoferr != 0) \ 361 dfaerror (eoferr); \ 362 else \ 363 return lasttok = END; \ 364 } \ 365 (c) = (unsigned char) *lexptr++; \ 366 --lexleft; \ 367 } 368 369#ifdef __STDC__ 370#define FUNC(F, P) static int F(int c) { return P(c); } 371#else 372#define FUNC(F, P) static int F(c) int c; { return P(c); } 373#endif 374 375FUNC(is_alpha, ISALPHA) 376FUNC(is_upper, ISUPPER) 377FUNC(is_lower, ISLOWER) 378FUNC(is_digit, ISDIGIT) 379FUNC(is_xdigit, ISXDIGIT) 380FUNC(is_space, ISSPACE) 381FUNC(is_punct, ISPUNCT) 382FUNC(is_alnum, ISALNUM) 383FUNC(is_print, ISPRINT) 384FUNC(is_graph, ISGRAPH) 385FUNC(is_cntrl, ISCNTRL) 386 387static int 388is_blank (int c) 389{ 390 return (c == ' ' || c == '\t'); 391} 392 393/* The following list maps the names of the Posix named character classes 394 to predicate functions that determine whether a given character is in 395 the class. The leading [ has already been eaten by the lexical analyzer. */ 396static struct { 397 const char *name; 398 int (*pred) PARAMS ((int)); 399} prednames[] = { 400 { ":alpha:]", is_alpha }, 401 { ":upper:]", is_upper }, 402 { ":lower:]", is_lower }, 403 { ":digit:]", is_digit }, 404 { ":xdigit:]", is_xdigit }, 405 { ":space:]", is_space }, 406 { ":punct:]", is_punct }, 407 { ":alnum:]", is_alnum }, 408 { ":print:]", is_print }, 409 { ":graph:]", is_graph }, 410 { ":cntrl:]", is_cntrl }, 411 { ":blank:]", is_blank }, 412 { 0 } 413}; 414 415/* Return non-zero if C is a `word-constituent' byte; zero otherwise. */ 416#define IS_WORD_CONSTITUENT(C) (ISALNUM(C) || (C) == '_') 417 418static int 419looking_at (char const *s) 420{ 421 size_t len; 422 423 len = strlen(s); 424 if (lexleft < len) 425 return 0; 426 return strncmp(s, lexptr, len) == 0; 427} 428 429static token 430lex (void) 431{ 432 token c, c1, c2; 433 int backslash = 0, invert; 434 charclass ccl; 435 int i; 436 char lo[2]; 437 char hi[2]; 438 439 /* Basic plan: We fetch a character. If it's a backslash, 440 we set the backslash flag and go through the loop again. 441 On the plus side, this avoids having a duplicate of the 442 main switch inside the backslash case. On the minus side, 443 it means that just about every case begins with 444 "if (backslash) ...". */ 445 for (i = 0; i < 2; ++i) 446 { 447 FETCH(c, 0); 448 switch (c) 449 { 450 case '\\': 451 if (backslash) 452 goto normal_char; 453 if (lexleft == 0) 454 dfaerror(_("Unfinished \\ escape")); 455 backslash = 1; 456 break; 457 458 case '^': 459 if (backslash) 460 goto normal_char; 461 if (syntax_bits & RE_CONTEXT_INDEP_ANCHORS 462 || lasttok == END 463 || lasttok == LPAREN 464 || lasttok == OR) 465 return lasttok = BEGLINE; 466 goto normal_char; 467 468 case '$': 469 if (backslash) 470 goto normal_char; 471 if (syntax_bits & RE_CONTEXT_INDEP_ANCHORS 472 || lexleft == 0 473 || (syntax_bits & RE_NO_BK_PARENS 474 ? lexleft > 0 && *lexptr == ')' 475 : lexleft > 1 && lexptr[0] == '\\' && lexptr[1] == ')') 476 || (syntax_bits & RE_NO_BK_VBAR 477 ? lexleft > 0 && *lexptr == '|' 478 : lexleft > 1 && lexptr[0] == '\\' && lexptr[1] == '|') 479 || ((syntax_bits & RE_NEWLINE_ALT) 480 && lexleft > 0 && *lexptr == '\n')) 481 return lasttok = ENDLINE; 482 goto normal_char; 483 484 case '1': 485 case '2': 486 case '3': 487 case '4': 488 case '5': 489 case '6': 490 case '7': 491 case '8': 492 case '9': 493 if (backslash && !(syntax_bits & RE_NO_BK_REFS)) 494 { 495 laststart = 0; 496 return lasttok = BACKREF; 497 } 498 goto normal_char; 499 500 case '`': 501 if (backslash && !(syntax_bits & RE_NO_GNU_OPS)) 502 return lasttok = BEGLINE; /* FIXME: should be beginning of string */ 503 goto normal_char; 504 505 case '\'': 506 if (backslash && !(syntax_bits & RE_NO_GNU_OPS)) 507 return lasttok = ENDLINE; /* FIXME: should be end of string */ 508 goto normal_char; 509 510 case '<': 511 if (backslash && !(syntax_bits & RE_NO_GNU_OPS)) 512 return lasttok = BEGWORD; 513 goto normal_char; 514 515 case '>': 516 if (backslash && !(syntax_bits & RE_NO_GNU_OPS)) 517 return lasttok = ENDWORD; 518 goto normal_char; 519 520 case 'b': 521 if (backslash && !(syntax_bits & RE_NO_GNU_OPS)) 522 return lasttok = LIMWORD; 523 goto normal_char; 524 525 case 'B': 526 if (backslash && !(syntax_bits & RE_NO_GNU_OPS)) 527 return lasttok = NOTLIMWORD; 528 goto normal_char; 529 530 case '?': 531 if (syntax_bits & RE_LIMITED_OPS) 532 goto normal_char; 533 if (backslash != ((syntax_bits & RE_BK_PLUS_QM) != 0)) 534 goto normal_char; 535 if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart) 536 goto normal_char; 537 return lasttok = QMARK; 538 539 case '*': 540 if (backslash) 541 goto normal_char; 542 if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart) 543 goto normal_char; 544 return lasttok = STAR; 545 546 case '+': 547 if (syntax_bits & RE_LIMITED_OPS) 548 goto normal_char; 549 if (backslash != ((syntax_bits & RE_BK_PLUS_QM) != 0)) 550 goto normal_char; 551 if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart) 552 goto normal_char; 553 return lasttok = PLUS; 554 555 case '{': 556 if (!(syntax_bits & RE_INTERVALS)) 557 goto normal_char; 558 if (backslash != ((syntax_bits & RE_NO_BK_BRACES) == 0)) 559 goto normal_char; 560 if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart) 561 goto normal_char; 562 563 if (syntax_bits & RE_NO_BK_BRACES) 564 { 565 /* Scan ahead for a valid interval; if it's not valid, 566 treat it as a literal '{'. */ 567 int lo = -1, hi = -1; 568 char const *p = lexptr; 569 char const *lim = p + lexleft; 570 for (; p != lim && ISASCIIDIGIT (*p); p++) 571 lo = (lo < 0 ? 0 : lo * 10) + *p - '0'; 572 if (p != lim && *p == ',') 573 while (++p != lim && ISASCIIDIGIT (*p)) 574 hi = (hi < 0 ? 0 : hi * 10) + *p - '0'; 575 else 576 hi = lo; 577 if (p == lim || *p != '}' 578 || lo < 0 || RE_DUP_MAX < hi || (0 <= hi && hi < lo)) 579 goto normal_char; 580 } 581 582 minrep = 0; 583 /* Cases: 584 {M} - exact count 585 {M,} - minimum count, maximum is infinity 586 {M,N} - M through N */ 587 FETCH(c, _("unfinished repeat count")); 588 if (ISASCIIDIGIT (c)) 589 { 590 minrep = c - '0'; 591 for (;;) 592 { 593 FETCH(c, _("unfinished repeat count")); 594 if (! ISASCIIDIGIT (c)) 595 break; 596 minrep = 10 * minrep + c - '0'; 597 } 598 } 599 else 600 dfaerror(_("malformed repeat count")); 601 if (c == ',') 602 { 603 FETCH (c, _("unfinished repeat count")); 604 if (! ISASCIIDIGIT (c)) 605 maxrep = -1; 606 else 607 { 608 maxrep = c - '0'; 609 for (;;) 610 { 611 FETCH (c, _("unfinished repeat count")); 612 if (! ISASCIIDIGIT (c)) 613 break; 614 maxrep = 10 * maxrep + c - '0'; 615 } 616 if (0 <= maxrep && maxrep < minrep) 617 dfaerror (_("malformed repeat count")); 618 } 619 } 620 else 621 maxrep = minrep; 622 if (!(syntax_bits & RE_NO_BK_BRACES)) 623 { 624 if (c != '\\') 625 dfaerror(_("malformed repeat count")); 626 FETCH(c, _("unfinished repeat count")); 627 } 628 if (c != '}') 629 dfaerror(_("malformed repeat count")); 630 laststart = 0; 631 return lasttok = REPMN; 632 633 case '|': 634 if (syntax_bits & RE_LIMITED_OPS) 635 goto normal_char; 636 if (backslash != ((syntax_bits & RE_NO_BK_VBAR) == 0)) 637 goto normal_char; 638 laststart = 1; 639 return lasttok = OR; 640 641 case '\n': 642 if (syntax_bits & RE_LIMITED_OPS 643 || backslash 644 || !(syntax_bits & RE_NEWLINE_ALT)) 645 goto normal_char; 646 laststart = 1; 647 return lasttok = OR; 648 649 case '(': 650 if (backslash != ((syntax_bits & RE_NO_BK_PARENS) == 0)) 651 goto normal_char; 652 ++parens; 653 laststart = 1; 654 return lasttok = LPAREN; 655 656 case ')': 657 if (backslash != ((syntax_bits & RE_NO_BK_PARENS) == 0)) 658 goto normal_char; 659 if (parens == 0 && syntax_bits & RE_UNMATCHED_RIGHT_PAREN_ORD) 660 goto normal_char; 661 --parens; 662 laststart = 0; 663 return lasttok = RPAREN; 664 665 case '.': 666 if (backslash) 667 goto normal_char; 668 zeroset(ccl); 669 notset(ccl); 670 if (!(syntax_bits & RE_DOT_NEWLINE)) 671 clrbit(eolbyte, ccl); 672 if (syntax_bits & RE_DOT_NOT_NULL) 673 clrbit('\0', ccl); 674 laststart = 0; 675 return lasttok = CSET + charclass_index(ccl); 676 677 case 'w': 678 case 'W': 679 if (!backslash || (syntax_bits & RE_NO_GNU_OPS)) 680 goto normal_char; 681 zeroset(ccl); 682 for (c2 = 0; c2 < NOTCHAR; ++c2) 683 if (IS_WORD_CONSTITUENT(c2)) 684 setbit(c2, ccl); 685 if (c == 'W') 686 notset(ccl); 687 laststart = 0; 688 return lasttok = CSET + charclass_index(ccl); 689 690 case '[': 691 if (backslash) 692 goto normal_char; 693 zeroset(ccl); 694 FETCH(c, _("Unbalanced [")); 695 if (c == '^') 696 { 697 FETCH(c, _("Unbalanced [")); 698 invert = 1; 699 } 700 else 701 invert = 0; 702 do 703 { 704 /* Nobody ever said this had to be fast. :-) 705 Note that if we're looking at some other [:...:] 706 construct, we just treat it as a bunch of ordinary 707 characters. We can do this because we assume 708 regex has checked for syntax errors before 709 dfa is ever called. */ 710 if (c == '[' && (syntax_bits & RE_CHAR_CLASSES)) 711 for (c1 = 0; prednames[c1].name; ++c1) 712 if (looking_at(prednames[c1].name)) 713 { 714 int (*pred)() = prednames[c1].pred; 715 if (case_fold 716 && (pred == is_upper || pred == is_lower)) 717 pred = is_alpha; 718 719 for (c2 = 0; c2 < NOTCHAR; ++c2) 720 if ((*pred)(c2)) 721 setbit(c2, ccl); 722 lexptr += strlen(prednames[c1].name); 723 lexleft -= strlen(prednames[c1].name); 724 FETCH(c1, _("Unbalanced [")); 725 goto skip; 726 } 727 if (c == '\\' && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS)) 728 FETCH(c, _("Unbalanced [")); 729 FETCH(c1, _("Unbalanced [")); 730 if (c1 == '-') 731 { 732 FETCH(c2, _("Unbalanced [")); 733 if (c2 == ']') 734 { 735 /* In the case [x-], the - is an ordinary hyphen, 736 which is left in c1, the lookahead character. */ 737 --lexptr; 738 ++lexleft; 739 c2 = c; 740 } 741 else 742 { 743 if (c2 == '\\' 744 && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS)) 745 FETCH(c2, _("Unbalanced [")); 746 FETCH(c1, _("Unbalanced [")); 747 } 748 } 749 else 750 c2 = c; 751 752 lo[0] = c; lo[1] = '\0'; 753 hi[0] = c2; hi[1] = '\0'; 754 for (c = 0; c < NOTCHAR; c++) 755 { 756 char ch[2]; 757 ch[0] = c; ch[1] = '\0'; 758 if (strcoll (lo, ch) <= 0 && strcoll (ch, hi) <= 0) 759 { 760 setbit (c, ccl); 761 if (case_fold) 762 { 763 if (ISUPPER (c)) 764 setbit (tolower (c), ccl); 765 else if (ISLOWER (c)) 766 setbit (toupper (c), ccl); 767 } 768 } 769 } 770 771 skip: 772 ; 773 } 774 while ((c = c1) != ']'); 775 if (invert) 776 { 777 notset(ccl); 778 if (syntax_bits & RE_HAT_LISTS_NOT_NEWLINE) 779 clrbit(eolbyte, ccl); 780 } 781 laststart = 0; 782 return lasttok = CSET + charclass_index(ccl); 783 784 default: 785 normal_char: 786 laststart = 0; 787 if (case_fold && ISALPHA(c)) 788 { 789 zeroset(ccl); 790 setbit(c, ccl); 791 if (isupper(c)) 792 setbit(tolower(c), ccl); 793 else 794 setbit(toupper(c), ccl); 795 return lasttok = CSET + charclass_index(ccl); 796 } 797 return c; 798 } 799 } 800 801 /* The above loop should consume at most a backslash 802 and some other character. */ 803 abort(); 804 return END; /* keeps pedantic compilers happy. */ 805} 806 807/* Recursive descent parser for regular expressions. */ 808 809static token tok; /* Lookahead token. */ 810static int depth; /* Current depth of a hypothetical stack 811 holding deferred productions. This is 812 used to determine the depth that will be 813 required of the real stack later on in 814 dfaanalyze(). */ 815 816/* Add the given token to the parse tree, maintaining the depth count and 817 updating the maximum depth if necessary. */ 818static void 819addtok (token t) 820{ 821 REALLOC_IF_NECESSARY(dfa->tokens, token, dfa->talloc, dfa->tindex); 822 dfa->tokens[dfa->tindex++] = t; 823 824 switch (t) 825 { 826 case QMARK: 827 case STAR: 828 case PLUS: 829 break; 830 831 case CAT: 832 case OR: 833 case ORTOP: 834 --depth; 835 break; 836 837 default: 838 ++dfa->nleaves; 839 case EMPTY: 840 ++depth; 841 break; 842 } 843 if (depth > dfa->depth) 844 dfa->depth = depth; 845} 846 847/* The grammar understood by the parser is as follows. 848 849 regexp: 850 regexp OR branch 851 branch 852 853 branch: 854 branch closure 855 closure 856 857 closure: 858 closure QMARK 859 closure STAR 860 closure PLUS 861 atom 862 863 atom: 864 <normal character> 865 CSET 866 BACKREF 867 BEGLINE 868 ENDLINE 869 BEGWORD 870 ENDWORD 871 LIMWORD 872 NOTLIMWORD 873 <empty> 874 875 The parser builds a parse tree in postfix form in an array of tokens. */ 876 877static void 878atom (void) 879{ 880 if ((tok >= 0 && tok < NOTCHAR) || tok >= CSET || tok == BACKREF 881 || tok == BEGLINE || tok == ENDLINE || tok == BEGWORD 882 || tok == ENDWORD || tok == LIMWORD || tok == NOTLIMWORD) 883 { 884 addtok(tok); 885 tok = lex(); 886 } 887 else if (tok == LPAREN) 888 { 889 tok = lex(); 890 regexp(0); 891 if (tok != RPAREN) 892 dfaerror(_("Unbalanced (")); 893 tok = lex(); 894 } 895 else 896 addtok(EMPTY); 897} 898 899/* Return the number of tokens in the given subexpression. */ 900static int 901nsubtoks (int tindex) 902{ 903 int ntoks1; 904 905 switch (dfa->tokens[tindex - 1]) 906 { 907 default: 908 return 1; 909 case QMARK: 910 case STAR: 911 case PLUS: 912 return 1 + nsubtoks(tindex - 1); 913 case CAT: 914 case OR: 915 case ORTOP: 916 ntoks1 = nsubtoks(tindex - 1); 917 return 1 + ntoks1 + nsubtoks(tindex - 1 - ntoks1); 918 } 919} 920 921/* Copy the given subexpression to the top of the tree. */ 922static void 923copytoks (int tindex, int ntokens) 924{ 925 int i; 926 927 for (i = 0; i < ntokens; ++i) 928 addtok(dfa->tokens[tindex + i]); 929} 930 931static void 932closure (void) 933{ 934 int tindex, ntokens, i; 935 936 atom(); 937 while (tok == QMARK || tok == STAR || tok == PLUS || tok == REPMN) 938 if (tok == REPMN) 939 { 940 ntokens = nsubtoks(dfa->tindex); 941 tindex = dfa->tindex - ntokens; 942 if (maxrep < 0) 943 addtok(PLUS); 944 if (minrep == 0) 945 addtok(QMARK); 946 for (i = 1; i < minrep; ++i) 947 { 948 copytoks(tindex, ntokens); 949 addtok(CAT); 950 } 951 for (; i < maxrep; ++i) 952 { 953 copytoks(tindex, ntokens); 954 addtok(QMARK); 955 addtok(CAT); 956 } 957 tok = lex(); 958 } 959 else 960 { 961 addtok(tok); 962 tok = lex(); 963 } 964} 965 966static void 967branch (void) 968{ 969 closure(); 970 while (tok != RPAREN && tok != OR && tok >= 0) 971 { 972 closure(); 973 addtok(CAT); 974 } 975} 976 977static void 978regexp (int toplevel) 979{ 980 branch(); 981 while (tok == OR) 982 { 983 tok = lex(); 984 branch(); 985 if (toplevel) 986 addtok(ORTOP); 987 else 988 addtok(OR); 989 } 990} 991 992/* Main entry point for the parser. S is a string to be parsed, len is the 993 length of the string, so s can include NUL characters. D is a pointer to 994 the struct dfa to parse into. */ 995void 996dfaparse (char *s, size_t len, struct dfa *d) 997{ 998 dfa = d; 999 lexstart = lexptr = s; 1000 lexleft = len; 1001 lasttok = END; 1002 laststart = 1; 1003 parens = 0; 1004 1005 if (! syntax_bits_set) 1006 dfaerror(_("No syntax specified")); 1007 1008 tok = lex(); 1009 depth = d->depth; 1010 1011 regexp(1); 1012 1013 if (tok != END) 1014 dfaerror(_("Unbalanced )")); 1015 1016 addtok(END - d->nregexps); 1017 addtok(CAT); 1018 1019 if (d->nregexps) 1020 addtok(ORTOP); 1021 1022 ++d->nregexps; 1023} 1024 1025/* Some primitives for operating on sets of positions. */ 1026 1027/* Copy one set to another; the destination must be large enough. */ 1028static void 1029copy (position_set *src, position_set *dst) 1030{ 1031 int i; 1032 1033 for (i = 0; i < src->nelem; ++i) 1034 dst->elems[i] = src->elems[i]; 1035 dst->nelem = src->nelem; 1036} 1037 1038/* Insert a position in a set. Position sets are maintained in sorted 1039 order according to index. If position already exists in the set with 1040 the same index then their constraints are logically or'd together. 1041 S->elems must point to an array large enough to hold the resulting set. */ 1042static void 1043insert (position p, position_set *s) 1044{ 1045 int i; 1046 position t1, t2; 1047 1048 for (i = 0; i < s->nelem && p.index < s->elems[i].index; ++i) 1049 continue; 1050 if (i < s->nelem && p.index == s->elems[i].index) 1051 s->elems[i].constraint |= p.constraint; 1052 else 1053 { 1054 t1 = p; 1055 ++s->nelem; 1056 while (i < s->nelem) 1057 { 1058 t2 = s->elems[i]; 1059 s->elems[i++] = t1; 1060 t1 = t2; 1061 } 1062 } 1063} 1064 1065/* Merge two sets of positions into a third. The result is exactly as if 1066 the positions of both sets were inserted into an initially empty set. */ 1067static void 1068merge (position_set *s1, position_set *s2, position_set *m) 1069{ 1070 int i = 0, j = 0; 1071 1072 m->nelem = 0; 1073 while (i < s1->nelem && j < s2->nelem) 1074 if (s1->elems[i].index > s2->elems[j].index) 1075 m->elems[m->nelem++] = s1->elems[i++]; 1076 else if (s1->elems[i].index < s2->elems[j].index) 1077 m->elems[m->nelem++] = s2->elems[j++]; 1078 else 1079 { 1080 m->elems[m->nelem] = s1->elems[i++]; 1081 m->elems[m->nelem++].constraint |= s2->elems[j++].constraint; 1082 } 1083 while (i < s1->nelem) 1084 m->elems[m->nelem++] = s1->elems[i++]; 1085 while (j < s2->nelem) 1086 m->elems[m->nelem++] = s2->elems[j++]; 1087} 1088 1089/* Delete a position from a set. */ 1090static void 1091delete (position p, position_set *s) 1092{ 1093 int i; 1094 1095 for (i = 0; i < s->nelem; ++i) 1096 if (p.index == s->elems[i].index) 1097 break; 1098 if (i < s->nelem) 1099 for (--s->nelem; i < s->nelem; ++i) 1100 s->elems[i] = s->elems[i + 1]; 1101} 1102 1103/* Find the index of the state corresponding to the given position set with 1104 the given preceding context, or create a new state if there is no such 1105 state. Newline and letter tell whether we got here on a newline or 1106 letter, respectively. */ 1107static int 1108state_index (struct dfa *d, position_set *s, int newline, int letter) 1109{ 1110 int hash = 0; 1111 int constraint; 1112 int i, j; 1113 1114 newline = newline ? 1 : 0; 1115 letter = letter ? 1 : 0; 1116 1117 for (i = 0; i < s->nelem; ++i) 1118 hash ^= s->elems[i].index + s->elems[i].constraint; 1119 1120 /* Try to find a state that exactly matches the proposed one. */ 1121 for (i = 0; i < d->sindex; ++i) 1122 { 1123 if (hash != d->states[i].hash || s->nelem != d->states[i].elems.nelem 1124 || newline != d->states[i].newline || letter != d->states[i].letter) 1125 continue; 1126 for (j = 0; j < s->nelem; ++j) 1127 if (s->elems[j].constraint 1128 != d->states[i].elems.elems[j].constraint 1129 || s->elems[j].index != d->states[i].elems.elems[j].index) 1130 break; 1131 if (j == s->nelem) 1132 return i; 1133 } 1134 1135 /* We'll have to create a new state. */ 1136 REALLOC_IF_NECESSARY(d->states, dfa_state, d->salloc, d->sindex); 1137 d->states[i].hash = hash; 1138 MALLOC(d->states[i].elems.elems, position, s->nelem); 1139 copy(s, &d->states[i].elems); 1140 d->states[i].newline = newline; 1141 d->states[i].letter = letter; 1142 d->states[i].backref = 0; 1143 d->states[i].constraint = 0; 1144 d->states[i].first_end = 0; 1145 for (j = 0; j < s->nelem; ++j) 1146 if (d->tokens[s->elems[j].index] < 0) 1147 { 1148 constraint = s->elems[j].constraint; 1149 if (SUCCEEDS_IN_CONTEXT(constraint, newline, 0, letter, 0) 1150 || SUCCEEDS_IN_CONTEXT(constraint, newline, 0, letter, 1) 1151 || SUCCEEDS_IN_CONTEXT(constraint, newline, 1, letter, 0) 1152 || SUCCEEDS_IN_CONTEXT(constraint, newline, 1, letter, 1)) 1153 d->states[i].constraint |= constraint; 1154 if (! d->states[i].first_end) 1155 d->states[i].first_end = d->tokens[s->elems[j].index]; 1156 } 1157 else if (d->tokens[s->elems[j].index] == BACKREF) 1158 { 1159 d->states[i].constraint = NO_CONSTRAINT; 1160 d->states[i].backref = 1; 1161 } 1162 1163 ++d->sindex; 1164 1165 return i; 1166} 1167 1168/* Find the epsilon closure of a set of positions. If any position of the set 1169 contains a symbol that matches the empty string in some context, replace 1170 that position with the elements of its follow labeled with an appropriate 1171 constraint. Repeat exhaustively until no funny positions are left. 1172 S->elems must be large enough to hold the result. */ 1173static void 1174epsclosure (position_set *s, struct dfa *d) 1175{ 1176 int i, j; 1177 int *visited; 1178 position p, old; 1179 1180 MALLOC(visited, int, d->tindex); 1181 for (i = 0; i < d->tindex; ++i) 1182 visited[i] = 0; 1183 1184 for (i = 0; i < s->nelem; ++i) 1185 if (d->tokens[s->elems[i].index] >= NOTCHAR 1186 && d->tokens[s->elems[i].index] != BACKREF 1187 && d->tokens[s->elems[i].index] < CSET) 1188 { 1189 old = s->elems[i]; 1190 p.constraint = old.constraint; 1191 delete(s->elems[i], s); 1192 if (visited[old.index]) 1193 { 1194 --i; 1195 continue; 1196 } 1197 visited[old.index] = 1; 1198 switch (d->tokens[old.index]) 1199 { 1200 case BEGLINE: 1201 p.constraint &= BEGLINE_CONSTRAINT; 1202 break; 1203 case ENDLINE: 1204 p.constraint &= ENDLINE_CONSTRAINT; 1205 break; 1206 case BEGWORD: 1207 p.constraint &= BEGWORD_CONSTRAINT; 1208 break; 1209 case ENDWORD: 1210 p.constraint &= ENDWORD_CONSTRAINT; 1211 break; 1212 case LIMWORD: 1213 p.constraint &= LIMWORD_CONSTRAINT; 1214 break; 1215 case NOTLIMWORD: 1216 p.constraint &= NOTLIMWORD_CONSTRAINT; 1217 break; 1218 default: 1219 break; 1220 } 1221 for (j = 0; j < d->follows[old.index].nelem; ++j) 1222 { 1223 p.index = d->follows[old.index].elems[j].index; 1224 insert(p, s); 1225 } 1226 /* Force rescan to start at the beginning. */ 1227 i = -1; 1228 } 1229 1230 free(visited); 1231} 1232 1233/* Perform bottom-up analysis on the parse tree, computing various functions. 1234 Note that at this point, we're pretending constructs like \< are real 1235 characters rather than constraints on what can follow them. 1236 1237 Nullable: A node is nullable if it is at the root of a regexp that can 1238 match the empty string. 1239 * EMPTY leaves are nullable. 1240 * No other leaf is nullable. 1241 * A QMARK or STAR node is nullable. 1242 * A PLUS node is nullable if its argument is nullable. 1243 * A CAT node is nullable if both its arguments are nullable. 1244 * An OR node is nullable if either argument is nullable. 1245 1246 Firstpos: The firstpos of a node is the set of positions (nonempty leaves) 1247 that could correspond to the first character of a string matching the 1248 regexp rooted at the given node. 1249 * EMPTY leaves have empty firstpos. 1250 * The firstpos of a nonempty leaf is that leaf itself. 1251 * The firstpos of a QMARK, STAR, or PLUS node is the firstpos of its 1252 argument. 1253 * The firstpos of a CAT node is the firstpos of the left argument, union 1254 the firstpos of the right if the left argument is nullable. 1255 * The firstpos of an OR node is the union of firstpos of each argument. 1256 1257 Lastpos: The lastpos of a node is the set of positions that could 1258 correspond to the last character of a string matching the regexp at 1259 the given node. 1260 * EMPTY leaves have empty lastpos. 1261 * The lastpos of a nonempty leaf is that leaf itself. 1262 * The lastpos of a QMARK, STAR, or PLUS node is the lastpos of its 1263 argument. 1264 * The lastpos of a CAT node is the lastpos of its right argument, union 1265 the lastpos of the left if the right argument is nullable. 1266 * The lastpos of an OR node is the union of the lastpos of each argument. 1267 1268 Follow: The follow of a position is the set of positions that could 1269 correspond to the character following a character matching the node in 1270 a string matching the regexp. At this point we consider special symbols 1271 that match the empty string in some context to be just normal characters. 1272 Later, if we find that a special symbol is in a follow set, we will 1273 replace it with the elements of its follow, labeled with an appropriate 1274 constraint. 1275 * Every node in the firstpos of the argument of a STAR or PLUS node is in 1276 the follow of every node in the lastpos. 1277 * Every node in the firstpos of the second argument of a CAT node is in 1278 the follow of every node in the lastpos of the first argument. 1279 1280 Because of the postfix representation of the parse tree, the depth-first 1281 analysis is conveniently done by a linear scan with the aid of a stack. 1282 Sets are stored as arrays of the elements, obeying a stack-like allocation 1283 scheme; the number of elements in each set deeper in the stack can be 1284 used to determine the address of a particular set's array. */ 1285void 1286dfaanalyze (struct dfa *d, int searchflag) 1287{ 1288 int *nullable; /* Nullable stack. */ 1289 int *nfirstpos; /* Element count stack for firstpos sets. */ 1290 position *firstpos; /* Array where firstpos elements are stored. */ 1291 int *nlastpos; /* Element count stack for lastpos sets. */ 1292 position *lastpos; /* Array where lastpos elements are stored. */ 1293 int *nalloc; /* Sizes of arrays allocated to follow sets. */ 1294 position_set tmp; /* Temporary set for merging sets. */ 1295 position_set merged; /* Result of merging sets. */ 1296 int wants_newline; /* True if some position wants newline info. */ 1297 int *o_nullable; 1298 int *o_nfirst, *o_nlast; 1299 position *o_firstpos, *o_lastpos; 1300 int i, j; 1301 position *pos; 1302 1303#ifdef DEBUG 1304 fprintf(stderr, "dfaanalyze:\n"); 1305 for (i = 0; i < d->tindex; ++i) 1306 { 1307 fprintf(stderr, " %d:", i); 1308 prtok(d->tokens[i]); 1309 } 1310 putc('\n', stderr); 1311#endif 1312 1313 d->searchflag = searchflag; 1314 1315 MALLOC(nullable, int, d->depth); 1316 o_nullable = nullable; 1317 MALLOC(nfirstpos, int, d->depth); 1318 o_nfirst = nfirstpos; 1319 MALLOC(firstpos, position, d->nleaves); 1320 o_firstpos = firstpos, firstpos += d->nleaves; 1321 MALLOC(nlastpos, int, d->depth); 1322 o_nlast = nlastpos; 1323 MALLOC(lastpos, position, d->nleaves); 1324 o_lastpos = lastpos, lastpos += d->nleaves; 1325 MALLOC(nalloc, int, d->tindex); 1326 for (i = 0; i < d->tindex; ++i) 1327 nalloc[i] = 0; 1328 MALLOC(merged.elems, position, d->nleaves); 1329 1330 CALLOC(d->follows, position_set, d->tindex); 1331 1332 for (i = 0; i < d->tindex; ++i) 1333#ifdef DEBUG 1334 { /* Nonsyntactic #ifdef goo... */ 1335#endif 1336 switch (d->tokens[i]) 1337 { 1338 case EMPTY: 1339 /* The empty set is nullable. */ 1340 *nullable++ = 1; 1341 1342 /* The firstpos and lastpos of the empty leaf are both empty. */ 1343 *nfirstpos++ = *nlastpos++ = 0; 1344 break; 1345 1346 case STAR: 1347 case PLUS: 1348 /* Every element in the firstpos of the argument is in the follow 1349 of every element in the lastpos. */ 1350 tmp.nelem = nfirstpos[-1]; 1351 tmp.elems = firstpos; 1352 pos = lastpos; 1353 for (j = 0; j < nlastpos[-1]; ++j) 1354 { 1355 merge(&tmp, &d->follows[pos[j].index], &merged); 1356 REALLOC_IF_NECESSARY(d->follows[pos[j].index].elems, position, 1357 nalloc[pos[j].index], merged.nelem - 1); 1358 copy(&merged, &d->follows[pos[j].index]); 1359 } 1360 1361 case QMARK: 1362 /* A QMARK or STAR node is automatically nullable. */ 1363 if (d->tokens[i] != PLUS) 1364 nullable[-1] = 1; 1365 break; 1366 1367 case CAT: 1368 /* Every element in the firstpos of the second argument is in the 1369 follow of every element in the lastpos of the first argument. */ 1370 tmp.nelem = nfirstpos[-1]; 1371 tmp.elems = firstpos; 1372 pos = lastpos + nlastpos[-1]; 1373 for (j = 0; j < nlastpos[-2]; ++j) 1374 { 1375 merge(&tmp, &d->follows[pos[j].index], &merged); 1376 REALLOC_IF_NECESSARY(d->follows[pos[j].index].elems, position, 1377 nalloc[pos[j].index], merged.nelem - 1); 1378 copy(&merged, &d->follows[pos[j].index]); 1379 } 1380 1381 /* The firstpos of a CAT node is the firstpos of the first argument, 1382 union that of the second argument if the first is nullable. */ 1383 if (nullable[-2]) 1384 nfirstpos[-2] += nfirstpos[-1]; 1385 else 1386 firstpos += nfirstpos[-1]; 1387 --nfirstpos; 1388 1389 /* The lastpos of a CAT node is the lastpos of the second argument, 1390 union that of the first argument if the second is nullable. */ 1391 if (nullable[-1]) 1392 nlastpos[-2] += nlastpos[-1]; 1393 else 1394 { 1395 pos = lastpos + nlastpos[-2]; 1396 for (j = nlastpos[-1] - 1; j >= 0; --j) 1397 pos[j] = lastpos[j]; 1398 lastpos += nlastpos[-2]; 1399 nlastpos[-2] = nlastpos[-1]; 1400 } 1401 --nlastpos; 1402 1403 /* A CAT node is nullable if both arguments are nullable. */ 1404 nullable[-2] = nullable[-1] && nullable[-2]; 1405 --nullable; 1406 break; 1407 1408 case OR: 1409 case ORTOP: 1410 /* The firstpos is the union of the firstpos of each argument. */ 1411 nfirstpos[-2] += nfirstpos[-1]; 1412 --nfirstpos; 1413 1414 /* The lastpos is the union of the lastpos of each argument. */ 1415 nlastpos[-2] += nlastpos[-1]; 1416 --nlastpos; 1417 1418 /* An OR node is nullable if either argument is nullable. */ 1419 nullable[-2] = nullable[-1] || nullable[-2]; 1420 --nullable; 1421 break; 1422 1423 default: 1424 /* Anything else is a nonempty position. (Note that special 1425 constructs like \< are treated as nonempty strings here; 1426 an "epsilon closure" effectively makes them nullable later. 1427 Backreferences have to get a real position so we can detect 1428 transitions on them later. But they are nullable. */ 1429 *nullable++ = d->tokens[i] == BACKREF; 1430 1431 /* This position is in its own firstpos and lastpos. */ 1432 *nfirstpos++ = *nlastpos++ = 1; 1433 --firstpos, --lastpos; 1434 firstpos->index = lastpos->index = i; 1435 firstpos->constraint = lastpos->constraint = NO_CONSTRAINT; 1436 1437 /* Allocate the follow set for this position. */ 1438 nalloc[i] = 1; 1439 MALLOC(d->follows[i].elems, position, nalloc[i]); 1440 break; 1441 } 1442#ifdef DEBUG 1443 /* ... balance the above nonsyntactic #ifdef goo... */ 1444 fprintf(stderr, "node %d:", i); 1445 prtok(d->tokens[i]); 1446 putc('\n', stderr); 1447 fprintf(stderr, nullable[-1] ? " nullable: yes\n" : " nullable: no\n"); 1448 fprintf(stderr, " firstpos:"); 1449 for (j = nfirstpos[-1] - 1; j >= 0; --j) 1450 { 1451 fprintf(stderr, " %d:", firstpos[j].index); 1452 prtok(d->tokens[firstpos[j].index]); 1453 } 1454 fprintf(stderr, "\n lastpos:"); 1455 for (j = nlastpos[-1] - 1; j >= 0; --j) 1456 { 1457 fprintf(stderr, " %d:", lastpos[j].index); 1458 prtok(d->tokens[lastpos[j].index]); 1459 } 1460 putc('\n', stderr); 1461 } 1462#endif 1463 1464 /* For each follow set that is the follow set of a real position, replace 1465 it with its epsilon closure. */ 1466 for (i = 0; i < d->tindex; ++i) 1467 if (d->tokens[i] < NOTCHAR || d->tokens[i] == BACKREF 1468 || d->tokens[i] >= CSET) 1469 { 1470#ifdef DEBUG 1471 fprintf(stderr, "follows(%d:", i); 1472 prtok(d->tokens[i]); 1473 fprintf(stderr, "):"); 1474 for (j = d->follows[i].nelem - 1; j >= 0; --j) 1475 { 1476 fprintf(stderr, " %d:", d->follows[i].elems[j].index); 1477 prtok(d->tokens[d->follows[i].elems[j].index]); 1478 } 1479 putc('\n', stderr); 1480#endif 1481 copy(&d->follows[i], &merged); 1482 epsclosure(&merged, d); 1483 if (d->follows[i].nelem < merged.nelem) 1484 REALLOC(d->follows[i].elems, position, merged.nelem); 1485 copy(&merged, &d->follows[i]); 1486 } 1487 1488 /* Get the epsilon closure of the firstpos of the regexp. The result will 1489 be the set of positions of state 0. */ 1490 merged.nelem = 0; 1491 for (i = 0; i < nfirstpos[-1]; ++i) 1492 insert(firstpos[i], &merged); 1493 epsclosure(&merged, d); 1494 1495 /* Check if any of the positions of state 0 will want newline context. */ 1496 wants_newline = 0; 1497 for (i = 0; i < merged.nelem; ++i) 1498 if (PREV_NEWLINE_DEPENDENT(merged.elems[i].constraint)) 1499 wants_newline = 1; 1500 1501 /* Build the initial state. */ 1502 d->salloc = 1; 1503 d->sindex = 0; 1504 MALLOC(d->states, dfa_state, d->salloc); 1505 state_index(d, &merged, wants_newline, 0); 1506 1507 free(o_nullable); 1508 free(o_nfirst); 1509 free(o_firstpos); 1510 free(o_nlast); 1511 free(o_lastpos); 1512 free(nalloc); 1513 free(merged.elems); 1514} 1515 1516/* Find, for each character, the transition out of state s of d, and store 1517 it in the appropriate slot of trans. 1518 1519 We divide the positions of s into groups (positions can appear in more 1520 than one group). Each group is labeled with a set of characters that 1521 every position in the group matches (taking into account, if necessary, 1522 preceding context information of s). For each group, find the union 1523 of the its elements' follows. This set is the set of positions of the 1524 new state. For each character in the group's label, set the transition 1525 on this character to be to a state corresponding to the set's positions, 1526 and its associated backward context information, if necessary. 1527 1528 If we are building a searching matcher, we include the positions of state 1529 0 in every state. 1530 1531 The collection of groups is constructed by building an equivalence-class 1532 partition of the positions of s. 1533 1534 For each position, find the set of characters C that it matches. Eliminate 1535 any characters from C that fail on grounds of backward context. 1536 1537 Search through the groups, looking for a group whose label L has nonempty 1538 intersection with C. If L - C is nonempty, create a new group labeled 1539 L - C and having the same positions as the current group, and set L to 1540 the intersection of L and C. Insert the position in this group, set 1541 C = C - L, and resume scanning. 1542 1543 If after comparing with every group there are characters remaining in C, 1544 create a new group labeled with the characters of C and insert this 1545 position in that group. */ 1546void 1547dfastate (int s, struct dfa *d, int trans[]) 1548{ 1549 position_set grps[NOTCHAR]; /* As many as will ever be needed. */ 1550 charclass labels[NOTCHAR]; /* Labels corresponding to the groups. */ 1551 int ngrps = 0; /* Number of groups actually used. */ 1552 position pos; /* Current position being considered. */ 1553 charclass matches; /* Set of matching characters. */ 1554 int matchesf; /* True if matches is nonempty. */ 1555 charclass intersect; /* Intersection with some label set. */ 1556 int intersectf; /* True if intersect is nonempty. */ 1557 charclass leftovers; /* Stuff in the label that didn't match. */ 1558 int leftoversf; /* True if leftovers is nonempty. */ 1559 static charclass letters; /* Set of characters considered letters. */ 1560 static charclass newline; /* Set of characters that aren't newline. */ 1561 position_set follows; /* Union of the follows of some group. */ 1562 position_set tmp; /* Temporary space for merging sets. */ 1563 int state; /* New state. */ 1564 int wants_newline; /* New state wants to know newline context. */ 1565 int state_newline; /* New state on a newline transition. */ 1566 int wants_letter; /* New state wants to know letter context. */ 1567 int state_letter; /* New state on a letter transition. */ 1568 static int initialized; /* Flag for static initialization. */ 1569 int i, j, k; 1570 1571 /* Initialize the set of letters, if necessary. */ 1572 if (! initialized) 1573 { 1574 initialized = 1; 1575 for (i = 0; i < NOTCHAR; ++i) 1576 if (IS_WORD_CONSTITUENT(i)) 1577 setbit(i, letters); 1578 setbit(eolbyte, newline); 1579 } 1580 1581 zeroset(matches); 1582 1583 for (i = 0; i < d->states[s].elems.nelem; ++i) 1584 { 1585 pos = d->states[s].elems.elems[i]; 1586 if (d->tokens[pos.index] >= 0 && d->tokens[pos.index] < NOTCHAR) 1587 setbit(d->tokens[pos.index], matches); 1588 else if (d->tokens[pos.index] >= CSET) 1589 copyset(d->charclasses[d->tokens[pos.index] - CSET], matches); 1590 else 1591 continue; 1592 1593 /* Some characters may need to be eliminated from matches because 1594 they fail in the current context. */ 1595 if (pos.constraint != 0xFF) 1596 { 1597 if (! MATCHES_NEWLINE_CONTEXT(pos.constraint, 1598 d->states[s].newline, 1)) 1599 clrbit(eolbyte, matches); 1600 if (! MATCHES_NEWLINE_CONTEXT(pos.constraint, 1601 d->states[s].newline, 0)) 1602 for (j = 0; j < CHARCLASS_INTS; ++j) 1603 matches[j] &= newline[j]; 1604 if (! MATCHES_LETTER_CONTEXT(pos.constraint, 1605 d->states[s].letter, 1)) 1606 for (j = 0; j < CHARCLASS_INTS; ++j) 1607 matches[j] &= ~letters[j]; 1608 if (! MATCHES_LETTER_CONTEXT(pos.constraint, 1609 d->states[s].letter, 0)) 1610 for (j = 0; j < CHARCLASS_INTS; ++j) 1611 matches[j] &= letters[j]; 1612 1613 /* If there are no characters left, there's no point in going on. */ 1614 for (j = 0; j < CHARCLASS_INTS && !matches[j]; ++j) 1615 continue; 1616 if (j == CHARCLASS_INTS) 1617 continue; 1618 } 1619 1620 for (j = 0; j < ngrps; ++j) 1621 { 1622 /* If matches contains a single character only, and the current 1623 group's label doesn't contain that character, go on to the 1624 next group. */ 1625 if (d->tokens[pos.index] >= 0 && d->tokens[pos.index] < NOTCHAR 1626 && !tstbit(d->tokens[pos.index], labels[j])) 1627 continue; 1628 1629 /* Check if this group's label has a nonempty intersection with 1630 matches. */ 1631 intersectf = 0; 1632 for (k = 0; k < CHARCLASS_INTS; ++k) 1633 (intersect[k] = matches[k] & labels[j][k]) ? (intersectf = 1) : 0; 1634 if (! intersectf) 1635 continue; 1636 1637 /* It does; now find the set differences both ways. */ 1638 leftoversf = matchesf = 0; 1639 for (k = 0; k < CHARCLASS_INTS; ++k) 1640 { 1641 /* Even an optimizing compiler can't know this for sure. */ 1642 int match = matches[k], label = labels[j][k]; 1643 1644 (leftovers[k] = ~match & label) ? (leftoversf = 1) : 0; 1645 (matches[k] = match & ~label) ? (matchesf = 1) : 0; 1646 } 1647 1648 /* If there were leftovers, create a new group labeled with them. */ 1649 if (leftoversf) 1650 { 1651 copyset(leftovers, labels[ngrps]); 1652 copyset(intersect, labels[j]); 1653 MALLOC(grps[ngrps].elems, position, d->nleaves); 1654 copy(&grps[j], &grps[ngrps]); 1655 ++ngrps; 1656 } 1657 1658 /* Put the position in the current group. Note that there is no 1659 reason to call insert() here. */ 1660 grps[j].elems[grps[j].nelem++] = pos; 1661 1662 /* If every character matching the current position has been 1663 accounted for, we're done. */ 1664 if (! matchesf) 1665 break; 1666 } 1667 1668 /* If we've passed the last group, and there are still characters 1669 unaccounted for, then we'll have to create a new group. */ 1670 if (j == ngrps) 1671 { 1672 copyset(matches, labels[ngrps]); 1673 zeroset(matches); 1674 MALLOC(grps[ngrps].elems, position, d->nleaves); 1675 grps[ngrps].nelem = 1; 1676 grps[ngrps].elems[0] = pos; 1677 ++ngrps; 1678 } 1679 } 1680 1681 MALLOC(follows.elems, position, d->nleaves); 1682 MALLOC(tmp.elems, position, d->nleaves); 1683 1684 /* If we are a searching matcher, the default transition is to a state 1685 containing the positions of state 0, otherwise the default transition 1686 is to fail miserably. */ 1687 if (d->searchflag) 1688 { 1689 wants_newline = 0; 1690 wants_letter = 0; 1691 for (i = 0; i < d->states[0].elems.nelem; ++i) 1692 { 1693 if (PREV_NEWLINE_DEPENDENT(d->states[0].elems.elems[i].constraint)) 1694 wants_newline = 1; 1695 if (PREV_LETTER_DEPENDENT(d->states[0].elems.elems[i].constraint)) 1696 wants_letter = 1; 1697 } 1698 copy(&d->states[0].elems, &follows); 1699 state = state_index(d, &follows, 0, 0); 1700 if (wants_newline) 1701 state_newline = state_index(d, &follows, 1, 0); 1702 else 1703 state_newline = state; 1704 if (wants_letter) 1705 state_letter = state_index(d, &follows, 0, 1); 1706 else 1707 state_letter = state; 1708 for (i = 0; i < NOTCHAR; ++i) 1709 trans[i] = (IS_WORD_CONSTITUENT(i)) ? state_letter : state; 1710 trans[eolbyte] = state_newline; 1711 } 1712 else 1713 for (i = 0; i < NOTCHAR; ++i) 1714 trans[i] = -1; 1715 1716 for (i = 0; i < ngrps; ++i) 1717 { 1718 follows.nelem = 0; 1719 1720 /* Find the union of the follows of the positions of the group. 1721 This is a hideously inefficient loop. Fix it someday. */ 1722 for (j = 0; j < grps[i].nelem; ++j) 1723 for (k = 0; k < d->follows[grps[i].elems[j].index].nelem; ++k) 1724 insert(d->follows[grps[i].elems[j].index].elems[k], &follows); 1725 1726 /* If we are building a searching matcher, throw in the positions 1727 of state 0 as well. */ 1728 if (d->searchflag) 1729 for (j = 0; j < d->states[0].elems.nelem; ++j) 1730 insert(d->states[0].elems.elems[j], &follows); 1731 1732 /* Find out if the new state will want any context information. */ 1733 wants_newline = 0; 1734 if (tstbit(eolbyte, labels[i])) 1735 for (j = 0; j < follows.nelem; ++j) 1736 if (PREV_NEWLINE_DEPENDENT(follows.elems[j].constraint)) 1737 wants_newline = 1; 1738 1739 wants_letter = 0; 1740 for (j = 0; j < CHARCLASS_INTS; ++j) 1741 if (labels[i][j] & letters[j]) 1742 break; 1743 if (j < CHARCLASS_INTS) 1744 for (j = 0; j < follows.nelem; ++j) 1745 if (PREV_LETTER_DEPENDENT(follows.elems[j].constraint)) 1746 wants_letter = 1; 1747 1748 /* Find the state(s) corresponding to the union of the follows. */ 1749 state = state_index(d, &follows, 0, 0); 1750 if (wants_newline) 1751 state_newline = state_index(d, &follows, 1, 0); 1752 else 1753 state_newline = state; 1754 if (wants_letter) 1755 state_letter = state_index(d, &follows, 0, 1); 1756 else 1757 state_letter = state; 1758 1759 /* Set the transitions for each character in the current label. */ 1760 for (j = 0; j < CHARCLASS_INTS; ++j) 1761 for (k = 0; k < INTBITS; ++k) 1762 if (labels[i][j] & 1 << k) 1763 { 1764 int c = j * INTBITS + k; 1765 1766 if (c == eolbyte) 1767 trans[c] = state_newline; 1768 else if (IS_WORD_CONSTITUENT(c)) 1769 trans[c] = state_letter; 1770 else if (c < NOTCHAR) 1771 trans[c] = state; 1772 } 1773 } 1774 1775 for (i = 0; i < ngrps; ++i) 1776 free(grps[i].elems); 1777 free(follows.elems); 1778 free(tmp.elems); 1779} 1780 1781/* Some routines for manipulating a compiled dfa's transition tables. 1782 Each state may or may not have a transition table; if it does, and it 1783 is a non-accepting state, then d->trans[state] points to its table. 1784 If it is an accepting state then d->fails[state] points to its table. 1785 If it has no table at all, then d->trans[state] is NULL. 1786 TODO: Improve this comment, get rid of the unnecessary redundancy. */ 1787 1788static void 1789build_state (int s, struct dfa *d) 1790{ 1791 int *trans; /* The new transition table. */ 1792 int i; 1793 1794 /* Set an upper limit on the number of transition tables that will ever 1795 exist at once. 1024 is arbitrary. The idea is that the frequently 1796 used transition tables will be quickly rebuilt, whereas the ones that 1797 were only needed once or twice will be cleared away. */ 1798 if (d->trcount >= 1024) 1799 { 1800 for (i = 0; i < d->tralloc; ++i) 1801 if (d->trans[i]) 1802 { 1803 free((ptr_t) d->trans[i]); 1804 d->trans[i] = NULL; 1805 } 1806 else if (d->fails[i]) 1807 { 1808 free((ptr_t) d->fails[i]); 1809 d->fails[i] = NULL; 1810 } 1811 d->trcount = 0; 1812 } 1813 1814 ++d->trcount; 1815 1816 /* Set up the success bits for this state. */ 1817 d->success[s] = 0; 1818 if (ACCEPTS_IN_CONTEXT(d->states[s].newline, 1, d->states[s].letter, 0, 1819 s, *d)) 1820 d->success[s] |= 4; 1821 if (ACCEPTS_IN_CONTEXT(d->states[s].newline, 0, d->states[s].letter, 1, 1822 s, *d)) 1823 d->success[s] |= 2; 1824 if (ACCEPTS_IN_CONTEXT(d->states[s].newline, 0, d->states[s].letter, 0, 1825 s, *d)) 1826 d->success[s] |= 1; 1827 1828 MALLOC(trans, int, NOTCHAR); 1829 dfastate(s, d, trans); 1830 1831 /* Now go through the new transition table, and make sure that the trans 1832 and fail arrays are allocated large enough to hold a pointer for the 1833 largest state mentioned in the table. */ 1834 for (i = 0; i < NOTCHAR; ++i) 1835 if (trans[i] >= d->tralloc) 1836 { 1837 int oldalloc = d->tralloc; 1838 1839 while (trans[i] >= d->tralloc) 1840 d->tralloc *= 2; 1841 REALLOC(d->realtrans, int *, d->tralloc + 1); 1842 d->trans = d->realtrans + 1; 1843 REALLOC(d->fails, int *, d->tralloc); 1844 REALLOC(d->success, int, d->tralloc); 1845 REALLOC(d->newlines, int, d->tralloc); 1846 while (oldalloc < d->tralloc) 1847 { 1848 d->trans[oldalloc] = NULL; 1849 d->fails[oldalloc++] = NULL; 1850 } 1851 } 1852 1853 /* Keep the newline transition in a special place so we can use it as 1854 a sentinel. */ 1855 d->newlines[s] = trans[eolbyte]; 1856 trans[eolbyte] = -1; 1857 1858 if (ACCEPTING(s, *d)) 1859 d->fails[s] = trans; 1860 else 1861 d->trans[s] = trans; 1862} 1863 1864static void 1865build_state_zero (struct dfa *d) 1866{ 1867 d->tralloc = 1; 1868 d->trcount = 0; 1869 CALLOC(d->realtrans, int *, d->tralloc + 1); 1870 d->trans = d->realtrans + 1; 1871 CALLOC(d->fails, int *, d->tralloc); 1872 MALLOC(d->success, int, d->tralloc); 1873 MALLOC(d->newlines, int, d->tralloc); 1874 build_state(0, d); 1875} 1876 1877/* Search through a buffer looking for a match to the given struct dfa. 1878 Find the first occurrence of a string matching the regexp in the buffer, 1879 and the shortest possible version thereof. Return a pointer to the first 1880 character after the match, or NULL if none is found. Begin points to 1881 the beginning of the buffer, and end points to the first character after 1882 its end. We store a newline in *end to act as a sentinel, so end had 1883 better point somewhere valid. Newline is a flag indicating whether to 1884 allow newlines to be in the matching string. If count is non- 1885 NULL it points to a place we're supposed to increment every time we 1886 see a newline. Finally, if backref is non-NULL it points to a place 1887 where we're supposed to store a 1 if backreferencing happened and the 1888 match needs to be verified by a backtracking matcher. Otherwise 1889 we store a 0 in *backref. */ 1890char * 1891dfaexec (struct dfa *d, char *begin, char *end, 1892 int newline, int *count, int *backref) 1893{ 1894 register int s, s1, tmp; /* Current state. */ 1895 register unsigned char *p; /* Current input character. */ 1896 register int **trans, *t; /* Copy of d->trans so it can be optimized 1897 into a register. */ 1898 register unsigned char eol = eolbyte; /* Likewise for eolbyte. */ 1899 static int sbit[NOTCHAR]; /* Table for anding with d->success. */ 1900 static int sbit_init; 1901 1902 if (! sbit_init) 1903 { 1904 int i; 1905 1906 sbit_init = 1; 1907 for (i = 0; i < NOTCHAR; ++i) 1908 sbit[i] = (IS_WORD_CONSTITUENT(i)) ? 2 : 1; 1909 sbit[eol] = 4; 1910 } 1911 1912 if (! d->tralloc) 1913 build_state_zero(d); 1914 1915 s = s1 = 0; 1916 p = (unsigned char *) begin; 1917 trans = d->trans; 1918 *end = eol; 1919 1920 for (;;) 1921 { 1922 while ((t = trans[s]) != 0) { /* hand-optimized loop */ 1923 s1 = t[*p++]; 1924 if ((t = trans[s1]) == 0) { 1925 tmp = s ; s = s1 ; s1 = tmp ; /* swap */ 1926 break; 1927 } 1928 s = t[*p++]; 1929 } 1930 1931 if (s >= 0 && p <= (unsigned char *) end && d->fails[s]) 1932 { 1933 if (d->success[s] & sbit[*p]) 1934 { 1935 if (backref) 1936 *backref = (d->states[s].backref != 0); 1937 return (char *) p; 1938 } 1939 1940 s1 = s; 1941 s = d->fails[s][*p++]; 1942 continue; 1943 } 1944 1945 /* If the previous character was a newline, count it. */ 1946 if (count && (char *) p <= end && p[-1] == eol) 1947 ++*count; 1948 1949 /* Check if we've run off the end of the buffer. */ 1950 if ((char *) p > end) 1951 return NULL; 1952 1953 if (s >= 0) 1954 { 1955 build_state(s, d); 1956 trans = d->trans; 1957 continue; 1958 } 1959 1960 if (p[-1] == eol && newline) 1961 { 1962 s = d->newlines[s1]; 1963 continue; 1964 } 1965 1966 s = 0; 1967 } 1968} 1969 1970/* Initialize the components of a dfa that the other routines don't 1971 initialize for themselves. */ 1972void 1973dfainit (struct dfa *d) 1974{ 1975 d->calloc = 1; 1976 MALLOC(d->charclasses, charclass, d->calloc); 1977 d->cindex = 0; 1978 1979 d->talloc = 1; 1980 MALLOC(d->tokens, token, d->talloc); 1981 d->tindex = d->depth = d->nleaves = d->nregexps = 0; 1982 1983 d->searchflag = 0; 1984 d->tralloc = 0; 1985 1986 d->musts = 0; 1987} 1988 1989/* Parse and analyze a single string of the given length. */ 1990void 1991dfacomp (char *s, size_t len, struct dfa *d, int searchflag) 1992{ 1993 if (case_fold) /* dummy folding in service of dfamust() */ 1994 { 1995 char *lcopy; 1996 int i; 1997 1998 lcopy = malloc(len); 1999 if (!lcopy) 2000 dfaerror(_("out of memory")); 2001 2002 /* This is a kludge. */ 2003 case_fold = 0; 2004 for (i = 0; i < len; ++i) 2005 if (ISUPPER ((unsigned char) s[i])) 2006 lcopy[i] = tolower ((unsigned char) s[i]); 2007 else 2008 lcopy[i] = s[i]; 2009 2010 dfainit(d); 2011 dfaparse(lcopy, len, d); 2012 free(lcopy); 2013 dfamust(d); 2014 d->cindex = d->tindex = d->depth = d->nleaves = d->nregexps = 0; 2015 case_fold = 1; 2016 dfaparse(s, len, d); 2017 dfaanalyze(d, searchflag); 2018 } 2019 else 2020 { 2021 dfainit(d); 2022 dfaparse(s, len, d); 2023 dfamust(d); 2024 dfaanalyze(d, searchflag); 2025 } 2026} 2027 2028/* Free the storage held by the components of a dfa. */ 2029void 2030dfafree (struct dfa *d) 2031{ 2032 int i; 2033 struct dfamust *dm, *ndm; 2034 2035 free((ptr_t) d->charclasses); 2036 free((ptr_t) d->tokens); 2037 for (i = 0; i < d->sindex; ++i) 2038 free((ptr_t) d->states[i].elems.elems); 2039 free((ptr_t) d->states); 2040 for (i = 0; i < d->tindex; ++i) 2041 if (d->follows[i].elems) 2042 free((ptr_t) d->follows[i].elems); 2043 free((ptr_t) d->follows); 2044 for (i = 0; i < d->tralloc; ++i) 2045 if (d->trans[i]) 2046 free((ptr_t) d->trans[i]); 2047 else if (d->fails[i]) 2048 free((ptr_t) d->fails[i]); 2049 if (d->realtrans) free((ptr_t) d->realtrans); 2050 if (d->fails) free((ptr_t) d->fails); 2051 if (d->newlines) free((ptr_t) d->newlines); 2052 if (d->success) free((ptr_t) d->success); 2053 for (dm = d->musts; dm; dm = ndm) 2054 { 2055 ndm = dm->next; 2056 free(dm->must); 2057 free((ptr_t) dm); 2058 } 2059} 2060 2061/* Having found the postfix representation of the regular expression, 2062 try to find a long sequence of characters that must appear in any line 2063 containing the r.e. 2064 Finding a "longest" sequence is beyond the scope here; 2065 we take an easy way out and hope for the best. 2066 (Take "(ab|a)b"--please.) 2067 2068 We do a bottom-up calculation of sequences of characters that must appear 2069 in matches of r.e.'s represented by trees rooted at the nodes of the postfix 2070 representation: 2071 sequences that must appear at the left of the match ("left") 2072 sequences that must appear at the right of the match ("right") 2073 lists of sequences that must appear somewhere in the match ("in") 2074 sequences that must constitute the match ("is") 2075 2076 When we get to the root of the tree, we use one of the longest of its 2077 calculated "in" sequences as our answer. The sequence we find is returned in 2078 d->must (where "d" is the single argument passed to "dfamust"); 2079 the length of the sequence is returned in d->mustn. 2080 2081 The sequences calculated for the various types of node (in pseudo ANSI c) 2082 are shown below. "p" is the operand of unary operators (and the left-hand 2083 operand of binary operators); "q" is the right-hand operand of binary 2084 operators. 2085 2086 "ZERO" means "a zero-length sequence" below. 2087 2088 Type left right is in 2089 ---- ---- ----- -- -- 2090 char c # c # c # c # c 2091 2092 CSET ZERO ZERO ZERO ZERO 2093 2094 STAR ZERO ZERO ZERO ZERO 2095 2096 QMARK ZERO ZERO ZERO ZERO 2097 2098 PLUS p->left p->right ZERO p->in 2099 2100 CAT (p->is==ZERO)? (q->is==ZERO)? (p->is!=ZERO && p->in plus 2101 p->left : q->right : q->is!=ZERO) ? q->in plus 2102 p->is##q->left p->right##q->is p->is##q->is : p->right##q->left 2103 ZERO 2104 2105 OR longest common longest common (do p->is and substrings common to 2106 leading trailing q->is have same p->in and q->in 2107 (sub)sequence (sub)sequence length and 2108 of p->left of p->right content) ? 2109 and q->left and q->right p->is : NULL 2110 2111 If there's anything else we recognize in the tree, all four sequences get set 2112 to zero-length sequences. If there's something we don't recognize in the tree, 2113 we just return a zero-length sequence. 2114 2115 Break ties in favor of infrequent letters (choosing 'zzz' in preference to 2116 'aaa')? 2117 2118 And. . .is it here or someplace that we might ponder "optimizations" such as 2119 egrep 'psi|epsilon' -> egrep 'psi' 2120 egrep 'pepsi|epsilon' -> egrep 'epsi' 2121 (Yes, we now find "epsi" as a "string 2122 that must occur", but we might also 2123 simplify the *entire* r.e. being sought) 2124 grep '[c]' -> grep 'c' 2125 grep '(ab|a)b' -> grep 'ab' 2126 grep 'ab*' -> grep 'a' 2127 grep 'a*b' -> grep 'b' 2128 2129 There are several issues: 2130 2131 Is optimization easy (enough)? 2132 2133 Does optimization actually accomplish anything, 2134 or is the automaton you get from "psi|epsilon" (for example) 2135 the same as the one you get from "psi" (for example)? 2136 2137 Are optimizable r.e.'s likely to be used in real-life situations 2138 (something like 'ab*' is probably unlikely; something like is 2139 'psi|epsilon' is likelier)? */ 2140 2141static char * 2142icatalloc (char *old, char *new) 2143{ 2144 char *result; 2145 size_t oldsize, newsize; 2146 2147 newsize = (new == NULL) ? 0 : strlen(new); 2148 if (old == NULL) 2149 oldsize = 0; 2150 else if (newsize == 0) 2151 return old; 2152 else oldsize = strlen(old); 2153 if (old == NULL) 2154 result = (char *) malloc(newsize + 1); 2155 else 2156 result = (char *) realloc((void *) old, oldsize + newsize + 1); 2157 if (result != NULL && new != NULL) 2158 (void) strcpy(result + oldsize, new); 2159 return result; 2160} 2161 2162static char * 2163icpyalloc (char *string) 2164{ 2165 return icatalloc((char *) NULL, string); 2166} 2167 2168static char * 2169istrstr (char *lookin, char *lookfor) 2170{ 2171 char *cp; 2172 size_t len; 2173 2174 len = strlen(lookfor); 2175 for (cp = lookin; *cp != '\0'; ++cp) 2176 if (strncmp(cp, lookfor, len) == 0) 2177 return cp; 2178 return NULL; 2179} 2180 2181static void 2182ifree (char *cp) 2183{ 2184 if (cp != NULL) 2185 free(cp); 2186} 2187 2188static void 2189freelist (char **cpp) 2190{ 2191 int i; 2192 2193 if (cpp == NULL) 2194 return; 2195 for (i = 0; cpp[i] != NULL; ++i) 2196 { 2197 free(cpp[i]); 2198 cpp[i] = NULL; 2199 } 2200} 2201 2202static char ** 2203enlist (char **cpp, char *new, size_t len) 2204{ 2205 int i, j; 2206 2207 if (cpp == NULL) 2208 return NULL; 2209 if ((new = icpyalloc(new)) == NULL) 2210 { 2211 freelist(cpp); 2212 return NULL; 2213 } 2214 new[len] = '\0'; 2215 /* Is there already something in the list that's new (or longer)? */ 2216 for (i = 0; cpp[i] != NULL; ++i) 2217 if (istrstr(cpp[i], new) != NULL) 2218 { 2219 free(new); 2220 return cpp; 2221 } 2222 /* Eliminate any obsoleted strings. */ 2223 j = 0; 2224 while (cpp[j] != NULL) 2225 if (istrstr(new, cpp[j]) == NULL) 2226 ++j; 2227 else 2228 { 2229 free(cpp[j]); 2230 if (--i == j) 2231 break; 2232 cpp[j] = cpp[i]; 2233 cpp[i] = NULL; 2234 } 2235 /* Add the new string. */ 2236 cpp = (char **) realloc((char *) cpp, (i + 2) * sizeof *cpp); 2237 if (cpp == NULL) 2238 return NULL; 2239 cpp[i] = new; 2240 cpp[i + 1] = NULL; 2241 return cpp; 2242} 2243 2244/* Given pointers to two strings, return a pointer to an allocated 2245 list of their distinct common substrings. Return NULL if something 2246 seems wild. */ 2247static char ** 2248comsubs (char *left, char *right) 2249{ 2250 char **cpp; 2251 char *lcp; 2252 char *rcp; 2253 size_t i, len; 2254 2255 if (left == NULL || right == NULL) 2256 return NULL; 2257 cpp = (char **) malloc(sizeof *cpp); 2258 if (cpp == NULL) 2259 return NULL; 2260 cpp[0] = NULL; 2261 for (lcp = left; *lcp != '\0'; ++lcp) 2262 { 2263 len = 0; 2264 rcp = index(right, *lcp); 2265 while (rcp != NULL) 2266 { 2267 for (i = 1; lcp[i] != '\0' && lcp[i] == rcp[i]; ++i) 2268 continue; 2269 if (i > len) 2270 len = i; 2271 rcp = index(rcp + 1, *lcp); 2272 } 2273 if (len == 0) 2274 continue; 2275 if ((cpp = enlist(cpp, lcp, len)) == NULL) 2276 break; 2277 } 2278 return cpp; 2279} 2280 2281static char ** 2282addlists (char **old, char **new) 2283{ 2284 int i; 2285 2286 if (old == NULL || new == NULL) 2287 return NULL; 2288 for (i = 0; new[i] != NULL; ++i) 2289 { 2290 old = enlist(old, new[i], strlen(new[i])); 2291 if (old == NULL) 2292 break; 2293 } 2294 return old; 2295} 2296 2297/* Given two lists of substrings, return a new list giving substrings 2298 common to both. */ 2299static char ** 2300inboth (char **left, char **right) 2301{ 2302 char **both; 2303 char **temp; 2304 int lnum, rnum; 2305 2306 if (left == NULL || right == NULL) 2307 return NULL; 2308 both = (char **) malloc(sizeof *both); 2309 if (both == NULL) 2310 return NULL; 2311 both[0] = NULL; 2312 for (lnum = 0; left[lnum] != NULL; ++lnum) 2313 { 2314 for (rnum = 0; right[rnum] != NULL; ++rnum) 2315 { 2316 temp = comsubs(left[lnum], right[rnum]); 2317 if (temp == NULL) 2318 { 2319 freelist(both); 2320 return NULL; 2321 } 2322 both = addlists(both, temp); 2323 freelist(temp); 2324 free(temp); 2325 if (both == NULL) 2326 return NULL; 2327 } 2328 } 2329 return both; 2330} 2331 2332typedef struct 2333{ 2334 char **in; 2335 char *left; 2336 char *right; 2337 char *is; 2338} must; 2339 2340static void 2341resetmust (must *mp) 2342{ 2343 mp->left[0] = mp->right[0] = mp->is[0] = '\0'; 2344 freelist(mp->in); 2345} 2346 2347static void 2348dfamust (struct dfa *dfa) 2349{ 2350 must *musts; 2351 must *mp; 2352 char *result; 2353 int ri; 2354 int i; 2355 int exact; 2356 token t; 2357 static must must0; 2358 struct dfamust *dm; 2359 static char empty_string[] = ""; 2360 2361 result = empty_string; 2362 exact = 0; 2363 musts = (must *) malloc((dfa->tindex + 1) * sizeof *musts); 2364 if (musts == NULL) 2365 return; 2366 mp = musts; 2367 for (i = 0; i <= dfa->tindex; ++i) 2368 mp[i] = must0; 2369 for (i = 0; i <= dfa->tindex; ++i) 2370 { 2371 mp[i].in = (char **) malloc(sizeof *mp[i].in); 2372 mp[i].left = malloc(2); 2373 mp[i].right = malloc(2); 2374 mp[i].is = malloc(2); 2375 if (mp[i].in == NULL || mp[i].left == NULL || 2376 mp[i].right == NULL || mp[i].is == NULL) 2377 goto done; 2378 mp[i].left[0] = mp[i].right[0] = mp[i].is[0] = '\0'; 2379 mp[i].in[0] = NULL; 2380 } 2381#ifdef DEBUG 2382 fprintf(stderr, "dfamust:\n"); 2383 for (i = 0; i < dfa->tindex; ++i) 2384 { 2385 fprintf(stderr, " %d:", i); 2386 prtok(dfa->tokens[i]); 2387 } 2388 putc('\n', stderr); 2389#endif 2390 for (ri = 0; ri < dfa->tindex; ++ri) 2391 { 2392 switch (t = dfa->tokens[ri]) 2393 { 2394 case LPAREN: 2395 case RPAREN: 2396 goto done; /* "cannot happen" */ 2397 case EMPTY: 2398 case BEGLINE: 2399 case ENDLINE: 2400 case BEGWORD: 2401 case ENDWORD: 2402 case LIMWORD: 2403 case NOTLIMWORD: 2404 case BACKREF: 2405 resetmust(mp); 2406 break; 2407 case STAR: 2408 case QMARK: 2409 if (mp <= musts) 2410 goto done; /* "cannot happen" */ 2411 --mp; 2412 resetmust(mp); 2413 break; 2414 case OR: 2415 case ORTOP: 2416 if (mp < &musts[2]) 2417 goto done; /* "cannot happen" */ 2418 { 2419 char **new; 2420 must *lmp; 2421 must *rmp; 2422 int j, ln, rn, n; 2423 2424 rmp = --mp; 2425 lmp = --mp; 2426 /* Guaranteed to be. Unlikely, but. . . */ 2427 if (strcmp(lmp->is, rmp->is) != 0) 2428 lmp->is[0] = '\0'; 2429 /* Left side--easy */ 2430 i = 0; 2431 while (lmp->left[i] != '\0' && lmp->left[i] == rmp->left[i]) 2432 ++i; 2433 lmp->left[i] = '\0'; 2434 /* Right side */ 2435 ln = strlen(lmp->right); 2436 rn = strlen(rmp->right); 2437 n = ln; 2438 if (n > rn) 2439 n = rn; 2440 for (i = 0; i < n; ++i) 2441 if (lmp->right[ln - i - 1] != rmp->right[rn - i - 1]) 2442 break; 2443 for (j = 0; j < i; ++j) 2444 lmp->right[j] = lmp->right[(ln - i) + j]; 2445 lmp->right[j] = '\0'; 2446 new = inboth(lmp->in, rmp->in); 2447 if (new == NULL) 2448 goto done; 2449 freelist(lmp->in); 2450 free((char *) lmp->in); 2451 lmp->in = new; 2452 } 2453 break; 2454 case PLUS: 2455 if (mp <= musts) 2456 goto done; /* "cannot happen" */ 2457 --mp; 2458 mp->is[0] = '\0'; 2459 break; 2460 case END: 2461 if (mp != &musts[1]) 2462 goto done; /* "cannot happen" */ 2463 for (i = 0; musts[0].in[i] != NULL; ++i) 2464 if (strlen(musts[0].in[i]) > strlen(result)) 2465 result = musts[0].in[i]; 2466 if (strcmp(result, musts[0].is) == 0) 2467 exact = 1; 2468 goto done; 2469 case CAT: 2470 if (mp < &musts[2]) 2471 goto done; /* "cannot happen" */ 2472 { 2473 must *lmp; 2474 must *rmp; 2475 2476 rmp = --mp; 2477 lmp = --mp; 2478 /* In. Everything in left, plus everything in 2479 right, plus catenation of 2480 left's right and right's left. */ 2481 lmp->in = addlists(lmp->in, rmp->in); 2482 if (lmp->in == NULL) 2483 goto done; 2484 if (lmp->right[0] != '\0' && 2485 rmp->left[0] != '\0') 2486 { 2487 char *tp; 2488 2489 tp = icpyalloc(lmp->right); 2490 if (tp == NULL) 2491 goto done; 2492 tp = icatalloc(tp, rmp->left); 2493 if (tp == NULL) 2494 goto done; 2495 lmp->in = enlist(lmp->in, tp, 2496 strlen(tp)); 2497 free(tp); 2498 if (lmp->in == NULL) 2499 goto done; 2500 } 2501 /* Left-hand */ 2502 if (lmp->is[0] != '\0') 2503 { 2504 lmp->left = icatalloc(lmp->left, 2505 rmp->left); 2506 if (lmp->left == NULL) 2507 goto done; 2508 } 2509 /* Right-hand */ 2510 if (rmp->is[0] == '\0') 2511 lmp->right[0] = '\0'; 2512 lmp->right = icatalloc(lmp->right, rmp->right); 2513 if (lmp->right == NULL) 2514 goto done; 2515 /* Guaranteed to be */ 2516 if (lmp->is[0] != '\0' && rmp->is[0] != '\0') 2517 { 2518 lmp->is = icatalloc(lmp->is, rmp->is); 2519 if (lmp->is == NULL) 2520 goto done; 2521 } 2522 else 2523 lmp->is[0] = '\0'; 2524 } 2525 break; 2526 default: 2527 if (t < END) 2528 { 2529 /* "cannot happen" */ 2530 goto done; 2531 } 2532 else if (t == '\0') 2533 { 2534 /* not on *my* shift */ 2535 goto done; 2536 } 2537 else if (t >= CSET) 2538 { 2539 /* easy enough */ 2540 resetmust(mp); 2541 } 2542 else 2543 { 2544 /* plain character */ 2545 resetmust(mp); 2546 mp->is[0] = mp->left[0] = mp->right[0] = t; 2547 mp->is[1] = mp->left[1] = mp->right[1] = '\0'; 2548 mp->in = enlist(mp->in, mp->is, (size_t)1); 2549 if (mp->in == NULL) 2550 goto done; 2551 } 2552 break; 2553 } 2554#ifdef DEBUG 2555 fprintf(stderr, " node: %d:", ri); 2556 prtok(dfa->tokens[ri]); 2557 fprintf(stderr, "\n in:"); 2558 for (i = 0; mp->in[i]; ++i) 2559 fprintf(stderr, " \"%s\"", mp->in[i]); 2560 fprintf(stderr, "\n is: \"%s\"\n", mp->is); 2561 fprintf(stderr, " left: \"%s\"\n", mp->left); 2562 fprintf(stderr, " right: \"%s\"\n", mp->right); 2563#endif 2564 ++mp; 2565 } 2566 done: 2567 if (strlen(result)) 2568 { 2569 dm = (struct dfamust *) malloc(sizeof (struct dfamust)); 2570 dm->exact = exact; 2571 dm->must = malloc(strlen(result) + 1); 2572 strcpy(dm->must, result); 2573 dm->next = dfa->musts; 2574 dfa->musts = dm; 2575 } 2576 mp = musts; 2577 for (i = 0; i <= dfa->tindex; ++i) 2578 { 2579 freelist(mp[i].in); 2580 ifree((char *) mp[i].in); 2581 ifree(mp[i].left); 2582 ifree(mp[i].right); 2583 ifree(mp[i].is); 2584 } 2585 free((char *) mp); 2586} 2587