dfa.c revision 146199
1/* dfa.c - deterministic extended regexp routines for GNU 2 Copyright 1988, 1998, 2000 Free Software Foundation, Inc. 3 4 This program is free software; you can redistribute it and/or modify 5 it under the terms of the GNU General Public License as published by 6 the Free Software Foundation; either version 2, or (at your option) 7 any later version. 8 9 This program is distributed in the hope that it will be useful, 10 but WITHOUT ANY WARRANTY; without even the implied warranty of 11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 GNU General Public License for more details. 13 14 You should have received a copy of the GNU General Public License 15 along with this program; if not, write to the Free Software 16 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA */ 17 18/* Written June, 1988 by Mike Haertel 19 Modified July, 1988 by Arthur David Olson to assist BMG speedups */ 20 21/* $FreeBSD: head/gnu/usr.bin/grep/dfa.c 146199 2005-05-14 03:02:22Z tjr $ */ 22 23#ifdef HAVE_CONFIG_H 24#include <config.h> 25#endif 26 27#include <assert.h> 28#include <ctype.h> 29#include <stdio.h> 30 31#include <sys/types.h> 32#ifdef STDC_HEADERS 33#include <stdlib.h> 34#else 35extern char *calloc(), *malloc(), *realloc(); 36extern void free(); 37#endif 38 39#if defined(HAVE_STRING_H) || defined(STDC_HEADERS) 40#include <string.h> 41#else 42#include <strings.h> 43#endif 44 45#if HAVE_SETLOCALE 46# include <locale.h> 47#endif 48 49#if defined HAVE_WCTYPE_H && defined HAVE_WCHAR_H && defined HAVE_MBRTOWC 50/* We can handle multibyte string. */ 51# define MBS_SUPPORT 52#endif 53 54#ifdef MBS_SUPPORT 55# include <wchar.h> 56# include <wctype.h> 57#endif 58 59#ifndef DEBUG /* use the same approach as regex.c */ 60#undef assert 61#define assert(e) 62#endif /* DEBUG */ 63 64#ifndef isgraph 65#define isgraph(C) (isprint(C) && !isspace(C)) 66#endif 67 68#if defined (STDC_HEADERS) || (!defined (isascii) && !defined (HAVE_ISASCII)) 69#define ISALPHA(C) isalpha(C) 70#define ISUPPER(C) isupper(C) 71#define ISLOWER(C) islower(C) 72#define ISDIGIT(C) isdigit(C) 73#define ISXDIGIT(C) isxdigit(C) 74#define ISSPACE(C) isspace(C) 75#define ISPUNCT(C) ispunct(C) 76#define ISALNUM(C) isalnum(C) 77#define ISPRINT(C) isprint(C) 78#define ISGRAPH(C) isgraph(C) 79#define ISCNTRL(C) iscntrl(C) 80#else 81#define ISALPHA(C) (isascii(C) && isalpha(C)) 82#define ISUPPER(C) (isascii(C) && isupper(C)) 83#define ISLOWER(C) (isascii(C) && islower(C)) 84#define ISDIGIT(C) (isascii(C) && isdigit(C)) 85#define ISXDIGIT(C) (isascii(C) && isxdigit(C)) 86#define ISSPACE(C) (isascii(C) && isspace(C)) 87#define ISPUNCT(C) (isascii(C) && ispunct(C)) 88#define ISALNUM(C) (isascii(C) && isalnum(C)) 89#define ISPRINT(C) (isascii(C) && isprint(C)) 90#define ISGRAPH(C) (isascii(C) && isgraph(C)) 91#define ISCNTRL(C) (isascii(C) && iscntrl(C)) 92#endif 93 94/* ISASCIIDIGIT differs from ISDIGIT, as follows: 95 - Its arg may be any int or unsigned int; it need not be an unsigned char. 96 - It's guaranteed to evaluate its argument exactly once. 97 - It's typically faster. 98 Posix 1003.2-1992 section 2.5.2.1 page 50 lines 1556-1558 says that 99 only '0' through '9' are digits. Prefer ISASCIIDIGIT to ISDIGIT unless 100 it's important to use the locale's definition of `digit' even when the 101 host does not conform to Posix. */ 102#define ISASCIIDIGIT(c) ((unsigned) (c) - '0' <= 9) 103 104/* If we (don't) have I18N. */ 105/* glibc defines _ */ 106#ifndef _ 107# ifdef HAVE_LIBINTL_H 108# include <libintl.h> 109# ifndef _ 110# define _(Str) gettext (Str) 111# endif 112# else 113# define _(Str) (Str) 114# endif 115#endif 116 117#include "regex.h" 118#include "dfa.h" 119#include "hard-locale.h" 120 121/* HPUX, define those as macros in sys/param.h */ 122#ifdef setbit 123# undef setbit 124#endif 125#ifdef clrbit 126# undef clrbit 127#endif 128 129static void dfamust PARAMS ((struct dfa *dfa)); 130static void regexp PARAMS ((int toplevel)); 131 132static ptr_t 133xcalloc (size_t n, size_t s) 134{ 135 ptr_t r = calloc(n, s); 136 137 if (!r) 138 dfaerror(_("Memory exhausted")); 139 return r; 140} 141 142static ptr_t 143xmalloc (size_t n) 144{ 145 ptr_t r = malloc(n); 146 147 assert(n != 0); 148 if (!r) 149 dfaerror(_("Memory exhausted")); 150 return r; 151} 152 153static ptr_t 154xrealloc (ptr_t p, size_t n) 155{ 156 ptr_t r = realloc(p, n); 157 158 assert(n != 0); 159 if (!r) 160 dfaerror(_("Memory exhausted")); 161 return r; 162} 163 164#define CALLOC(p, t, n) ((p) = (t *) xcalloc((size_t)(n), sizeof (t))) 165#define MALLOC(p, t, n) ((p) = (t *) xmalloc((n) * sizeof (t))) 166#define REALLOC(p, t, n) ((p) = (t *) xrealloc((ptr_t) (p), (n) * sizeof (t))) 167 168/* Reallocate an array of type t if nalloc is too small for index. */ 169#define REALLOC_IF_NECESSARY(p, t, nalloc, index) \ 170 if ((index) >= (nalloc)) \ 171 { \ 172 do \ 173 (nalloc) *= 2; \ 174 while ((index) >= (nalloc)); \ 175 REALLOC(p, t, nalloc); \ 176 } 177 178#ifdef DEBUG 179 180static void 181prtok (token t) 182{ 183 char const *s; 184 185 if (t < 0) 186 fprintf(stderr, "END"); 187 else if (t < NOTCHAR) 188 fprintf(stderr, "%c", t); 189 else 190 { 191 switch (t) 192 { 193 case EMPTY: s = "EMPTY"; break; 194 case BACKREF: s = "BACKREF"; break; 195 case BEGLINE: s = "BEGLINE"; break; 196 case ENDLINE: s = "ENDLINE"; break; 197 case BEGWORD: s = "BEGWORD"; break; 198 case ENDWORD: s = "ENDWORD"; break; 199 case LIMWORD: s = "LIMWORD"; break; 200 case NOTLIMWORD: s = "NOTLIMWORD"; break; 201 case QMARK: s = "QMARK"; break; 202 case STAR: s = "STAR"; break; 203 case PLUS: s = "PLUS"; break; 204 case CAT: s = "CAT"; break; 205 case OR: s = "OR"; break; 206 case ORTOP: s = "ORTOP"; break; 207 case LPAREN: s = "LPAREN"; break; 208 case RPAREN: s = "RPAREN"; break; 209 case CRANGE: s = "CRANGE"; break; 210#ifdef MBS_SUPPORT 211 case ANYCHAR: s = "ANYCHAR"; break; 212 case MBCSET: s = "MBCSET"; break; 213#endif /* MBS_SUPPORT */ 214 default: s = "CSET"; break; 215 } 216 fprintf(stderr, "%s", s); 217 } 218} 219#endif /* DEBUG */ 220 221/* Stuff pertaining to charclasses. */ 222 223static int 224tstbit (unsigned b, charclass c) 225{ 226 return c[b / INTBITS] & 1 << b % INTBITS; 227} 228 229static void 230setbit (unsigned b, charclass c) 231{ 232 c[b / INTBITS] |= 1 << b % INTBITS; 233} 234 235static void 236clrbit (unsigned b, charclass c) 237{ 238 c[b / INTBITS] &= ~(1 << b % INTBITS); 239} 240 241static void 242copyset (charclass src, charclass dst) 243{ 244 memcpy (dst, src, sizeof (charclass)); 245} 246 247static void 248zeroset (charclass s) 249{ 250 memset (s, 0, sizeof (charclass)); 251} 252 253static void 254notset (charclass s) 255{ 256 int i; 257 258 for (i = 0; i < CHARCLASS_INTS; ++i) 259 s[i] = ~s[i]; 260} 261 262static int 263equal (charclass s1, charclass s2) 264{ 265 return memcmp (s1, s2, sizeof (charclass)) == 0; 266} 267 268/* A pointer to the current dfa is kept here during parsing. */ 269static struct dfa *dfa; 270 271/* Find the index of charclass s in dfa->charclasses, or allocate a new charclass. */ 272static int 273charclass_index (charclass s) 274{ 275 int i; 276 277 for (i = 0; i < dfa->cindex; ++i) 278 if (equal(s, dfa->charclasses[i])) 279 return i; 280 REALLOC_IF_NECESSARY(dfa->charclasses, charclass, dfa->calloc, dfa->cindex); 281 ++dfa->cindex; 282 copyset(s, dfa->charclasses[i]); 283 return i; 284} 285 286/* Syntax bits controlling the behavior of the lexical analyzer. */ 287static reg_syntax_t syntax_bits, syntax_bits_set; 288 289/* Flag for case-folding letters into sets. */ 290static int case_fold; 291 292/* End-of-line byte in data. */ 293static unsigned char eolbyte; 294 295/* Entry point to set syntax options. */ 296void 297dfasyntax (reg_syntax_t bits, int fold, unsigned char eol) 298{ 299 syntax_bits_set = 1; 300 syntax_bits = bits; 301 case_fold = fold; 302 eolbyte = eol; 303} 304 305/* Like setbit, but if case is folded, set both cases of a letter. */ 306static void 307setbit_case_fold (unsigned b, charclass c) 308{ 309 setbit (b, c); 310 if (case_fold) 311 { 312 if (ISUPPER (b)) 313 setbit (tolower (b), c); 314 else if (ISLOWER (b)) 315 setbit (toupper (b), c); 316 } 317} 318 319/* Lexical analyzer. All the dross that deals with the obnoxious 320 GNU Regex syntax bits is located here. The poor, suffering 321 reader is referred to the GNU Regex documentation for the 322 meaning of the @#%!@#%^!@ syntax bits. */ 323 324static char const *lexstart; /* Pointer to beginning of input string. */ 325static char const *lexptr; /* Pointer to next input character. */ 326static int lexleft; /* Number of characters remaining. */ 327static token lasttok; /* Previous token returned; initially END. */ 328static int laststart; /* True if we're separated from beginning or (, | 329 only by zero-width characters. */ 330static int parens; /* Count of outstanding left parens. */ 331static int minrep, maxrep; /* Repeat counts for {m,n}. */ 332static int hard_LC_COLLATE; /* Nonzero if LC_COLLATE is hard. */ 333 334#ifdef MBS_SUPPORT 335/* These variables are used only if (MB_CUR_MAX > 1). */ 336static mbstate_t mbs; /* Mbstate for mbrlen(). */ 337static int cur_mb_len; /* Byte length of the current scanning 338 multibyte character. */ 339static int cur_mb_index; /* Byte index of the current scanning multibyte 340 character. 341 342 singlebyte character : cur_mb_index = 0 343 multibyte character 344 1st byte : cur_mb_index = 1 345 2nd byte : cur_mb_index = 2 346 ... 347 nth byte : cur_mb_index = n */ 348static unsigned char *mblen_buf;/* Correspond to the input buffer in dfaexec(). 349 Each element store the amount of remain 350 byte of corresponding multibyte character 351 in the input string. A element's value 352 is 0 if corresponding character is a 353 singlebyte chracter. 354 e.g. input : 'a', <mb(0)>, <mb(1)>, <mb(2)> 355 mblen_buf : 0, 3, 2, 1 356 */ 357static wchar_t *inputwcs; /* Wide character representation of input 358 string in dfaexec(). 359 The length of this array is same as 360 the length of input string(char array). 361 inputstring[i] is a single-byte char, 362 or 1st byte of a multibyte char. 363 And inputwcs[i] is the codepoint. */ 364static unsigned char const *buf_begin;/* refference to begin in dfaexec(). */ 365static unsigned char const *buf_end; /* refference to end in dfaexec(). */ 366#endif /* MBS_SUPPORT */ 367 368#ifdef MBS_SUPPORT 369/* This function update cur_mb_len, and cur_mb_index. 370 p points current lexptr, len is the remaining buffer length. */ 371static void 372update_mb_len_index (unsigned char const *p, int len) 373{ 374 /* If last character is a part of a multibyte character, 375 we update cur_mb_index. */ 376 if (cur_mb_index) 377 cur_mb_index = (cur_mb_index >= cur_mb_len)? 0 378 : cur_mb_index + 1; 379 380 /* If last character is a single byte character, or the 381 last portion of a multibyte character, we check whether 382 next character is a multibyte character or not. */ 383 if (! cur_mb_index) 384 { 385 cur_mb_len = mbrlen(p, len, &mbs); 386 if (cur_mb_len > 1) 387 /* It is a multibyte character. 388 cur_mb_len was already set by mbrlen(). */ 389 cur_mb_index = 1; 390 else if (cur_mb_len < 1) 391 /* Invalid sequence. We treat it as a singlebyte character. 392 cur_mb_index is aleady 0. */ 393 cur_mb_len = 1; 394 /* Otherwise, cur_mb_len == 1, it is a singlebyte character. 395 cur_mb_index is aleady 0. */ 396 } 397} 398#endif /* MBS_SUPPORT */ 399 400#ifdef MBS_SUPPORT 401/* Note that characters become unsigned here. */ 402# define FETCH(c, eoferr) \ 403 { \ 404 if (! lexleft) \ 405 { \ 406 if (eoferr != 0) \ 407 dfaerror (eoferr); \ 408 else \ 409 return lasttok = END; \ 410 } \ 411 if (MB_CUR_MAX > 1) \ 412 update_mb_len_index(lexptr, lexleft); \ 413 (c) = (unsigned char) *lexptr++; \ 414 --lexleft; \ 415 } 416 417/* This function fetch a wide character, and update cur_mb_len, 418 used only if the current locale is a multibyte environment. */ 419static wint_t 420fetch_wc (char const *eoferr) 421{ 422 wchar_t wc; 423 if (! lexleft) 424 { 425 if (eoferr != 0) 426 dfaerror (eoferr); 427 else 428 return WEOF; 429 } 430 431 cur_mb_len = mbrtowc(&wc, lexptr, lexleft, &mbs); 432 if (cur_mb_len <= 0) 433 { 434 cur_mb_len = 1; 435 wc = *lexptr; 436 } 437 lexptr += cur_mb_len; 438 lexleft -= cur_mb_len; 439 return wc; 440} 441#else 442/* Note that characters become unsigned here. */ 443# define FETCH(c, eoferr) \ 444 { \ 445 if (! lexleft) \ 446 { \ 447 if (eoferr != 0) \ 448 dfaerror (eoferr); \ 449 else \ 450 return lasttok = END; \ 451 } \ 452 (c) = (unsigned char) *lexptr++; \ 453 --lexleft; \ 454 } 455#endif /* MBS_SUPPORT */ 456 457#ifdef MBS_SUPPORT 458/* Multibyte character handling sub-routin for lex. 459 This function parse a bracket expression and build a struct 460 mb_char_classes. */ 461static void 462parse_bracket_exp_mb () 463{ 464 wint_t wc, wc1, wc2; 465 466 /* Work area to build a mb_char_classes. */ 467 struct mb_char_classes *work_mbc; 468 int chars_al, range_sts_al, range_ends_al, ch_classes_al, 469 equivs_al, coll_elems_al; 470 471 REALLOC_IF_NECESSARY(dfa->mbcsets, struct mb_char_classes, 472 dfa->mbcsets_alloc, dfa->nmbcsets + 1); 473 /* dfa->multibyte_prop[] hold the index of dfa->mbcsets. 474 We will update dfa->multibyte_prop in addtok(), because we can't 475 decide the index in dfa->tokens[]. */ 476 477 /* Initialize work are */ 478 work_mbc = &(dfa->mbcsets[dfa->nmbcsets++]); 479 480 chars_al = 1; 481 range_sts_al = range_ends_al = 0; 482 ch_classes_al = equivs_al = coll_elems_al = 0; 483 MALLOC(work_mbc->chars, wchar_t, chars_al); 484 485 work_mbc->nchars = work_mbc->nranges = work_mbc->nch_classes = 0; 486 work_mbc->nequivs = work_mbc->ncoll_elems = 0; 487 work_mbc->chars = work_mbc->ch_classes = NULL; 488 work_mbc->range_sts = work_mbc->range_ends = NULL; 489 work_mbc->equivs = work_mbc->coll_elems = NULL; 490 491 wc = fetch_wc(_("Unbalanced [")); 492 if (wc == L'^') 493 { 494 wc = fetch_wc(_("Unbalanced [")); 495 work_mbc->invert = 1; 496 } 497 else 498 work_mbc->invert = 0; 499 do 500 { 501 wc1 = WEOF; /* mark wc1 is not initialized". */ 502 503 /* Note that if we're looking at some other [:...:] construct, 504 we just treat it as a bunch of ordinary characters. We can do 505 this because we assume regex has checked for syntax errors before 506 dfa is ever called. */ 507 if (wc == L'[' && (syntax_bits & RE_CHAR_CLASSES)) 508 { 509#define BRACKET_BUFFER_SIZE 128 510 char str[BRACKET_BUFFER_SIZE]; 511 wc1 = wc; 512 wc = fetch_wc(_("Unbalanced [")); 513 514 /* If pattern contains `[[:', `[[.', or `[[='. */ 515 if (cur_mb_len == 1 && (wc == L':' || wc == L'.' || wc == L'=')) 516 { 517 unsigned char c; 518 unsigned char delim = (unsigned char)wc; 519 int len = 0; 520 for (;;) 521 { 522 if (! lexleft) 523 dfaerror (_("Unbalanced [")); 524 c = (unsigned char) *lexptr++; 525 --lexleft; 526 527 if ((c == delim && *lexptr == ']') || lexleft == 0) 528 break; 529 if (len < BRACKET_BUFFER_SIZE) 530 str[len++] = c; 531 else 532 /* This is in any case an invalid class name. */ 533 str[0] = '\0'; 534 } 535 str[len] = '\0'; 536 537 if (lexleft == 0) 538 { 539 REALLOC_IF_NECESSARY(work_mbc->chars, wchar_t, chars_al, 540 work_mbc->nchars + 2); 541 work_mbc->chars[work_mbc->nchars++] = L'['; 542 work_mbc->chars[work_mbc->nchars++] = delim; 543 break; 544 } 545 546 if (--lexleft, *lexptr++ != ']') 547 dfaerror (_("Unbalanced [")); 548 if (delim == ':') 549 /* build character class. */ 550 { 551 wctype_t wt; 552 /* Query the character class as wctype_t. */ 553 wt = wctype (str); 554 555 if (ch_classes_al == 0) 556 MALLOC(work_mbc->ch_classes, wchar_t, ++ch_classes_al); 557 REALLOC_IF_NECESSARY(work_mbc->ch_classes, wctype_t, 558 ch_classes_al, 559 work_mbc->nch_classes + 1); 560 work_mbc->ch_classes[work_mbc->nch_classes++] = wt; 561 562 } 563 else if (delim == '=' || delim == '.') 564 { 565 char *elem; 566 MALLOC(elem, char, len + 1); 567 strncpy(elem, str, len + 1); 568 569 if (delim == '=') 570 /* build equivalent class. */ 571 { 572 if (equivs_al == 0) 573 MALLOC(work_mbc->equivs, char*, ++equivs_al); 574 REALLOC_IF_NECESSARY(work_mbc->equivs, char*, 575 equivs_al, 576 work_mbc->nequivs + 1); 577 work_mbc->equivs[work_mbc->nequivs++] = elem; 578 } 579 580 if (delim == '.') 581 /* build collating element. */ 582 { 583 if (coll_elems_al == 0) 584 MALLOC(work_mbc->coll_elems, char*, ++coll_elems_al); 585 REALLOC_IF_NECESSARY(work_mbc->coll_elems, char*, 586 coll_elems_al, 587 work_mbc->ncoll_elems + 1); 588 work_mbc->coll_elems[work_mbc->ncoll_elems++] = elem; 589 } 590 } 591 wc1 = wc = WEOF; 592 } 593 else 594 /* We treat '[' as a normal character here. */ 595 { 596 wc2 = wc1; wc1 = wc; wc = wc2; /* swap */ 597 } 598 } 599 else 600 { 601 if (wc == L'\\' && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS)) 602 wc = fetch_wc(("Unbalanced [")); 603 } 604 605 if (wc1 == WEOF) 606 wc1 = fetch_wc(_("Unbalanced [")); 607 608 if (wc1 == L'-') 609 /* build range characters. */ 610 { 611 wc2 = fetch_wc(_("Unbalanced [")); 612 if (wc2 == L']') 613 { 614 /* In the case [x-], the - is an ordinary hyphen, 615 which is left in c1, the lookahead character. */ 616 lexptr -= cur_mb_len; 617 lexleft += cur_mb_len; 618 wc2 = wc; 619 } 620 else 621 { 622 if (wc2 == L'\\' 623 && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS)) 624 wc2 = fetch_wc(_("Unbalanced [")); 625 wc1 = fetch_wc(_("Unbalanced [")); 626 } 627 628 if (range_sts_al == 0) 629 { 630 MALLOC(work_mbc->range_sts, wchar_t, ++range_sts_al); 631 MALLOC(work_mbc->range_ends, wchar_t, ++range_ends_al); 632 } 633 REALLOC_IF_NECESSARY(work_mbc->range_sts, wchar_t, 634 range_sts_al, work_mbc->nranges + 1); 635 work_mbc->range_sts[work_mbc->nranges] = (wchar_t)wc; 636 REALLOC_IF_NECESSARY(work_mbc->range_ends, wchar_t, 637 range_ends_al, work_mbc->nranges + 1); 638 work_mbc->range_ends[work_mbc->nranges++] = (wchar_t)wc2; 639 } 640 else if (wc != WEOF) 641 /* build normal characters. */ 642 { 643 REALLOC_IF_NECESSARY(work_mbc->chars, wchar_t, chars_al, 644 work_mbc->nchars + 1); 645 work_mbc->chars[work_mbc->nchars++] = (wchar_t)wc; 646 } 647 } 648 while ((wc = wc1) != L']'); 649} 650#endif /* MBS_SUPPORT */ 651 652#ifdef __STDC__ 653#define FUNC(F, P) static int F(int c) { return P(c); } 654#else 655#define FUNC(F, P) static int F(c) int c; { return P(c); } 656#endif 657 658FUNC(is_alpha, ISALPHA) 659FUNC(is_upper, ISUPPER) 660FUNC(is_lower, ISLOWER) 661FUNC(is_digit, ISDIGIT) 662FUNC(is_xdigit, ISXDIGIT) 663FUNC(is_space, ISSPACE) 664FUNC(is_punct, ISPUNCT) 665FUNC(is_alnum, ISALNUM) 666FUNC(is_print, ISPRINT) 667FUNC(is_graph, ISGRAPH) 668FUNC(is_cntrl, ISCNTRL) 669 670static int 671is_blank (int c) 672{ 673 return (c == ' ' || c == '\t'); 674} 675 676/* The following list maps the names of the Posix named character classes 677 to predicate functions that determine whether a given character is in 678 the class. The leading [ has already been eaten by the lexical analyzer. */ 679static struct { 680 const char *name; 681 int (*pred) PARAMS ((int)); 682} const prednames[] = { 683 { ":alpha:]", is_alpha }, 684 { ":upper:]", is_upper }, 685 { ":lower:]", is_lower }, 686 { ":digit:]", is_digit }, 687 { ":xdigit:]", is_xdigit }, 688 { ":space:]", is_space }, 689 { ":punct:]", is_punct }, 690 { ":alnum:]", is_alnum }, 691 { ":print:]", is_print }, 692 { ":graph:]", is_graph }, 693 { ":cntrl:]", is_cntrl }, 694 { ":blank:]", is_blank }, 695 { 0 } 696}; 697 698/* Return non-zero if C is a `word-constituent' byte; zero otherwise. */ 699#define IS_WORD_CONSTITUENT(C) (ISALNUM(C) || (C) == '_') 700 701static int 702looking_at (char const *s) 703{ 704 size_t len; 705 706 len = strlen(s); 707 if (lexleft < len) 708 return 0; 709 return strncmp(s, lexptr, len) == 0; 710} 711 712static token 713lex (void) 714{ 715 unsigned c, c1, c2; 716 int backslash = 0, invert; 717 charclass ccl; 718 int i; 719 720 /* Basic plan: We fetch a character. If it's a backslash, 721 we set the backslash flag and go through the loop again. 722 On the plus side, this avoids having a duplicate of the 723 main switch inside the backslash case. On the minus side, 724 it means that just about every case begins with 725 "if (backslash) ...". */ 726 for (i = 0; i < 2; ++i) 727 { 728 FETCH(c, 0); 729#ifdef MBS_SUPPORT 730 if (MB_CUR_MAX > 1 && cur_mb_index) 731 /* If this is a part of a multi-byte character, we must treat 732 this byte data as a normal character. 733 e.g. In case of SJIS encoding, some character contains '\', 734 but they must not be backslash. */ 735 goto normal_char; 736#endif /* MBS_SUPPORT */ 737 switch (c) 738 { 739 case '\\': 740 if (backslash) 741 goto normal_char; 742 if (lexleft == 0) 743 dfaerror(_("Unfinished \\ escape")); 744 backslash = 1; 745 break; 746 747 case '^': 748 if (backslash) 749 goto normal_char; 750 if (syntax_bits & RE_CONTEXT_INDEP_ANCHORS 751 || lasttok == END 752 || lasttok == LPAREN 753 || lasttok == OR) 754 return lasttok = BEGLINE; 755 goto normal_char; 756 757 case '$': 758 if (backslash) 759 goto normal_char; 760 if (syntax_bits & RE_CONTEXT_INDEP_ANCHORS 761 || lexleft == 0 762 || (syntax_bits & RE_NO_BK_PARENS 763 ? lexleft > 0 && *lexptr == ')' 764 : lexleft > 1 && lexptr[0] == '\\' && lexptr[1] == ')') 765 || (syntax_bits & RE_NO_BK_VBAR 766 ? lexleft > 0 && *lexptr == '|' 767 : lexleft > 1 && lexptr[0] == '\\' && lexptr[1] == '|') 768 || ((syntax_bits & RE_NEWLINE_ALT) 769 && lexleft > 0 && *lexptr == '\n')) 770 return lasttok = ENDLINE; 771 goto normal_char; 772 773 case '1': 774 case '2': 775 case '3': 776 case '4': 777 case '5': 778 case '6': 779 case '7': 780 case '8': 781 case '9': 782 if (backslash && !(syntax_bits & RE_NO_BK_REFS)) 783 { 784 laststart = 0; 785 return lasttok = BACKREF; 786 } 787 goto normal_char; 788 789 case '`': 790 if (backslash && !(syntax_bits & RE_NO_GNU_OPS)) 791 return lasttok = BEGLINE; /* FIXME: should be beginning of string */ 792 goto normal_char; 793 794 case '\'': 795 if (backslash && !(syntax_bits & RE_NO_GNU_OPS)) 796 return lasttok = ENDLINE; /* FIXME: should be end of string */ 797 goto normal_char; 798 799 case '<': 800 if (backslash && !(syntax_bits & RE_NO_GNU_OPS)) 801 return lasttok = BEGWORD; 802 goto normal_char; 803 804 case '>': 805 if (backslash && !(syntax_bits & RE_NO_GNU_OPS)) 806 return lasttok = ENDWORD; 807 goto normal_char; 808 809 case 'b': 810 if (backslash && !(syntax_bits & RE_NO_GNU_OPS)) 811 return lasttok = LIMWORD; 812 goto normal_char; 813 814 case 'B': 815 if (backslash && !(syntax_bits & RE_NO_GNU_OPS)) 816 return lasttok = NOTLIMWORD; 817 goto normal_char; 818 819 case '?': 820 if (syntax_bits & RE_LIMITED_OPS) 821 goto normal_char; 822 if (backslash != ((syntax_bits & RE_BK_PLUS_QM) != 0)) 823 goto normal_char; 824 if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart) 825 goto normal_char; 826 return lasttok = QMARK; 827 828 case '*': 829 if (backslash) 830 goto normal_char; 831 if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart) 832 goto normal_char; 833 return lasttok = STAR; 834 835 case '+': 836 if (syntax_bits & RE_LIMITED_OPS) 837 goto normal_char; 838 if (backslash != ((syntax_bits & RE_BK_PLUS_QM) != 0)) 839 goto normal_char; 840 if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart) 841 goto normal_char; 842 return lasttok = PLUS; 843 844 case '{': 845 if (!(syntax_bits & RE_INTERVALS)) 846 goto normal_char; 847 if (backslash != ((syntax_bits & RE_NO_BK_BRACES) == 0)) 848 goto normal_char; 849 if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart) 850 goto normal_char; 851 852 if (syntax_bits & RE_NO_BK_BRACES) 853 { 854 /* Scan ahead for a valid interval; if it's not valid, 855 treat it as a literal '{'. */ 856 int lo = -1, hi = -1; 857 char const *p = lexptr; 858 char const *lim = p + lexleft; 859 for (; p != lim && ISASCIIDIGIT (*p); p++) 860 lo = (lo < 0 ? 0 : lo * 10) + *p - '0'; 861 if (p != lim && *p == ',') 862 while (++p != lim && ISASCIIDIGIT (*p)) 863 hi = (hi < 0 ? 0 : hi * 10) + *p - '0'; 864 else 865 hi = lo; 866 if (p == lim || *p != '}' 867 || lo < 0 || RE_DUP_MAX < hi || (0 <= hi && hi < lo)) 868 goto normal_char; 869 } 870 871 minrep = 0; 872 /* Cases: 873 {M} - exact count 874 {M,} - minimum count, maximum is infinity 875 {M,N} - M through N */ 876 FETCH(c, _("unfinished repeat count")); 877 if (ISASCIIDIGIT (c)) 878 { 879 minrep = c - '0'; 880 for (;;) 881 { 882 FETCH(c, _("unfinished repeat count")); 883 if (! ISASCIIDIGIT (c)) 884 break; 885 minrep = 10 * minrep + c - '0'; 886 } 887 } 888 else 889 dfaerror(_("malformed repeat count")); 890 if (c == ',') 891 { 892 FETCH (c, _("unfinished repeat count")); 893 if (! ISASCIIDIGIT (c)) 894 maxrep = -1; 895 else 896 { 897 maxrep = c - '0'; 898 for (;;) 899 { 900 FETCH (c, _("unfinished repeat count")); 901 if (! ISASCIIDIGIT (c)) 902 break; 903 maxrep = 10 * maxrep + c - '0'; 904 } 905 if (0 <= maxrep && maxrep < minrep) 906 dfaerror (_("malformed repeat count")); 907 } 908 } 909 else 910 maxrep = minrep; 911 if (!(syntax_bits & RE_NO_BK_BRACES)) 912 { 913 if (c != '\\') 914 dfaerror(_("malformed repeat count")); 915 FETCH(c, _("unfinished repeat count")); 916 } 917 if (c != '}') 918 dfaerror(_("malformed repeat count")); 919 laststart = 0; 920 return lasttok = REPMN; 921 922 case '|': 923 if (syntax_bits & RE_LIMITED_OPS) 924 goto normal_char; 925 if (backslash != ((syntax_bits & RE_NO_BK_VBAR) == 0)) 926 goto normal_char; 927 laststart = 1; 928 return lasttok = OR; 929 930 case '\n': 931 if (syntax_bits & RE_LIMITED_OPS 932 || backslash 933 || !(syntax_bits & RE_NEWLINE_ALT)) 934 goto normal_char; 935 laststart = 1; 936 return lasttok = OR; 937 938 case '(': 939 if (backslash != ((syntax_bits & RE_NO_BK_PARENS) == 0)) 940 goto normal_char; 941 ++parens; 942 laststart = 1; 943 return lasttok = LPAREN; 944 945 case ')': 946 if (backslash != ((syntax_bits & RE_NO_BK_PARENS) == 0)) 947 goto normal_char; 948 if (parens == 0 && syntax_bits & RE_UNMATCHED_RIGHT_PAREN_ORD) 949 goto normal_char; 950 --parens; 951 laststart = 0; 952 return lasttok = RPAREN; 953 954 case '.': 955 if (backslash) 956 goto normal_char; 957#ifdef MBS_SUPPORT 958 if (MB_CUR_MAX > 1) 959 { 960 /* In multibyte environment period must match with a single 961 character not a byte. So we use ANYCHAR. */ 962 laststart = 0; 963 return lasttok = ANYCHAR; 964 } 965#endif /* MBS_SUPPORT */ 966 zeroset(ccl); 967 notset(ccl); 968 if (!(syntax_bits & RE_DOT_NEWLINE)) 969 clrbit(eolbyte, ccl); 970 if (syntax_bits & RE_DOT_NOT_NULL) 971 clrbit('\0', ccl); 972 laststart = 0; 973 return lasttok = CSET + charclass_index(ccl); 974 975 case 'w': 976 case 'W': 977 if (!backslash || (syntax_bits & RE_NO_GNU_OPS)) 978 goto normal_char; 979 zeroset(ccl); 980 for (c2 = 0; c2 < NOTCHAR; ++c2) 981 if (IS_WORD_CONSTITUENT(c2)) 982 setbit(c2, ccl); 983 if (c == 'W') 984 notset(ccl); 985 laststart = 0; 986 return lasttok = CSET + charclass_index(ccl); 987 988 case '[': 989 if (backslash) 990 goto normal_char; 991 laststart = 0; 992#ifdef MBS_SUPPORT 993 if (MB_CUR_MAX > 1) 994 { 995 /* In multibyte environment a bracket expression may contain 996 multibyte characters, which must be treated as characters 997 (not bytes). So we parse it by parse_bracket_exp_mb(). */ 998 parse_bracket_exp_mb(); 999 return lasttok = MBCSET; 1000 } 1001#endif 1002 zeroset(ccl); 1003 FETCH(c, _("Unbalanced [")); 1004 if (c == '^') 1005 { 1006 FETCH(c, _("Unbalanced [")); 1007 invert = 1; 1008 } 1009 else 1010 invert = 0; 1011 do 1012 { 1013 /* Nobody ever said this had to be fast. :-) 1014 Note that if we're looking at some other [:...:] 1015 construct, we just treat it as a bunch of ordinary 1016 characters. We can do this because we assume 1017 regex has checked for syntax errors before 1018 dfa is ever called. */ 1019 if (c == '[' && (syntax_bits & RE_CHAR_CLASSES)) 1020 for (c1 = 0; prednames[c1].name; ++c1) 1021 if (looking_at(prednames[c1].name)) 1022 { 1023 int (*pred) PARAMS ((int)) = prednames[c1].pred; 1024 1025 for (c2 = 0; c2 < NOTCHAR; ++c2) 1026 if ((*pred)(c2)) 1027 setbit_case_fold (c2, ccl); 1028 lexptr += strlen(prednames[c1].name); 1029 lexleft -= strlen(prednames[c1].name); 1030 FETCH(c1, _("Unbalanced [")); 1031 goto skip; 1032 } 1033 if (c == '\\' && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS)) 1034 FETCH(c, _("Unbalanced [")); 1035 FETCH(c1, _("Unbalanced [")); 1036 if (c1 == '-') 1037 { 1038 FETCH(c2, _("Unbalanced [")); 1039 if (c2 == ']') 1040 { 1041 /* In the case [x-], the - is an ordinary hyphen, 1042 which is left in c1, the lookahead character. */ 1043 --lexptr; 1044 ++lexleft; 1045 } 1046 else 1047 { 1048 if (c2 == '\\' 1049 && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS)) 1050 FETCH(c2, _("Unbalanced [")); 1051 FETCH(c1, _("Unbalanced [")); 1052 if (!hard_LC_COLLATE) { 1053 for (; c <= c2; c++) 1054 setbit_case_fold (c, ccl); 1055 } else { 1056 /* POSIX locales are painful - leave the decision to libc */ 1057 char expr[6] = { '[', c, '-', c2, ']', '\0' }; 1058 regex_t re; 1059 if (regcomp (&re, expr, case_fold ? REG_ICASE : 0) == REG_NOERROR) { 1060 for (c = 0; c < NOTCHAR; ++c) { 1061 char buf[2] = { c, '\0' }; 1062 regmatch_t mat; 1063 if (regexec (&re, buf, 1, &mat, 0) == REG_NOERROR 1064 && mat.rm_so == 0 && mat.rm_eo == 1) 1065 setbit_case_fold (c, ccl); 1066 } 1067 regfree (&re); 1068 } 1069 } 1070 continue; 1071 } 1072 } 1073 1074 setbit_case_fold (c, ccl); 1075 1076 skip: 1077 ; 1078 } 1079 while ((c = c1) != ']'); 1080 if (invert) 1081 { 1082 notset(ccl); 1083 if (syntax_bits & RE_HAT_LISTS_NOT_NEWLINE) 1084 clrbit(eolbyte, ccl); 1085 } 1086 return lasttok = CSET + charclass_index(ccl); 1087 1088 default: 1089 normal_char: 1090 laststart = 0; 1091 if (case_fold && ISALPHA(c)) 1092 { 1093 zeroset(ccl); 1094 setbit_case_fold (c, ccl); 1095 return lasttok = CSET + charclass_index(ccl); 1096 } 1097 return c; 1098 } 1099 } 1100 1101 /* The above loop should consume at most a backslash 1102 and some other character. */ 1103 abort(); 1104 return END; /* keeps pedantic compilers happy. */ 1105} 1106 1107/* Recursive descent parser for regular expressions. */ 1108 1109static token tok; /* Lookahead token. */ 1110static int depth; /* Current depth of a hypothetical stack 1111 holding deferred productions. This is 1112 used to determine the depth that will be 1113 required of the real stack later on in 1114 dfaanalyze(). */ 1115 1116/* Add the given token to the parse tree, maintaining the depth count and 1117 updating the maximum depth if necessary. */ 1118static void 1119addtok (token t) 1120{ 1121#ifdef MBS_SUPPORT 1122 if (MB_CUR_MAX > 1) 1123 { 1124 REALLOC_IF_NECESSARY(dfa->multibyte_prop, int, dfa->nmultibyte_prop, 1125 dfa->tindex); 1126 /* Set dfa->multibyte_prop. See struct dfa in dfa.h. */ 1127 if (t == MBCSET) 1128 dfa->multibyte_prop[dfa->tindex] = ((dfa->nmbcsets - 1) << 2) + 3; 1129 else if (t < NOTCHAR) 1130 dfa->multibyte_prop[dfa->tindex] 1131 = (cur_mb_len == 1)? 3 /* single-byte char */ 1132 : (((cur_mb_index == 1)? 1 : 0) /* 1st-byte of multibyte char */ 1133 + ((cur_mb_index == cur_mb_len)? 2 : 0)); /* last-byte */ 1134 else 1135 /* It may be unnecesssary, but it is safer to treat other 1136 symbols as singlebyte characters. */ 1137 dfa->multibyte_prop[dfa->tindex] = 3; 1138 } 1139#endif 1140 1141 REALLOC_IF_NECESSARY(dfa->tokens, token, dfa->talloc, dfa->tindex); 1142 dfa->tokens[dfa->tindex++] = t; 1143 1144 switch (t) 1145 { 1146 case QMARK: 1147 case STAR: 1148 case PLUS: 1149 break; 1150 1151 case CAT: 1152 case OR: 1153 case ORTOP: 1154 --depth; 1155 break; 1156 1157 default: 1158 ++dfa->nleaves; 1159 case EMPTY: 1160 ++depth; 1161 break; 1162 } 1163 if (depth > dfa->depth) 1164 dfa->depth = depth; 1165} 1166 1167/* The grammar understood by the parser is as follows. 1168 1169 regexp: 1170 regexp OR branch 1171 branch 1172 1173 branch: 1174 branch closure 1175 closure 1176 1177 closure: 1178 closure QMARK 1179 closure STAR 1180 closure PLUS 1181 closure REPMN 1182 atom 1183 1184 atom: 1185 <normal character> 1186 <multibyte character> 1187 ANYCHAR 1188 MBCSET 1189 CSET 1190 BACKREF 1191 BEGLINE 1192 ENDLINE 1193 BEGWORD 1194 ENDWORD 1195 LIMWORD 1196 NOTLIMWORD 1197 CRANGE 1198 LPAREN regexp RPAREN 1199 <empty> 1200 1201 The parser builds a parse tree in postfix form in an array of tokens. */ 1202 1203static void 1204atom (void) 1205{ 1206 if ((tok >= 0 && tok < NOTCHAR) || tok >= CSET || tok == BACKREF 1207 || tok == BEGLINE || tok == ENDLINE || tok == BEGWORD 1208#ifdef MBS_SUPPORT 1209 || tok == ANYCHAR || tok == MBCSET /* MB_CUR_MAX > 1 */ 1210#endif /* MBS_SUPPORT */ 1211 || tok == ENDWORD || tok == LIMWORD || tok == NOTLIMWORD) 1212 { 1213 addtok(tok); 1214 tok = lex(); 1215#ifdef MBS_SUPPORT 1216 /* We treat a multibyte character as a single atom, so that DFA 1217 can treat a multibyte character as a single expression. 1218 1219 e.g. We construct following tree from "<mb1><mb2>". 1220 <mb1(1st-byte)><mb1(2nd-byte)><CAT><mb1(3rd-byte)><CAT> 1221 <mb2(1st-byte)><mb2(2nd-byte)><CAT><mb2(3rd-byte)><CAT><CAT> 1222 */ 1223 if (MB_CUR_MAX > 1) 1224 { 1225 while (cur_mb_index > 1 && tok >= 0 && tok < NOTCHAR) 1226 { 1227 addtok(tok); 1228 addtok(CAT); 1229 tok = lex(); 1230 } 1231 } 1232#endif /* MBS_SUPPORT */ 1233 } 1234 else if (tok == CRANGE) 1235 { 1236 /* A character range like "[a-z]" in a locale other than "C" or 1237 "POSIX". This range might any sequence of one or more 1238 characters. Unfortunately the POSIX locale primitives give 1239 us no practical way to find what character sequences might be 1240 matched. Treat this approximately like "(.\1)" -- i.e. match 1241 one character, and then punt to the full matcher. */ 1242 charclass ccl; 1243 zeroset (ccl); 1244 notset (ccl); 1245 addtok (CSET + charclass_index (ccl)); 1246 addtok (BACKREF); 1247 addtok (CAT); 1248 tok = lex (); 1249 } 1250 else if (tok == LPAREN) 1251 { 1252 tok = lex(); 1253 regexp(0); 1254 if (tok != RPAREN) 1255 dfaerror(_("Unbalanced (")); 1256 tok = lex(); 1257 } 1258 else 1259 addtok(EMPTY); 1260} 1261 1262/* Return the number of tokens in the given subexpression. */ 1263static int 1264nsubtoks (int tindex) 1265{ 1266 int ntoks1; 1267 1268 switch (dfa->tokens[tindex - 1]) 1269 { 1270 default: 1271 return 1; 1272 case QMARK: 1273 case STAR: 1274 case PLUS: 1275 return 1 + nsubtoks(tindex - 1); 1276 case CAT: 1277 case OR: 1278 case ORTOP: 1279 ntoks1 = nsubtoks(tindex - 1); 1280 return 1 + ntoks1 + nsubtoks(tindex - 1 - ntoks1); 1281 } 1282} 1283 1284/* Copy the given subexpression to the top of the tree. */ 1285static void 1286copytoks (int tindex, int ntokens) 1287{ 1288 int i; 1289 1290 for (i = 0; i < ntokens; ++i) 1291 addtok(dfa->tokens[tindex + i]); 1292} 1293 1294static void 1295closure (void) 1296{ 1297 int tindex, ntokens, i; 1298 1299 atom(); 1300 while (tok == QMARK || tok == STAR || tok == PLUS || tok == REPMN) 1301 if (tok == REPMN) 1302 { 1303 ntokens = nsubtoks(dfa->tindex); 1304 tindex = dfa->tindex - ntokens; 1305 if (maxrep < 0) 1306 addtok(PLUS); 1307 if (minrep == 0) 1308 addtok(QMARK); 1309 for (i = 1; i < minrep; ++i) 1310 { 1311 copytoks(tindex, ntokens); 1312 addtok(CAT); 1313 } 1314 for (; i < maxrep; ++i) 1315 { 1316 copytoks(tindex, ntokens); 1317 addtok(QMARK); 1318 addtok(CAT); 1319 } 1320 tok = lex(); 1321 } 1322 else 1323 { 1324 addtok(tok); 1325 tok = lex(); 1326 } 1327} 1328 1329static void 1330branch (void) 1331{ 1332 closure(); 1333 while (tok != RPAREN && tok != OR && tok >= 0) 1334 { 1335 closure(); 1336 addtok(CAT); 1337 } 1338} 1339 1340static void 1341regexp (int toplevel) 1342{ 1343 branch(); 1344 while (tok == OR) 1345 { 1346 tok = lex(); 1347 branch(); 1348 if (toplevel) 1349 addtok(ORTOP); 1350 else 1351 addtok(OR); 1352 } 1353} 1354 1355/* Main entry point for the parser. S is a string to be parsed, len is the 1356 length of the string, so s can include NUL characters. D is a pointer to 1357 the struct dfa to parse into. */ 1358void 1359dfaparse (char const *s, size_t len, struct dfa *d) 1360{ 1361 dfa = d; 1362 lexstart = lexptr = s; 1363 lexleft = len; 1364 lasttok = END; 1365 laststart = 1; 1366 parens = 0; 1367 hard_LC_COLLATE = hard_locale (LC_COLLATE); 1368#ifdef MBS_SUPPORT 1369 if (MB_CUR_MAX > 1) 1370 { 1371 cur_mb_index = 0; 1372 cur_mb_len = 0; 1373 memset(&mbs, 0, sizeof(mbstate_t)); 1374 } 1375#endif /* MBS_SUPPORT */ 1376 1377 if (! syntax_bits_set) 1378 dfaerror(_("No syntax specified")); 1379 1380 tok = lex(); 1381 depth = d->depth; 1382 1383 regexp(1); 1384 1385 if (tok != END) 1386 dfaerror(_("Unbalanced )")); 1387 1388 addtok(END - d->nregexps); 1389 addtok(CAT); 1390 1391 if (d->nregexps) 1392 addtok(ORTOP); 1393 1394 ++d->nregexps; 1395} 1396 1397/* Some primitives for operating on sets of positions. */ 1398 1399/* Copy one set to another; the destination must be large enough. */ 1400static void 1401copy (position_set const *src, position_set *dst) 1402{ 1403 int i; 1404 1405 for (i = 0; i < src->nelem; ++i) 1406 dst->elems[i] = src->elems[i]; 1407 dst->nelem = src->nelem; 1408} 1409 1410/* Insert a position in a set. Position sets are maintained in sorted 1411 order according to index. If position already exists in the set with 1412 the same index then their constraints are logically or'd together. 1413 S->elems must point to an array large enough to hold the resulting set. */ 1414static void 1415insert (position p, position_set *s) 1416{ 1417 int i; 1418 position t1, t2; 1419 1420 for (i = 0; i < s->nelem && p.index < s->elems[i].index; ++i) 1421 continue; 1422 if (i < s->nelem && p.index == s->elems[i].index) 1423 s->elems[i].constraint |= p.constraint; 1424 else 1425 { 1426 t1 = p; 1427 ++s->nelem; 1428 while (i < s->nelem) 1429 { 1430 t2 = s->elems[i]; 1431 s->elems[i++] = t1; 1432 t1 = t2; 1433 } 1434 } 1435} 1436 1437/* Merge two sets of positions into a third. The result is exactly as if 1438 the positions of both sets were inserted into an initially empty set. */ 1439static void 1440merge (position_set const *s1, position_set const *s2, position_set *m) 1441{ 1442 int i = 0, j = 0; 1443 1444 m->nelem = 0; 1445 while (i < s1->nelem && j < s2->nelem) 1446 if (s1->elems[i].index > s2->elems[j].index) 1447 m->elems[m->nelem++] = s1->elems[i++]; 1448 else if (s1->elems[i].index < s2->elems[j].index) 1449 m->elems[m->nelem++] = s2->elems[j++]; 1450 else 1451 { 1452 m->elems[m->nelem] = s1->elems[i++]; 1453 m->elems[m->nelem++].constraint |= s2->elems[j++].constraint; 1454 } 1455 while (i < s1->nelem) 1456 m->elems[m->nelem++] = s1->elems[i++]; 1457 while (j < s2->nelem) 1458 m->elems[m->nelem++] = s2->elems[j++]; 1459} 1460 1461/* Delete a position from a set. */ 1462static void 1463delete (position p, position_set *s) 1464{ 1465 int i; 1466 1467 for (i = 0; i < s->nelem; ++i) 1468 if (p.index == s->elems[i].index) 1469 break; 1470 if (i < s->nelem) 1471 for (--s->nelem; i < s->nelem; ++i) 1472 s->elems[i] = s->elems[i + 1]; 1473} 1474 1475/* Find the index of the state corresponding to the given position set with 1476 the given preceding context, or create a new state if there is no such 1477 state. Newline and letter tell whether we got here on a newline or 1478 letter, respectively. */ 1479static int 1480state_index (struct dfa *d, position_set const *s, int newline, int letter) 1481{ 1482 int hash = 0; 1483 int constraint; 1484 int i, j; 1485 1486 newline = newline ? 1 : 0; 1487 letter = letter ? 1 : 0; 1488 1489 for (i = 0; i < s->nelem; ++i) 1490 hash ^= s->elems[i].index + s->elems[i].constraint; 1491 1492 /* Try to find a state that exactly matches the proposed one. */ 1493 for (i = 0; i < d->sindex; ++i) 1494 { 1495 if (hash != d->states[i].hash || s->nelem != d->states[i].elems.nelem 1496 || newline != d->states[i].newline || letter != d->states[i].letter) 1497 continue; 1498 for (j = 0; j < s->nelem; ++j) 1499 if (s->elems[j].constraint 1500 != d->states[i].elems.elems[j].constraint 1501 || s->elems[j].index != d->states[i].elems.elems[j].index) 1502 break; 1503 if (j == s->nelem) 1504 return i; 1505 } 1506 1507 /* We'll have to create a new state. */ 1508 REALLOC_IF_NECESSARY(d->states, dfa_state, d->salloc, d->sindex); 1509 d->states[i].hash = hash; 1510 MALLOC(d->states[i].elems.elems, position, s->nelem); 1511 copy(s, &d->states[i].elems); 1512 d->states[i].newline = newline; 1513 d->states[i].letter = letter; 1514 d->states[i].backref = 0; 1515 d->states[i].constraint = 0; 1516 d->states[i].first_end = 0; 1517#ifdef MBS_SUPPORT 1518 if (MB_CUR_MAX > 1) 1519 d->states[i].mbps.nelem = 0; 1520#endif 1521 for (j = 0; j < s->nelem; ++j) 1522 if (d->tokens[s->elems[j].index] < 0) 1523 { 1524 constraint = s->elems[j].constraint; 1525 if (SUCCEEDS_IN_CONTEXT(constraint, newline, 0, letter, 0) 1526 || SUCCEEDS_IN_CONTEXT(constraint, newline, 0, letter, 1) 1527 || SUCCEEDS_IN_CONTEXT(constraint, newline, 1, letter, 0) 1528 || SUCCEEDS_IN_CONTEXT(constraint, newline, 1, letter, 1)) 1529 d->states[i].constraint |= constraint; 1530 if (! d->states[i].first_end) 1531 d->states[i].first_end = d->tokens[s->elems[j].index]; 1532 } 1533 else if (d->tokens[s->elems[j].index] == BACKREF) 1534 { 1535 d->states[i].constraint = NO_CONSTRAINT; 1536 d->states[i].backref = 1; 1537 } 1538 1539 ++d->sindex; 1540 1541 return i; 1542} 1543 1544/* Find the epsilon closure of a set of positions. If any position of the set 1545 contains a symbol that matches the empty string in some context, replace 1546 that position with the elements of its follow labeled with an appropriate 1547 constraint. Repeat exhaustively until no funny positions are left. 1548 S->elems must be large enough to hold the result. */ 1549static void 1550epsclosure (position_set *s, struct dfa const *d) 1551{ 1552 int i, j; 1553 int *visited; 1554 position p, old; 1555 1556 MALLOC(visited, int, d->tindex); 1557 for (i = 0; i < d->tindex; ++i) 1558 visited[i] = 0; 1559 1560 for (i = 0; i < s->nelem; ++i) 1561 if (d->tokens[s->elems[i].index] >= NOTCHAR 1562 && d->tokens[s->elems[i].index] != BACKREF 1563#ifdef MBS_SUPPORT 1564 && d->tokens[s->elems[i].index] != ANYCHAR 1565 && d->tokens[s->elems[i].index] != MBCSET 1566#endif 1567 && d->tokens[s->elems[i].index] < CSET) 1568 { 1569 old = s->elems[i]; 1570 p.constraint = old.constraint; 1571 delete(s->elems[i], s); 1572 if (visited[old.index]) 1573 { 1574 --i; 1575 continue; 1576 } 1577 visited[old.index] = 1; 1578 switch (d->tokens[old.index]) 1579 { 1580 case BEGLINE: 1581 p.constraint &= BEGLINE_CONSTRAINT; 1582 break; 1583 case ENDLINE: 1584 p.constraint &= ENDLINE_CONSTRAINT; 1585 break; 1586 case BEGWORD: 1587 p.constraint &= BEGWORD_CONSTRAINT; 1588 break; 1589 case ENDWORD: 1590 p.constraint &= ENDWORD_CONSTRAINT; 1591 break; 1592 case LIMWORD: 1593 p.constraint &= LIMWORD_CONSTRAINT; 1594 break; 1595 case NOTLIMWORD: 1596 p.constraint &= NOTLIMWORD_CONSTRAINT; 1597 break; 1598 default: 1599 break; 1600 } 1601 for (j = 0; j < d->follows[old.index].nelem; ++j) 1602 { 1603 p.index = d->follows[old.index].elems[j].index; 1604 insert(p, s); 1605 } 1606 /* Force rescan to start at the beginning. */ 1607 i = -1; 1608 } 1609 1610 free(visited); 1611} 1612 1613/* Perform bottom-up analysis on the parse tree, computing various functions. 1614 Note that at this point, we're pretending constructs like \< are real 1615 characters rather than constraints on what can follow them. 1616 1617 Nullable: A node is nullable if it is at the root of a regexp that can 1618 match the empty string. 1619 * EMPTY leaves are nullable. 1620 * No other leaf is nullable. 1621 * A QMARK or STAR node is nullable. 1622 * A PLUS node is nullable if its argument is nullable. 1623 * A CAT node is nullable if both its arguments are nullable. 1624 * An OR node is nullable if either argument is nullable. 1625 1626 Firstpos: The firstpos of a node is the set of positions (nonempty leaves) 1627 that could correspond to the first character of a string matching the 1628 regexp rooted at the given node. 1629 * EMPTY leaves have empty firstpos. 1630 * The firstpos of a nonempty leaf is that leaf itself. 1631 * The firstpos of a QMARK, STAR, or PLUS node is the firstpos of its 1632 argument. 1633 * The firstpos of a CAT node is the firstpos of the left argument, union 1634 the firstpos of the right if the left argument is nullable. 1635 * The firstpos of an OR node is the union of firstpos of each argument. 1636 1637 Lastpos: The lastpos of a node is the set of positions that could 1638 correspond to the last character of a string matching the regexp at 1639 the given node. 1640 * EMPTY leaves have empty lastpos. 1641 * The lastpos of a nonempty leaf is that leaf itself. 1642 * The lastpos of a QMARK, STAR, or PLUS node is the lastpos of its 1643 argument. 1644 * The lastpos of a CAT node is the lastpos of its right argument, union 1645 the lastpos of the left if the right argument is nullable. 1646 * The lastpos of an OR node is the union of the lastpos of each argument. 1647 1648 Follow: The follow of a position is the set of positions that could 1649 correspond to the character following a character matching the node in 1650 a string matching the regexp. At this point we consider special symbols 1651 that match the empty string in some context to be just normal characters. 1652 Later, if we find that a special symbol is in a follow set, we will 1653 replace it with the elements of its follow, labeled with an appropriate 1654 constraint. 1655 * Every node in the firstpos of the argument of a STAR or PLUS node is in 1656 the follow of every node in the lastpos. 1657 * Every node in the firstpos of the second argument of a CAT node is in 1658 the follow of every node in the lastpos of the first argument. 1659 1660 Because of the postfix representation of the parse tree, the depth-first 1661 analysis is conveniently done by a linear scan with the aid of a stack. 1662 Sets are stored as arrays of the elements, obeying a stack-like allocation 1663 scheme; the number of elements in each set deeper in the stack can be 1664 used to determine the address of a particular set's array. */ 1665void 1666dfaanalyze (struct dfa *d, int searchflag) 1667{ 1668 int *nullable; /* Nullable stack. */ 1669 int *nfirstpos; /* Element count stack for firstpos sets. */ 1670 position *firstpos; /* Array where firstpos elements are stored. */ 1671 int *nlastpos; /* Element count stack for lastpos sets. */ 1672 position *lastpos; /* Array where lastpos elements are stored. */ 1673 int *nalloc; /* Sizes of arrays allocated to follow sets. */ 1674 position_set tmp; /* Temporary set for merging sets. */ 1675 position_set merged; /* Result of merging sets. */ 1676 int wants_newline; /* True if some position wants newline info. */ 1677 int *o_nullable; 1678 int *o_nfirst, *o_nlast; 1679 position *o_firstpos, *o_lastpos; 1680 int i, j; 1681 position *pos; 1682 1683#ifdef DEBUG 1684 fprintf(stderr, "dfaanalyze:\n"); 1685 for (i = 0; i < d->tindex; ++i) 1686 { 1687 fprintf(stderr, " %d:", i); 1688 prtok(d->tokens[i]); 1689 } 1690 putc('\n', stderr); 1691#endif 1692 1693 d->searchflag = searchflag; 1694 1695 MALLOC(nullable, int, d->depth); 1696 o_nullable = nullable; 1697 MALLOC(nfirstpos, int, d->depth); 1698 o_nfirst = nfirstpos; 1699 MALLOC(firstpos, position, d->nleaves); 1700 o_firstpos = firstpos, firstpos += d->nleaves; 1701 MALLOC(nlastpos, int, d->depth); 1702 o_nlast = nlastpos; 1703 MALLOC(lastpos, position, d->nleaves); 1704 o_lastpos = lastpos, lastpos += d->nleaves; 1705 MALLOC(nalloc, int, d->tindex); 1706 for (i = 0; i < d->tindex; ++i) 1707 nalloc[i] = 0; 1708 MALLOC(merged.elems, position, d->nleaves); 1709 1710 CALLOC(d->follows, position_set, d->tindex); 1711 1712 for (i = 0; i < d->tindex; ++i) 1713#ifdef DEBUG 1714 { /* Nonsyntactic #ifdef goo... */ 1715#endif 1716 switch (d->tokens[i]) 1717 { 1718 case EMPTY: 1719 /* The empty set is nullable. */ 1720 *nullable++ = 1; 1721 1722 /* The firstpos and lastpos of the empty leaf are both empty. */ 1723 *nfirstpos++ = *nlastpos++ = 0; 1724 break; 1725 1726 case STAR: 1727 case PLUS: 1728 /* Every element in the firstpos of the argument is in the follow 1729 of every element in the lastpos. */ 1730 tmp.nelem = nfirstpos[-1]; 1731 tmp.elems = firstpos; 1732 pos = lastpos; 1733 for (j = 0; j < nlastpos[-1]; ++j) 1734 { 1735 merge(&tmp, &d->follows[pos[j].index], &merged); 1736 REALLOC_IF_NECESSARY(d->follows[pos[j].index].elems, position, 1737 nalloc[pos[j].index], merged.nelem - 1); 1738 copy(&merged, &d->follows[pos[j].index]); 1739 } 1740 1741 case QMARK: 1742 /* A QMARK or STAR node is automatically nullable. */ 1743 if (d->tokens[i] != PLUS) 1744 nullable[-1] = 1; 1745 break; 1746 1747 case CAT: 1748 /* Every element in the firstpos of the second argument is in the 1749 follow of every element in the lastpos of the first argument. */ 1750 tmp.nelem = nfirstpos[-1]; 1751 tmp.elems = firstpos; 1752 pos = lastpos + nlastpos[-1]; 1753 for (j = 0; j < nlastpos[-2]; ++j) 1754 { 1755 merge(&tmp, &d->follows[pos[j].index], &merged); 1756 REALLOC_IF_NECESSARY(d->follows[pos[j].index].elems, position, 1757 nalloc[pos[j].index], merged.nelem - 1); 1758 copy(&merged, &d->follows[pos[j].index]); 1759 } 1760 1761 /* The firstpos of a CAT node is the firstpos of the first argument, 1762 union that of the second argument if the first is nullable. */ 1763 if (nullable[-2]) 1764 nfirstpos[-2] += nfirstpos[-1]; 1765 else 1766 firstpos += nfirstpos[-1]; 1767 --nfirstpos; 1768 1769 /* The lastpos of a CAT node is the lastpos of the second argument, 1770 union that of the first argument if the second is nullable. */ 1771 if (nullable[-1]) 1772 nlastpos[-2] += nlastpos[-1]; 1773 else 1774 { 1775 pos = lastpos + nlastpos[-2]; 1776 for (j = nlastpos[-1] - 1; j >= 0; --j) 1777 pos[j] = lastpos[j]; 1778 lastpos += nlastpos[-2]; 1779 nlastpos[-2] = nlastpos[-1]; 1780 } 1781 --nlastpos; 1782 1783 /* A CAT node is nullable if both arguments are nullable. */ 1784 nullable[-2] = nullable[-1] && nullable[-2]; 1785 --nullable; 1786 break; 1787 1788 case OR: 1789 case ORTOP: 1790 /* The firstpos is the union of the firstpos of each argument. */ 1791 nfirstpos[-2] += nfirstpos[-1]; 1792 --nfirstpos; 1793 1794 /* The lastpos is the union of the lastpos of each argument. */ 1795 nlastpos[-2] += nlastpos[-1]; 1796 --nlastpos; 1797 1798 /* An OR node is nullable if either argument is nullable. */ 1799 nullable[-2] = nullable[-1] || nullable[-2]; 1800 --nullable; 1801 break; 1802 1803 default: 1804 /* Anything else is a nonempty position. (Note that special 1805 constructs like \< are treated as nonempty strings here; 1806 an "epsilon closure" effectively makes them nullable later. 1807 Backreferences have to get a real position so we can detect 1808 transitions on them later. But they are nullable. */ 1809 *nullable++ = d->tokens[i] == BACKREF; 1810 1811 /* This position is in its own firstpos and lastpos. */ 1812 *nfirstpos++ = *nlastpos++ = 1; 1813 --firstpos, --lastpos; 1814 firstpos->index = lastpos->index = i; 1815 firstpos->constraint = lastpos->constraint = NO_CONSTRAINT; 1816 1817 /* Allocate the follow set for this position. */ 1818 nalloc[i] = 1; 1819 MALLOC(d->follows[i].elems, position, nalloc[i]); 1820 break; 1821 } 1822#ifdef DEBUG 1823 /* ... balance the above nonsyntactic #ifdef goo... */ 1824 fprintf(stderr, "node %d:", i); 1825 prtok(d->tokens[i]); 1826 putc('\n', stderr); 1827 fprintf(stderr, nullable[-1] ? " nullable: yes\n" : " nullable: no\n"); 1828 fprintf(stderr, " firstpos:"); 1829 for (j = nfirstpos[-1] - 1; j >= 0; --j) 1830 { 1831 fprintf(stderr, " %d:", firstpos[j].index); 1832 prtok(d->tokens[firstpos[j].index]); 1833 } 1834 fprintf(stderr, "\n lastpos:"); 1835 for (j = nlastpos[-1] - 1; j >= 0; --j) 1836 { 1837 fprintf(stderr, " %d:", lastpos[j].index); 1838 prtok(d->tokens[lastpos[j].index]); 1839 } 1840 putc('\n', stderr); 1841 } 1842#endif 1843 1844 /* For each follow set that is the follow set of a real position, replace 1845 it with its epsilon closure. */ 1846 for (i = 0; i < d->tindex; ++i) 1847 if (d->tokens[i] < NOTCHAR || d->tokens[i] == BACKREF 1848#ifdef MBS_SUPPORT 1849 || d->tokens[i] == ANYCHAR 1850 || d->tokens[i] == MBCSET 1851#endif 1852 || d->tokens[i] >= CSET) 1853 { 1854#ifdef DEBUG 1855 fprintf(stderr, "follows(%d:", i); 1856 prtok(d->tokens[i]); 1857 fprintf(stderr, "):"); 1858 for (j = d->follows[i].nelem - 1; j >= 0; --j) 1859 { 1860 fprintf(stderr, " %d:", d->follows[i].elems[j].index); 1861 prtok(d->tokens[d->follows[i].elems[j].index]); 1862 } 1863 putc('\n', stderr); 1864#endif 1865 copy(&d->follows[i], &merged); 1866 epsclosure(&merged, d); 1867 if (d->follows[i].nelem < merged.nelem) 1868 REALLOC(d->follows[i].elems, position, merged.nelem); 1869 copy(&merged, &d->follows[i]); 1870 } 1871 1872 /* Get the epsilon closure of the firstpos of the regexp. The result will 1873 be the set of positions of state 0. */ 1874 merged.nelem = 0; 1875 for (i = 0; i < nfirstpos[-1]; ++i) 1876 insert(firstpos[i], &merged); 1877 epsclosure(&merged, d); 1878 1879 /* Check if any of the positions of state 0 will want newline context. */ 1880 wants_newline = 0; 1881 for (i = 0; i < merged.nelem; ++i) 1882 if (PREV_NEWLINE_DEPENDENT(merged.elems[i].constraint)) 1883 wants_newline = 1; 1884 1885 /* Build the initial state. */ 1886 d->salloc = 1; 1887 d->sindex = 0; 1888 MALLOC(d->states, dfa_state, d->salloc); 1889 state_index(d, &merged, wants_newline, 0); 1890 1891 free(o_nullable); 1892 free(o_nfirst); 1893 free(o_firstpos); 1894 free(o_nlast); 1895 free(o_lastpos); 1896 free(nalloc); 1897 free(merged.elems); 1898} 1899 1900/* Find, for each character, the transition out of state s of d, and store 1901 it in the appropriate slot of trans. 1902 1903 We divide the positions of s into groups (positions can appear in more 1904 than one group). Each group is labeled with a set of characters that 1905 every position in the group matches (taking into account, if necessary, 1906 preceding context information of s). For each group, find the union 1907 of the its elements' follows. This set is the set of positions of the 1908 new state. For each character in the group's label, set the transition 1909 on this character to be to a state corresponding to the set's positions, 1910 and its associated backward context information, if necessary. 1911 1912 If we are building a searching matcher, we include the positions of state 1913 0 in every state. 1914 1915 The collection of groups is constructed by building an equivalence-class 1916 partition of the positions of s. 1917 1918 For each position, find the set of characters C that it matches. Eliminate 1919 any characters from C that fail on grounds of backward context. 1920 1921 Search through the groups, looking for a group whose label L has nonempty 1922 intersection with C. If L - C is nonempty, create a new group labeled 1923 L - C and having the same positions as the current group, and set L to 1924 the intersection of L and C. Insert the position in this group, set 1925 C = C - L, and resume scanning. 1926 1927 If after comparing with every group there are characters remaining in C, 1928 create a new group labeled with the characters of C and insert this 1929 position in that group. */ 1930void 1931dfastate (int s, struct dfa *d, int trans[]) 1932{ 1933 position_set grps[NOTCHAR]; /* As many as will ever be needed. */ 1934 charclass labels[NOTCHAR]; /* Labels corresponding to the groups. */ 1935 int ngrps = 0; /* Number of groups actually used. */ 1936 position pos; /* Current position being considered. */ 1937 charclass matches; /* Set of matching characters. */ 1938 int matchesf; /* True if matches is nonempty. */ 1939 charclass intersect; /* Intersection with some label set. */ 1940 int intersectf; /* True if intersect is nonempty. */ 1941 charclass leftovers; /* Stuff in the label that didn't match. */ 1942 int leftoversf; /* True if leftovers is nonempty. */ 1943 static charclass letters; /* Set of characters considered letters. */ 1944 static charclass newline; /* Set of characters that aren't newline. */ 1945 position_set follows; /* Union of the follows of some group. */ 1946 position_set tmp; /* Temporary space for merging sets. */ 1947 int state; /* New state. */ 1948 int wants_newline; /* New state wants to know newline context. */ 1949 int state_newline; /* New state on a newline transition. */ 1950 int wants_letter; /* New state wants to know letter context. */ 1951 int state_letter; /* New state on a letter transition. */ 1952 static int initialized; /* Flag for static initialization. */ 1953#ifdef MBS_SUPPORT 1954 int next_isnt_1st_byte = 0; /* Flag If we can't add state0. */ 1955#endif 1956 int i, j, k; 1957 1958 /* Initialize the set of letters, if necessary. */ 1959 if (! initialized) 1960 { 1961 initialized = 1; 1962 for (i = 0; i < NOTCHAR; ++i) 1963 if (IS_WORD_CONSTITUENT(i)) 1964 setbit(i, letters); 1965 setbit(eolbyte, newline); 1966 } 1967 1968 zeroset(matches); 1969 1970 for (i = 0; i < d->states[s].elems.nelem; ++i) 1971 { 1972 pos = d->states[s].elems.elems[i]; 1973 if (d->tokens[pos.index] >= 0 && d->tokens[pos.index] < NOTCHAR) 1974 setbit(d->tokens[pos.index], matches); 1975 else if (d->tokens[pos.index] >= CSET) 1976 copyset(d->charclasses[d->tokens[pos.index] - CSET], matches); 1977#ifdef MBS_SUPPORT 1978 else if (d->tokens[pos.index] == ANYCHAR 1979 || d->tokens[pos.index] == MBCSET) 1980 /* MB_CUR_MAX > 1 */ 1981 { 1982 /* ANYCHAR and MBCSET must match with a single character, so we 1983 must put it to d->states[s].mbps, which contains the positions 1984 which can match with a single character not a byte. */ 1985 if (d->states[s].mbps.nelem == 0) 1986 { 1987 MALLOC(d->states[s].mbps.elems, position, 1988 d->states[s].elems.nelem); 1989 } 1990 insert(pos, &(d->states[s].mbps)); 1991 continue; 1992 } 1993#endif /* MBS_SUPPORT */ 1994 else 1995 continue; 1996 1997 /* Some characters may need to be eliminated from matches because 1998 they fail in the current context. */ 1999 if (pos.constraint != 0xFF) 2000 { 2001 if (! MATCHES_NEWLINE_CONTEXT(pos.constraint, 2002 d->states[s].newline, 1)) 2003 clrbit(eolbyte, matches); 2004 if (! MATCHES_NEWLINE_CONTEXT(pos.constraint, 2005 d->states[s].newline, 0)) 2006 for (j = 0; j < CHARCLASS_INTS; ++j) 2007 matches[j] &= newline[j]; 2008 if (! MATCHES_LETTER_CONTEXT(pos.constraint, 2009 d->states[s].letter, 1)) 2010 for (j = 0; j < CHARCLASS_INTS; ++j) 2011 matches[j] &= ~letters[j]; 2012 if (! MATCHES_LETTER_CONTEXT(pos.constraint, 2013 d->states[s].letter, 0)) 2014 for (j = 0; j < CHARCLASS_INTS; ++j) 2015 matches[j] &= letters[j]; 2016 2017 /* If there are no characters left, there's no point in going on. */ 2018 for (j = 0; j < CHARCLASS_INTS && !matches[j]; ++j) 2019 continue; 2020 if (j == CHARCLASS_INTS) 2021 continue; 2022 } 2023 2024 for (j = 0; j < ngrps; ++j) 2025 { 2026 /* If matches contains a single character only, and the current 2027 group's label doesn't contain that character, go on to the 2028 next group. */ 2029 if (d->tokens[pos.index] >= 0 && d->tokens[pos.index] < NOTCHAR 2030 && !tstbit(d->tokens[pos.index], labels[j])) 2031 continue; 2032 2033 /* Check if this group's label has a nonempty intersection with 2034 matches. */ 2035 intersectf = 0; 2036 for (k = 0; k < CHARCLASS_INTS; ++k) 2037 (intersect[k] = matches[k] & labels[j][k]) ? (intersectf = 1) : 0; 2038 if (! intersectf) 2039 continue; 2040 2041 /* It does; now find the set differences both ways. */ 2042 leftoversf = matchesf = 0; 2043 for (k = 0; k < CHARCLASS_INTS; ++k) 2044 { 2045 /* Even an optimizing compiler can't know this for sure. */ 2046 int match = matches[k], label = labels[j][k]; 2047 2048 (leftovers[k] = ~match & label) ? (leftoversf = 1) : 0; 2049 (matches[k] = match & ~label) ? (matchesf = 1) : 0; 2050 } 2051 2052 /* If there were leftovers, create a new group labeled with them. */ 2053 if (leftoversf) 2054 { 2055 copyset(leftovers, labels[ngrps]); 2056 copyset(intersect, labels[j]); 2057 MALLOC(grps[ngrps].elems, position, d->nleaves); 2058 copy(&grps[j], &grps[ngrps]); 2059 ++ngrps; 2060 } 2061 2062 /* Put the position in the current group. Note that there is no 2063 reason to call insert() here. */ 2064 grps[j].elems[grps[j].nelem++] = pos; 2065 2066 /* If every character matching the current position has been 2067 accounted for, we're done. */ 2068 if (! matchesf) 2069 break; 2070 } 2071 2072 /* If we've passed the last group, and there are still characters 2073 unaccounted for, then we'll have to create a new group. */ 2074 if (j == ngrps) 2075 { 2076 copyset(matches, labels[ngrps]); 2077 zeroset(matches); 2078 MALLOC(grps[ngrps].elems, position, d->nleaves); 2079 grps[ngrps].nelem = 1; 2080 grps[ngrps].elems[0] = pos; 2081 ++ngrps; 2082 } 2083 } 2084 2085 MALLOC(follows.elems, position, d->nleaves); 2086 MALLOC(tmp.elems, position, d->nleaves); 2087 2088 /* If we are a searching matcher, the default transition is to a state 2089 containing the positions of state 0, otherwise the default transition 2090 is to fail miserably. */ 2091 if (d->searchflag) 2092 { 2093 wants_newline = 0; 2094 wants_letter = 0; 2095 for (i = 0; i < d->states[0].elems.nelem; ++i) 2096 { 2097 if (PREV_NEWLINE_DEPENDENT(d->states[0].elems.elems[i].constraint)) 2098 wants_newline = 1; 2099 if (PREV_LETTER_DEPENDENT(d->states[0].elems.elems[i].constraint)) 2100 wants_letter = 1; 2101 } 2102 copy(&d->states[0].elems, &follows); 2103 state = state_index(d, &follows, 0, 0); 2104 if (wants_newline) 2105 state_newline = state_index(d, &follows, 1, 0); 2106 else 2107 state_newline = state; 2108 if (wants_letter) 2109 state_letter = state_index(d, &follows, 0, 1); 2110 else 2111 state_letter = state; 2112 for (i = 0; i < NOTCHAR; ++i) 2113 trans[i] = (IS_WORD_CONSTITUENT(i)) ? state_letter : state; 2114 trans[eolbyte] = state_newline; 2115 } 2116 else 2117 for (i = 0; i < NOTCHAR; ++i) 2118 trans[i] = -1; 2119 2120 for (i = 0; i < ngrps; ++i) 2121 { 2122 follows.nelem = 0; 2123 2124 /* Find the union of the follows of the positions of the group. 2125 This is a hideously inefficient loop. Fix it someday. */ 2126 for (j = 0; j < grps[i].nelem; ++j) 2127 for (k = 0; k < d->follows[grps[i].elems[j].index].nelem; ++k) 2128 insert(d->follows[grps[i].elems[j].index].elems[k], &follows); 2129 2130#ifdef MBS_SUPPORT 2131 if (MB_CUR_MAX > 1) 2132 { 2133 /* If a token in follows.elems is not 1st byte of a multibyte 2134 character, or the states of follows must accept the bytes 2135 which are not 1st byte of the multibyte character. 2136 Then, if a state of follows encounter a byte, it must not be 2137 a 1st byte of a multibyte character nor singlebyte character. 2138 We cansel to add state[0].follows to next state, because 2139 state[0] must accept 1st-byte 2140 2141 For example, we assume <sb a> is a certain singlebyte 2142 character, <mb A> is a certain multibyte character, and the 2143 codepoint of <sb a> equals the 2nd byte of the codepoint of 2144 <mb A>. 2145 When state[0] accepts <sb a>, state[i] transit to state[i+1] 2146 by accepting accepts 1st byte of <mb A>, and state[i+1] 2147 accepts 2nd byte of <mb A>, if state[i+1] encounter the 2148 codepoint of <sb a>, it must not be <sb a> but 2nd byte of 2149 <mb A>, so we can not add state[0]. */ 2150 2151 next_isnt_1st_byte = 0; 2152 for (j = 0; j < follows.nelem; ++j) 2153 { 2154 if (!(d->multibyte_prop[follows.elems[j].index] & 1)) 2155 { 2156 next_isnt_1st_byte = 1; 2157 break; 2158 } 2159 } 2160 } 2161#endif 2162 2163 /* If we are building a searching matcher, throw in the positions 2164 of state 0 as well. */ 2165#ifdef MBS_SUPPORT 2166 if (d->searchflag && (MB_CUR_MAX == 1 || !next_isnt_1st_byte)) 2167#else 2168 if (d->searchflag) 2169#endif 2170 for (j = 0; j < d->states[0].elems.nelem; ++j) 2171 insert(d->states[0].elems.elems[j], &follows); 2172 2173 /* Find out if the new state will want any context information. */ 2174 wants_newline = 0; 2175 if (tstbit(eolbyte, labels[i])) 2176 for (j = 0; j < follows.nelem; ++j) 2177 if (PREV_NEWLINE_DEPENDENT(follows.elems[j].constraint)) 2178 wants_newline = 1; 2179 2180 wants_letter = 0; 2181 for (j = 0; j < CHARCLASS_INTS; ++j) 2182 if (labels[i][j] & letters[j]) 2183 break; 2184 if (j < CHARCLASS_INTS) 2185 for (j = 0; j < follows.nelem; ++j) 2186 if (PREV_LETTER_DEPENDENT(follows.elems[j].constraint)) 2187 wants_letter = 1; 2188 2189 /* Find the state(s) corresponding to the union of the follows. */ 2190 state = state_index(d, &follows, 0, 0); 2191 if (wants_newline) 2192 state_newline = state_index(d, &follows, 1, 0); 2193 else 2194 state_newline = state; 2195 if (wants_letter) 2196 state_letter = state_index(d, &follows, 0, 1); 2197 else 2198 state_letter = state; 2199 2200 /* Set the transitions for each character in the current label. */ 2201 for (j = 0; j < CHARCLASS_INTS; ++j) 2202 for (k = 0; k < INTBITS; ++k) 2203 if (labels[i][j] & 1 << k) 2204 { 2205 int c = j * INTBITS + k; 2206 2207 if (c == eolbyte) 2208 trans[c] = state_newline; 2209 else if (IS_WORD_CONSTITUENT(c)) 2210 trans[c] = state_letter; 2211 else if (c < NOTCHAR) 2212 trans[c] = state; 2213 } 2214 } 2215 2216 for (i = 0; i < ngrps; ++i) 2217 free(grps[i].elems); 2218 free(follows.elems); 2219 free(tmp.elems); 2220} 2221 2222/* Some routines for manipulating a compiled dfa's transition tables. 2223 Each state may or may not have a transition table; if it does, and it 2224 is a non-accepting state, then d->trans[state] points to its table. 2225 If it is an accepting state then d->fails[state] points to its table. 2226 If it has no table at all, then d->trans[state] is NULL. 2227 TODO: Improve this comment, get rid of the unnecessary redundancy. */ 2228 2229static void 2230build_state (int s, struct dfa *d) 2231{ 2232 int *trans; /* The new transition table. */ 2233 int i; 2234 2235 /* Set an upper limit on the number of transition tables that will ever 2236 exist at once. 1024 is arbitrary. The idea is that the frequently 2237 used transition tables will be quickly rebuilt, whereas the ones that 2238 were only needed once or twice will be cleared away. */ 2239 if (d->trcount >= 1024) 2240 { 2241 for (i = 0; i < d->tralloc; ++i) 2242 if (d->trans[i]) 2243 { 2244 free((ptr_t) d->trans[i]); 2245 d->trans[i] = NULL; 2246 } 2247 else if (d->fails[i]) 2248 { 2249 free((ptr_t) d->fails[i]); 2250 d->fails[i] = NULL; 2251 } 2252 d->trcount = 0; 2253 } 2254 2255 ++d->trcount; 2256 2257 /* Set up the success bits for this state. */ 2258 d->success[s] = 0; 2259 if (ACCEPTS_IN_CONTEXT(d->states[s].newline, 1, d->states[s].letter, 0, 2260 s, *d)) 2261 d->success[s] |= 4; 2262 if (ACCEPTS_IN_CONTEXT(d->states[s].newline, 0, d->states[s].letter, 1, 2263 s, *d)) 2264 d->success[s] |= 2; 2265 if (ACCEPTS_IN_CONTEXT(d->states[s].newline, 0, d->states[s].letter, 0, 2266 s, *d)) 2267 d->success[s] |= 1; 2268 2269 MALLOC(trans, int, NOTCHAR); 2270 dfastate(s, d, trans); 2271 2272 /* Now go through the new transition table, and make sure that the trans 2273 and fail arrays are allocated large enough to hold a pointer for the 2274 largest state mentioned in the table. */ 2275 for (i = 0; i < NOTCHAR; ++i) 2276 if (trans[i] >= d->tralloc) 2277 { 2278 int oldalloc = d->tralloc; 2279 2280 while (trans[i] >= d->tralloc) 2281 d->tralloc *= 2; 2282 REALLOC(d->realtrans, int *, d->tralloc + 1); 2283 d->trans = d->realtrans + 1; 2284 REALLOC(d->fails, int *, d->tralloc); 2285 REALLOC(d->success, int, d->tralloc); 2286 while (oldalloc < d->tralloc) 2287 { 2288 d->trans[oldalloc] = NULL; 2289 d->fails[oldalloc++] = NULL; 2290 } 2291 } 2292 2293 /* Newline is a sentinel. */ 2294 trans[eolbyte] = -1; 2295 2296 if (ACCEPTING(s, *d)) 2297 d->fails[s] = trans; 2298 else 2299 d->trans[s] = trans; 2300} 2301 2302static void 2303build_state_zero (struct dfa *d) 2304{ 2305 d->tralloc = 1; 2306 d->trcount = 0; 2307 CALLOC(d->realtrans, int *, d->tralloc + 1); 2308 d->trans = d->realtrans + 1; 2309 CALLOC(d->fails, int *, d->tralloc); 2310 MALLOC(d->success, int, d->tralloc); 2311 build_state(0, d); 2312} 2313 2314#ifdef MBS_SUPPORT 2315/* Multibyte character handling sub-routins for dfaexec. */ 2316 2317/* Initial state may encounter the byte which is not a singlebyte character 2318 nor 1st byte of a multibyte character. But it is incorrect for initial 2319 state to accept such a byte. 2320 For example, in sjis encoding the regular expression like "\\" accepts 2321 the codepoint 0x5c, but should not accept the 2nd byte of the codepoint 2322 0x815c. Then Initial state must skip the bytes which are not a singlebyte 2323 character nor 1st byte of a multibyte character. */ 2324#define SKIP_REMAINS_MB_IF_INITIAL_STATE(s, p) \ 2325 if (s == 0) \ 2326 { \ 2327 while (inputwcs[p - buf_begin] == 0 \ 2328 && mblen_buf[p - buf_begin] > 0 \ 2329 && p < buf_end) \ 2330 ++p; \ 2331 if (p >= end) \ 2332 { \ 2333 free(mblen_buf); \ 2334 free(inputwcs); \ 2335 return (size_t) -1; \ 2336 } \ 2337 } 2338 2339static void 2340realloc_trans_if_necessary(struct dfa *d, int new_state) 2341{ 2342 /* Make sure that the trans and fail arrays are allocated large enough 2343 to hold a pointer for the new state. */ 2344 if (new_state >= d->tralloc) 2345 { 2346 int oldalloc = d->tralloc; 2347 2348 while (new_state >= d->tralloc) 2349 d->tralloc *= 2; 2350 REALLOC(d->realtrans, int *, d->tralloc + 1); 2351 d->trans = d->realtrans + 1; 2352 REALLOC(d->fails, int *, d->tralloc); 2353 REALLOC(d->success, int, d->tralloc); 2354 while (oldalloc < d->tralloc) 2355 { 2356 d->trans[oldalloc] = NULL; 2357 d->fails[oldalloc++] = NULL; 2358 } 2359 } 2360} 2361 2362/* Return values of transit_state_singlebyte(), and 2363 transit_state_consume_1char. */ 2364typedef enum 2365{ 2366 TRANSIT_STATE_IN_PROGRESS, /* State transition has not finished. */ 2367 TRANSIT_STATE_DONE, /* State transition has finished. */ 2368 TRANSIT_STATE_END_BUFFER /* Reach the end of the buffer. */ 2369} status_transit_state; 2370 2371/* Consume a single byte and transit state from 's' to '*next_state'. 2372 This function is almost same as the state transition routin in dfaexec(). 2373 But state transition is done just once, otherwise matching succeed or 2374 reach the end of the buffer. */ 2375static status_transit_state 2376transit_state_singlebyte (struct dfa *d, int s, unsigned char const *p, 2377 int *next_state) 2378{ 2379 int *t; 2380 int works = s; 2381 2382 status_transit_state rval = TRANSIT_STATE_IN_PROGRESS; 2383 2384 while (rval == TRANSIT_STATE_IN_PROGRESS) 2385 { 2386 if ((t = d->trans[works]) != NULL) 2387 { 2388 works = t[*p]; 2389 rval = TRANSIT_STATE_DONE; 2390 if (works < 0) 2391 works = 0; 2392 } 2393 else if (works < 0) 2394 { 2395 if (p == buf_end) 2396 /* At the moment, it must not happen. */ 2397 return TRANSIT_STATE_END_BUFFER; 2398 works = 0; 2399 } 2400 else if (d->fails[works]) 2401 { 2402 works = d->fails[works][*p]; 2403 rval = TRANSIT_STATE_DONE; 2404 } 2405 else 2406 { 2407 build_state(works, d); 2408 } 2409 } 2410 *next_state = works; 2411 return rval; 2412} 2413 2414/* Check whether period can match or not in the current context. If it can, 2415 return the amount of the bytes with which period can match, otherwise 2416 return 0. 2417 `pos' is the position of the period. `index' is the index from the 2418 buf_begin, and it is the current position in the buffer. */ 2419static int 2420match_anychar (struct dfa *d, int s, position pos, int index) 2421{ 2422 int newline = 0; 2423 int letter = 0; 2424 wchar_t wc; 2425 int mbclen; 2426 2427 wc = inputwcs[index]; 2428 mbclen = (mblen_buf[index] == 0)? 1 : mblen_buf[index]; 2429 2430 /* Check context. */ 2431 if (wc == (wchar_t)eolbyte) 2432 { 2433 if (!(syntax_bits & RE_DOT_NEWLINE)) 2434 return 0; 2435 newline = 1; 2436 } 2437 else if (wc == (wchar_t)'\0') 2438 { 2439 if (syntax_bits & RE_DOT_NOT_NULL) 2440 return 0; 2441 newline = 1; 2442 } 2443 2444 if (iswalnum(wc) || wc == L'_') 2445 letter = 1; 2446 2447 if (!SUCCEEDS_IN_CONTEXT(pos.constraint, d->states[s].newline, 2448 newline, d->states[s].letter, letter)) 2449 return 0; 2450 2451 return mbclen; 2452} 2453 2454/* Check whether bracket expression can match or not in the current context. 2455 If it can, return the amount of the bytes with which expression can match, 2456 otherwise return 0. 2457 `pos' is the position of the bracket expression. `index' is the index 2458 from the buf_begin, and it is the current position in the buffer. */ 2459int 2460match_mb_charset (struct dfa *d, int s, position pos, int index) 2461{ 2462 int i; 2463 int match; /* Flag which represent that matching succeed. */ 2464 int match_len; /* Length of the character (or collating element) 2465 with which this operator match. */ 2466 int op_len; /* Length of the operator. */ 2467 char buffer[128]; 2468 wchar_t wcbuf[6]; 2469 2470 /* Pointer to the structure to which we are currently reffering. */ 2471 struct mb_char_classes *work_mbc; 2472 2473 int newline = 0; 2474 int letter = 0; 2475 wchar_t wc; /* Current reffering character. */ 2476 2477 wc = inputwcs[index]; 2478 2479 /* Check context. */ 2480 if (wc == (wchar_t)eolbyte) 2481 { 2482 if (!(syntax_bits & RE_DOT_NEWLINE)) 2483 return 0; 2484 newline = 1; 2485 } 2486 else if (wc == (wchar_t)'\0') 2487 { 2488 if (syntax_bits & RE_DOT_NOT_NULL) 2489 return 0; 2490 newline = 1; 2491 } 2492 if (iswalnum(wc) || wc == L'_') 2493 letter = 1; 2494 if (!SUCCEEDS_IN_CONTEXT(pos.constraint, d->states[s].newline, 2495 newline, d->states[s].letter, letter)) 2496 return 0; 2497 2498 /* Assign the current reffering operator to work_mbc. */ 2499 work_mbc = &(d->mbcsets[(d->multibyte_prop[pos.index]) >> 2]); 2500 match = !work_mbc->invert; 2501 match_len = (mblen_buf[index] == 0)? 1 : mblen_buf[index]; 2502 2503 /* match with a character class? */ 2504 for (i = 0; i<work_mbc->nch_classes; i++) 2505 { 2506 if (iswctype((wint_t)wc, work_mbc->ch_classes[i])) 2507 goto charset_matched; 2508 } 2509 2510 strncpy(buffer, buf_begin + index, match_len); 2511 buffer[match_len] = '\0'; 2512 2513 /* match with an equivalent class? */ 2514 for (i = 0; i<work_mbc->nequivs; i++) 2515 { 2516 op_len = strlen(work_mbc->equivs[i]); 2517 strncpy(buffer, buf_begin + index, op_len); 2518 buffer[op_len] = '\0'; 2519 if (strcoll(work_mbc->equivs[i], buffer) == 0) 2520 { 2521 match_len = op_len; 2522 goto charset_matched; 2523 } 2524 } 2525 2526 /* match with a collating element? */ 2527 for (i = 0; i<work_mbc->ncoll_elems; i++) 2528 { 2529 op_len = strlen(work_mbc->coll_elems[i]); 2530 strncpy(buffer, buf_begin + index, op_len); 2531 buffer[op_len] = '\0'; 2532 2533 if (strcoll(work_mbc->coll_elems[i], buffer) == 0) 2534 { 2535 match_len = op_len; 2536 goto charset_matched; 2537 } 2538 } 2539 2540 wcbuf[0] = wc; 2541 wcbuf[1] = wcbuf[3] = wcbuf[5] = '\0'; 2542 2543 /* match with a range? */ 2544 for (i = 0; i<work_mbc->nranges; i++) 2545 { 2546 wcbuf[2] = work_mbc->range_sts[i]; 2547 wcbuf[4] = work_mbc->range_ends[i]; 2548 2549 if (wcscoll(wcbuf, wcbuf+2) >= 0 && 2550 wcscoll(wcbuf+4, wcbuf) >= 0) 2551 goto charset_matched; 2552 } 2553 2554 /* match with a character? */ 2555 for (i = 0; i<work_mbc->nchars; i++) 2556 { 2557 if (wc == work_mbc->chars[i]) 2558 goto charset_matched; 2559 } 2560 2561 match = !match; 2562 2563 charset_matched: 2564 return match ? match_len : 0; 2565} 2566 2567/* Check each of `d->states[s].mbps.elem' can match or not. Then return the 2568 array which corresponds to `d->states[s].mbps.elem' and each element of 2569 the array contains the amount of the bytes with which the element can 2570 match. 2571 `index' is the index from the buf_begin, and it is the current position 2572 in the buffer. 2573 Caller MUST free the array which this function return. */ 2574static int* 2575check_matching_with_multibyte_ops (struct dfa *d, int s, int index) 2576{ 2577 int i; 2578 int* rarray; 2579 2580 MALLOC(rarray, int, d->states[s].mbps.nelem); 2581 for (i = 0; i < d->states[s].mbps.nelem; ++i) 2582 { 2583 position pos = d->states[s].mbps.elems[i]; 2584 switch(d->tokens[pos.index]) 2585 { 2586 case ANYCHAR: 2587 rarray[i] = match_anychar(d, s, pos, index); 2588 break; 2589 case MBCSET: 2590 rarray[i] = match_mb_charset(d, s, pos, index); 2591 break; 2592 default: 2593 break; /* can not happen. */ 2594 } 2595 } 2596 return rarray; 2597} 2598 2599/* Consume a single character and enumerate all of the positions which can 2600 be next position from the state `s'. 2601 `match_lens' is the input. It can be NULL, but it can also be the output 2602 of check_matching_with_multibyte_ops() for optimization. 2603 `mbclen' and `pps' are the output. `mbclen' is the length of the 2604 character consumed, and `pps' is the set this function enumerate. */ 2605static status_transit_state 2606transit_state_consume_1char (struct dfa *d, int s, unsigned char const **pp, 2607 int *match_lens, int *mbclen, position_set *pps) 2608{ 2609 int i, j; 2610 int s1, s2; 2611 int* work_mbls; 2612 status_transit_state rs = TRANSIT_STATE_DONE; 2613 2614 /* Calculate the length of the (single/multi byte) character 2615 to which p points. */ 2616 *mbclen = (mblen_buf[*pp - buf_begin] == 0)? 1 2617 : mblen_buf[*pp - buf_begin]; 2618 2619 /* Calculate the state which can be reached from the state `s' by 2620 consuming `*mbclen' single bytes from the buffer. */ 2621 s1 = s; 2622 for (i = 0; i < *mbclen; i++) 2623 { 2624 s2 = s1; 2625 rs = transit_state_singlebyte(d, s2, (*pp)++, &s1); 2626 } 2627 /* Copy the positions contained by `s1' to the set `pps'. */ 2628 copy(&(d->states[s1].elems), pps); 2629 2630 /* Check (inputed)match_lens, and initialize if it is NULL. */ 2631 if (match_lens == NULL && d->states[s].mbps.nelem != 0) 2632 work_mbls = check_matching_with_multibyte_ops(d, s, *pp - buf_begin); 2633 else 2634 work_mbls = match_lens; 2635 2636 /* Add all of the positions which can be reached from `s' by consuming 2637 a single character. */ 2638 for (i = 0; i < d->states[s].mbps.nelem ; i++) 2639 { 2640 if (work_mbls[i] == *mbclen) 2641 for (j = 0; j < d->follows[d->states[s].mbps.elems[i].index].nelem; 2642 j++) 2643 insert(d->follows[d->states[s].mbps.elems[i].index].elems[j], 2644 pps); 2645 } 2646 2647 if (match_lens == NULL && work_mbls != NULL) 2648 free(work_mbls); 2649 return rs; 2650} 2651 2652/* Transit state from s, then return new state and update the pointer of the 2653 buffer. This function is for some operator which can match with a multi- 2654 byte character or a collating element(which may be multi characters). */ 2655static int 2656transit_state (struct dfa *d, int s, unsigned char const **pp) 2657{ 2658 int s1; 2659 int mbclen; /* The length of current input multibyte character. */ 2660 int maxlen = 0; 2661 int i, j; 2662 int *match_lens = NULL; 2663 int nelem = d->states[s].mbps.nelem; /* Just a alias. */ 2664 position_set follows; 2665 unsigned char const *p1 = *pp; 2666 status_transit_state rs; 2667 wchar_t wc; 2668 2669 if (nelem > 0) 2670 /* This state has (a) multibyte operator(s). 2671 We check whether each of them can match or not. */ 2672 { 2673 /* Note: caller must free the return value of this function. */ 2674 match_lens = check_matching_with_multibyte_ops(d, s, *pp - buf_begin); 2675 2676 for (i = 0; i < nelem; i++) 2677 /* Search the operator which match the longest string, 2678 in this state. */ 2679 { 2680 if (match_lens[i] > maxlen) 2681 maxlen = match_lens[i]; 2682 } 2683 } 2684 2685 if (nelem == 0 || maxlen == 0) 2686 /* This state has no multibyte operator which can match. 2687 We need to check only one singlebyte character. */ 2688 { 2689 status_transit_state rs; 2690 rs = transit_state_singlebyte(d, s, *pp, &s1); 2691 2692 /* We must update the pointer if state transition succeeded. */ 2693 if (rs == TRANSIT_STATE_DONE) 2694 ++*pp; 2695 2696 if (match_lens != NULL) 2697 free(match_lens); 2698 return s1; 2699 } 2700 2701 /* This state has some operators which can match a multibyte character. */ 2702 follows.nelem = 0; 2703 MALLOC(follows.elems, position, d->nleaves); 2704 2705 /* `maxlen' may be longer than the length of a character, because it may 2706 not be a character but a (multi character) collating element. 2707 We enumerate all of the positions which `s' can reach by consuming 2708 `maxlen' bytes. */ 2709 rs = transit_state_consume_1char(d, s, pp, match_lens, &mbclen, &follows); 2710 2711 wc = inputwcs[*pp - mbclen - buf_begin]; 2712 s1 = state_index(d, &follows, wc == L'\n', iswalnum(wc)); 2713 realloc_trans_if_necessary(d, s1); 2714 2715 while (*pp - p1 < maxlen) 2716 { 2717 follows.nelem = 0; 2718 rs = transit_state_consume_1char(d, s1, pp, NULL, &mbclen, &follows); 2719 2720 for (i = 0; i < nelem ; i++) 2721 { 2722 if (match_lens[i] == *pp - p1) 2723 for (j = 0; 2724 j < d->follows[d->states[s1].mbps.elems[i].index].nelem; j++) 2725 insert(d->follows[d->states[s1].mbps.elems[i].index].elems[j], 2726 &follows); 2727 } 2728 2729 wc = inputwcs[*pp - mbclen - buf_begin]; 2730 s1 = state_index(d, &follows, wc == L'\n', iswalnum(wc)); 2731 realloc_trans_if_necessary(d, s1); 2732 } 2733 free(match_lens); 2734 free(follows.elems); 2735 return s1; 2736} 2737 2738#endif 2739 2740/* Search through a buffer looking for a match to the given struct dfa. 2741 Find the first occurrence of a string matching the regexp in the buffer, 2742 and the shortest possible version thereof. Return the offset of the first 2743 character after the match, or (size_t) -1 if none is found. BEGIN points to 2744 the beginning of the buffer, and SIZE is the size of the buffer. If SIZE 2745 is nonzero, BEGIN[SIZE - 1] must be a newline. BACKREF points to a place 2746 where we're supposed to store a 1 if backreferencing happened and the 2747 match needs to be verified by a backtracking matcher. Otherwise 2748 we store a 0 in *backref. */ 2749size_t 2750dfaexec (struct dfa *d, char const *begin, size_t size, int *backref) 2751{ 2752 register int s; /* Current state. */ 2753 register unsigned char const *p; /* Current input character. */ 2754 register unsigned char const *end; /* One past the last input character. */ 2755 register int **trans, *t; /* Copy of d->trans so it can be optimized 2756 into a register. */ 2757 register unsigned char eol = eolbyte; /* Likewise for eolbyte. */ 2758 static int sbit[NOTCHAR]; /* Table for anding with d->success. */ 2759 static int sbit_init; 2760 2761 if (! sbit_init) 2762 { 2763 int i; 2764 2765 sbit_init = 1; 2766 for (i = 0; i < NOTCHAR; ++i) 2767 sbit[i] = (IS_WORD_CONSTITUENT(i)) ? 2 : 1; 2768 sbit[eol] = 4; 2769 } 2770 2771 if (! d->tralloc) 2772 build_state_zero(d); 2773 2774 s = 0; 2775 p = (unsigned char const *) begin; 2776 end = p + size; 2777 trans = d->trans; 2778 2779#ifdef MBS_SUPPORT 2780 if (MB_CUR_MAX > 1) 2781 { 2782 int remain_bytes, i; 2783 buf_begin = begin; 2784 buf_end = end; 2785 2786 /* initialize mblen_buf, and inputwcs. */ 2787 MALLOC(mblen_buf, unsigned char, end - (unsigned char const *)begin + 2); 2788 MALLOC(inputwcs, wchar_t, end - (unsigned char const *)begin + 2); 2789 memset(&mbs, 0, sizeof(mbstate_t)); 2790 remain_bytes = 0; 2791 for (i = 0; i < end - (unsigned char const *)begin + 1; i++) 2792 { 2793 if (remain_bytes == 0) 2794 { 2795 remain_bytes 2796 = mbrtowc(inputwcs + i, begin + i, 2797 end - (unsigned char const *)begin - i + 1, &mbs); 2798 if (remain_bytes <= 1) 2799 { 2800 remain_bytes = 0; 2801 inputwcs[i] = (wchar_t)begin[i]; 2802 mblen_buf[i] = 0; 2803 } 2804 else 2805 { 2806 mblen_buf[i] = remain_bytes; 2807 remain_bytes--; 2808 } 2809 } 2810 else 2811 { 2812 mblen_buf[i] = remain_bytes; 2813 inputwcs[i] = 0; 2814 remain_bytes--; 2815 } 2816 } 2817 mblen_buf[i] = 0; 2818 inputwcs[i] = 0; /* sentinel */ 2819 } 2820#endif /* MBS_SUPPORT */ 2821 2822 for (;;) 2823 { 2824#ifdef MBS_SUPPORT 2825 if (MB_CUR_MAX > 1) 2826 while ((t = trans[s])) 2827 { 2828 if (d->states[s].mbps.nelem != 0) 2829 { 2830 /* Can match with a multibyte character( and multi character 2831 collating element). */ 2832 unsigned char const *nextp; 2833 2834 SKIP_REMAINS_MB_IF_INITIAL_STATE(s, p); 2835 2836 nextp = p; 2837 s = transit_state(d, s, &nextp); 2838 p = nextp; 2839 2840 /* Trans table might be updated. */ 2841 trans = d->trans; 2842 } 2843 else 2844 { 2845 SKIP_REMAINS_MB_IF_INITIAL_STATE(s, p); 2846 s = t[*p++]; 2847 } 2848 } 2849 else 2850#endif /* MBS_SUPPORT */ 2851 while ((t = trans[s])) 2852 s = t[*p++]; 2853 2854 if (s < 0) 2855 { 2856 if (p == end) 2857 { 2858#ifdef MBS_SUPPORT 2859 if (MB_CUR_MAX > 1) 2860 { 2861 free(mblen_buf); 2862 free(inputwcs); 2863 } 2864#endif /* MBS_SUPPORT */ 2865 return (size_t) -1; 2866 } 2867 s = 0; 2868 } 2869 else if ((t = d->fails[s])) 2870 { 2871 if (d->success[s] & sbit[*p]) 2872 { 2873 if (backref) 2874 *backref = (d->states[s].backref != 0); 2875#ifdef MBS_SUPPORT 2876 if (MB_CUR_MAX > 1) 2877 { 2878 free(mblen_buf); 2879 free(inputwcs); 2880 } 2881#endif /* MBS_SUPPORT */ 2882 return (char const *) p - begin; 2883 } 2884 2885#ifdef MBS_SUPPORT 2886 if (MB_CUR_MAX > 1) 2887 { 2888 SKIP_REMAINS_MB_IF_INITIAL_STATE(s, p); 2889 if (d->states[s].mbps.nelem != 0) 2890 { 2891 /* Can match with a multibyte character( and multi 2892 character collating element). */ 2893 unsigned char const *nextp; 2894 nextp = p; 2895 s = transit_state(d, s, &nextp); 2896 p = nextp; 2897 2898 /* Trans table might be updated. */ 2899 trans = d->trans; 2900 } 2901 else 2902 s = t[*p++]; 2903 } 2904 else 2905#endif /* MBS_SUPPORT */ 2906 s = t[*p++]; 2907 } 2908 else 2909 { 2910 build_state(s, d); 2911 trans = d->trans; 2912 } 2913 } 2914} 2915 2916/* Initialize the components of a dfa that the other routines don't 2917 initialize for themselves. */ 2918void 2919dfainit (struct dfa *d) 2920{ 2921 d->calloc = 1; 2922 MALLOC(d->charclasses, charclass, d->calloc); 2923 d->cindex = 0; 2924 2925 d->talloc = 1; 2926 MALLOC(d->tokens, token, d->talloc); 2927 d->tindex = d->depth = d->nleaves = d->nregexps = 0; 2928#ifdef MBS_SUPPORT 2929 if (MB_CUR_MAX > 1) 2930 { 2931 d->nmultibyte_prop = 1; 2932 MALLOC(d->multibyte_prop, int, d->nmultibyte_prop); 2933 d->nmbcsets = 0; 2934 d->mbcsets_alloc = 1; 2935 MALLOC(d->mbcsets, struct mb_char_classes, d->mbcsets_alloc); 2936 } 2937#endif 2938 2939 d->searchflag = 0; 2940 d->tralloc = 0; 2941 2942 d->musts = 0; 2943} 2944 2945/* Parse and analyze a single string of the given length. */ 2946void 2947dfacomp (char const *s, size_t len, struct dfa *d, int searchflag) 2948{ 2949 if (case_fold) /* dummy folding in service of dfamust() */ 2950 { 2951 char *lcopy; 2952 int i; 2953 2954 lcopy = malloc(len); 2955 if (!lcopy) 2956 dfaerror(_("out of memory")); 2957 2958 /* This is a kludge. */ 2959 case_fold = 0; 2960 for (i = 0; i < len; ++i) 2961 if (ISUPPER ((unsigned char) s[i])) 2962 lcopy[i] = tolower ((unsigned char) s[i]); 2963 else 2964 lcopy[i] = s[i]; 2965 2966 dfainit(d); 2967 dfaparse(lcopy, len, d); 2968 free(lcopy); 2969 dfamust(d); 2970 d->cindex = d->tindex = d->depth = d->nleaves = d->nregexps = 0; 2971 case_fold = 1; 2972 dfaparse(s, len, d); 2973 dfaanalyze(d, searchflag); 2974 } 2975 else 2976 { 2977 dfainit(d); 2978 dfaparse(s, len, d); 2979 dfamust(d); 2980 dfaanalyze(d, searchflag); 2981 } 2982} 2983 2984/* Free the storage held by the components of a dfa. */ 2985void 2986dfafree (struct dfa *d) 2987{ 2988 int i; 2989 struct dfamust *dm, *ndm; 2990 2991 free((ptr_t) d->charclasses); 2992 free((ptr_t) d->tokens); 2993 2994#ifdef MBS_SUPPORT 2995 if (MB_CUR_MAX > 1) 2996 { 2997 free((ptr_t) d->multibyte_prop); 2998 for (i = 0; i < d->nmbcsets; ++i) 2999 { 3000 int j; 3001 struct mb_char_classes *p = &(d->mbcsets[i]); 3002 if (p->chars != NULL) 3003 free(p->chars); 3004 if (p->ch_classes != NULL) 3005 free(p->ch_classes); 3006 if (p->range_sts != NULL) 3007 free(p->range_sts); 3008 if (p->range_ends != NULL) 3009 free(p->range_ends); 3010 3011 for (j = 0; j < p->nequivs; ++j) 3012 free(p->equivs[j]); 3013 if (p->equivs != NULL) 3014 free(p->equivs); 3015 3016 for (j = 0; j < p->ncoll_elems; ++j) 3017 free(p->coll_elems[j]); 3018 if (p->coll_elems != NULL) 3019 free(p->coll_elems); 3020 } 3021 free((ptr_t) d->mbcsets); 3022 } 3023#endif /* MBS_SUPPORT */ 3024 3025 for (i = 0; i < d->sindex; ++i) 3026 free((ptr_t) d->states[i].elems.elems); 3027 free((ptr_t) d->states); 3028 for (i = 0; i < d->tindex; ++i) 3029 if (d->follows[i].elems) 3030 free((ptr_t) d->follows[i].elems); 3031 free((ptr_t) d->follows); 3032 for (i = 0; i < d->tralloc; ++i) 3033 if (d->trans[i]) 3034 free((ptr_t) d->trans[i]); 3035 else if (d->fails[i]) 3036 free((ptr_t) d->fails[i]); 3037 if (d->realtrans) free((ptr_t) d->realtrans); 3038 if (d->fails) free((ptr_t) d->fails); 3039 if (d->success) free((ptr_t) d->success); 3040 for (dm = d->musts; dm; dm = ndm) 3041 { 3042 ndm = dm->next; 3043 free(dm->must); 3044 free((ptr_t) dm); 3045 } 3046} 3047 3048/* Having found the postfix representation of the regular expression, 3049 try to find a long sequence of characters that must appear in any line 3050 containing the r.e. 3051 Finding a "longest" sequence is beyond the scope here; 3052 we take an easy way out and hope for the best. 3053 (Take "(ab|a)b"--please.) 3054 3055 We do a bottom-up calculation of sequences of characters that must appear 3056 in matches of r.e.'s represented by trees rooted at the nodes of the postfix 3057 representation: 3058 sequences that must appear at the left of the match ("left") 3059 sequences that must appear at the right of the match ("right") 3060 lists of sequences that must appear somewhere in the match ("in") 3061 sequences that must constitute the match ("is") 3062 3063 When we get to the root of the tree, we use one of the longest of its 3064 calculated "in" sequences as our answer. The sequence we find is returned in 3065 d->must (where "d" is the single argument passed to "dfamust"); 3066 the length of the sequence is returned in d->mustn. 3067 3068 The sequences calculated for the various types of node (in pseudo ANSI c) 3069 are shown below. "p" is the operand of unary operators (and the left-hand 3070 operand of binary operators); "q" is the right-hand operand of binary 3071 operators. 3072 3073 "ZERO" means "a zero-length sequence" below. 3074 3075 Type left right is in 3076 ---- ---- ----- -- -- 3077 char c # c # c # c # c 3078 3079 ANYCHAR ZERO ZERO ZERO ZERO 3080 3081 MBCSET ZERO ZERO ZERO ZERO 3082 3083 CSET ZERO ZERO ZERO ZERO 3084 3085 STAR ZERO ZERO ZERO ZERO 3086 3087 QMARK ZERO ZERO ZERO ZERO 3088 3089 PLUS p->left p->right ZERO p->in 3090 3091 CAT (p->is==ZERO)? (q->is==ZERO)? (p->is!=ZERO && p->in plus 3092 p->left : q->right : q->is!=ZERO) ? q->in plus 3093 p->is##q->left p->right##q->is p->is##q->is : p->right##q->left 3094 ZERO 3095 3096 OR longest common longest common (do p->is and substrings common to 3097 leading trailing q->is have same p->in and q->in 3098 (sub)sequence (sub)sequence length and 3099 of p->left of p->right content) ? 3100 and q->left and q->right p->is : NULL 3101 3102 If there's anything else we recognize in the tree, all four sequences get set 3103 to zero-length sequences. If there's something we don't recognize in the tree, 3104 we just return a zero-length sequence. 3105 3106 Break ties in favor of infrequent letters (choosing 'zzz' in preference to 3107 'aaa')? 3108 3109 And. . .is it here or someplace that we might ponder "optimizations" such as 3110 egrep 'psi|epsilon' -> egrep 'psi' 3111 egrep 'pepsi|epsilon' -> egrep 'epsi' 3112 (Yes, we now find "epsi" as a "string 3113 that must occur", but we might also 3114 simplify the *entire* r.e. being sought) 3115 grep '[c]' -> grep 'c' 3116 grep '(ab|a)b' -> grep 'ab' 3117 grep 'ab*' -> grep 'a' 3118 grep 'a*b' -> grep 'b' 3119 3120 There are several issues: 3121 3122 Is optimization easy (enough)? 3123 3124 Does optimization actually accomplish anything, 3125 or is the automaton you get from "psi|epsilon" (for example) 3126 the same as the one you get from "psi" (for example)? 3127 3128 Are optimizable r.e.'s likely to be used in real-life situations 3129 (something like 'ab*' is probably unlikely; something like is 3130 'psi|epsilon' is likelier)? */ 3131 3132static char * 3133icatalloc (char *old, char *new) 3134{ 3135 char *result; 3136 size_t oldsize, newsize; 3137 3138 newsize = (new == NULL) ? 0 : strlen(new); 3139 if (old == NULL) 3140 oldsize = 0; 3141 else if (newsize == 0) 3142 return old; 3143 else oldsize = strlen(old); 3144 if (old == NULL) 3145 result = (char *) malloc(newsize + 1); 3146 else 3147 result = (char *) realloc((void *) old, oldsize + newsize + 1); 3148 if (result != NULL && new != NULL) 3149 (void) strcpy(result + oldsize, new); 3150 return result; 3151} 3152 3153static char * 3154icpyalloc (char *string) 3155{ 3156 return icatalloc((char *) NULL, string); 3157} 3158 3159static char * 3160istrstr (char *lookin, char *lookfor) 3161{ 3162 char *cp; 3163 size_t len; 3164 3165 len = strlen(lookfor); 3166 for (cp = lookin; *cp != '\0'; ++cp) 3167 if (strncmp(cp, lookfor, len) == 0) 3168 return cp; 3169 return NULL; 3170} 3171 3172static void 3173ifree (char *cp) 3174{ 3175 if (cp != NULL) 3176 free(cp); 3177} 3178 3179static void 3180freelist (char **cpp) 3181{ 3182 int i; 3183 3184 if (cpp == NULL) 3185 return; 3186 for (i = 0; cpp[i] != NULL; ++i) 3187 { 3188 free(cpp[i]); 3189 cpp[i] = NULL; 3190 } 3191} 3192 3193static char ** 3194enlist (char **cpp, char *new, size_t len) 3195{ 3196 int i, j; 3197 3198 if (cpp == NULL) 3199 return NULL; 3200 if ((new = icpyalloc(new)) == NULL) 3201 { 3202 freelist(cpp); 3203 return NULL; 3204 } 3205 new[len] = '\0'; 3206 /* Is there already something in the list that's new (or longer)? */ 3207 for (i = 0; cpp[i] != NULL; ++i) 3208 if (istrstr(cpp[i], new) != NULL) 3209 { 3210 free(new); 3211 return cpp; 3212 } 3213 /* Eliminate any obsoleted strings. */ 3214 j = 0; 3215 while (cpp[j] != NULL) 3216 if (istrstr(new, cpp[j]) == NULL) 3217 ++j; 3218 else 3219 { 3220 free(cpp[j]); 3221 if (--i == j) 3222 break; 3223 cpp[j] = cpp[i]; 3224 cpp[i] = NULL; 3225 } 3226 /* Add the new string. */ 3227 cpp = (char **) realloc((char *) cpp, (i + 2) * sizeof *cpp); 3228 if (cpp == NULL) 3229 return NULL; 3230 cpp[i] = new; 3231 cpp[i + 1] = NULL; 3232 return cpp; 3233} 3234 3235/* Given pointers to two strings, return a pointer to an allocated 3236 list of their distinct common substrings. Return NULL if something 3237 seems wild. */ 3238static char ** 3239comsubs (char *left, char *right) 3240{ 3241 char **cpp; 3242 char *lcp; 3243 char *rcp; 3244 size_t i, len; 3245 3246 if (left == NULL || right == NULL) 3247 return NULL; 3248 cpp = (char **) malloc(sizeof *cpp); 3249 if (cpp == NULL) 3250 return NULL; 3251 cpp[0] = NULL; 3252 for (lcp = left; *lcp != '\0'; ++lcp) 3253 { 3254 len = 0; 3255 rcp = strchr (right, *lcp); 3256 while (rcp != NULL) 3257 { 3258 for (i = 1; lcp[i] != '\0' && lcp[i] == rcp[i]; ++i) 3259 continue; 3260 if (i > len) 3261 len = i; 3262 rcp = strchr (rcp + 1, *lcp); 3263 } 3264 if (len == 0) 3265 continue; 3266 if ((cpp = enlist(cpp, lcp, len)) == NULL) 3267 break; 3268 } 3269 return cpp; 3270} 3271 3272static char ** 3273addlists (char **old, char **new) 3274{ 3275 int i; 3276 3277 if (old == NULL || new == NULL) 3278 return NULL; 3279 for (i = 0; new[i] != NULL; ++i) 3280 { 3281 old = enlist(old, new[i], strlen(new[i])); 3282 if (old == NULL) 3283 break; 3284 } 3285 return old; 3286} 3287 3288/* Given two lists of substrings, return a new list giving substrings 3289 common to both. */ 3290static char ** 3291inboth (char **left, char **right) 3292{ 3293 char **both; 3294 char **temp; 3295 int lnum, rnum; 3296 3297 if (left == NULL || right == NULL) 3298 return NULL; 3299 both = (char **) malloc(sizeof *both); 3300 if (both == NULL) 3301 return NULL; 3302 both[0] = NULL; 3303 for (lnum = 0; left[lnum] != NULL; ++lnum) 3304 { 3305 for (rnum = 0; right[rnum] != NULL; ++rnum) 3306 { 3307 temp = comsubs(left[lnum], right[rnum]); 3308 if (temp == NULL) 3309 { 3310 freelist(both); 3311 return NULL; 3312 } 3313 both = addlists(both, temp); 3314 freelist(temp); 3315 free(temp); 3316 if (both == NULL) 3317 return NULL; 3318 } 3319 } 3320 return both; 3321} 3322 3323typedef struct 3324{ 3325 char **in; 3326 char *left; 3327 char *right; 3328 char *is; 3329} must; 3330 3331static void 3332resetmust (must *mp) 3333{ 3334 mp->left[0] = mp->right[0] = mp->is[0] = '\0'; 3335 freelist(mp->in); 3336} 3337 3338static void 3339dfamust (struct dfa *dfa) 3340{ 3341 must *musts; 3342 must *mp; 3343 char *result; 3344 int ri; 3345 int i; 3346 int exact; 3347 token t; 3348 static must must0; 3349 struct dfamust *dm; 3350 static char empty_string[] = ""; 3351 3352 result = empty_string; 3353 exact = 0; 3354 musts = (must *) malloc((dfa->tindex + 1) * sizeof *musts); 3355 if (musts == NULL) 3356 return; 3357 mp = musts; 3358 for (i = 0; i <= dfa->tindex; ++i) 3359 mp[i] = must0; 3360 for (i = 0; i <= dfa->tindex; ++i) 3361 { 3362 mp[i].in = (char **) malloc(sizeof *mp[i].in); 3363 mp[i].left = malloc(2); 3364 mp[i].right = malloc(2); 3365 mp[i].is = malloc(2); 3366 if (mp[i].in == NULL || mp[i].left == NULL || 3367 mp[i].right == NULL || mp[i].is == NULL) 3368 goto done; 3369 mp[i].left[0] = mp[i].right[0] = mp[i].is[0] = '\0'; 3370 mp[i].in[0] = NULL; 3371 } 3372#ifdef DEBUG 3373 fprintf(stderr, "dfamust:\n"); 3374 for (i = 0; i < dfa->tindex; ++i) 3375 { 3376 fprintf(stderr, " %d:", i); 3377 prtok(dfa->tokens[i]); 3378 } 3379 putc('\n', stderr); 3380#endif 3381 for (ri = 0; ri < dfa->tindex; ++ri) 3382 { 3383 switch (t = dfa->tokens[ri]) 3384 { 3385 case LPAREN: 3386 case RPAREN: 3387 goto done; /* "cannot happen" */ 3388 case EMPTY: 3389 case BEGLINE: 3390 case ENDLINE: 3391 case BEGWORD: 3392 case ENDWORD: 3393 case LIMWORD: 3394 case NOTLIMWORD: 3395 case BACKREF: 3396 resetmust(mp); 3397 break; 3398 case STAR: 3399 case QMARK: 3400 if (mp <= musts) 3401 goto done; /* "cannot happen" */ 3402 --mp; 3403 resetmust(mp); 3404 break; 3405 case OR: 3406 case ORTOP: 3407 if (mp < &musts[2]) 3408 goto done; /* "cannot happen" */ 3409 { 3410 char **new; 3411 must *lmp; 3412 must *rmp; 3413 int j, ln, rn, n; 3414 3415 rmp = --mp; 3416 lmp = --mp; 3417 /* Guaranteed to be. Unlikely, but. . . */ 3418 if (strcmp(lmp->is, rmp->is) != 0) 3419 lmp->is[0] = '\0'; 3420 /* Left side--easy */ 3421 i = 0; 3422 while (lmp->left[i] != '\0' && lmp->left[i] == rmp->left[i]) 3423 ++i; 3424 lmp->left[i] = '\0'; 3425 /* Right side */ 3426 ln = strlen(lmp->right); 3427 rn = strlen(rmp->right); 3428 n = ln; 3429 if (n > rn) 3430 n = rn; 3431 for (i = 0; i < n; ++i) 3432 if (lmp->right[ln - i - 1] != rmp->right[rn - i - 1]) 3433 break; 3434 for (j = 0; j < i; ++j) 3435 lmp->right[j] = lmp->right[(ln - i) + j]; 3436 lmp->right[j] = '\0'; 3437 new = inboth(lmp->in, rmp->in); 3438 if (new == NULL) 3439 goto done; 3440 freelist(lmp->in); 3441 free((char *) lmp->in); 3442 lmp->in = new; 3443 } 3444 break; 3445 case PLUS: 3446 if (mp <= musts) 3447 goto done; /* "cannot happen" */ 3448 --mp; 3449 mp->is[0] = '\0'; 3450 break; 3451 case END: 3452 if (mp != &musts[1]) 3453 goto done; /* "cannot happen" */ 3454 for (i = 0; musts[0].in[i] != NULL; ++i) 3455 if (strlen(musts[0].in[i]) > strlen(result)) 3456 result = musts[0].in[i]; 3457 if (strcmp(result, musts[0].is) == 0) 3458 exact = 1; 3459 goto done; 3460 case CAT: 3461 if (mp < &musts[2]) 3462 goto done; /* "cannot happen" */ 3463 { 3464 must *lmp; 3465 must *rmp; 3466 3467 rmp = --mp; 3468 lmp = --mp; 3469 /* In. Everything in left, plus everything in 3470 right, plus catenation of 3471 left's right and right's left. */ 3472 lmp->in = addlists(lmp->in, rmp->in); 3473 if (lmp->in == NULL) 3474 goto done; 3475 if (lmp->right[0] != '\0' && 3476 rmp->left[0] != '\0') 3477 { 3478 char *tp; 3479 3480 tp = icpyalloc(lmp->right); 3481 if (tp == NULL) 3482 goto done; 3483 tp = icatalloc(tp, rmp->left); 3484 if (tp == NULL) 3485 goto done; 3486 lmp->in = enlist(lmp->in, tp, 3487 strlen(tp)); 3488 free(tp); 3489 if (lmp->in == NULL) 3490 goto done; 3491 } 3492 /* Left-hand */ 3493 if (lmp->is[0] != '\0') 3494 { 3495 lmp->left = icatalloc(lmp->left, 3496 rmp->left); 3497 if (lmp->left == NULL) 3498 goto done; 3499 } 3500 /* Right-hand */ 3501 if (rmp->is[0] == '\0') 3502 lmp->right[0] = '\0'; 3503 lmp->right = icatalloc(lmp->right, rmp->right); 3504 if (lmp->right == NULL) 3505 goto done; 3506 /* Guaranteed to be */ 3507 if (lmp->is[0] != '\0' && rmp->is[0] != '\0') 3508 { 3509 lmp->is = icatalloc(lmp->is, rmp->is); 3510 if (lmp->is == NULL) 3511 goto done; 3512 } 3513 else 3514 lmp->is[0] = '\0'; 3515 } 3516 break; 3517 default: 3518 if (t < END) 3519 { 3520 /* "cannot happen" */ 3521 goto done; 3522 } 3523 else if (t == '\0') 3524 { 3525 /* not on *my* shift */ 3526 goto done; 3527 } 3528 else if (t >= CSET 3529#ifdef MBS_SUPPORT 3530 || t == ANYCHAR 3531 || t == MBCSET 3532#endif /* MBS_SUPPORT */ 3533 ) 3534 { 3535 /* easy enough */ 3536 resetmust(mp); 3537 } 3538 else 3539 { 3540 /* plain character */ 3541 resetmust(mp); 3542 mp->is[0] = mp->left[0] = mp->right[0] = t; 3543 mp->is[1] = mp->left[1] = mp->right[1] = '\0'; 3544 mp->in = enlist(mp->in, mp->is, (size_t)1); 3545 if (mp->in == NULL) 3546 goto done; 3547 } 3548 break; 3549 } 3550#ifdef DEBUG 3551 fprintf(stderr, " node: %d:", ri); 3552 prtok(dfa->tokens[ri]); 3553 fprintf(stderr, "\n in:"); 3554 for (i = 0; mp->in[i]; ++i) 3555 fprintf(stderr, " \"%s\"", mp->in[i]); 3556 fprintf(stderr, "\n is: \"%s\"\n", mp->is); 3557 fprintf(stderr, " left: \"%s\"\n", mp->left); 3558 fprintf(stderr, " right: \"%s\"\n", mp->right); 3559#endif 3560 ++mp; 3561 } 3562 done: 3563 if (strlen(result)) 3564 { 3565 dm = (struct dfamust *) malloc(sizeof (struct dfamust)); 3566 dm->exact = exact; 3567 dm->must = malloc(strlen(result) + 1); 3568 strcpy(dm->must, result); 3569 dm->next = dfa->musts; 3570 dfa->musts = dm; 3571 } 3572 mp = musts; 3573 for (i = 0; i <= dfa->tindex; ++i) 3574 { 3575 freelist(mp[i].in); 3576 ifree((char *) mp[i].in); 3577 ifree(mp[i].left); 3578 ifree(mp[i].right); 3579 ifree(mp[i].is); 3580 } 3581 free((char *) mp); 3582} 3583/* vim:set shiftwidth=2: */ 3584