dfa.c revision 131557
1/* dfa.c - deterministic extended regexp routines for GNU 2 Copyright 1988, 1998, 2000 Free Software Foundation, Inc. 3 4 This program is free software; you can redistribute it and/or modify 5 it under the terms of the GNU General Public License as published by 6 the Free Software Foundation; either version 2, or (at your option) 7 any later version. 8 9 This program is distributed in the hope that it will be useful, 10 but WITHOUT ANY WARRANTY; without even the implied warranty of 11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 GNU General Public License for more details. 13 14 You should have received a copy of the GNU General Public License 15 along with this program; if not, write to the Free Software 16 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA */ 17 18/* Written June, 1988 by Mike Haertel 19 Modified July, 1988 by Arthur David Olson to assist BMG speedups */ 20 21/* $FreeBSD: head/gnu/usr.bin/grep/dfa.c 131557 2004-07-04 10:02:03Z tjr $ */ 22 23#ifdef HAVE_CONFIG_H 24#include <config.h> 25#endif 26 27#include <assert.h> 28#include <ctype.h> 29#include <stdio.h> 30 31#include <sys/types.h> 32#ifdef STDC_HEADERS 33#include <stdlib.h> 34#else 35extern char *calloc(), *malloc(), *realloc(); 36extern void free(); 37#endif 38 39#if defined(HAVE_STRING_H) || defined(STDC_HEADERS) 40#include <string.h> 41#else 42#include <strings.h> 43#endif 44 45#if HAVE_SETLOCALE 46# include <locale.h> 47#endif 48 49#if defined HAVE_WCTYPE_H && defined HAVE_WCHAR_H && defined HAVE_MBRTOWC 50/* We can handle multibyte string. */ 51# define MBS_SUPPORT 52#endif 53 54#ifdef MBS_SUPPORT 55# include <wchar.h> 56# include <wctype.h> 57#endif 58 59#ifndef DEBUG /* use the same approach as regex.c */ 60#undef assert 61#define assert(e) 62#endif /* DEBUG */ 63 64#ifndef isgraph 65#define isgraph(C) (isprint(C) && !isspace(C)) 66#endif 67 68#if defined (STDC_HEADERS) || (!defined (isascii) && !defined (HAVE_ISASCII)) 69#define ISALPHA(C) isalpha(C) 70#define ISUPPER(C) isupper(C) 71#define ISLOWER(C) islower(C) 72#define ISDIGIT(C) isdigit(C) 73#define ISXDIGIT(C) isxdigit(C) 74#define ISSPACE(C) isspace(C) 75#define ISPUNCT(C) ispunct(C) 76#define ISALNUM(C) isalnum(C) 77#define ISPRINT(C) isprint(C) 78#define ISGRAPH(C) isgraph(C) 79#define ISCNTRL(C) iscntrl(C) 80#else 81#define ISALPHA(C) (isascii(C) && isalpha(C)) 82#define ISUPPER(C) (isascii(C) && isupper(C)) 83#define ISLOWER(C) (isascii(C) && islower(C)) 84#define ISDIGIT(C) (isascii(C) && isdigit(C)) 85#define ISXDIGIT(C) (isascii(C) && isxdigit(C)) 86#define ISSPACE(C) (isascii(C) && isspace(C)) 87#define ISPUNCT(C) (isascii(C) && ispunct(C)) 88#define ISALNUM(C) (isascii(C) && isalnum(C)) 89#define ISPRINT(C) (isascii(C) && isprint(C)) 90#define ISGRAPH(C) (isascii(C) && isgraph(C)) 91#define ISCNTRL(C) (isascii(C) && iscntrl(C)) 92#endif 93 94/* ISASCIIDIGIT differs from ISDIGIT, as follows: 95 - Its arg may be any int or unsigned int; it need not be an unsigned char. 96 - It's guaranteed to evaluate its argument exactly once. 97 - It's typically faster. 98 Posix 1003.2-1992 section 2.5.2.1 page 50 lines 1556-1558 says that 99 only '0' through '9' are digits. Prefer ISASCIIDIGIT to ISDIGIT unless 100 it's important to use the locale's definition of `digit' even when the 101 host does not conform to Posix. */ 102#define ISASCIIDIGIT(c) ((unsigned) (c) - '0' <= 9) 103 104/* If we (don't) have I18N. */ 105/* glibc defines _ */ 106#ifndef _ 107# ifdef HAVE_LIBINTL_H 108# include <libintl.h> 109# ifndef _ 110# define _(Str) gettext (Str) 111# endif 112# else 113# define _(Str) (Str) 114# endif 115#endif 116 117#include "regex.h" 118#include "dfa.h" 119#include "hard-locale.h" 120 121/* HPUX, define those as macros in sys/param.h */ 122#ifdef setbit 123# undef setbit 124#endif 125#ifdef clrbit 126# undef clrbit 127#endif 128 129static void dfamust PARAMS ((struct dfa *dfa)); 130static void regexp PARAMS ((int toplevel)); 131 132static ptr_t 133xcalloc (size_t n, size_t s) 134{ 135 ptr_t r = calloc(n, s); 136 137 if (!r) 138 dfaerror(_("Memory exhausted")); 139 return r; 140} 141 142static ptr_t 143xmalloc (size_t n) 144{ 145 ptr_t r = malloc(n); 146 147 assert(n != 0); 148 if (!r) 149 dfaerror(_("Memory exhausted")); 150 return r; 151} 152 153static ptr_t 154xrealloc (ptr_t p, size_t n) 155{ 156 ptr_t r = realloc(p, n); 157 158 assert(n != 0); 159 if (!r) 160 dfaerror(_("Memory exhausted")); 161 return r; 162} 163 164#define CALLOC(p, t, n) ((p) = (t *) xcalloc((size_t)(n), sizeof (t))) 165#define MALLOC(p, t, n) ((p) = (t *) xmalloc((n) * sizeof (t))) 166#define REALLOC(p, t, n) ((p) = (t *) xrealloc((ptr_t) (p), (n) * sizeof (t))) 167 168/* Reallocate an array of type t if nalloc is too small for index. */ 169#define REALLOC_IF_NECESSARY(p, t, nalloc, index) \ 170 if ((index) >= (nalloc)) \ 171 { \ 172 do \ 173 (nalloc) *= 2; \ 174 while ((index) >= (nalloc)); \ 175 REALLOC(p, t, nalloc); \ 176 } 177 178#ifdef DEBUG 179 180static void 181prtok (token t) 182{ 183 char const *s; 184 185 if (t < 0) 186 fprintf(stderr, "END"); 187 else if (t < NOTCHAR) 188 fprintf(stderr, "%c", t); 189 else 190 { 191 switch (t) 192 { 193 case EMPTY: s = "EMPTY"; break; 194 case BACKREF: s = "BACKREF"; break; 195 case BEGLINE: s = "BEGLINE"; break; 196 case ENDLINE: s = "ENDLINE"; break; 197 case BEGWORD: s = "BEGWORD"; break; 198 case ENDWORD: s = "ENDWORD"; break; 199 case LIMWORD: s = "LIMWORD"; break; 200 case NOTLIMWORD: s = "NOTLIMWORD"; break; 201 case QMARK: s = "QMARK"; break; 202 case STAR: s = "STAR"; break; 203 case PLUS: s = "PLUS"; break; 204 case CAT: s = "CAT"; break; 205 case OR: s = "OR"; break; 206 case ORTOP: s = "ORTOP"; break; 207 case LPAREN: s = "LPAREN"; break; 208 case RPAREN: s = "RPAREN"; break; 209 case CRANGE: s = "CRANGE"; break; 210#ifdef MBS_SUPPORT 211 case ANYCHAR: s = "ANYCHAR"; break; 212 case MBCSET: s = "MBCSET"; break; 213#endif /* MBS_SUPPORT */ 214 default: s = "CSET"; break; 215 } 216 fprintf(stderr, "%s", s); 217 } 218} 219#endif /* DEBUG */ 220 221/* Stuff pertaining to charclasses. */ 222 223static int 224tstbit (unsigned b, charclass c) 225{ 226 return c[b / INTBITS] & 1 << b % INTBITS; 227} 228 229static void 230setbit (unsigned b, charclass c) 231{ 232 c[b / INTBITS] |= 1 << b % INTBITS; 233} 234 235static void 236clrbit (unsigned b, charclass c) 237{ 238 c[b / INTBITS] &= ~(1 << b % INTBITS); 239} 240 241static void 242copyset (charclass src, charclass dst) 243{ 244 memcpy (dst, src, sizeof (charclass)); 245} 246 247static void 248zeroset (charclass s) 249{ 250 memset (s, 0, sizeof (charclass)); 251} 252 253static void 254notset (charclass s) 255{ 256 int i; 257 258 for (i = 0; i < CHARCLASS_INTS; ++i) 259 s[i] = ~s[i]; 260} 261 262static int 263equal (charclass s1, charclass s2) 264{ 265 return memcmp (s1, s2, sizeof (charclass)) == 0; 266} 267 268/* A pointer to the current dfa is kept here during parsing. */ 269static struct dfa *dfa; 270 271/* Find the index of charclass s in dfa->charclasses, or allocate a new charclass. */ 272static int 273charclass_index (charclass s) 274{ 275 int i; 276 277 for (i = 0; i < dfa->cindex; ++i) 278 if (equal(s, dfa->charclasses[i])) 279 return i; 280 REALLOC_IF_NECESSARY(dfa->charclasses, charclass, dfa->calloc, dfa->cindex); 281 ++dfa->cindex; 282 copyset(s, dfa->charclasses[i]); 283 return i; 284} 285 286/* Syntax bits controlling the behavior of the lexical analyzer. */ 287static reg_syntax_t syntax_bits, syntax_bits_set; 288 289/* Flag for case-folding letters into sets. */ 290static int case_fold; 291 292/* End-of-line byte in data. */ 293static unsigned char eolbyte; 294 295/* Entry point to set syntax options. */ 296void 297dfasyntax (reg_syntax_t bits, int fold, unsigned char eol) 298{ 299 syntax_bits_set = 1; 300 syntax_bits = bits; 301 case_fold = fold; 302 eolbyte = eol; 303} 304 305/* Like setbit, but if case is folded, set both cases of a letter. */ 306static void 307setbit_case_fold (unsigned b, charclass c) 308{ 309 setbit (b, c); 310 if (case_fold) 311 { 312 if (ISUPPER (b)) 313 setbit (tolower (b), c); 314 else if (ISLOWER (b)) 315 setbit (toupper (b), c); 316 } 317} 318 319/* Lexical analyzer. All the dross that deals with the obnoxious 320 GNU Regex syntax bits is located here. The poor, suffering 321 reader is referred to the GNU Regex documentation for the 322 meaning of the @#%!@#%^!@ syntax bits. */ 323 324static char const *lexstart; /* Pointer to beginning of input string. */ 325static char const *lexptr; /* Pointer to next input character. */ 326static int lexleft; /* Number of characters remaining. */ 327static token lasttok; /* Previous token returned; initially END. */ 328static int laststart; /* True if we're separated from beginning or (, | 329 only by zero-width characters. */ 330static int parens; /* Count of outstanding left parens. */ 331static int minrep, maxrep; /* Repeat counts for {m,n}. */ 332static int hard_LC_COLLATE; /* Nonzero if LC_COLLATE is hard. */ 333 334#ifdef MBS_SUPPORT 335/* These variables are used only if (MB_CUR_MAX > 1). */ 336static mbstate_t mbs; /* Mbstate for mbrlen(). */ 337static int cur_mb_len; /* Byte length of the current scanning 338 multibyte character. */ 339static int cur_mb_index; /* Byte index of the current scanning multibyte 340 character. 341 342 singlebyte character : cur_mb_index = 0 343 multibyte character 344 1st byte : cur_mb_index = 1 345 2nd byte : cur_mb_index = 2 346 ... 347 nth byte : cur_mb_index = n */ 348static unsigned char *mblen_buf;/* Correspond to the input buffer in dfaexec(). 349 Each element store the amount of remain 350 byte of corresponding multibyte character 351 in the input string. A element's value 352 is 0 if corresponding character is a 353 singlebyte chracter. 354 e.g. input : 'a', <mb(0)>, <mb(1)>, <mb(2)> 355 mblen_buf : 0, 3, 2, 1 356 */ 357static wchar_t *inputwcs; /* Wide character representation of input 358 string in dfaexec(). 359 The length of this array is same as 360 the length of input string(char array). 361 inputstring[i] is a single-byte char, 362 or 1st byte of a multibyte char. 363 And inputwcs[i] is the codepoint. */ 364static unsigned char const *buf_begin;/* refference to begin in dfaexec(). */ 365static unsigned char const *buf_end; /* refference to end in dfaexec(). */ 366#endif /* MBS_SUPPORT */ 367 368#ifdef MBS_SUPPORT 369/* This function update cur_mb_len, and cur_mb_index. 370 p points current lexptr, len is the remaining buffer length. */ 371static void 372update_mb_len_index (unsigned char const *p, int len) 373{ 374 /* If last character is a part of a multibyte character, 375 we update cur_mb_index. */ 376 if (cur_mb_index) 377 cur_mb_index = (cur_mb_index >= cur_mb_len)? 0 378 : cur_mb_index + 1; 379 380 /* If last character is a single byte character, or the 381 last portion of a multibyte character, we check whether 382 next character is a multibyte character or not. */ 383 if (! cur_mb_index) 384 { 385 cur_mb_len = mbrlen(p, len, &mbs); 386 if (cur_mb_len > 1) 387 /* It is a multibyte character. 388 cur_mb_len was already set by mbrlen(). */ 389 cur_mb_index = 1; 390 else if (cur_mb_len < 1) 391 /* Invalid sequence. We treat it as a singlebyte character. 392 cur_mb_index is aleady 0. */ 393 cur_mb_len = 1; 394 /* Otherwise, cur_mb_len == 1, it is a singlebyte character. 395 cur_mb_index is aleady 0. */ 396 } 397} 398#endif /* MBS_SUPPORT */ 399 400#ifdef MBS_SUPPORT 401/* Note that characters become unsigned here. */ 402# define FETCH(c, eoferr) \ 403 { \ 404 if (! lexleft) \ 405 { \ 406 if (eoferr != 0) \ 407 dfaerror (eoferr); \ 408 else \ 409 return lasttok = END; \ 410 } \ 411 if (MB_CUR_MAX > 1) \ 412 update_mb_len_index(lexptr, lexleft); \ 413 (c) = (unsigned char) *lexptr++; \ 414 --lexleft; \ 415 } 416 417/* This function fetch a wide character, and update cur_mb_len, 418 used only if the current locale is a multibyte environment. */ 419static wchar_t 420fetch_wc (char const *eoferr) 421{ 422 wchar_t wc; 423 if (! lexleft) 424 { 425 if (eoferr != 0) 426 dfaerror (eoferr); 427 else 428 return -1; 429 } 430 431 cur_mb_len = mbrtowc(&wc, lexptr, lexleft, &mbs); 432 if (cur_mb_len <= 0) 433 { 434 cur_mb_len = 1; 435 wc = *lexptr; 436 } 437 lexptr += cur_mb_len; 438 lexleft -= cur_mb_len; 439 return wc; 440} 441#else 442/* Note that characters become unsigned here. */ 443# define FETCH(c, eoferr) \ 444 { \ 445 if (! lexleft) \ 446 { \ 447 if (eoferr != 0) \ 448 dfaerror (eoferr); \ 449 else \ 450 return lasttok = END; \ 451 } \ 452 (c) = (unsigned char) *lexptr++; \ 453 --lexleft; \ 454 } 455#endif /* MBS_SUPPORT */ 456 457#ifdef MBS_SUPPORT 458/* Multibyte character handling sub-routin for lex. 459 This function parse a bracket expression and build a struct 460 mb_char_classes. */ 461static void 462parse_bracket_exp_mb () 463{ 464 wchar_t wc, wc1, wc2; 465 466 /* Work area to build a mb_char_classes. */ 467 struct mb_char_classes *work_mbc; 468 int chars_al, range_sts_al, range_ends_al, ch_classes_al, 469 equivs_al, coll_elems_al; 470 471 REALLOC_IF_NECESSARY(dfa->mbcsets, struct mb_char_classes, 472 dfa->mbcsets_alloc, dfa->nmbcsets + 1); 473 /* dfa->multibyte_prop[] hold the index of dfa->mbcsets. 474 We will update dfa->multibyte_prop in addtok(), because we can't 475 decide the index in dfa->tokens[]. */ 476 477 /* Initialize work are */ 478 work_mbc = &(dfa->mbcsets[dfa->nmbcsets++]); 479 480 chars_al = 1; 481 range_sts_al = range_ends_al = 0; 482 ch_classes_al = equivs_al = coll_elems_al = 0; 483 MALLOC(work_mbc->chars, wchar_t, chars_al); 484 485 work_mbc->nchars = work_mbc->nranges = work_mbc->nch_classes = 0; 486 work_mbc->nequivs = work_mbc->ncoll_elems = 0; 487 work_mbc->chars = work_mbc->ch_classes = NULL; 488 work_mbc->range_sts = work_mbc->range_ends = NULL; 489 work_mbc->equivs = work_mbc->coll_elems = NULL; 490 491 wc = fetch_wc(_("Unbalanced [")); 492 if (wc == L'^') 493 { 494 wc = fetch_wc(_("Unbalanced [")); 495 work_mbc->invert = 1; 496 } 497 else 498 work_mbc->invert = 0; 499 do 500 { 501 wc1 = -1; /* mark wc1 is not initialized". */ 502 503 /* Note that if we're looking at some other [:...:] construct, 504 we just treat it as a bunch of ordinary characters. We can do 505 this because we assume regex has checked for syntax errors before 506 dfa is ever called. */ 507 if (wc == L'[' && (syntax_bits & RE_CHAR_CLASSES)) 508 { 509#define BRACKET_BUFFER_SIZE 128 510 char str[BRACKET_BUFFER_SIZE]; 511 wc1 = wc; 512 wc = fetch_wc(_("Unbalanced [")); 513 514 /* If pattern contains `[[:', `[[.', or `[[='. */ 515 if (cur_mb_len == 1 && (wc == L':' || wc == L'.' || wc == L'=')) 516 { 517 unsigned char c; 518 unsigned char delim = (unsigned char)wc; 519 int len = 0; 520 for (;;) 521 { 522 if (! lexleft) 523 dfaerror (_("Unbalanced [")); 524 c = (unsigned char) *lexptr++; 525 --lexleft; 526 527 if ((c == delim && *lexptr == ']') || lexleft == 0) 528 break; 529 if (len < BRACKET_BUFFER_SIZE) 530 str[len++] = c; 531 else 532 /* This is in any case an invalid class name. */ 533 str[0] = '\0'; 534 } 535 str[len] = '\0'; 536 537 if (lexleft == 0) 538 { 539 REALLOC_IF_NECESSARY(work_mbc->chars, wchar_t, chars_al, 540 work_mbc->nchars + 2); 541 work_mbc->chars[work_mbc->nchars++] = L'['; 542 work_mbc->chars[work_mbc->nchars++] = delim; 543 break; 544 } 545 546 if (--lexleft, *lexptr++ != ']') 547 dfaerror (_("Unbalanced [")); 548 if (delim == ':') 549 /* build character class. */ 550 { 551 wctype_t wt; 552 /* Query the character class as wctype_t. */ 553 wt = wctype (str); 554 555 if (ch_classes_al == 0) 556 MALLOC(work_mbc->ch_classes, wchar_t, ++ch_classes_al); 557 REALLOC_IF_NECESSARY(work_mbc->ch_classes, wctype_t, 558 ch_classes_al, 559 work_mbc->nch_classes + 1); 560 work_mbc->ch_classes[work_mbc->nch_classes++] = wt; 561 562 } 563 else if (delim == '=' || delim == '.') 564 { 565 char *elem; 566 MALLOC(elem, char, len + 1); 567 strncpy(elem, str, len + 1); 568 569 if (delim == '=') 570 /* build equivalent class. */ 571 { 572 if (equivs_al == 0) 573 MALLOC(work_mbc->equivs, char*, ++equivs_al); 574 REALLOC_IF_NECESSARY(work_mbc->equivs, char*, 575 equivs_al, 576 work_mbc->nequivs + 1); 577 work_mbc->equivs[work_mbc->nequivs++] = elem; 578 } 579 580 if (delim == '.') 581 /* build collating element. */ 582 { 583 if (coll_elems_al == 0) 584 MALLOC(work_mbc->coll_elems, char*, ++coll_elems_al); 585 REALLOC_IF_NECESSARY(work_mbc->coll_elems, char*, 586 coll_elems_al, 587 work_mbc->ncoll_elems + 1); 588 work_mbc->coll_elems[work_mbc->ncoll_elems++] = elem; 589 } 590 } 591 wc = -1; 592 } 593 else 594 /* We treat '[' as a normal character here. */ 595 { 596 wc2 = wc1; wc1 = wc; wc = wc2; /* swap */ 597 } 598 } 599 else 600 { 601 if (wc == L'\\' && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS)) 602 wc = fetch_wc(("Unbalanced [")); 603 } 604 605 if (wc1 == -1) 606 wc1 = fetch_wc(_("Unbalanced [")); 607 608 if (wc1 == L'-') 609 /* build range characters. */ 610 { 611 wc2 = fetch_wc(_("Unbalanced [")); 612 if (wc2 == L']') 613 { 614 /* In the case [x-], the - is an ordinary hyphen, 615 which is left in c1, the lookahead character. */ 616 lexptr -= cur_mb_len; 617 lexleft += cur_mb_len; 618 wc2 = wc; 619 } 620 else 621 { 622 if (wc2 == L'\\' 623 && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS)) 624 wc2 = fetch_wc(_("Unbalanced [")); 625 wc1 = fetch_wc(_("Unbalanced [")); 626 } 627 628 if (range_sts_al == 0) 629 { 630 MALLOC(work_mbc->range_sts, wchar_t, ++range_sts_al); 631 MALLOC(work_mbc->range_ends, wchar_t, ++range_ends_al); 632 } 633 REALLOC_IF_NECESSARY(work_mbc->range_sts, wchar_t, 634 range_sts_al, work_mbc->nranges + 1); 635 work_mbc->range_sts[work_mbc->nranges] = wc; 636 REALLOC_IF_NECESSARY(work_mbc->range_ends, wchar_t, 637 range_ends_al, work_mbc->nranges + 1); 638 work_mbc->range_ends[work_mbc->nranges++] = wc2; 639 } 640 else if (wc != -1) 641 /* build normal characters. */ 642 { 643 REALLOC_IF_NECESSARY(work_mbc->chars, wchar_t, chars_al, 644 work_mbc->nchars + 1); 645 work_mbc->chars[work_mbc->nchars++] = wc; 646 } 647 } 648 while ((wc = wc1) != L']'); 649} 650#endif /* MBS_SUPPORT */ 651 652#ifdef __STDC__ 653#define FUNC(F, P) static int F(int c) { return P(c); } 654#else 655#define FUNC(F, P) static int F(c) int c; { return P(c); } 656#endif 657 658FUNC(is_alpha, ISALPHA) 659FUNC(is_upper, ISUPPER) 660FUNC(is_lower, ISLOWER) 661FUNC(is_digit, ISDIGIT) 662FUNC(is_xdigit, ISXDIGIT) 663FUNC(is_space, ISSPACE) 664FUNC(is_punct, ISPUNCT) 665FUNC(is_alnum, ISALNUM) 666FUNC(is_print, ISPRINT) 667FUNC(is_graph, ISGRAPH) 668FUNC(is_cntrl, ISCNTRL) 669 670static int 671is_blank (int c) 672{ 673 return (c == ' ' || c == '\t'); 674} 675 676/* The following list maps the names of the Posix named character classes 677 to predicate functions that determine whether a given character is in 678 the class. The leading [ has already been eaten by the lexical analyzer. */ 679static struct { 680 const char *name; 681 int (*pred) PARAMS ((int)); 682} const prednames[] = { 683 { ":alpha:]", is_alpha }, 684 { ":upper:]", is_upper }, 685 { ":lower:]", is_lower }, 686 { ":digit:]", is_digit }, 687 { ":xdigit:]", is_xdigit }, 688 { ":space:]", is_space }, 689 { ":punct:]", is_punct }, 690 { ":alnum:]", is_alnum }, 691 { ":print:]", is_print }, 692 { ":graph:]", is_graph }, 693 { ":cntrl:]", is_cntrl }, 694 { ":blank:]", is_blank }, 695 { 0 } 696}; 697 698/* Return non-zero if C is a `word-constituent' byte; zero otherwise. */ 699#define IS_WORD_CONSTITUENT(C) (ISALNUM(C) || (C) == '_') 700 701static int 702looking_at (char const *s) 703{ 704 size_t len; 705 706 len = strlen(s); 707 if (lexleft < len) 708 return 0; 709 return strncmp(s, lexptr, len) == 0; 710} 711 712static token 713lex (void) 714{ 715 unsigned c, c1, c2; 716 int backslash = 0, invert; 717 charclass ccl; 718 int i; 719 720 /* Basic plan: We fetch a character. If it's a backslash, 721 we set the backslash flag and go through the loop again. 722 On the plus side, this avoids having a duplicate of the 723 main switch inside the backslash case. On the minus side, 724 it means that just about every case begins with 725 "if (backslash) ...". */ 726 for (i = 0; i < 2; ++i) 727 { 728 FETCH(c, 0); 729#ifdef MBS_SUPPORT 730 if (MB_CUR_MAX > 1 && cur_mb_index) 731 /* If this is a part of a multi-byte character, we must treat 732 this byte data as a normal character. 733 e.g. In case of SJIS encoding, some character contains '\', 734 but they must not be backslash. */ 735 goto normal_char; 736#endif /* MBS_SUPPORT */ 737 switch (c) 738 { 739 case '\\': 740 if (backslash) 741 goto normal_char; 742 if (lexleft == 0) 743 dfaerror(_("Unfinished \\ escape")); 744 backslash = 1; 745 break; 746 747 case '^': 748 if (backslash) 749 goto normal_char; 750 if (syntax_bits & RE_CONTEXT_INDEP_ANCHORS 751 || lasttok == END 752 || lasttok == LPAREN 753 || lasttok == OR) 754 return lasttok = BEGLINE; 755 goto normal_char; 756 757 case '$': 758 if (backslash) 759 goto normal_char; 760 if (syntax_bits & RE_CONTEXT_INDEP_ANCHORS 761 || lexleft == 0 762 || (syntax_bits & RE_NO_BK_PARENS 763 ? lexleft > 0 && *lexptr == ')' 764 : lexleft > 1 && lexptr[0] == '\\' && lexptr[1] == ')') 765 || (syntax_bits & RE_NO_BK_VBAR 766 ? lexleft > 0 && *lexptr == '|' 767 : lexleft > 1 && lexptr[0] == '\\' && lexptr[1] == '|') 768 || ((syntax_bits & RE_NEWLINE_ALT) 769 && lexleft > 0 && *lexptr == '\n')) 770 return lasttok = ENDLINE; 771 goto normal_char; 772 773 case '1': 774 case '2': 775 case '3': 776 case '4': 777 case '5': 778 case '6': 779 case '7': 780 case '8': 781 case '9': 782 if (backslash && !(syntax_bits & RE_NO_BK_REFS)) 783 { 784 laststart = 0; 785 return lasttok = BACKREF; 786 } 787 goto normal_char; 788 789 case '`': 790 if (backslash && !(syntax_bits & RE_NO_GNU_OPS)) 791 return lasttok = BEGLINE; /* FIXME: should be beginning of string */ 792 goto normal_char; 793 794 case '\'': 795 if (backslash && !(syntax_bits & RE_NO_GNU_OPS)) 796 return lasttok = ENDLINE; /* FIXME: should be end of string */ 797 goto normal_char; 798 799 case '<': 800 if (backslash && !(syntax_bits & RE_NO_GNU_OPS)) 801 return lasttok = BEGWORD; 802 goto normal_char; 803 804 case '>': 805 if (backslash && !(syntax_bits & RE_NO_GNU_OPS)) 806 return lasttok = ENDWORD; 807 goto normal_char; 808 809 case 'b': 810 if (backslash && !(syntax_bits & RE_NO_GNU_OPS)) 811 return lasttok = LIMWORD; 812 goto normal_char; 813 814 case 'B': 815 if (backslash && !(syntax_bits & RE_NO_GNU_OPS)) 816 return lasttok = NOTLIMWORD; 817 goto normal_char; 818 819 case '?': 820 if (syntax_bits & RE_LIMITED_OPS) 821 goto normal_char; 822 if (backslash != ((syntax_bits & RE_BK_PLUS_QM) != 0)) 823 goto normal_char; 824 if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart) 825 goto normal_char; 826 return lasttok = QMARK; 827 828 case '*': 829 if (backslash) 830 goto normal_char; 831 if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart) 832 goto normal_char; 833 return lasttok = STAR; 834 835 case '+': 836 if (syntax_bits & RE_LIMITED_OPS) 837 goto normal_char; 838 if (backslash != ((syntax_bits & RE_BK_PLUS_QM) != 0)) 839 goto normal_char; 840 if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart) 841 goto normal_char; 842 return lasttok = PLUS; 843 844 case '{': 845 if (!(syntax_bits & RE_INTERVALS)) 846 goto normal_char; 847 if (backslash != ((syntax_bits & RE_NO_BK_BRACES) == 0)) 848 goto normal_char; 849 if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart) 850 goto normal_char; 851 852 if (syntax_bits & RE_NO_BK_BRACES) 853 { 854 /* Scan ahead for a valid interval; if it's not valid, 855 treat it as a literal '{'. */ 856 int lo = -1, hi = -1; 857 char const *p = lexptr; 858 char const *lim = p + lexleft; 859 for (; p != lim && ISASCIIDIGIT (*p); p++) 860 lo = (lo < 0 ? 0 : lo * 10) + *p - '0'; 861 if (p != lim && *p == ',') 862 while (++p != lim && ISASCIIDIGIT (*p)) 863 hi = (hi < 0 ? 0 : hi * 10) + *p - '0'; 864 else 865 hi = lo; 866 if (p == lim || *p != '}' 867 || lo < 0 || RE_DUP_MAX < hi || (0 <= hi && hi < lo)) 868 goto normal_char; 869 } 870 871 minrep = 0; 872 /* Cases: 873 {M} - exact count 874 {M,} - minimum count, maximum is infinity 875 {M,N} - M through N */ 876 FETCH(c, _("unfinished repeat count")); 877 if (ISASCIIDIGIT (c)) 878 { 879 minrep = c - '0'; 880 for (;;) 881 { 882 FETCH(c, _("unfinished repeat count")); 883 if (! ISASCIIDIGIT (c)) 884 break; 885 minrep = 10 * minrep + c - '0'; 886 } 887 } 888 else 889 dfaerror(_("malformed repeat count")); 890 if (c == ',') 891 { 892 FETCH (c, _("unfinished repeat count")); 893 if (! ISASCIIDIGIT (c)) 894 maxrep = -1; 895 else 896 { 897 maxrep = c - '0'; 898 for (;;) 899 { 900 FETCH (c, _("unfinished repeat count")); 901 if (! ISASCIIDIGIT (c)) 902 break; 903 maxrep = 10 * maxrep + c - '0'; 904 } 905 if (0 <= maxrep && maxrep < minrep) 906 dfaerror (_("malformed repeat count")); 907 } 908 } 909 else 910 maxrep = minrep; 911 if (!(syntax_bits & RE_NO_BK_BRACES)) 912 { 913 if (c != '\\') 914 dfaerror(_("malformed repeat count")); 915 FETCH(c, _("unfinished repeat count")); 916 } 917 if (c != '}') 918 dfaerror(_("malformed repeat count")); 919 laststart = 0; 920 return lasttok = REPMN; 921 922 case '|': 923 if (syntax_bits & RE_LIMITED_OPS) 924 goto normal_char; 925 if (backslash != ((syntax_bits & RE_NO_BK_VBAR) == 0)) 926 goto normal_char; 927 laststart = 1; 928 return lasttok = OR; 929 930 case '\n': 931 if (syntax_bits & RE_LIMITED_OPS 932 || backslash 933 || !(syntax_bits & RE_NEWLINE_ALT)) 934 goto normal_char; 935 laststart = 1; 936 return lasttok = OR; 937 938 case '(': 939 if (backslash != ((syntax_bits & RE_NO_BK_PARENS) == 0)) 940 goto normal_char; 941 ++parens; 942 laststart = 1; 943 return lasttok = LPAREN; 944 945 case ')': 946 if (backslash != ((syntax_bits & RE_NO_BK_PARENS) == 0)) 947 goto normal_char; 948 if (parens == 0 && syntax_bits & RE_UNMATCHED_RIGHT_PAREN_ORD) 949 goto normal_char; 950 --parens; 951 laststart = 0; 952 return lasttok = RPAREN; 953 954 case '.': 955 if (backslash) 956 goto normal_char; 957#ifdef MBS_SUPPORT 958 if (MB_CUR_MAX > 1) 959 { 960 /* In multibyte environment period must match with a single 961 character not a byte. So we use ANYCHAR. */ 962 laststart = 0; 963 return lasttok = ANYCHAR; 964 } 965#endif /* MBS_SUPPORT */ 966 zeroset(ccl); 967 notset(ccl); 968 if (!(syntax_bits & RE_DOT_NEWLINE)) 969 clrbit(eolbyte, ccl); 970 if (syntax_bits & RE_DOT_NOT_NULL) 971 clrbit('\0', ccl); 972 laststart = 0; 973 return lasttok = CSET + charclass_index(ccl); 974 975 case 'w': 976 case 'W': 977 if (!backslash || (syntax_bits & RE_NO_GNU_OPS)) 978 goto normal_char; 979 zeroset(ccl); 980 for (c2 = 0; c2 < NOTCHAR; ++c2) 981 if (IS_WORD_CONSTITUENT(c2)) 982 setbit(c2, ccl); 983 if (c == 'W') 984 notset(ccl); 985 laststart = 0; 986 return lasttok = CSET + charclass_index(ccl); 987 988 case '[': 989 if (backslash) 990 goto normal_char; 991 laststart = 0; 992#ifdef MBS_SUPPORT 993 if (MB_CUR_MAX > 1) 994 { 995 /* In multibyte environment a bracket expression may contain 996 multibyte characters, which must be treated as characters 997 (not bytes). So we parse it by parse_bracket_exp_mb(). */ 998 parse_bracket_exp_mb(); 999 return lasttok = MBCSET; 1000 } 1001#endif 1002 zeroset(ccl); 1003 FETCH(c, _("Unbalanced [")); 1004 if (c == '^') 1005 { 1006 FETCH(c, _("Unbalanced [")); 1007 invert = 1; 1008 } 1009 else 1010 invert = 0; 1011 do 1012 { 1013 /* Nobody ever said this had to be fast. :-) 1014 Note that if we're looking at some other [:...:] 1015 construct, we just treat it as a bunch of ordinary 1016 characters. We can do this because we assume 1017 regex has checked for syntax errors before 1018 dfa is ever called. */ 1019 if (c == '[' && (syntax_bits & RE_CHAR_CLASSES)) 1020 for (c1 = 0; prednames[c1].name; ++c1) 1021 if (looking_at(prednames[c1].name)) 1022 { 1023 int (*pred) PARAMS ((int)) = prednames[c1].pred; 1024 1025 for (c2 = 0; c2 < NOTCHAR; ++c2) 1026 if ((*pred)(c2)) 1027 setbit_case_fold (c2, ccl); 1028 lexptr += strlen(prednames[c1].name); 1029 lexleft -= strlen(prednames[c1].name); 1030 FETCH(c1, _("Unbalanced [")); 1031 goto skip; 1032 } 1033 if (c == '\\' && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS)) 1034 FETCH(c, _("Unbalanced [")); 1035 FETCH(c1, _("Unbalanced [")); 1036 if (c1 == '-') 1037 { 1038 FETCH(c2, _("Unbalanced [")); 1039 if (c2 == ']') 1040 { 1041 /* In the case [x-], the - is an ordinary hyphen, 1042 which is left in c1, the lookahead character. */ 1043 --lexptr; 1044 ++lexleft; 1045 } 1046 else 1047 { 1048 if (c2 == '\\' 1049 && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS)) 1050 FETCH(c2, _("Unbalanced [")); 1051 FETCH(c1, _("Unbalanced [")); 1052 if (!hard_LC_COLLATE) { 1053 for (; c <= c2; c++) 1054 setbit_case_fold (c, ccl); 1055 } else { 1056 /* POSIX locales are painful - leave the decision to libc */ 1057 char expr[6] = { '[', c, '-', c2, ']', '\0' }; 1058 regex_t re; 1059 if (regcomp (&re, expr, case_fold ? REG_ICASE : 0) == REG_NOERROR) { 1060 for (c = 0; c < NOTCHAR; ++c) { 1061 char buf[2] = { c, '\0' }; 1062 regmatch_t mat; 1063 if (regexec (&re, buf, 1, &mat, 0) == REG_NOERROR 1064 && mat.rm_so == 0 && mat.rm_eo == 1) 1065 setbit_case_fold (c, ccl); 1066 } 1067 regfree (&re); 1068 } 1069 } 1070 continue; 1071 } 1072 } 1073 1074 setbit_case_fold (c, ccl); 1075 1076 skip: 1077 ; 1078 } 1079 while ((c = c1) != ']'); 1080 if (invert) 1081 { 1082 notset(ccl); 1083 if (syntax_bits & RE_HAT_LISTS_NOT_NEWLINE) 1084 clrbit(eolbyte, ccl); 1085 } 1086 return lasttok = CSET + charclass_index(ccl); 1087 1088 default: 1089 normal_char: 1090 laststart = 0; 1091 if (case_fold && ISALPHA(c)) 1092 { 1093 zeroset(ccl); 1094 setbit_case_fold (c, ccl); 1095 return lasttok = CSET + charclass_index(ccl); 1096 } 1097 return c; 1098 } 1099 } 1100 1101 /* The above loop should consume at most a backslash 1102 and some other character. */ 1103 abort(); 1104 return END; /* keeps pedantic compilers happy. */ 1105} 1106 1107/* Recursive descent parser for regular expressions. */ 1108 1109static token tok; /* Lookahead token. */ 1110static int depth; /* Current depth of a hypothetical stack 1111 holding deferred productions. This is 1112 used to determine the depth that will be 1113 required of the real stack later on in 1114 dfaanalyze(). */ 1115 1116/* Add the given token to the parse tree, maintaining the depth count and 1117 updating the maximum depth if necessary. */ 1118static void 1119addtok (token t) 1120{ 1121#ifdef MBS_SUPPORT 1122 if (MB_CUR_MAX > 1) 1123 { 1124 REALLOC_IF_NECESSARY(dfa->multibyte_prop, int, dfa->nmultibyte_prop, 1125 dfa->tindex); 1126 /* Set dfa->multibyte_prop. See struct dfa in dfa.h. */ 1127 if (t == MBCSET) 1128 dfa->multibyte_prop[dfa->tindex] = ((dfa->nmbcsets - 1) << 2) + 3; 1129 else if (t < NOTCHAR) 1130 dfa->multibyte_prop[dfa->tindex] 1131 = (cur_mb_len == 1)? 3 /* single-byte char */ 1132 : (((cur_mb_index == 1)? 1 : 0) /* 1st-byte of multibyte char */ 1133 + ((cur_mb_index == cur_mb_len)? 2 : 0)); /* last-byte */ 1134 else 1135 /* It may be unnecesssary, but it is safer to treat other 1136 symbols as singlebyte characters. */ 1137 dfa->multibyte_prop[dfa->tindex] = 3; 1138 } 1139#endif 1140 1141 REALLOC_IF_NECESSARY(dfa->tokens, token, dfa->talloc, dfa->tindex); 1142 dfa->tokens[dfa->tindex++] = t; 1143 1144 switch (t) 1145 { 1146 case QMARK: 1147 case STAR: 1148 case PLUS: 1149 break; 1150 1151 case CAT: 1152 case OR: 1153 case ORTOP: 1154 --depth; 1155 break; 1156 1157 default: 1158 ++dfa->nleaves; 1159 case EMPTY: 1160 ++depth; 1161 break; 1162 } 1163 if (depth > dfa->depth) 1164 dfa->depth = depth; 1165} 1166 1167/* The grammar understood by the parser is as follows. 1168 1169 regexp: 1170 regexp OR branch 1171 branch 1172 1173 branch: 1174 branch closure 1175 closure 1176 1177 closure: 1178 closure QMARK 1179 closure STAR 1180 closure PLUS 1181 closure REPMN 1182 atom 1183 1184 atom: 1185 <normal character> 1186 <multibyte character> 1187 ANYCHAR 1188 MBCSET 1189 CSET 1190 BACKREF 1191 BEGLINE 1192 ENDLINE 1193 BEGWORD 1194 ENDWORD 1195 LIMWORD 1196 NOTLIMWORD 1197 CRANGE 1198 LPAREN regexp RPAREN 1199 <empty> 1200 1201 The parser builds a parse tree in postfix form in an array of tokens. */ 1202 1203static void 1204atom (void) 1205{ 1206 if ((tok >= 0 && tok < NOTCHAR) || tok >= CSET || tok == BACKREF 1207 || tok == BEGLINE || tok == ENDLINE || tok == BEGWORD 1208#ifdef MBS_SUPPORT 1209 || tok == ANYCHAR || tok == MBCSET /* MB_CUR_MAX > 1 */ 1210#endif /* MBS_SUPPORT */ 1211 || tok == ENDWORD || tok == LIMWORD || tok == NOTLIMWORD) 1212 { 1213 addtok(tok); 1214 tok = lex(); 1215#ifdef MBS_SUPPORT 1216 /* We treat a multibyte character as a single atom, so that DFA 1217 can treat a multibyte character as a single expression. 1218 1219 e.g. We construct following tree from "<mb1><mb2>". 1220 <mb1(1st-byte)><mb1(2nd-byte)><CAT><mb1(3rd-byte)><CAT> 1221 <mb2(1st-byte)><mb2(2nd-byte)><CAT><mb2(3rd-byte)><CAT><CAT> 1222 */ 1223 if (MB_CUR_MAX > 1) 1224 { 1225 while (cur_mb_index > 1 && tok >= 0 && tok < NOTCHAR) 1226 { 1227 addtok(tok); 1228 addtok(CAT); 1229 tok = lex(); 1230 } 1231 } 1232#endif /* MBS_SUPPORT */ 1233 } 1234 else if (tok == CRANGE) 1235 { 1236 /* A character range like "[a-z]" in a locale other than "C" or 1237 "POSIX". This range might any sequence of one or more 1238 characters. Unfortunately the POSIX locale primitives give 1239 us no practical way to find what character sequences might be 1240 matched. Treat this approximately like "(.\1)" -- i.e. match 1241 one character, and then punt to the full matcher. */ 1242 charclass ccl; 1243 zeroset (ccl); 1244 notset (ccl); 1245 addtok (CSET + charclass_index (ccl)); 1246 addtok (BACKREF); 1247 addtok (CAT); 1248 tok = lex (); 1249 } 1250 else if (tok == LPAREN) 1251 { 1252 tok = lex(); 1253 regexp(0); 1254 if (tok != RPAREN) 1255 dfaerror(_("Unbalanced (")); 1256 tok = lex(); 1257 } 1258 else 1259 addtok(EMPTY); 1260} 1261 1262/* Return the number of tokens in the given subexpression. */ 1263static int 1264nsubtoks (int tindex) 1265{ 1266 int ntoks1; 1267 1268 switch (dfa->tokens[tindex - 1]) 1269 { 1270 default: 1271 return 1; 1272 case QMARK: 1273 case STAR: 1274 case PLUS: 1275 return 1 + nsubtoks(tindex - 1); 1276 case CAT: 1277 case OR: 1278 case ORTOP: 1279 ntoks1 = nsubtoks(tindex - 1); 1280 return 1 + ntoks1 + nsubtoks(tindex - 1 - ntoks1); 1281 } 1282} 1283 1284/* Copy the given subexpression to the top of the tree. */ 1285static void 1286copytoks (int tindex, int ntokens) 1287{ 1288 int i; 1289 1290 for (i = 0; i < ntokens; ++i) 1291 addtok(dfa->tokens[tindex + i]); 1292} 1293 1294static void 1295closure (void) 1296{ 1297 int tindex, ntokens, i; 1298 1299 atom(); 1300 while (tok == QMARK || tok == STAR || tok == PLUS || tok == REPMN) 1301 if (tok == REPMN) 1302 { 1303 ntokens = nsubtoks(dfa->tindex); 1304 tindex = dfa->tindex - ntokens; 1305 if (maxrep < 0) 1306 addtok(PLUS); 1307 if (minrep == 0) 1308 addtok(QMARK); 1309 for (i = 1; i < minrep; ++i) 1310 { 1311 copytoks(tindex, ntokens); 1312 addtok(CAT); 1313 } 1314 for (; i < maxrep; ++i) 1315 { 1316 copytoks(tindex, ntokens); 1317 addtok(QMARK); 1318 addtok(CAT); 1319 } 1320 tok = lex(); 1321 } 1322 else 1323 { 1324 addtok(tok); 1325 tok = lex(); 1326 } 1327} 1328 1329static void 1330branch (void) 1331{ 1332 closure(); 1333 while (tok != RPAREN && tok != OR && tok >= 0) 1334 { 1335 closure(); 1336 addtok(CAT); 1337 } 1338} 1339 1340static void 1341regexp (int toplevel) 1342{ 1343 branch(); 1344 while (tok == OR) 1345 { 1346 tok = lex(); 1347 branch(); 1348 if (toplevel) 1349 addtok(ORTOP); 1350 else 1351 addtok(OR); 1352 } 1353} 1354 1355/* Main entry point for the parser. S is a string to be parsed, len is the 1356 length of the string, so s can include NUL characters. D is a pointer to 1357 the struct dfa to parse into. */ 1358void 1359dfaparse (char const *s, size_t len, struct dfa *d) 1360{ 1361 dfa = d; 1362 lexstart = lexptr = s; 1363 lexleft = len; 1364 lasttok = END; 1365 laststart = 1; 1366 parens = 0; 1367#if ENABLE_NLS 1368 hard_LC_COLLATE = hard_locale (LC_COLLATE); 1369#endif 1370#ifdef MBS_SUPPORT 1371 if (MB_CUR_MAX > 1) 1372 { 1373 cur_mb_index = 0; 1374 cur_mb_len = 0; 1375 memset(&mbs, 0, sizeof(mbstate_t)); 1376 } 1377#endif /* MBS_SUPPORT */ 1378 1379 if (! syntax_bits_set) 1380 dfaerror(_("No syntax specified")); 1381 1382 tok = lex(); 1383 depth = d->depth; 1384 1385 regexp(1); 1386 1387 if (tok != END) 1388 dfaerror(_("Unbalanced )")); 1389 1390 addtok(END - d->nregexps); 1391 addtok(CAT); 1392 1393 if (d->nregexps) 1394 addtok(ORTOP); 1395 1396 ++d->nregexps; 1397} 1398 1399/* Some primitives for operating on sets of positions. */ 1400 1401/* Copy one set to another; the destination must be large enough. */ 1402static void 1403copy (position_set const *src, position_set *dst) 1404{ 1405 int i; 1406 1407 for (i = 0; i < src->nelem; ++i) 1408 dst->elems[i] = src->elems[i]; 1409 dst->nelem = src->nelem; 1410} 1411 1412/* Insert a position in a set. Position sets are maintained in sorted 1413 order according to index. If position already exists in the set with 1414 the same index then their constraints are logically or'd together. 1415 S->elems must point to an array large enough to hold the resulting set. */ 1416static void 1417insert (position p, position_set *s) 1418{ 1419 int i; 1420 position t1, t2; 1421 1422 for (i = 0; i < s->nelem && p.index < s->elems[i].index; ++i) 1423 continue; 1424 if (i < s->nelem && p.index == s->elems[i].index) 1425 s->elems[i].constraint |= p.constraint; 1426 else 1427 { 1428 t1 = p; 1429 ++s->nelem; 1430 while (i < s->nelem) 1431 { 1432 t2 = s->elems[i]; 1433 s->elems[i++] = t1; 1434 t1 = t2; 1435 } 1436 } 1437} 1438 1439/* Merge two sets of positions into a third. The result is exactly as if 1440 the positions of both sets were inserted into an initially empty set. */ 1441static void 1442merge (position_set const *s1, position_set const *s2, position_set *m) 1443{ 1444 int i = 0, j = 0; 1445 1446 m->nelem = 0; 1447 while (i < s1->nelem && j < s2->nelem) 1448 if (s1->elems[i].index > s2->elems[j].index) 1449 m->elems[m->nelem++] = s1->elems[i++]; 1450 else if (s1->elems[i].index < s2->elems[j].index) 1451 m->elems[m->nelem++] = s2->elems[j++]; 1452 else 1453 { 1454 m->elems[m->nelem] = s1->elems[i++]; 1455 m->elems[m->nelem++].constraint |= s2->elems[j++].constraint; 1456 } 1457 while (i < s1->nelem) 1458 m->elems[m->nelem++] = s1->elems[i++]; 1459 while (j < s2->nelem) 1460 m->elems[m->nelem++] = s2->elems[j++]; 1461} 1462 1463/* Delete a position from a set. */ 1464static void 1465delete (position p, position_set *s) 1466{ 1467 int i; 1468 1469 for (i = 0; i < s->nelem; ++i) 1470 if (p.index == s->elems[i].index) 1471 break; 1472 if (i < s->nelem) 1473 for (--s->nelem; i < s->nelem; ++i) 1474 s->elems[i] = s->elems[i + 1]; 1475} 1476 1477/* Find the index of the state corresponding to the given position set with 1478 the given preceding context, or create a new state if there is no such 1479 state. Newline and letter tell whether we got here on a newline or 1480 letter, respectively. */ 1481static int 1482state_index (struct dfa *d, position_set const *s, int newline, int letter) 1483{ 1484 int hash = 0; 1485 int constraint; 1486 int i, j; 1487 1488 newline = newline ? 1 : 0; 1489 letter = letter ? 1 : 0; 1490 1491 for (i = 0; i < s->nelem; ++i) 1492 hash ^= s->elems[i].index + s->elems[i].constraint; 1493 1494 /* Try to find a state that exactly matches the proposed one. */ 1495 for (i = 0; i < d->sindex; ++i) 1496 { 1497 if (hash != d->states[i].hash || s->nelem != d->states[i].elems.nelem 1498 || newline != d->states[i].newline || letter != d->states[i].letter) 1499 continue; 1500 for (j = 0; j < s->nelem; ++j) 1501 if (s->elems[j].constraint 1502 != d->states[i].elems.elems[j].constraint 1503 || s->elems[j].index != d->states[i].elems.elems[j].index) 1504 break; 1505 if (j == s->nelem) 1506 return i; 1507 } 1508 1509 /* We'll have to create a new state. */ 1510 REALLOC_IF_NECESSARY(d->states, dfa_state, d->salloc, d->sindex); 1511 d->states[i].hash = hash; 1512 MALLOC(d->states[i].elems.elems, position, s->nelem); 1513 copy(s, &d->states[i].elems); 1514 d->states[i].newline = newline; 1515 d->states[i].letter = letter; 1516 d->states[i].backref = 0; 1517 d->states[i].constraint = 0; 1518 d->states[i].first_end = 0; 1519#ifdef MBS_SUPPORT 1520 if (MB_CUR_MAX > 1) 1521 d->states[i].mbps.nelem = 0; 1522#endif 1523 for (j = 0; j < s->nelem; ++j) 1524 if (d->tokens[s->elems[j].index] < 0) 1525 { 1526 constraint = s->elems[j].constraint; 1527 if (SUCCEEDS_IN_CONTEXT(constraint, newline, 0, letter, 0) 1528 || SUCCEEDS_IN_CONTEXT(constraint, newline, 0, letter, 1) 1529 || SUCCEEDS_IN_CONTEXT(constraint, newline, 1, letter, 0) 1530 || SUCCEEDS_IN_CONTEXT(constraint, newline, 1, letter, 1)) 1531 d->states[i].constraint |= constraint; 1532 if (! d->states[i].first_end) 1533 d->states[i].first_end = d->tokens[s->elems[j].index]; 1534 } 1535 else if (d->tokens[s->elems[j].index] == BACKREF) 1536 { 1537 d->states[i].constraint = NO_CONSTRAINT; 1538 d->states[i].backref = 1; 1539 } 1540 1541 ++d->sindex; 1542 1543 return i; 1544} 1545 1546/* Find the epsilon closure of a set of positions. If any position of the set 1547 contains a symbol that matches the empty string in some context, replace 1548 that position with the elements of its follow labeled with an appropriate 1549 constraint. Repeat exhaustively until no funny positions are left. 1550 S->elems must be large enough to hold the result. */ 1551static void 1552epsclosure (position_set *s, struct dfa const *d) 1553{ 1554 int i, j; 1555 int *visited; 1556 position p, old; 1557 1558 MALLOC(visited, int, d->tindex); 1559 for (i = 0; i < d->tindex; ++i) 1560 visited[i] = 0; 1561 1562 for (i = 0; i < s->nelem; ++i) 1563 if (d->tokens[s->elems[i].index] >= NOTCHAR 1564 && d->tokens[s->elems[i].index] != BACKREF 1565#ifdef MBS_SUPPORT 1566 && d->tokens[s->elems[i].index] != ANYCHAR 1567 && d->tokens[s->elems[i].index] != MBCSET 1568#endif 1569 && d->tokens[s->elems[i].index] < CSET) 1570 { 1571 old = s->elems[i]; 1572 p.constraint = old.constraint; 1573 delete(s->elems[i], s); 1574 if (visited[old.index]) 1575 { 1576 --i; 1577 continue; 1578 } 1579 visited[old.index] = 1; 1580 switch (d->tokens[old.index]) 1581 { 1582 case BEGLINE: 1583 p.constraint &= BEGLINE_CONSTRAINT; 1584 break; 1585 case ENDLINE: 1586 p.constraint &= ENDLINE_CONSTRAINT; 1587 break; 1588 case BEGWORD: 1589 p.constraint &= BEGWORD_CONSTRAINT; 1590 break; 1591 case ENDWORD: 1592 p.constraint &= ENDWORD_CONSTRAINT; 1593 break; 1594 case LIMWORD: 1595 p.constraint &= LIMWORD_CONSTRAINT; 1596 break; 1597 case NOTLIMWORD: 1598 p.constraint &= NOTLIMWORD_CONSTRAINT; 1599 break; 1600 default: 1601 break; 1602 } 1603 for (j = 0; j < d->follows[old.index].nelem; ++j) 1604 { 1605 p.index = d->follows[old.index].elems[j].index; 1606 insert(p, s); 1607 } 1608 /* Force rescan to start at the beginning. */ 1609 i = -1; 1610 } 1611 1612 free(visited); 1613} 1614 1615/* Perform bottom-up analysis on the parse tree, computing various functions. 1616 Note that at this point, we're pretending constructs like \< are real 1617 characters rather than constraints on what can follow them. 1618 1619 Nullable: A node is nullable if it is at the root of a regexp that can 1620 match the empty string. 1621 * EMPTY leaves are nullable. 1622 * No other leaf is nullable. 1623 * A QMARK or STAR node is nullable. 1624 * A PLUS node is nullable if its argument is nullable. 1625 * A CAT node is nullable if both its arguments are nullable. 1626 * An OR node is nullable if either argument is nullable. 1627 1628 Firstpos: The firstpos of a node is the set of positions (nonempty leaves) 1629 that could correspond to the first character of a string matching the 1630 regexp rooted at the given node. 1631 * EMPTY leaves have empty firstpos. 1632 * The firstpos of a nonempty leaf is that leaf itself. 1633 * The firstpos of a QMARK, STAR, or PLUS node is the firstpos of its 1634 argument. 1635 * The firstpos of a CAT node is the firstpos of the left argument, union 1636 the firstpos of the right if the left argument is nullable. 1637 * The firstpos of an OR node is the union of firstpos of each argument. 1638 1639 Lastpos: The lastpos of a node is the set of positions that could 1640 correspond to the last character of a string matching the regexp at 1641 the given node. 1642 * EMPTY leaves have empty lastpos. 1643 * The lastpos of a nonempty leaf is that leaf itself. 1644 * The lastpos of a QMARK, STAR, or PLUS node is the lastpos of its 1645 argument. 1646 * The lastpos of a CAT node is the lastpos of its right argument, union 1647 the lastpos of the left if the right argument is nullable. 1648 * The lastpos of an OR node is the union of the lastpos of each argument. 1649 1650 Follow: The follow of a position is the set of positions that could 1651 correspond to the character following a character matching the node in 1652 a string matching the regexp. At this point we consider special symbols 1653 that match the empty string in some context to be just normal characters. 1654 Later, if we find that a special symbol is in a follow set, we will 1655 replace it with the elements of its follow, labeled with an appropriate 1656 constraint. 1657 * Every node in the firstpos of the argument of a STAR or PLUS node is in 1658 the follow of every node in the lastpos. 1659 * Every node in the firstpos of the second argument of a CAT node is in 1660 the follow of every node in the lastpos of the first argument. 1661 1662 Because of the postfix representation of the parse tree, the depth-first 1663 analysis is conveniently done by a linear scan with the aid of a stack. 1664 Sets are stored as arrays of the elements, obeying a stack-like allocation 1665 scheme; the number of elements in each set deeper in the stack can be 1666 used to determine the address of a particular set's array. */ 1667void 1668dfaanalyze (struct dfa *d, int searchflag) 1669{ 1670 int *nullable; /* Nullable stack. */ 1671 int *nfirstpos; /* Element count stack for firstpos sets. */ 1672 position *firstpos; /* Array where firstpos elements are stored. */ 1673 int *nlastpos; /* Element count stack for lastpos sets. */ 1674 position *lastpos; /* Array where lastpos elements are stored. */ 1675 int *nalloc; /* Sizes of arrays allocated to follow sets. */ 1676 position_set tmp; /* Temporary set for merging sets. */ 1677 position_set merged; /* Result of merging sets. */ 1678 int wants_newline; /* True if some position wants newline info. */ 1679 int *o_nullable; 1680 int *o_nfirst, *o_nlast; 1681 position *o_firstpos, *o_lastpos; 1682 int i, j; 1683 position *pos; 1684 1685#ifdef DEBUG 1686 fprintf(stderr, "dfaanalyze:\n"); 1687 for (i = 0; i < d->tindex; ++i) 1688 { 1689 fprintf(stderr, " %d:", i); 1690 prtok(d->tokens[i]); 1691 } 1692 putc('\n', stderr); 1693#endif 1694 1695 d->searchflag = searchflag; 1696 1697 MALLOC(nullable, int, d->depth); 1698 o_nullable = nullable; 1699 MALLOC(nfirstpos, int, d->depth); 1700 o_nfirst = nfirstpos; 1701 MALLOC(firstpos, position, d->nleaves); 1702 o_firstpos = firstpos, firstpos += d->nleaves; 1703 MALLOC(nlastpos, int, d->depth); 1704 o_nlast = nlastpos; 1705 MALLOC(lastpos, position, d->nleaves); 1706 o_lastpos = lastpos, lastpos += d->nleaves; 1707 MALLOC(nalloc, int, d->tindex); 1708 for (i = 0; i < d->tindex; ++i) 1709 nalloc[i] = 0; 1710 MALLOC(merged.elems, position, d->nleaves); 1711 1712 CALLOC(d->follows, position_set, d->tindex); 1713 1714 for (i = 0; i < d->tindex; ++i) 1715#ifdef DEBUG 1716 { /* Nonsyntactic #ifdef goo... */ 1717#endif 1718 switch (d->tokens[i]) 1719 { 1720 case EMPTY: 1721 /* The empty set is nullable. */ 1722 *nullable++ = 1; 1723 1724 /* The firstpos and lastpos of the empty leaf are both empty. */ 1725 *nfirstpos++ = *nlastpos++ = 0; 1726 break; 1727 1728 case STAR: 1729 case PLUS: 1730 /* Every element in the firstpos of the argument is in the follow 1731 of every element in the lastpos. */ 1732 tmp.nelem = nfirstpos[-1]; 1733 tmp.elems = firstpos; 1734 pos = lastpos; 1735 for (j = 0; j < nlastpos[-1]; ++j) 1736 { 1737 merge(&tmp, &d->follows[pos[j].index], &merged); 1738 REALLOC_IF_NECESSARY(d->follows[pos[j].index].elems, position, 1739 nalloc[pos[j].index], merged.nelem - 1); 1740 copy(&merged, &d->follows[pos[j].index]); 1741 } 1742 1743 case QMARK: 1744 /* A QMARK or STAR node is automatically nullable. */ 1745 if (d->tokens[i] != PLUS) 1746 nullable[-1] = 1; 1747 break; 1748 1749 case CAT: 1750 /* Every element in the firstpos of the second argument is in the 1751 follow of every element in the lastpos of the first argument. */ 1752 tmp.nelem = nfirstpos[-1]; 1753 tmp.elems = firstpos; 1754 pos = lastpos + nlastpos[-1]; 1755 for (j = 0; j < nlastpos[-2]; ++j) 1756 { 1757 merge(&tmp, &d->follows[pos[j].index], &merged); 1758 REALLOC_IF_NECESSARY(d->follows[pos[j].index].elems, position, 1759 nalloc[pos[j].index], merged.nelem - 1); 1760 copy(&merged, &d->follows[pos[j].index]); 1761 } 1762 1763 /* The firstpos of a CAT node is the firstpos of the first argument, 1764 union that of the second argument if the first is nullable. */ 1765 if (nullable[-2]) 1766 nfirstpos[-2] += nfirstpos[-1]; 1767 else 1768 firstpos += nfirstpos[-1]; 1769 --nfirstpos; 1770 1771 /* The lastpos of a CAT node is the lastpos of the second argument, 1772 union that of the first argument if the second is nullable. */ 1773 if (nullable[-1]) 1774 nlastpos[-2] += nlastpos[-1]; 1775 else 1776 { 1777 pos = lastpos + nlastpos[-2]; 1778 for (j = nlastpos[-1] - 1; j >= 0; --j) 1779 pos[j] = lastpos[j]; 1780 lastpos += nlastpos[-2]; 1781 nlastpos[-2] = nlastpos[-1]; 1782 } 1783 --nlastpos; 1784 1785 /* A CAT node is nullable if both arguments are nullable. */ 1786 nullable[-2] = nullable[-1] && nullable[-2]; 1787 --nullable; 1788 break; 1789 1790 case OR: 1791 case ORTOP: 1792 /* The firstpos is the union of the firstpos of each argument. */ 1793 nfirstpos[-2] += nfirstpos[-1]; 1794 --nfirstpos; 1795 1796 /* The lastpos is the union of the lastpos of each argument. */ 1797 nlastpos[-2] += nlastpos[-1]; 1798 --nlastpos; 1799 1800 /* An OR node is nullable if either argument is nullable. */ 1801 nullable[-2] = nullable[-1] || nullable[-2]; 1802 --nullable; 1803 break; 1804 1805 default: 1806 /* Anything else is a nonempty position. (Note that special 1807 constructs like \< are treated as nonempty strings here; 1808 an "epsilon closure" effectively makes them nullable later. 1809 Backreferences have to get a real position so we can detect 1810 transitions on them later. But they are nullable. */ 1811 *nullable++ = d->tokens[i] == BACKREF; 1812 1813 /* This position is in its own firstpos and lastpos. */ 1814 *nfirstpos++ = *nlastpos++ = 1; 1815 --firstpos, --lastpos; 1816 firstpos->index = lastpos->index = i; 1817 firstpos->constraint = lastpos->constraint = NO_CONSTRAINT; 1818 1819 /* Allocate the follow set for this position. */ 1820 nalloc[i] = 1; 1821 MALLOC(d->follows[i].elems, position, nalloc[i]); 1822 break; 1823 } 1824#ifdef DEBUG 1825 /* ... balance the above nonsyntactic #ifdef goo... */ 1826 fprintf(stderr, "node %d:", i); 1827 prtok(d->tokens[i]); 1828 putc('\n', stderr); 1829 fprintf(stderr, nullable[-1] ? " nullable: yes\n" : " nullable: no\n"); 1830 fprintf(stderr, " firstpos:"); 1831 for (j = nfirstpos[-1] - 1; j >= 0; --j) 1832 { 1833 fprintf(stderr, " %d:", firstpos[j].index); 1834 prtok(d->tokens[firstpos[j].index]); 1835 } 1836 fprintf(stderr, "\n lastpos:"); 1837 for (j = nlastpos[-1] - 1; j >= 0; --j) 1838 { 1839 fprintf(stderr, " %d:", lastpos[j].index); 1840 prtok(d->tokens[lastpos[j].index]); 1841 } 1842 putc('\n', stderr); 1843 } 1844#endif 1845 1846 /* For each follow set that is the follow set of a real position, replace 1847 it with its epsilon closure. */ 1848 for (i = 0; i < d->tindex; ++i) 1849 if (d->tokens[i] < NOTCHAR || d->tokens[i] == BACKREF 1850#ifdef MBS_SUPPORT 1851 || d->tokens[i] == ANYCHAR 1852 || d->tokens[i] == MBCSET 1853#endif 1854 || d->tokens[i] >= CSET) 1855 { 1856#ifdef DEBUG 1857 fprintf(stderr, "follows(%d:", i); 1858 prtok(d->tokens[i]); 1859 fprintf(stderr, "):"); 1860 for (j = d->follows[i].nelem - 1; j >= 0; --j) 1861 { 1862 fprintf(stderr, " %d:", d->follows[i].elems[j].index); 1863 prtok(d->tokens[d->follows[i].elems[j].index]); 1864 } 1865 putc('\n', stderr); 1866#endif 1867 copy(&d->follows[i], &merged); 1868 epsclosure(&merged, d); 1869 if (d->follows[i].nelem < merged.nelem) 1870 REALLOC(d->follows[i].elems, position, merged.nelem); 1871 copy(&merged, &d->follows[i]); 1872 } 1873 1874 /* Get the epsilon closure of the firstpos of the regexp. The result will 1875 be the set of positions of state 0. */ 1876 merged.nelem = 0; 1877 for (i = 0; i < nfirstpos[-1]; ++i) 1878 insert(firstpos[i], &merged); 1879 epsclosure(&merged, d); 1880 1881 /* Check if any of the positions of state 0 will want newline context. */ 1882 wants_newline = 0; 1883 for (i = 0; i < merged.nelem; ++i) 1884 if (PREV_NEWLINE_DEPENDENT(merged.elems[i].constraint)) 1885 wants_newline = 1; 1886 1887 /* Build the initial state. */ 1888 d->salloc = 1; 1889 d->sindex = 0; 1890 MALLOC(d->states, dfa_state, d->salloc); 1891 state_index(d, &merged, wants_newline, 0); 1892 1893 free(o_nullable); 1894 free(o_nfirst); 1895 free(o_firstpos); 1896 free(o_nlast); 1897 free(o_lastpos); 1898 free(nalloc); 1899 free(merged.elems); 1900} 1901 1902/* Find, for each character, the transition out of state s of d, and store 1903 it in the appropriate slot of trans. 1904 1905 We divide the positions of s into groups (positions can appear in more 1906 than one group). Each group is labeled with a set of characters that 1907 every position in the group matches (taking into account, if necessary, 1908 preceding context information of s). For each group, find the union 1909 of the its elements' follows. This set is the set of positions of the 1910 new state. For each character in the group's label, set the transition 1911 on this character to be to a state corresponding to the set's positions, 1912 and its associated backward context information, if necessary. 1913 1914 If we are building a searching matcher, we include the positions of state 1915 0 in every state. 1916 1917 The collection of groups is constructed by building an equivalence-class 1918 partition of the positions of s. 1919 1920 For each position, find the set of characters C that it matches. Eliminate 1921 any characters from C that fail on grounds of backward context. 1922 1923 Search through the groups, looking for a group whose label L has nonempty 1924 intersection with C. If L - C is nonempty, create a new group labeled 1925 L - C and having the same positions as the current group, and set L to 1926 the intersection of L and C. Insert the position in this group, set 1927 C = C - L, and resume scanning. 1928 1929 If after comparing with every group there are characters remaining in C, 1930 create a new group labeled with the characters of C and insert this 1931 position in that group. */ 1932void 1933dfastate (int s, struct dfa *d, int trans[]) 1934{ 1935 position_set grps[NOTCHAR]; /* As many as will ever be needed. */ 1936 charclass labels[NOTCHAR]; /* Labels corresponding to the groups. */ 1937 int ngrps = 0; /* Number of groups actually used. */ 1938 position pos; /* Current position being considered. */ 1939 charclass matches; /* Set of matching characters. */ 1940 int matchesf; /* True if matches is nonempty. */ 1941 charclass intersect; /* Intersection with some label set. */ 1942 int intersectf; /* True if intersect is nonempty. */ 1943 charclass leftovers; /* Stuff in the label that didn't match. */ 1944 int leftoversf; /* True if leftovers is nonempty. */ 1945 static charclass letters; /* Set of characters considered letters. */ 1946 static charclass newline; /* Set of characters that aren't newline. */ 1947 position_set follows; /* Union of the follows of some group. */ 1948 position_set tmp; /* Temporary space for merging sets. */ 1949 int state; /* New state. */ 1950 int wants_newline; /* New state wants to know newline context. */ 1951 int state_newline; /* New state on a newline transition. */ 1952 int wants_letter; /* New state wants to know letter context. */ 1953 int state_letter; /* New state on a letter transition. */ 1954 static int initialized; /* Flag for static initialization. */ 1955#ifdef MBS_SUPPORT 1956 int next_isnt_1st_byte = 0; /* Flag If we can't add state0. */ 1957#endif 1958 int i, j, k; 1959 1960 /* Initialize the set of letters, if necessary. */ 1961 if (! initialized) 1962 { 1963 initialized = 1; 1964 for (i = 0; i < NOTCHAR; ++i) 1965 if (IS_WORD_CONSTITUENT(i)) 1966 setbit(i, letters); 1967 setbit(eolbyte, newline); 1968 } 1969 1970 zeroset(matches); 1971 1972 for (i = 0; i < d->states[s].elems.nelem; ++i) 1973 { 1974 pos = d->states[s].elems.elems[i]; 1975 if (d->tokens[pos.index] >= 0 && d->tokens[pos.index] < NOTCHAR) 1976 setbit(d->tokens[pos.index], matches); 1977 else if (d->tokens[pos.index] >= CSET) 1978 copyset(d->charclasses[d->tokens[pos.index] - CSET], matches); 1979#ifdef MBS_SUPPORT 1980 else if (d->tokens[pos.index] == ANYCHAR 1981 || d->tokens[pos.index] == MBCSET) 1982 /* MB_CUR_MAX > 1 */ 1983 { 1984 /* ANYCHAR and MBCSET must match with a single character, so we 1985 must put it to d->states[s].mbps, which contains the positions 1986 which can match with a single character not a byte. */ 1987 if (d->states[s].mbps.nelem == 0) 1988 { 1989 MALLOC(d->states[s].mbps.elems, position, 1990 d->states[s].elems.nelem); 1991 } 1992 insert(pos, &(d->states[s].mbps)); 1993 continue; 1994 } 1995#endif /* MBS_SUPPORT */ 1996 else 1997 continue; 1998 1999 /* Some characters may need to be eliminated from matches because 2000 they fail in the current context. */ 2001 if (pos.constraint != 0xFF) 2002 { 2003 if (! MATCHES_NEWLINE_CONTEXT(pos.constraint, 2004 d->states[s].newline, 1)) 2005 clrbit(eolbyte, matches); 2006 if (! MATCHES_NEWLINE_CONTEXT(pos.constraint, 2007 d->states[s].newline, 0)) 2008 for (j = 0; j < CHARCLASS_INTS; ++j) 2009 matches[j] &= newline[j]; 2010 if (! MATCHES_LETTER_CONTEXT(pos.constraint, 2011 d->states[s].letter, 1)) 2012 for (j = 0; j < CHARCLASS_INTS; ++j) 2013 matches[j] &= ~letters[j]; 2014 if (! MATCHES_LETTER_CONTEXT(pos.constraint, 2015 d->states[s].letter, 0)) 2016 for (j = 0; j < CHARCLASS_INTS; ++j) 2017 matches[j] &= letters[j]; 2018 2019 /* If there are no characters left, there's no point in going on. */ 2020 for (j = 0; j < CHARCLASS_INTS && !matches[j]; ++j) 2021 continue; 2022 if (j == CHARCLASS_INTS) 2023 continue; 2024 } 2025 2026 for (j = 0; j < ngrps; ++j) 2027 { 2028 /* If matches contains a single character only, and the current 2029 group's label doesn't contain that character, go on to the 2030 next group. */ 2031 if (d->tokens[pos.index] >= 0 && d->tokens[pos.index] < NOTCHAR 2032 && !tstbit(d->tokens[pos.index], labels[j])) 2033 continue; 2034 2035 /* Check if this group's label has a nonempty intersection with 2036 matches. */ 2037 intersectf = 0; 2038 for (k = 0; k < CHARCLASS_INTS; ++k) 2039 (intersect[k] = matches[k] & labels[j][k]) ? (intersectf = 1) : 0; 2040 if (! intersectf) 2041 continue; 2042 2043 /* It does; now find the set differences both ways. */ 2044 leftoversf = matchesf = 0; 2045 for (k = 0; k < CHARCLASS_INTS; ++k) 2046 { 2047 /* Even an optimizing compiler can't know this for sure. */ 2048 int match = matches[k], label = labels[j][k]; 2049 2050 (leftovers[k] = ~match & label) ? (leftoversf = 1) : 0; 2051 (matches[k] = match & ~label) ? (matchesf = 1) : 0; 2052 } 2053 2054 /* If there were leftovers, create a new group labeled with them. */ 2055 if (leftoversf) 2056 { 2057 copyset(leftovers, labels[ngrps]); 2058 copyset(intersect, labels[j]); 2059 MALLOC(grps[ngrps].elems, position, d->nleaves); 2060 copy(&grps[j], &grps[ngrps]); 2061 ++ngrps; 2062 } 2063 2064 /* Put the position in the current group. Note that there is no 2065 reason to call insert() here. */ 2066 grps[j].elems[grps[j].nelem++] = pos; 2067 2068 /* If every character matching the current position has been 2069 accounted for, we're done. */ 2070 if (! matchesf) 2071 break; 2072 } 2073 2074 /* If we've passed the last group, and there are still characters 2075 unaccounted for, then we'll have to create a new group. */ 2076 if (j == ngrps) 2077 { 2078 copyset(matches, labels[ngrps]); 2079 zeroset(matches); 2080 MALLOC(grps[ngrps].elems, position, d->nleaves); 2081 grps[ngrps].nelem = 1; 2082 grps[ngrps].elems[0] = pos; 2083 ++ngrps; 2084 } 2085 } 2086 2087 MALLOC(follows.elems, position, d->nleaves); 2088 MALLOC(tmp.elems, position, d->nleaves); 2089 2090 /* If we are a searching matcher, the default transition is to a state 2091 containing the positions of state 0, otherwise the default transition 2092 is to fail miserably. */ 2093 if (d->searchflag) 2094 { 2095 wants_newline = 0; 2096 wants_letter = 0; 2097 for (i = 0; i < d->states[0].elems.nelem; ++i) 2098 { 2099 if (PREV_NEWLINE_DEPENDENT(d->states[0].elems.elems[i].constraint)) 2100 wants_newline = 1; 2101 if (PREV_LETTER_DEPENDENT(d->states[0].elems.elems[i].constraint)) 2102 wants_letter = 1; 2103 } 2104 copy(&d->states[0].elems, &follows); 2105 state = state_index(d, &follows, 0, 0); 2106 if (wants_newline) 2107 state_newline = state_index(d, &follows, 1, 0); 2108 else 2109 state_newline = state; 2110 if (wants_letter) 2111 state_letter = state_index(d, &follows, 0, 1); 2112 else 2113 state_letter = state; 2114 for (i = 0; i < NOTCHAR; ++i) 2115 trans[i] = (IS_WORD_CONSTITUENT(i)) ? state_letter : state; 2116 trans[eolbyte] = state_newline; 2117 } 2118 else 2119 for (i = 0; i < NOTCHAR; ++i) 2120 trans[i] = -1; 2121 2122 for (i = 0; i < ngrps; ++i) 2123 { 2124 follows.nelem = 0; 2125 2126 /* Find the union of the follows of the positions of the group. 2127 This is a hideously inefficient loop. Fix it someday. */ 2128 for (j = 0; j < grps[i].nelem; ++j) 2129 for (k = 0; k < d->follows[grps[i].elems[j].index].nelem; ++k) 2130 insert(d->follows[grps[i].elems[j].index].elems[k], &follows); 2131 2132#ifdef MBS_SUPPORT 2133 if (MB_CUR_MAX > 1) 2134 { 2135 /* If a token in follows.elems is not 1st byte of a multibyte 2136 character, or the states of follows must accept the bytes 2137 which are not 1st byte of the multibyte character. 2138 Then, if a state of follows encounter a byte, it must not be 2139 a 1st byte of a multibyte character nor singlebyte character. 2140 We cansel to add state[0].follows to next state, because 2141 state[0] must accept 1st-byte 2142 2143 For example, we assume <sb a> is a certain singlebyte 2144 character, <mb A> is a certain multibyte character, and the 2145 codepoint of <sb a> equals the 2nd byte of the codepoint of 2146 <mb A>. 2147 When state[0] accepts <sb a>, state[i] transit to state[i+1] 2148 by accepting accepts 1st byte of <mb A>, and state[i+1] 2149 accepts 2nd byte of <mb A>, if state[i+1] encounter the 2150 codepoint of <sb a>, it must not be <sb a> but 2nd byte of 2151 <mb A>, so we can not add state[0]. */ 2152 2153 next_isnt_1st_byte = 0; 2154 for (j = 0; j < follows.nelem; ++j) 2155 { 2156 if (!(d->multibyte_prop[follows.elems[j].index] & 1)) 2157 { 2158 next_isnt_1st_byte = 1; 2159 break; 2160 } 2161 } 2162 } 2163#endif 2164 2165 /* If we are building a searching matcher, throw in the positions 2166 of state 0 as well. */ 2167#ifdef MBS_SUPPORT 2168 if (d->searchflag && (MB_CUR_MAX == 1 || !next_isnt_1st_byte)) 2169#else 2170 if (d->searchflag) 2171#endif 2172 for (j = 0; j < d->states[0].elems.nelem; ++j) 2173 insert(d->states[0].elems.elems[j], &follows); 2174 2175 /* Find out if the new state will want any context information. */ 2176 wants_newline = 0; 2177 if (tstbit(eolbyte, labels[i])) 2178 for (j = 0; j < follows.nelem; ++j) 2179 if (PREV_NEWLINE_DEPENDENT(follows.elems[j].constraint)) 2180 wants_newline = 1; 2181 2182 wants_letter = 0; 2183 for (j = 0; j < CHARCLASS_INTS; ++j) 2184 if (labels[i][j] & letters[j]) 2185 break; 2186 if (j < CHARCLASS_INTS) 2187 for (j = 0; j < follows.nelem; ++j) 2188 if (PREV_LETTER_DEPENDENT(follows.elems[j].constraint)) 2189 wants_letter = 1; 2190 2191 /* Find the state(s) corresponding to the union of the follows. */ 2192 state = state_index(d, &follows, 0, 0); 2193 if (wants_newline) 2194 state_newline = state_index(d, &follows, 1, 0); 2195 else 2196 state_newline = state; 2197 if (wants_letter) 2198 state_letter = state_index(d, &follows, 0, 1); 2199 else 2200 state_letter = state; 2201 2202 /* Set the transitions for each character in the current label. */ 2203 for (j = 0; j < CHARCLASS_INTS; ++j) 2204 for (k = 0; k < INTBITS; ++k) 2205 if (labels[i][j] & 1 << k) 2206 { 2207 int c = j * INTBITS + k; 2208 2209 if (c == eolbyte) 2210 trans[c] = state_newline; 2211 else if (IS_WORD_CONSTITUENT(c)) 2212 trans[c] = state_letter; 2213 else if (c < NOTCHAR) 2214 trans[c] = state; 2215 } 2216 } 2217 2218 for (i = 0; i < ngrps; ++i) 2219 free(grps[i].elems); 2220 free(follows.elems); 2221 free(tmp.elems); 2222} 2223 2224/* Some routines for manipulating a compiled dfa's transition tables. 2225 Each state may or may not have a transition table; if it does, and it 2226 is a non-accepting state, then d->trans[state] points to its table. 2227 If it is an accepting state then d->fails[state] points to its table. 2228 If it has no table at all, then d->trans[state] is NULL. 2229 TODO: Improve this comment, get rid of the unnecessary redundancy. */ 2230 2231static void 2232build_state (int s, struct dfa *d) 2233{ 2234 int *trans; /* The new transition table. */ 2235 int i; 2236 2237 /* Set an upper limit on the number of transition tables that will ever 2238 exist at once. 1024 is arbitrary. The idea is that the frequently 2239 used transition tables will be quickly rebuilt, whereas the ones that 2240 were only needed once or twice will be cleared away. */ 2241 if (d->trcount >= 1024) 2242 { 2243 for (i = 0; i < d->tralloc; ++i) 2244 if (d->trans[i]) 2245 { 2246 free((ptr_t) d->trans[i]); 2247 d->trans[i] = NULL; 2248 } 2249 else if (d->fails[i]) 2250 { 2251 free((ptr_t) d->fails[i]); 2252 d->fails[i] = NULL; 2253 } 2254 d->trcount = 0; 2255 } 2256 2257 ++d->trcount; 2258 2259 /* Set up the success bits for this state. */ 2260 d->success[s] = 0; 2261 if (ACCEPTS_IN_CONTEXT(d->states[s].newline, 1, d->states[s].letter, 0, 2262 s, *d)) 2263 d->success[s] |= 4; 2264 if (ACCEPTS_IN_CONTEXT(d->states[s].newline, 0, d->states[s].letter, 1, 2265 s, *d)) 2266 d->success[s] |= 2; 2267 if (ACCEPTS_IN_CONTEXT(d->states[s].newline, 0, d->states[s].letter, 0, 2268 s, *d)) 2269 d->success[s] |= 1; 2270 2271 MALLOC(trans, int, NOTCHAR); 2272 dfastate(s, d, trans); 2273 2274 /* Now go through the new transition table, and make sure that the trans 2275 and fail arrays are allocated large enough to hold a pointer for the 2276 largest state mentioned in the table. */ 2277 for (i = 0; i < NOTCHAR; ++i) 2278 if (trans[i] >= d->tralloc) 2279 { 2280 int oldalloc = d->tralloc; 2281 2282 while (trans[i] >= d->tralloc) 2283 d->tralloc *= 2; 2284 REALLOC(d->realtrans, int *, d->tralloc + 1); 2285 d->trans = d->realtrans + 1; 2286 REALLOC(d->fails, int *, d->tralloc); 2287 REALLOC(d->success, int, d->tralloc); 2288 while (oldalloc < d->tralloc) 2289 { 2290 d->trans[oldalloc] = NULL; 2291 d->fails[oldalloc++] = NULL; 2292 } 2293 } 2294 2295 /* Newline is a sentinel. */ 2296 trans[eolbyte] = -1; 2297 2298 if (ACCEPTING(s, *d)) 2299 d->fails[s] = trans; 2300 else 2301 d->trans[s] = trans; 2302} 2303 2304static void 2305build_state_zero (struct dfa *d) 2306{ 2307 d->tralloc = 1; 2308 d->trcount = 0; 2309 CALLOC(d->realtrans, int *, d->tralloc + 1); 2310 d->trans = d->realtrans + 1; 2311 CALLOC(d->fails, int *, d->tralloc); 2312 MALLOC(d->success, int, d->tralloc); 2313 build_state(0, d); 2314} 2315 2316#ifdef MBS_SUPPORT 2317/* Multibyte character handling sub-routins for dfaexec. */ 2318 2319/* Initial state may encounter the byte which is not a singlebyte character 2320 nor 1st byte of a multibyte character. But it is incorrect for initial 2321 state to accept such a byte. 2322 For example, in sjis encoding the regular expression like "\\" accepts 2323 the codepoint 0x5c, but should not accept the 2nd byte of the codepoint 2324 0x815c. Then Initial state must skip the bytes which are not a singlebyte 2325 character nor 1st byte of a multibyte character. */ 2326#define SKIP_REMAINS_MB_IF_INITIAL_STATE(s, p) \ 2327 if (s == 0) \ 2328 { \ 2329 while (inputwcs[p - buf_begin] == 0 \ 2330 && mblen_buf[p - buf_begin] > 0 \ 2331 && p < buf_end) \ 2332 ++p; \ 2333 if (p >= end) \ 2334 { \ 2335 free(mblen_buf); \ 2336 free(inputwcs); \ 2337 return (size_t) -1; \ 2338 } \ 2339 } 2340 2341static void 2342realloc_trans_if_necessary(struct dfa *d, int new_state) 2343{ 2344 /* Make sure that the trans and fail arrays are allocated large enough 2345 to hold a pointer for the new state. */ 2346 if (new_state >= d->tralloc) 2347 { 2348 int oldalloc = d->tralloc; 2349 2350 while (new_state >= d->tralloc) 2351 d->tralloc *= 2; 2352 REALLOC(d->realtrans, int *, d->tralloc + 1); 2353 d->trans = d->realtrans + 1; 2354 REALLOC(d->fails, int *, d->tralloc); 2355 REALLOC(d->success, int, d->tralloc); 2356 while (oldalloc < d->tralloc) 2357 { 2358 d->trans[oldalloc] = NULL; 2359 d->fails[oldalloc++] = NULL; 2360 } 2361 } 2362} 2363 2364/* Return values of transit_state_singlebyte(), and 2365 transit_state_consume_1char. */ 2366typedef enum 2367{ 2368 TRANSIT_STATE_IN_PROGRESS, /* State transition has not finished. */ 2369 TRANSIT_STATE_DONE, /* State transition has finished. */ 2370 TRANSIT_STATE_END_BUFFER /* Reach the end of the buffer. */ 2371} status_transit_state; 2372 2373/* Consume a single byte and transit state from 's' to '*next_state'. 2374 This function is almost same as the state transition routin in dfaexec(). 2375 But state transition is done just once, otherwise matching succeed or 2376 reach the end of the buffer. */ 2377static status_transit_state 2378transit_state_singlebyte (struct dfa *d, int s, unsigned char const *p, 2379 int *next_state) 2380{ 2381 int *t; 2382 int works = s; 2383 2384 status_transit_state rval = TRANSIT_STATE_IN_PROGRESS; 2385 2386 while (rval == TRANSIT_STATE_IN_PROGRESS) 2387 { 2388 if ((t = d->trans[works]) != NULL) 2389 { 2390 works = t[*p]; 2391 rval = TRANSIT_STATE_DONE; 2392 if (works < 0) 2393 works = 0; 2394 } 2395 else if (works < 0) 2396 { 2397 if (p == buf_end) 2398 /* At the moment, it must not happen. */ 2399 return TRANSIT_STATE_END_BUFFER; 2400 works = 0; 2401 } 2402 else if (d->fails[works]) 2403 { 2404 works = d->fails[works][*p]; 2405 rval = TRANSIT_STATE_DONE; 2406 } 2407 else 2408 { 2409 build_state(works, d); 2410 } 2411 } 2412 *next_state = works; 2413 return rval; 2414} 2415 2416/* Check whether period can match or not in the current context. If it can, 2417 return the amount of the bytes with which period can match, otherwise 2418 return 0. 2419 `pos' is the position of the period. `index' is the index from the 2420 buf_begin, and it is the current position in the buffer. */ 2421static int 2422match_anychar (struct dfa *d, int s, position pos, int index) 2423{ 2424 int newline = 0; 2425 int letter = 0; 2426 wchar_t wc; 2427 int mbclen; 2428 2429 wc = inputwcs[index]; 2430 mbclen = (mblen_buf[index] == 0)? 1 : mblen_buf[index]; 2431 2432 /* Check context. */ 2433 if (wc == (wchar_t)eolbyte) 2434 { 2435 if (!(syntax_bits & RE_DOT_NEWLINE)) 2436 return 0; 2437 newline = 1; 2438 } 2439 else if (wc == (wchar_t)'\0') 2440 { 2441 if (syntax_bits & RE_DOT_NOT_NULL) 2442 return 0; 2443 newline = 1; 2444 } 2445 2446 if (iswalnum(wc) || wc == L'_') 2447 letter = 1; 2448 2449 if (!SUCCEEDS_IN_CONTEXT(pos.constraint, d->states[s].newline, 2450 newline, d->states[s].letter, letter)) 2451 return 0; 2452 2453 return mbclen; 2454} 2455 2456/* Check whether bracket expression can match or not in the current context. 2457 If it can, return the amount of the bytes with which expression can match, 2458 otherwise return 0. 2459 `pos' is the position of the bracket expression. `index' is the index 2460 from the buf_begin, and it is the current position in the buffer. */ 2461int 2462match_mb_charset (struct dfa *d, int s, position pos, int index) 2463{ 2464 int i; 2465 int match; /* Flag which represent that matching succeed. */ 2466 int match_len; /* Length of the character (or collating element) 2467 with which this operator match. */ 2468 int op_len; /* Length of the operator. */ 2469 char buffer[128]; 2470 wchar_t wcbuf[6]; 2471 2472 /* Pointer to the structure to which we are currently reffering. */ 2473 struct mb_char_classes *work_mbc; 2474 2475 int newline = 0; 2476 int letter = 0; 2477 wchar_t wc; /* Current reffering character. */ 2478 2479 wc = inputwcs[index]; 2480 2481 /* Check context. */ 2482 if (wc == (wchar_t)eolbyte) 2483 { 2484 if (!(syntax_bits & RE_DOT_NEWLINE)) 2485 return 0; 2486 newline = 1; 2487 } 2488 else if (wc == (wchar_t)'\0') 2489 { 2490 if (syntax_bits & RE_DOT_NOT_NULL) 2491 return 0; 2492 newline = 1; 2493 } 2494 if (iswalnum(wc) || wc == L'_') 2495 letter = 1; 2496 if (!SUCCEEDS_IN_CONTEXT(pos.constraint, d->states[s].newline, 2497 newline, d->states[s].letter, letter)) 2498 return 0; 2499 2500 /* Assign the current reffering operator to work_mbc. */ 2501 work_mbc = &(d->mbcsets[(d->multibyte_prop[pos.index]) >> 2]); 2502 match = !work_mbc->invert; 2503 match_len = (mblen_buf[index] == 0)? 1 : mblen_buf[index]; 2504 2505 /* match with a character class? */ 2506 for (i = 0; i<work_mbc->nch_classes; i++) 2507 { 2508 if (iswctype((wint_t)wc, work_mbc->ch_classes[i])) 2509 goto charset_matched; 2510 } 2511 2512 strncpy(buffer, buf_begin + index, match_len); 2513 buffer[match_len] = '\0'; 2514 2515 /* match with an equivalent class? */ 2516 for (i = 0; i<work_mbc->nequivs; i++) 2517 { 2518 op_len = strlen(work_mbc->equivs[i]); 2519 strncpy(buffer, buf_begin + index, op_len); 2520 buffer[op_len] = '\0'; 2521 if (strcoll(work_mbc->equivs[i], buffer) == 0) 2522 { 2523 match_len = op_len; 2524 goto charset_matched; 2525 } 2526 } 2527 2528 /* match with a collating element? */ 2529 for (i = 0; i<work_mbc->ncoll_elems; i++) 2530 { 2531 op_len = strlen(work_mbc->coll_elems[i]); 2532 strncpy(buffer, buf_begin + index, op_len); 2533 buffer[op_len] = '\0'; 2534 2535 if (strcoll(work_mbc->coll_elems[i], buffer) == 0) 2536 { 2537 match_len = op_len; 2538 goto charset_matched; 2539 } 2540 } 2541 2542 wcbuf[0] = wc; 2543 wcbuf[1] = wcbuf[3] = wcbuf[5] = '\0'; 2544 2545 /* match with a range? */ 2546 for (i = 0; i<work_mbc->nranges; i++) 2547 { 2548 wcbuf[2] = work_mbc->range_sts[i]; 2549 wcbuf[4] = work_mbc->range_ends[i]; 2550 2551 if (wcscoll(wcbuf, wcbuf+2) >= 0 && 2552 wcscoll(wcbuf+4, wcbuf) >= 0) 2553 goto charset_matched; 2554 } 2555 2556 /* match with a character? */ 2557 for (i = 0; i<work_mbc->nchars; i++) 2558 { 2559 if (wc == work_mbc->chars[i]) 2560 goto charset_matched; 2561 } 2562 2563 match = !match; 2564 2565 charset_matched: 2566 return match ? match_len : 0; 2567} 2568 2569/* Check each of `d->states[s].mbps.elem' can match or not. Then return the 2570 array which corresponds to `d->states[s].mbps.elem' and each element of 2571 the array contains the amount of the bytes with which the element can 2572 match. 2573 `index' is the index from the buf_begin, and it is the current position 2574 in the buffer. 2575 Caller MUST free the array which this function return. */ 2576static int* 2577check_matching_with_multibyte_ops (struct dfa *d, int s, int index) 2578{ 2579 int i; 2580 int* rarray; 2581 2582 MALLOC(rarray, int, d->states[s].mbps.nelem); 2583 for (i = 0; i < d->states[s].mbps.nelem; ++i) 2584 { 2585 position pos = d->states[s].mbps.elems[i]; 2586 switch(d->tokens[pos.index]) 2587 { 2588 case ANYCHAR: 2589 rarray[i] = match_anychar(d, s, pos, index); 2590 break; 2591 case MBCSET: 2592 rarray[i] = match_mb_charset(d, s, pos, index); 2593 break; 2594 default: 2595 break; /* can not happen. */ 2596 } 2597 } 2598 return rarray; 2599} 2600 2601/* Consume a single character and enumerate all of the positions which can 2602 be next position from the state `s'. 2603 `match_lens' is the input. It can be NULL, but it can also be the output 2604 of check_matching_with_multibyte_ops() for optimization. 2605 `mbclen' and `pps' are the output. `mbclen' is the length of the 2606 character consumed, and `pps' is the set this function enumerate. */ 2607static status_transit_state 2608transit_state_consume_1char (struct dfa *d, int s, unsigned char const **pp, 2609 int *match_lens, int *mbclen, position_set *pps) 2610{ 2611 int i, j; 2612 int s1, s2; 2613 int* work_mbls; 2614 status_transit_state rs = TRANSIT_STATE_DONE; 2615 2616 /* Calculate the length of the (single/multi byte) character 2617 to which p points. */ 2618 *mbclen = (mblen_buf[*pp - buf_begin] == 0)? 1 2619 : mblen_buf[*pp - buf_begin]; 2620 2621 /* Calculate the state which can be reached from the state `s' by 2622 consuming `*mbclen' single bytes from the buffer. */ 2623 s1 = s; 2624 for (i = 0; i < *mbclen; i++) 2625 { 2626 s2 = s1; 2627 rs = transit_state_singlebyte(d, s2, (*pp)++, &s1); 2628 } 2629 /* Copy the positions contained by `s1' to the set `pps'. */ 2630 copy(&(d->states[s1].elems), pps); 2631 2632 /* Check (inputed)match_lens, and initialize if it is NULL. */ 2633 if (match_lens == NULL && d->states[s].mbps.nelem != 0) 2634 work_mbls = check_matching_with_multibyte_ops(d, s, *pp - buf_begin); 2635 else 2636 work_mbls = match_lens; 2637 2638 /* Add all of the positions which can be reached from `s' by consuming 2639 a single character. */ 2640 for (i = 0; i < d->states[s].mbps.nelem ; i++) 2641 { 2642 if (work_mbls[i] == *mbclen) 2643 for (j = 0; j < d->follows[d->states[s].mbps.elems[i].index].nelem; 2644 j++) 2645 insert(d->follows[d->states[s].mbps.elems[i].index].elems[j], 2646 pps); 2647 } 2648 2649 if (match_lens == NULL && work_mbls != NULL) 2650 free(work_mbls); 2651 return rs; 2652} 2653 2654/* Transit state from s, then return new state and update the pointer of the 2655 buffer. This function is for some operator which can match with a multi- 2656 byte character or a collating element(which may be multi characters). */ 2657static int 2658transit_state (struct dfa *d, int s, unsigned char const **pp) 2659{ 2660 int s1; 2661 int mbclen; /* The length of current input multibyte character. */ 2662 int maxlen = 0; 2663 int i, j; 2664 int *match_lens = NULL; 2665 int nelem = d->states[s].mbps.nelem; /* Just a alias. */ 2666 position_set follows; 2667 unsigned char const *p1 = *pp; 2668 status_transit_state rs; 2669 wchar_t wc; 2670 2671 if (nelem > 0) 2672 /* This state has (a) multibyte operator(s). 2673 We check whether each of them can match or not. */ 2674 { 2675 /* Note: caller must free the return value of this function. */ 2676 match_lens = check_matching_with_multibyte_ops(d, s, *pp - buf_begin); 2677 2678 for (i = 0; i < nelem; i++) 2679 /* Search the operator which match the longest string, 2680 in this state. */ 2681 { 2682 if (match_lens[i] > maxlen) 2683 maxlen = match_lens[i]; 2684 } 2685 } 2686 2687 if (nelem == 0 || maxlen == 0) 2688 /* This state has no multibyte operator which can match. 2689 We need to check only one singlebyte character. */ 2690 { 2691 status_transit_state rs; 2692 rs = transit_state_singlebyte(d, s, *pp, &s1); 2693 2694 /* We must update the pointer if state transition succeeded. */ 2695 if (rs == TRANSIT_STATE_DONE) 2696 ++*pp; 2697 2698 if (match_lens != NULL) 2699 free(match_lens); 2700 return s1; 2701 } 2702 2703 /* This state has some operators which can match a multibyte character. */ 2704 follows.nelem = 0; 2705 MALLOC(follows.elems, position, d->nleaves); 2706 2707 /* `maxlen' may be longer than the length of a character, because it may 2708 not be a character but a (multi character) collating element. 2709 We enumerate all of the positions which `s' can reach by consuming 2710 `maxlen' bytes. */ 2711 rs = transit_state_consume_1char(d, s, pp, match_lens, &mbclen, &follows); 2712 2713 wc = inputwcs[*pp - mbclen - buf_begin]; 2714 s1 = state_index(d, &follows, wc == L'\n', iswalnum(wc)); 2715 realloc_trans_if_necessary(d, s1); 2716 2717 while (*pp - p1 < maxlen) 2718 { 2719 follows.nelem = 0; 2720 rs = transit_state_consume_1char(d, s1, pp, NULL, &mbclen, &follows); 2721 2722 for (i = 0; i < nelem ; i++) 2723 { 2724 if (match_lens[i] == *pp - p1) 2725 for (j = 0; 2726 j < d->follows[d->states[s1].mbps.elems[i].index].nelem; j++) 2727 insert(d->follows[d->states[s1].mbps.elems[i].index].elems[j], 2728 &follows); 2729 } 2730 2731 wc = inputwcs[*pp - mbclen - buf_begin]; 2732 s1 = state_index(d, &follows, wc == L'\n', iswalnum(wc)); 2733 realloc_trans_if_necessary(d, s1); 2734 } 2735 free(match_lens); 2736 free(follows.elems); 2737 return s1; 2738} 2739 2740#endif 2741 2742/* Search through a buffer looking for a match to the given struct dfa. 2743 Find the first occurrence of a string matching the regexp in the buffer, 2744 and the shortest possible version thereof. Return the offset of the first 2745 character after the match, or (size_t) -1 if none is found. BEGIN points to 2746 the beginning of the buffer, and SIZE is the size of the buffer. If SIZE 2747 is nonzero, BEGIN[SIZE - 1] must be a newline. BACKREF points to a place 2748 where we're supposed to store a 1 if backreferencing happened and the 2749 match needs to be verified by a backtracking matcher. Otherwise 2750 we store a 0 in *backref. */ 2751size_t 2752dfaexec (struct dfa *d, char const *begin, size_t size, int *backref) 2753{ 2754 register int s; /* Current state. */ 2755 register unsigned char const *p; /* Current input character. */ 2756 register unsigned char const *end; /* One past the last input character. */ 2757 register int **trans, *t; /* Copy of d->trans so it can be optimized 2758 into a register. */ 2759 register unsigned char eol = eolbyte; /* Likewise for eolbyte. */ 2760 static int sbit[NOTCHAR]; /* Table for anding with d->success. */ 2761 static int sbit_init; 2762 2763 if (! sbit_init) 2764 { 2765 int i; 2766 2767 sbit_init = 1; 2768 for (i = 0; i < NOTCHAR; ++i) 2769 sbit[i] = (IS_WORD_CONSTITUENT(i)) ? 2 : 1; 2770 sbit[eol] = 4; 2771 } 2772 2773 if (! d->tralloc) 2774 build_state_zero(d); 2775 2776 s = 0; 2777 p = (unsigned char const *) begin; 2778 end = p + size; 2779 trans = d->trans; 2780 2781#ifdef MBS_SUPPORT 2782 if (MB_CUR_MAX > 1) 2783 { 2784 int remain_bytes, i; 2785 buf_begin = begin; 2786 buf_end = end; 2787 2788 /* initialize mblen_buf, and inputwcs. */ 2789 MALLOC(mblen_buf, unsigned char, end - (unsigned char const *)begin + 2); 2790 MALLOC(inputwcs, wchar_t, end - (unsigned char const *)begin + 2); 2791 memset(&mbs, 0, sizeof(mbstate_t)); 2792 remain_bytes = 0; 2793 for (i = 0; i < end - (unsigned char const *)begin + 1; i++) 2794 { 2795 if (remain_bytes == 0) 2796 { 2797 remain_bytes 2798 = mbrtowc(inputwcs + i, begin + i, 2799 end - (unsigned char const *)begin - i + 1, &mbs); 2800 if (remain_bytes <= 1) 2801 { 2802 remain_bytes = 0; 2803 inputwcs[i] = (wchar_t)begin[i]; 2804 mblen_buf[i] = 0; 2805 } 2806 else 2807 { 2808 mblen_buf[i] = remain_bytes; 2809 remain_bytes--; 2810 } 2811 } 2812 else 2813 { 2814 mblen_buf[i] = remain_bytes; 2815 inputwcs[i] = 0; 2816 remain_bytes--; 2817 } 2818 } 2819 mblen_buf[i] = 0; 2820 inputwcs[i] = 0; /* sentinel */ 2821 } 2822#endif /* MBS_SUPPORT */ 2823 2824 for (;;) 2825 { 2826#ifdef MBS_SUPPORT 2827 if (MB_CUR_MAX > 1) 2828 while ((t = trans[s])) 2829 { 2830 if (d->states[s].mbps.nelem != 0) 2831 { 2832 /* Can match with a multibyte character( and multi character 2833 collating element). */ 2834 unsigned char const *nextp; 2835 2836 SKIP_REMAINS_MB_IF_INITIAL_STATE(s, p); 2837 2838 nextp = p; 2839 s = transit_state(d, s, &nextp); 2840 p = nextp; 2841 2842 /* Trans table might be updated. */ 2843 trans = d->trans; 2844 } 2845 else 2846 { 2847 SKIP_REMAINS_MB_IF_INITIAL_STATE(s, p); 2848 s = t[*p++]; 2849 } 2850 } 2851 else 2852#endif /* MBS_SUPPORT */ 2853 while ((t = trans[s])) 2854 s = t[*p++]; 2855 2856 if (s < 0) 2857 { 2858 if (p == end) 2859 { 2860#ifdef MBS_SUPPORT 2861 if (MB_CUR_MAX > 1) 2862 { 2863 free(mblen_buf); 2864 free(inputwcs); 2865 } 2866#endif /* MBS_SUPPORT */ 2867 return (size_t) -1; 2868 } 2869 s = 0; 2870 } 2871 else if ((t = d->fails[s])) 2872 { 2873 if (d->success[s] & sbit[*p]) 2874 { 2875 if (backref) 2876 *backref = (d->states[s].backref != 0); 2877#ifdef MBS_SUPPORT 2878 if (MB_CUR_MAX > 1) 2879 { 2880 free(mblen_buf); 2881 free(inputwcs); 2882 } 2883#endif /* MBS_SUPPORT */ 2884 return (char const *) p - begin; 2885 } 2886 2887#ifdef MBS_SUPPORT 2888 if (MB_CUR_MAX > 1) 2889 { 2890 SKIP_REMAINS_MB_IF_INITIAL_STATE(s, p); 2891 if (d->states[s].mbps.nelem != 0) 2892 { 2893 /* Can match with a multibyte character( and multi 2894 character collating element). */ 2895 unsigned char const *nextp; 2896 nextp = p; 2897 s = transit_state(d, s, &nextp); 2898 p = nextp; 2899 2900 /* Trans table might be updated. */ 2901 trans = d->trans; 2902 } 2903 else 2904 s = t[*p++]; 2905 } 2906 else 2907#endif /* MBS_SUPPORT */ 2908 s = t[*p++]; 2909 } 2910 else 2911 { 2912 build_state(s, d); 2913 trans = d->trans; 2914 } 2915 } 2916} 2917 2918/* Initialize the components of a dfa that the other routines don't 2919 initialize for themselves. */ 2920void 2921dfainit (struct dfa *d) 2922{ 2923 d->calloc = 1; 2924 MALLOC(d->charclasses, charclass, d->calloc); 2925 d->cindex = 0; 2926 2927 d->talloc = 1; 2928 MALLOC(d->tokens, token, d->talloc); 2929 d->tindex = d->depth = d->nleaves = d->nregexps = 0; 2930#ifdef MBS_SUPPORT 2931 if (MB_CUR_MAX > 1) 2932 { 2933 d->nmultibyte_prop = 1; 2934 MALLOC(d->multibyte_prop, int, d->nmultibyte_prop); 2935 d->nmbcsets = 0; 2936 d->mbcsets_alloc = 1; 2937 MALLOC(d->mbcsets, struct mb_char_classes, d->mbcsets_alloc); 2938 } 2939#endif 2940 2941 d->searchflag = 0; 2942 d->tralloc = 0; 2943 2944 d->musts = 0; 2945} 2946 2947/* Parse and analyze a single string of the given length. */ 2948void 2949dfacomp (char const *s, size_t len, struct dfa *d, int searchflag) 2950{ 2951 if (case_fold) /* dummy folding in service of dfamust() */ 2952 { 2953 char *lcopy; 2954 int i; 2955 2956 lcopy = malloc(len); 2957 if (!lcopy) 2958 dfaerror(_("out of memory")); 2959 2960 /* This is a kludge. */ 2961 case_fold = 0; 2962 for (i = 0; i < len; ++i) 2963 if (ISUPPER ((unsigned char) s[i])) 2964 lcopy[i] = tolower ((unsigned char) s[i]); 2965 else 2966 lcopy[i] = s[i]; 2967 2968 dfainit(d); 2969 dfaparse(lcopy, len, d); 2970 free(lcopy); 2971 dfamust(d); 2972 d->cindex = d->tindex = d->depth = d->nleaves = d->nregexps = 0; 2973 case_fold = 1; 2974 dfaparse(s, len, d); 2975 dfaanalyze(d, searchflag); 2976 } 2977 else 2978 { 2979 dfainit(d); 2980 dfaparse(s, len, d); 2981 dfamust(d); 2982 dfaanalyze(d, searchflag); 2983 } 2984} 2985 2986/* Free the storage held by the components of a dfa. */ 2987void 2988dfafree (struct dfa *d) 2989{ 2990 int i; 2991 struct dfamust *dm, *ndm; 2992 2993 free((ptr_t) d->charclasses); 2994 free((ptr_t) d->tokens); 2995 2996#ifdef MBS_SUPPORT 2997 if (MB_CUR_MAX > 1) 2998 { 2999 free((ptr_t) d->multibyte_prop); 3000 for (i = 0; i < d->nmbcsets; ++i) 3001 { 3002 int j; 3003 struct mb_char_classes *p = &(d->mbcsets[i]); 3004 if (p->chars != NULL) 3005 free(p->chars); 3006 if (p->ch_classes != NULL) 3007 free(p->ch_classes); 3008 if (p->range_sts != NULL) 3009 free(p->range_sts); 3010 if (p->range_ends != NULL) 3011 free(p->range_ends); 3012 3013 for (j = 0; j < p->nequivs; ++j) 3014 free(p->equivs[j]); 3015 if (p->equivs != NULL) 3016 free(p->equivs); 3017 3018 for (j = 0; j < p->ncoll_elems; ++j) 3019 free(p->coll_elems[j]); 3020 if (p->coll_elems != NULL) 3021 free(p->coll_elems); 3022 } 3023 free((ptr_t) d->mbcsets); 3024 } 3025#endif /* MBS_SUPPORT */ 3026 3027 for (i = 0; i < d->sindex; ++i) 3028 free((ptr_t) d->states[i].elems.elems); 3029 free((ptr_t) d->states); 3030 for (i = 0; i < d->tindex; ++i) 3031 if (d->follows[i].elems) 3032 free((ptr_t) d->follows[i].elems); 3033 free((ptr_t) d->follows); 3034 for (i = 0; i < d->tralloc; ++i) 3035 if (d->trans[i]) 3036 free((ptr_t) d->trans[i]); 3037 else if (d->fails[i]) 3038 free((ptr_t) d->fails[i]); 3039 if (d->realtrans) free((ptr_t) d->realtrans); 3040 if (d->fails) free((ptr_t) d->fails); 3041 if (d->success) free((ptr_t) d->success); 3042 for (dm = d->musts; dm; dm = ndm) 3043 { 3044 ndm = dm->next; 3045 free(dm->must); 3046 free((ptr_t) dm); 3047 } 3048} 3049 3050/* Having found the postfix representation of the regular expression, 3051 try to find a long sequence of characters that must appear in any line 3052 containing the r.e. 3053 Finding a "longest" sequence is beyond the scope here; 3054 we take an easy way out and hope for the best. 3055 (Take "(ab|a)b"--please.) 3056 3057 We do a bottom-up calculation of sequences of characters that must appear 3058 in matches of r.e.'s represented by trees rooted at the nodes of the postfix 3059 representation: 3060 sequences that must appear at the left of the match ("left") 3061 sequences that must appear at the right of the match ("right") 3062 lists of sequences that must appear somewhere in the match ("in") 3063 sequences that must constitute the match ("is") 3064 3065 When we get to the root of the tree, we use one of the longest of its 3066 calculated "in" sequences as our answer. The sequence we find is returned in 3067 d->must (where "d" is the single argument passed to "dfamust"); 3068 the length of the sequence is returned in d->mustn. 3069 3070 The sequences calculated for the various types of node (in pseudo ANSI c) 3071 are shown below. "p" is the operand of unary operators (and the left-hand 3072 operand of binary operators); "q" is the right-hand operand of binary 3073 operators. 3074 3075 "ZERO" means "a zero-length sequence" below. 3076 3077 Type left right is in 3078 ---- ---- ----- -- -- 3079 char c # c # c # c # c 3080 3081 ANYCHAR ZERO ZERO ZERO ZERO 3082 3083 MBCSET ZERO ZERO ZERO ZERO 3084 3085 CSET ZERO ZERO ZERO ZERO 3086 3087 STAR ZERO ZERO ZERO ZERO 3088 3089 QMARK ZERO ZERO ZERO ZERO 3090 3091 PLUS p->left p->right ZERO p->in 3092 3093 CAT (p->is==ZERO)? (q->is==ZERO)? (p->is!=ZERO && p->in plus 3094 p->left : q->right : q->is!=ZERO) ? q->in plus 3095 p->is##q->left p->right##q->is p->is##q->is : p->right##q->left 3096 ZERO 3097 3098 OR longest common longest common (do p->is and substrings common to 3099 leading trailing q->is have same p->in and q->in 3100 (sub)sequence (sub)sequence length and 3101 of p->left of p->right content) ? 3102 and q->left and q->right p->is : NULL 3103 3104 If there's anything else we recognize in the tree, all four sequences get set 3105 to zero-length sequences. If there's something we don't recognize in the tree, 3106 we just return a zero-length sequence. 3107 3108 Break ties in favor of infrequent letters (choosing 'zzz' in preference to 3109 'aaa')? 3110 3111 And. . .is it here or someplace that we might ponder "optimizations" such as 3112 egrep 'psi|epsilon' -> egrep 'psi' 3113 egrep 'pepsi|epsilon' -> egrep 'epsi' 3114 (Yes, we now find "epsi" as a "string 3115 that must occur", but we might also 3116 simplify the *entire* r.e. being sought) 3117 grep '[c]' -> grep 'c' 3118 grep '(ab|a)b' -> grep 'ab' 3119 grep 'ab*' -> grep 'a' 3120 grep 'a*b' -> grep 'b' 3121 3122 There are several issues: 3123 3124 Is optimization easy (enough)? 3125 3126 Does optimization actually accomplish anything, 3127 or is the automaton you get from "psi|epsilon" (for example) 3128 the same as the one you get from "psi" (for example)? 3129 3130 Are optimizable r.e.'s likely to be used in real-life situations 3131 (something like 'ab*' is probably unlikely; something like is 3132 'psi|epsilon' is likelier)? */ 3133 3134static char * 3135icatalloc (char *old, char *new) 3136{ 3137 char *result; 3138 size_t oldsize, newsize; 3139 3140 newsize = (new == NULL) ? 0 : strlen(new); 3141 if (old == NULL) 3142 oldsize = 0; 3143 else if (newsize == 0) 3144 return old; 3145 else oldsize = strlen(old); 3146 if (old == NULL) 3147 result = (char *) malloc(newsize + 1); 3148 else 3149 result = (char *) realloc((void *) old, oldsize + newsize + 1); 3150 if (result != NULL && new != NULL) 3151 (void) strcpy(result + oldsize, new); 3152 return result; 3153} 3154 3155static char * 3156icpyalloc (char *string) 3157{ 3158 return icatalloc((char *) NULL, string); 3159} 3160 3161static char * 3162istrstr (char *lookin, char *lookfor) 3163{ 3164 char *cp; 3165 size_t len; 3166 3167 len = strlen(lookfor); 3168 for (cp = lookin; *cp != '\0'; ++cp) 3169 if (strncmp(cp, lookfor, len) == 0) 3170 return cp; 3171 return NULL; 3172} 3173 3174static void 3175ifree (char *cp) 3176{ 3177 if (cp != NULL) 3178 free(cp); 3179} 3180 3181static void 3182freelist (char **cpp) 3183{ 3184 int i; 3185 3186 if (cpp == NULL) 3187 return; 3188 for (i = 0; cpp[i] != NULL; ++i) 3189 { 3190 free(cpp[i]); 3191 cpp[i] = NULL; 3192 } 3193} 3194 3195static char ** 3196enlist (char **cpp, char *new, size_t len) 3197{ 3198 int i, j; 3199 3200 if (cpp == NULL) 3201 return NULL; 3202 if ((new = icpyalloc(new)) == NULL) 3203 { 3204 freelist(cpp); 3205 return NULL; 3206 } 3207 new[len] = '\0'; 3208 /* Is there already something in the list that's new (or longer)? */ 3209 for (i = 0; cpp[i] != NULL; ++i) 3210 if (istrstr(cpp[i], new) != NULL) 3211 { 3212 free(new); 3213 return cpp; 3214 } 3215 /* Eliminate any obsoleted strings. */ 3216 j = 0; 3217 while (cpp[j] != NULL) 3218 if (istrstr(new, cpp[j]) == NULL) 3219 ++j; 3220 else 3221 { 3222 free(cpp[j]); 3223 if (--i == j) 3224 break; 3225 cpp[j] = cpp[i]; 3226 cpp[i] = NULL; 3227 } 3228 /* Add the new string. */ 3229 cpp = (char **) realloc((char *) cpp, (i + 2) * sizeof *cpp); 3230 if (cpp == NULL) 3231 return NULL; 3232 cpp[i] = new; 3233 cpp[i + 1] = NULL; 3234 return cpp; 3235} 3236 3237/* Given pointers to two strings, return a pointer to an allocated 3238 list of their distinct common substrings. Return NULL if something 3239 seems wild. */ 3240static char ** 3241comsubs (char *left, char *right) 3242{ 3243 char **cpp; 3244 char *lcp; 3245 char *rcp; 3246 size_t i, len; 3247 3248 if (left == NULL || right == NULL) 3249 return NULL; 3250 cpp = (char **) malloc(sizeof *cpp); 3251 if (cpp == NULL) 3252 return NULL; 3253 cpp[0] = NULL; 3254 for (lcp = left; *lcp != '\0'; ++lcp) 3255 { 3256 len = 0; 3257 rcp = strchr (right, *lcp); 3258 while (rcp != NULL) 3259 { 3260 for (i = 1; lcp[i] != '\0' && lcp[i] == rcp[i]; ++i) 3261 continue; 3262 if (i > len) 3263 len = i; 3264 rcp = strchr (rcp + 1, *lcp); 3265 } 3266 if (len == 0) 3267 continue; 3268 if ((cpp = enlist(cpp, lcp, len)) == NULL) 3269 break; 3270 } 3271 return cpp; 3272} 3273 3274static char ** 3275addlists (char **old, char **new) 3276{ 3277 int i; 3278 3279 if (old == NULL || new == NULL) 3280 return NULL; 3281 for (i = 0; new[i] != NULL; ++i) 3282 { 3283 old = enlist(old, new[i], strlen(new[i])); 3284 if (old == NULL) 3285 break; 3286 } 3287 return old; 3288} 3289 3290/* Given two lists of substrings, return a new list giving substrings 3291 common to both. */ 3292static char ** 3293inboth (char **left, char **right) 3294{ 3295 char **both; 3296 char **temp; 3297 int lnum, rnum; 3298 3299 if (left == NULL || right == NULL) 3300 return NULL; 3301 both = (char **) malloc(sizeof *both); 3302 if (both == NULL) 3303 return NULL; 3304 both[0] = NULL; 3305 for (lnum = 0; left[lnum] != NULL; ++lnum) 3306 { 3307 for (rnum = 0; right[rnum] != NULL; ++rnum) 3308 { 3309 temp = comsubs(left[lnum], right[rnum]); 3310 if (temp == NULL) 3311 { 3312 freelist(both); 3313 return NULL; 3314 } 3315 both = addlists(both, temp); 3316 freelist(temp); 3317 free(temp); 3318 if (both == NULL) 3319 return NULL; 3320 } 3321 } 3322 return both; 3323} 3324 3325typedef struct 3326{ 3327 char **in; 3328 char *left; 3329 char *right; 3330 char *is; 3331} must; 3332 3333static void 3334resetmust (must *mp) 3335{ 3336 mp->left[0] = mp->right[0] = mp->is[0] = '\0'; 3337 freelist(mp->in); 3338} 3339 3340static void 3341dfamust (struct dfa *dfa) 3342{ 3343 must *musts; 3344 must *mp; 3345 char *result; 3346 int ri; 3347 int i; 3348 int exact; 3349 token t; 3350 static must must0; 3351 struct dfamust *dm; 3352 static char empty_string[] = ""; 3353 3354 result = empty_string; 3355 exact = 0; 3356 musts = (must *) malloc((dfa->tindex + 1) * sizeof *musts); 3357 if (musts == NULL) 3358 return; 3359 mp = musts; 3360 for (i = 0; i <= dfa->tindex; ++i) 3361 mp[i] = must0; 3362 for (i = 0; i <= dfa->tindex; ++i) 3363 { 3364 mp[i].in = (char **) malloc(sizeof *mp[i].in); 3365 mp[i].left = malloc(2); 3366 mp[i].right = malloc(2); 3367 mp[i].is = malloc(2); 3368 if (mp[i].in == NULL || mp[i].left == NULL || 3369 mp[i].right == NULL || mp[i].is == NULL) 3370 goto done; 3371 mp[i].left[0] = mp[i].right[0] = mp[i].is[0] = '\0'; 3372 mp[i].in[0] = NULL; 3373 } 3374#ifdef DEBUG 3375 fprintf(stderr, "dfamust:\n"); 3376 for (i = 0; i < dfa->tindex; ++i) 3377 { 3378 fprintf(stderr, " %d:", i); 3379 prtok(dfa->tokens[i]); 3380 } 3381 putc('\n', stderr); 3382#endif 3383 for (ri = 0; ri < dfa->tindex; ++ri) 3384 { 3385 switch (t = dfa->tokens[ri]) 3386 { 3387 case LPAREN: 3388 case RPAREN: 3389 goto done; /* "cannot happen" */ 3390 case EMPTY: 3391 case BEGLINE: 3392 case ENDLINE: 3393 case BEGWORD: 3394 case ENDWORD: 3395 case LIMWORD: 3396 case NOTLIMWORD: 3397 case BACKREF: 3398 resetmust(mp); 3399 break; 3400 case STAR: 3401 case QMARK: 3402 if (mp <= musts) 3403 goto done; /* "cannot happen" */ 3404 --mp; 3405 resetmust(mp); 3406 break; 3407 case OR: 3408 case ORTOP: 3409 if (mp < &musts[2]) 3410 goto done; /* "cannot happen" */ 3411 { 3412 char **new; 3413 must *lmp; 3414 must *rmp; 3415 int j, ln, rn, n; 3416 3417 rmp = --mp; 3418 lmp = --mp; 3419 /* Guaranteed to be. Unlikely, but. . . */ 3420 if (strcmp(lmp->is, rmp->is) != 0) 3421 lmp->is[0] = '\0'; 3422 /* Left side--easy */ 3423 i = 0; 3424 while (lmp->left[i] != '\0' && lmp->left[i] == rmp->left[i]) 3425 ++i; 3426 lmp->left[i] = '\0'; 3427 /* Right side */ 3428 ln = strlen(lmp->right); 3429 rn = strlen(rmp->right); 3430 n = ln; 3431 if (n > rn) 3432 n = rn; 3433 for (i = 0; i < n; ++i) 3434 if (lmp->right[ln - i - 1] != rmp->right[rn - i - 1]) 3435 break; 3436 for (j = 0; j < i; ++j) 3437 lmp->right[j] = lmp->right[(ln - i) + j]; 3438 lmp->right[j] = '\0'; 3439 new = inboth(lmp->in, rmp->in); 3440 if (new == NULL) 3441 goto done; 3442 freelist(lmp->in); 3443 free((char *) lmp->in); 3444 lmp->in = new; 3445 } 3446 break; 3447 case PLUS: 3448 if (mp <= musts) 3449 goto done; /* "cannot happen" */ 3450 --mp; 3451 mp->is[0] = '\0'; 3452 break; 3453 case END: 3454 if (mp != &musts[1]) 3455 goto done; /* "cannot happen" */ 3456 for (i = 0; musts[0].in[i] != NULL; ++i) 3457 if (strlen(musts[0].in[i]) > strlen(result)) 3458 result = musts[0].in[i]; 3459 if (strcmp(result, musts[0].is) == 0) 3460 exact = 1; 3461 goto done; 3462 case CAT: 3463 if (mp < &musts[2]) 3464 goto done; /* "cannot happen" */ 3465 { 3466 must *lmp; 3467 must *rmp; 3468 3469 rmp = --mp; 3470 lmp = --mp; 3471 /* In. Everything in left, plus everything in 3472 right, plus catenation of 3473 left's right and right's left. */ 3474 lmp->in = addlists(lmp->in, rmp->in); 3475 if (lmp->in == NULL) 3476 goto done; 3477 if (lmp->right[0] != '\0' && 3478 rmp->left[0] != '\0') 3479 { 3480 char *tp; 3481 3482 tp = icpyalloc(lmp->right); 3483 if (tp == NULL) 3484 goto done; 3485 tp = icatalloc(tp, rmp->left); 3486 if (tp == NULL) 3487 goto done; 3488 lmp->in = enlist(lmp->in, tp, 3489 strlen(tp)); 3490 free(tp); 3491 if (lmp->in == NULL) 3492 goto done; 3493 } 3494 /* Left-hand */ 3495 if (lmp->is[0] != '\0') 3496 { 3497 lmp->left = icatalloc(lmp->left, 3498 rmp->left); 3499 if (lmp->left == NULL) 3500 goto done; 3501 } 3502 /* Right-hand */ 3503 if (rmp->is[0] == '\0') 3504 lmp->right[0] = '\0'; 3505 lmp->right = icatalloc(lmp->right, rmp->right); 3506 if (lmp->right == NULL) 3507 goto done; 3508 /* Guaranteed to be */ 3509 if (lmp->is[0] != '\0' && rmp->is[0] != '\0') 3510 { 3511 lmp->is = icatalloc(lmp->is, rmp->is); 3512 if (lmp->is == NULL) 3513 goto done; 3514 } 3515 else 3516 lmp->is[0] = '\0'; 3517 } 3518 break; 3519 default: 3520 if (t < END) 3521 { 3522 /* "cannot happen" */ 3523 goto done; 3524 } 3525 else if (t == '\0') 3526 { 3527 /* not on *my* shift */ 3528 goto done; 3529 } 3530 else if (t >= CSET 3531#ifdef MBS_SUPPORT 3532 || t == ANYCHAR 3533 || t == MBCSET 3534#endif /* MBS_SUPPORT */ 3535 ) 3536 { 3537 /* easy enough */ 3538 resetmust(mp); 3539 } 3540 else 3541 { 3542 /* plain character */ 3543 resetmust(mp); 3544 mp->is[0] = mp->left[0] = mp->right[0] = t; 3545 mp->is[1] = mp->left[1] = mp->right[1] = '\0'; 3546 mp->in = enlist(mp->in, mp->is, (size_t)1); 3547 if (mp->in == NULL) 3548 goto done; 3549 } 3550 break; 3551 } 3552#ifdef DEBUG 3553 fprintf(stderr, " node: %d:", ri); 3554 prtok(dfa->tokens[ri]); 3555 fprintf(stderr, "\n in:"); 3556 for (i = 0; mp->in[i]; ++i) 3557 fprintf(stderr, " \"%s\"", mp->in[i]); 3558 fprintf(stderr, "\n is: \"%s\"\n", mp->is); 3559 fprintf(stderr, " left: \"%s\"\n", mp->left); 3560 fprintf(stderr, " right: \"%s\"\n", mp->right); 3561#endif 3562 ++mp; 3563 } 3564 done: 3565 if (strlen(result)) 3566 { 3567 dm = (struct dfamust *) malloc(sizeof (struct dfamust)); 3568 dm->exact = exact; 3569 dm->must = malloc(strlen(result) + 1); 3570 strcpy(dm->must, result); 3571 dm->next = dfa->musts; 3572 dfa->musts = dm; 3573 } 3574 mp = musts; 3575 for (i = 0; i <= dfa->tindex; ++i) 3576 { 3577 freelist(mp[i].in); 3578 ifree((char *) mp[i].in); 3579 ifree(mp[i].left); 3580 ifree(mp[i].right); 3581 ifree(mp[i].is); 3582 } 3583 free((char *) mp); 3584} 3585/* vim:set shiftwidth=2: */ 3586