1/* dfa.c - deterministic extended regexp routines for GNU 2 Copyright 1988, 1998, 2000, 2005-2006 Free Software Foundation, Inc. 3 4 This program is free software: you can redistribute it and/or modify 5 it under the terms of the GNU General Public License as published by 6 the Free Software Foundation; either version 3 of the License, or 7 (at your option) any later version. 8 9 This program is distributed in the hope that it will be useful, 10 but WITHOUT ANY WARRANTY; without even the implied warranty of 11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 GNU General Public License for more details. 13 14 You should have received a copy of the GNU General Public License 15 along with this program. If not, see <http://www.gnu.org/licenses/>. */ 16 17/* Written June, 1988 by Mike Haertel 18 Modified July, 1988 by Arthur David Olson to assist BMG speedups */ 19 20#ifdef HAVE_CONFIG_H 21#include <config.h> 22#endif 23 24#include <assert.h> 25#include <ctype.h> 26#include <stdio.h> 27 28#include <sys/types.h> 29#include <stdlib.h> 30#include <string.h> 31#include <locale.h> 32 33#if defined HAVE_WCTYPE_H && defined HAVE_WCHAR_H && defined HAVE_MBRTOWC 34/* We can handle multibyte string. */ 35# define MBS_SUPPORT 36#endif 37 38#ifdef MBS_SUPPORT 39# include <wchar.h> 40# include <wctype.h> 41#endif 42 43#ifndef DEBUG /* use the same approach as regex.c */ 44#undef assert 45#define assert(e) 46#endif /* DEBUG */ 47 48#ifndef isgraph 49#define isgraph(C) (isprint(C) && !isspace(C)) 50#endif 51 52#if defined (STDC_HEADERS) || (!defined (isascii) && !defined (HAVE_ISASCII)) 53#define ISALPHA(C) isalpha(C) 54#define ISUPPER(C) isupper(C) 55#define ISLOWER(C) islower(C) 56#define ISDIGIT(C) isdigit(C) 57#define ISXDIGIT(C) isxdigit(C) 58#define ISSPACE(C) isspace(C) 59#define ISPUNCT(C) ispunct(C) 60#define ISALNUM(C) isalnum(C) 61#define ISPRINT(C) isprint(C) 62#define ISGRAPH(C) isgraph(C) 63#define ISCNTRL(C) iscntrl(C) 64#else 65#define ISALPHA(C) (isascii(C) && isalpha(C)) 66#define ISUPPER(C) (isascii(C) && isupper(C)) 67#define ISLOWER(C) (isascii(C) && islower(C)) 68#define ISDIGIT(C) (isascii(C) && isdigit(C)) 69#define ISXDIGIT(C) (isascii(C) && isxdigit(C)) 70#define ISSPACE(C) (isascii(C) && isspace(C)) 71#define ISPUNCT(C) (isascii(C) && ispunct(C)) 72#define ISALNUM(C) (isascii(C) && isalnum(C)) 73#define ISPRINT(C) (isascii(C) && isprint(C)) 74#define ISGRAPH(C) (isascii(C) && isgraph(C)) 75#define ISCNTRL(C) (isascii(C) && iscntrl(C)) 76#endif 77 78/* ISASCIIDIGIT differs from ISDIGIT, as follows: 79 - Its arg may be any int or unsigned int; it need not be an unsigned char. 80 - It's guaranteed to evaluate its argument exactly once. 81 - It's typically faster. 82 Posix 1003.2-1992 section 2.5.2.1 page 50 lines 1556-1558 says that 83 only '0' through '9' are digits. Prefer ISASCIIDIGIT to ISDIGIT unless 84 it's important to use the locale's definition of `digit' even when the 85 host does not conform to Posix. */ 86#define ISASCIIDIGIT(c) ((unsigned) (c) - '0' <= 9) 87 88#include "regex.h" 89#include "dfa.h" 90#include "hard-locale.h" 91#include "gettext.h" 92#define _(str) gettext (str) 93 94/* HPUX, define those as macros in sys/param.h */ 95#ifdef setbit 96# undef setbit 97#endif 98#ifdef clrbit 99# undef clrbit 100#endif 101 102static void dfamust (struct dfa *dfa); 103static void regexp (int toplevel); 104 105static ptr_t 106xcalloc (size_t n, size_t s) 107{ 108 ptr_t r = calloc(n, s); 109 110 if (!r) 111 dfaerror(_("Memory exhausted")); 112 return r; 113} 114 115static ptr_t 116xmalloc (size_t n) 117{ 118 ptr_t r = malloc(n); 119 120 assert(n != 0); 121 if (!r) 122 dfaerror(_("Memory exhausted")); 123 return r; 124} 125 126static ptr_t 127xrealloc (ptr_t p, size_t n) 128{ 129 ptr_t r = realloc(p, n); 130 131 assert(n != 0); 132 if (!r) 133 dfaerror(_("Memory exhausted")); 134 return r; 135} 136 137#define CALLOC(p, t, n) ((p) = (t *) xcalloc((size_t)(n), sizeof (t))) 138#define MALLOC(p, t, n) ((p) = (t *) xmalloc((n) * sizeof (t))) 139#define REALLOC(p, t, n) ((p) = (t *) xrealloc((ptr_t) (p), (n) * sizeof (t))) 140 141/* Reallocate an array of type t if nalloc is too small for index. */ 142#define REALLOC_IF_NECESSARY(p, t, nalloc, index) \ 143 if ((index) >= (nalloc)) \ 144 { \ 145 do \ 146 (nalloc) *= 2; \ 147 while ((index) >= (nalloc)); \ 148 REALLOC(p, t, nalloc); \ 149 } 150 151#ifdef DEBUG 152 153static void 154prtok (token t) 155{ 156 char const *s; 157 158 if (t < 0) 159 fprintf(stderr, "END"); 160 else if (t < NOTCHAR) 161 fprintf(stderr, "%c", t); 162 else 163 { 164 switch (t) 165 { 166 case EMPTY: s = "EMPTY"; break; 167 case BACKREF: s = "BACKREF"; break; 168 case BEGLINE: s = "BEGLINE"; break; 169 case ENDLINE: s = "ENDLINE"; break; 170 case BEGWORD: s = "BEGWORD"; break; 171 case ENDWORD: s = "ENDWORD"; break; 172 case LIMWORD: s = "LIMWORD"; break; 173 case NOTLIMWORD: s = "NOTLIMWORD"; break; 174 case QMARK: s = "QMARK"; break; 175 case STAR: s = "STAR"; break; 176 case PLUS: s = "PLUS"; break; 177 case CAT: s = "CAT"; break; 178 case OR: s = "OR"; break; 179 case ORTOP: s = "ORTOP"; break; 180 case LPAREN: s = "LPAREN"; break; 181 case RPAREN: s = "RPAREN"; break; 182 case CRANGE: s = "CRANGE"; break; 183#ifdef MBS_SUPPORT 184 case ANYCHAR: s = "ANYCHAR"; break; 185 case MBCSET: s = "MBCSET"; break; 186#endif /* MBS_SUPPORT */ 187 default: s = "CSET"; break; 188 } 189 fprintf(stderr, "%s", s); 190 } 191} 192#endif /* DEBUG */ 193 194/* Stuff pertaining to charclasses. */ 195 196static int 197tstbit (unsigned b, charclass c) 198{ 199 return c[b / INTBITS] & 1 << b % INTBITS; 200} 201 202static void 203setbit (unsigned b, charclass c) 204{ 205 c[b / INTBITS] |= 1 << b % INTBITS; 206} 207 208static void 209clrbit (unsigned b, charclass c) 210{ 211 c[b / INTBITS] &= ~(1 << b % INTBITS); 212} 213 214static void 215copyset (charclass src, charclass dst) 216{ 217 memcpy (dst, src, sizeof (charclass)); 218} 219 220static void 221zeroset (charclass s) 222{ 223 memset (s, 0, sizeof (charclass)); 224} 225 226static void 227notset (charclass s) 228{ 229 int i; 230 231 for (i = 0; i < CHARCLASS_INTS; ++i) 232 s[i] = ~s[i]; 233} 234 235static int 236equal (charclass s1, charclass s2) 237{ 238 return memcmp (s1, s2, sizeof (charclass)) == 0; 239} 240 241/* A pointer to the current dfa is kept here during parsing. */ 242static struct dfa *dfa; 243 244/* Find the index of charclass s in dfa->charclasses, or allocate a new charclass. */ 245static int 246charclass_index (charclass s) 247{ 248 int i; 249 250 for (i = 0; i < dfa->cindex; ++i) 251 if (equal(s, dfa->charclasses[i])) 252 return i; 253 REALLOC_IF_NECESSARY(dfa->charclasses, charclass, dfa->calloc, dfa->cindex); 254 ++dfa->cindex; 255 copyset(s, dfa->charclasses[i]); 256 return i; 257} 258 259/* Syntax bits controlling the behavior of the lexical analyzer. */ 260static reg_syntax_t syntax_bits, syntax_bits_set; 261 262/* Flag for case-folding letters into sets. */ 263static int case_fold; 264 265/* End-of-line byte in data. */ 266static unsigned char eolbyte; 267 268/* Entry point to set syntax options. */ 269void 270dfasyntax (reg_syntax_t bits, int fold, unsigned char eol) 271{ 272 syntax_bits_set = 1; 273 syntax_bits = bits; 274 case_fold = fold; 275 eolbyte = eol; 276} 277 278/* Like setbit, but if case is folded, set both cases of a letter. */ 279static void 280setbit_case_fold (unsigned b, charclass c) 281{ 282 setbit (b, c); 283 if (case_fold) 284 { 285 if (ISUPPER (b)) 286 setbit (tolower (b), c); 287 else if (ISLOWER (b)) 288 setbit (toupper (b), c); 289 } 290} 291 292/* Lexical analyzer. All the dross that deals with the obnoxious 293 GNU Regex syntax bits is located here. The poor, suffering 294 reader is referred to the GNU Regex documentation for the 295 meaning of the @#%!@#%^!@ syntax bits. */ 296 297static char const *lexstart; /* Pointer to beginning of input string. */ 298static char const *lexptr; /* Pointer to next input character. */ 299static int lexleft; /* Number of characters remaining. */ 300static token lasttok; /* Previous token returned; initially END. */ 301static int laststart; /* True if we're separated from beginning or (, | 302 only by zero-width characters. */ 303static int parens; /* Count of outstanding left parens. */ 304static int minrep, maxrep; /* Repeat counts for {m,n}. */ 305static int hard_LC_COLLATE; /* Nonzero if LC_COLLATE is hard. */ 306 307#ifdef MBS_SUPPORT 308/* These variables are used only if (MB_CUR_MAX > 1). */ 309static mbstate_t mbs; /* Mbstate for mbrlen(). */ 310static int cur_mb_len; /* Byte length of the current scanning 311 multibyte character. */ 312static int cur_mb_index; /* Byte index of the current scanning multibyte 313 character. 314 315 singlebyte character : cur_mb_index = 0 316 multibyte character 317 1st byte : cur_mb_index = 1 318 2nd byte : cur_mb_index = 2 319 ... 320 nth byte : cur_mb_index = n */ 321static unsigned char *mblen_buf;/* Correspond to the input buffer in dfaexec(). 322 Each element store the amount of remain 323 byte of corresponding multibyte character 324 in the input string. A element's value 325 is 0 if corresponding character is a 326 singlebyte chracter. 327 e.g. input : 'a', <mb(0)>, <mb(1)>, <mb(2)> 328 mblen_buf : 0, 3, 2, 1 329 */ 330static wchar_t *inputwcs; /* Wide character representation of input 331 string in dfaexec(). 332 The length of this array is same as 333 the length of input string(char array). 334 inputstring[i] is a single-byte char, 335 or 1st byte of a multibyte char. 336 And inputwcs[i] is the codepoint. */ 337static unsigned char const *buf_begin;/* refference to begin in dfaexec(). */ 338static unsigned char const *buf_end; /* refference to end in dfaexec(). */ 339#endif /* MBS_SUPPORT */ 340 341#ifdef MBS_SUPPORT 342/* This function update cur_mb_len, and cur_mb_index. 343 p points current lexptr, len is the remaining buffer length. */ 344static void 345update_mb_len_index (char const *p, int len) 346{ 347 /* If last character is a part of a multibyte character, 348 we update cur_mb_index. */ 349 if (cur_mb_index) 350 cur_mb_index = (cur_mb_index >= cur_mb_len)? 0 351 : cur_mb_index + 1; 352 353 /* If last character is a single byte character, or the 354 last portion of a multibyte character, we check whether 355 next character is a multibyte character or not. */ 356 if (! cur_mb_index) 357 { 358 cur_mb_len = mbrlen(p, len, &mbs); 359 if (cur_mb_len > 1) 360 /* It is a multibyte character. 361 cur_mb_len was already set by mbrlen(). */ 362 cur_mb_index = 1; 363 else if (cur_mb_len < 1) 364 /* Invalid sequence. We treat it as a singlebyte character. 365 cur_mb_index is aleady 0. */ 366 cur_mb_len = 1; 367 /* Otherwise, cur_mb_len == 1, it is a singlebyte character. 368 cur_mb_index is aleady 0. */ 369 } 370} 371#endif /* MBS_SUPPORT */ 372 373#ifdef MBS_SUPPORT 374/* Note that characters become unsigned here. */ 375# define FETCH(c, eoferr) \ 376 { \ 377 if (! lexleft) \ 378 { \ 379 if (eoferr != 0) \ 380 dfaerror (eoferr); \ 381 else \ 382 return lasttok = END; \ 383 } \ 384 if (MB_CUR_MAX > 1) \ 385 update_mb_len_index(lexptr, lexleft); \ 386 (c) = (unsigned char) *lexptr++; \ 387 --lexleft; \ 388 } 389 390/* This function fetch a wide character, and update cur_mb_len, 391 used only if the current locale is a multibyte environment. */ 392static wchar_t 393fetch_wc (char const *eoferr) 394{ 395 wchar_t wc; 396 if (! lexleft) 397 { 398 if (eoferr != 0) 399 dfaerror (eoferr); 400 else 401 return -1; 402 } 403 404 cur_mb_len = mbrtowc(&wc, lexptr, lexleft, &mbs); 405 if (cur_mb_len <= 0) 406 { 407 cur_mb_len = 1; 408 wc = *lexptr; 409 } 410 lexptr += cur_mb_len; 411 lexleft -= cur_mb_len; 412 return wc; 413} 414#else 415/* Note that characters become unsigned here. */ 416# define FETCH(c, eoferr) \ 417 { \ 418 if (! lexleft) \ 419 { \ 420 if (eoferr != 0) \ 421 dfaerror (eoferr); \ 422 else \ 423 return lasttok = END; \ 424 } \ 425 (c) = (unsigned char) *lexptr++; \ 426 --lexleft; \ 427 } 428#endif /* MBS_SUPPORT */ 429 430#ifdef MBS_SUPPORT 431/* Multibyte character handling sub-routin for lex. 432 This function parse a bracket expression and build a struct 433 mb_char_classes. */ 434static void 435parse_bracket_exp_mb () 436{ 437 wchar_t wc, wc1, wc2; 438 439 /* Work area to build a mb_char_classes. */ 440 struct mb_char_classes *work_mbc; 441 int chars_al, range_sts_al, range_ends_al, ch_classes_al, 442 equivs_al, coll_elems_al; 443 444 REALLOC_IF_NECESSARY(dfa->mbcsets, struct mb_char_classes, 445 dfa->mbcsets_alloc, dfa->nmbcsets + 1); 446 /* dfa->multibyte_prop[] hold the index of dfa->mbcsets. 447 We will update dfa->multibyte_prop in addtok(), because we can't 448 decide the index in dfa->tokens[]. */ 449 450 /* Initialize work are */ 451 work_mbc = &(dfa->mbcsets[dfa->nmbcsets++]); 452 453 chars_al = 1; 454 range_sts_al = range_ends_al = 0; 455 ch_classes_al = equivs_al = coll_elems_al = 0; 456 MALLOC(work_mbc->chars, wchar_t, chars_al); 457 458 work_mbc->nchars = work_mbc->nranges = work_mbc->nch_classes = 0; 459 work_mbc->nequivs = work_mbc->ncoll_elems = 0; 460 work_mbc->chars = NULL; 461 work_mbc->ch_classes = NULL; 462 work_mbc->range_sts = work_mbc->range_ends = NULL; 463 work_mbc->equivs = work_mbc->coll_elems = NULL; 464 465 wc = fetch_wc(_("Unbalanced [")); 466 if (wc == L'^') 467 { 468 wc = fetch_wc(_("Unbalanced [")); 469 work_mbc->invert = 1; 470 } 471 else 472 work_mbc->invert = 0; 473 do 474 { 475 wc1 = -1; /* mark wc1 is not initialized". */ 476 477 /* Note that if we're looking at some other [:...:] construct, 478 we just treat it as a bunch of ordinary characters. We can do 479 this because we assume regex has checked for syntax errors before 480 dfa is ever called. */ 481 if (wc == L'[' && (syntax_bits & RE_CHAR_CLASSES)) 482 { 483#define BRACKET_BUFFER_SIZE 128 484 char str[BRACKET_BUFFER_SIZE]; 485 wc1 = wc; 486 wc = fetch_wc(_("Unbalanced [")); 487 488 /* If pattern contains `[[:', `[[.', or `[[='. */ 489 if (cur_mb_len == 1 && (wc == L':' || wc == L'.' || wc == L'=')) 490 { 491 unsigned char c; 492 unsigned char delim = (unsigned char)wc; 493 int len = 0; 494 for (;;) 495 { 496 if (! lexleft) 497 dfaerror (_("Unbalanced [")); 498 c = (unsigned char) *lexptr++; 499 --lexleft; 500 501 if ((c == delim && *lexptr == ']') || lexleft == 0) 502 break; 503 if (len < BRACKET_BUFFER_SIZE) 504 str[len++] = c; 505 else 506 /* This is in any case an invalid class name. */ 507 str[0] = '\0'; 508 } 509 str[len] = '\0'; 510 511 if (lexleft == 0) 512 { 513 REALLOC_IF_NECESSARY(work_mbc->chars, wchar_t, chars_al, 514 work_mbc->nchars + 2); 515 work_mbc->chars[work_mbc->nchars++] = L'['; 516 work_mbc->chars[work_mbc->nchars++] = delim; 517 break; 518 } 519 520 if (--lexleft, *lexptr++ != ']') 521 dfaerror (_("Unbalanced [")); 522 if (delim == ':') 523 /* build character class. */ 524 { 525 wctype_t wt; 526 /* Query the character class as wctype_t. */ 527 wt = wctype (str); 528 529 if (ch_classes_al == 0) 530 MALLOC(work_mbc->ch_classes, wctype_t, ++ch_classes_al); 531 REALLOC_IF_NECESSARY(work_mbc->ch_classes, wctype_t, 532 ch_classes_al, 533 work_mbc->nch_classes + 1); 534 work_mbc->ch_classes[work_mbc->nch_classes++] = wt; 535 536 } 537 else if (delim == '=' || delim == '.') 538 { 539 char *elem; 540 MALLOC(elem, char, len + 1); 541 strncpy(elem, str, len + 1); 542 543 if (delim == '=') 544 /* build equivalent class. */ 545 { 546 if (equivs_al == 0) 547 MALLOC(work_mbc->equivs, char*, ++equivs_al); 548 REALLOC_IF_NECESSARY(work_mbc->equivs, char*, 549 equivs_al, 550 work_mbc->nequivs + 1); 551 work_mbc->equivs[work_mbc->nequivs++] = elem; 552 } 553 554 if (delim == '.') 555 /* build collating element. */ 556 { 557 if (coll_elems_al == 0) 558 MALLOC(work_mbc->coll_elems, char*, ++coll_elems_al); 559 REALLOC_IF_NECESSARY(work_mbc->coll_elems, char*, 560 coll_elems_al, 561 work_mbc->ncoll_elems + 1); 562 work_mbc->coll_elems[work_mbc->ncoll_elems++] = elem; 563 } 564 } 565 wc = -1; 566 } 567 else 568 /* We treat '[' as a normal character here. */ 569 { 570 wc2 = wc1; wc1 = wc; wc = wc2; /* swap */ 571 } 572 } 573 else 574 { 575 if (wc == L'\\' && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS)) 576 wc = fetch_wc(("Unbalanced [")); 577 } 578 579 if (wc1 == -1) 580 wc1 = fetch_wc(_("Unbalanced [")); 581 582 if (wc1 == L'-') 583 /* build range characters. */ 584 { 585 wc2 = fetch_wc(_("Unbalanced [")); 586 if (wc2 == L']') 587 { 588 /* In the case [x-], the - is an ordinary hyphen, 589 which is left in c1, the lookahead character. */ 590 lexptr -= cur_mb_len; 591 lexleft += cur_mb_len; 592 wc2 = wc; 593 } 594 else 595 { 596 if (wc2 == L'\\' 597 && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS)) 598 wc2 = fetch_wc(_("Unbalanced [")); 599 wc1 = fetch_wc(_("Unbalanced [")); 600 } 601 602 if (range_sts_al == 0) 603 { 604 MALLOC(work_mbc->range_sts, wchar_t, ++range_sts_al); 605 MALLOC(work_mbc->range_ends, wchar_t, ++range_ends_al); 606 } 607 REALLOC_IF_NECESSARY(work_mbc->range_sts, wchar_t, 608 range_sts_al, work_mbc->nranges + 1); 609 work_mbc->range_sts[work_mbc->nranges] = wc; 610 REALLOC_IF_NECESSARY(work_mbc->range_ends, wchar_t, 611 range_ends_al, work_mbc->nranges + 1); 612 work_mbc->range_ends[work_mbc->nranges++] = wc2; 613 } 614 else if (wc != -1) 615 /* build normal characters. */ 616 { 617 REALLOC_IF_NECESSARY(work_mbc->chars, wchar_t, chars_al, 618 work_mbc->nchars + 1); 619 work_mbc->chars[work_mbc->nchars++] = wc; 620 } 621 } 622 while ((wc = wc1) != L']'); 623} 624#endif /* MBS_SUPPORT */ 625 626#ifdef __STDC__ 627#define FUNC(F, P) static int F(int c) { return P(c); } 628#else 629#define FUNC(F, P) static int F(c) int c; { return P(c); } 630#endif 631 632FUNC(is_alpha, ISALPHA) 633FUNC(is_upper, ISUPPER) 634FUNC(is_lower, ISLOWER) 635FUNC(is_digit, ISDIGIT) 636FUNC(is_xdigit, ISXDIGIT) 637FUNC(is_space, ISSPACE) 638FUNC(is_punct, ISPUNCT) 639FUNC(is_alnum, ISALNUM) 640FUNC(is_print, ISPRINT) 641FUNC(is_graph, ISGRAPH) 642FUNC(is_cntrl, ISCNTRL) 643 644static int 645is_blank (int c) 646{ 647 return (c == ' ' || c == '\t'); 648} 649 650/* The following list maps the names of the Posix named character classes 651 to predicate functions that determine whether a given character is in 652 the class. The leading [ has already been eaten by the lexical analyzer. */ 653static struct { 654 const char *name; 655 int (*pred) (int); 656} const prednames[] = { 657 { ":alpha:]", is_alpha }, 658 { ":upper:]", is_upper }, 659 { ":lower:]", is_lower }, 660 { ":digit:]", is_digit }, 661 { ":xdigit:]", is_xdigit }, 662 { ":space:]", is_space }, 663 { ":punct:]", is_punct }, 664 { ":alnum:]", is_alnum }, 665 { ":print:]", is_print }, 666 { ":graph:]", is_graph }, 667 { ":cntrl:]", is_cntrl }, 668 { ":blank:]", is_blank }, 669 { 0 } 670}; 671 672/* Return non-zero if C is a `word-constituent' byte; zero otherwise. */ 673#define IS_WORD_CONSTITUENT(C) (ISALNUM(C) || (C) == '_') 674 675static int 676looking_at (char const *s) 677{ 678 size_t len; 679 680 len = strlen(s); 681 if (lexleft < len) 682 return 0; 683 return strncmp(s, lexptr, len) == 0; 684} 685 686static token 687lex (void) 688{ 689 unsigned c, c1, c2; 690 int backslash = 0, invert; 691 charclass ccl; 692 int i; 693 694 /* Basic plan: We fetch a character. If it's a backslash, 695 we set the backslash flag and go through the loop again. 696 On the plus side, this avoids having a duplicate of the 697 main switch inside the backslash case. On the minus side, 698 it means that just about every case begins with 699 "if (backslash) ...". */ 700 for (i = 0; i < 2; ++i) 701 { 702 FETCH(c, 0); 703#ifdef MBS_SUPPORT 704 if (MB_CUR_MAX > 1 && cur_mb_index) 705 /* If this is a part of a multi-byte character, we must treat 706 this byte data as a normal character. 707 e.g. In case of SJIS encoding, some character contains '\', 708 but they must not be backslash. */ 709 goto normal_char; 710#endif /* MBS_SUPPORT */ 711 switch (c) 712 { 713 case '\\': 714 if (backslash) 715 goto normal_char; 716 if (lexleft == 0) 717 dfaerror(_("Unfinished \\ escape")); 718 backslash = 1; 719 break; 720 721 case '^': 722 if (backslash) 723 goto normal_char; 724 if (syntax_bits & RE_CONTEXT_INDEP_ANCHORS 725 || lasttok == END 726 || lasttok == LPAREN 727 || lasttok == OR) 728 return lasttok = BEGLINE; 729 goto normal_char; 730 731 case '$': 732 if (backslash) 733 goto normal_char; 734 if (syntax_bits & RE_CONTEXT_INDEP_ANCHORS 735 || lexleft == 0 736 || (syntax_bits & RE_NO_BK_PARENS 737 ? lexleft > 0 && *lexptr == ')' 738 : lexleft > 1 && lexptr[0] == '\\' && lexptr[1] == ')') 739 || (syntax_bits & RE_NO_BK_VBAR 740 ? lexleft > 0 && *lexptr == '|' 741 : lexleft > 1 && lexptr[0] == '\\' && lexptr[1] == '|') 742 || ((syntax_bits & RE_NEWLINE_ALT) 743 && lexleft > 0 && *lexptr == '\n')) 744 return lasttok = ENDLINE; 745 goto normal_char; 746 747 case '1': 748 case '2': 749 case '3': 750 case '4': 751 case '5': 752 case '6': 753 case '7': 754 case '8': 755 case '9': 756 if (backslash && !(syntax_bits & RE_NO_BK_REFS)) 757 { 758 laststart = 0; 759 return lasttok = BACKREF; 760 } 761 goto normal_char; 762 763 case '`': 764 if (backslash && !(syntax_bits & RE_NO_GNU_OPS)) 765 return lasttok = BEGLINE; /* FIXME: should be beginning of string */ 766 goto normal_char; 767 768 case '\'': 769 if (backslash && !(syntax_bits & RE_NO_GNU_OPS)) 770 return lasttok = ENDLINE; /* FIXME: should be end of string */ 771 goto normal_char; 772 773 case '<': 774 if (backslash && !(syntax_bits & RE_NO_GNU_OPS)) 775 return lasttok = BEGWORD; 776 goto normal_char; 777 778 case '>': 779 if (backslash && !(syntax_bits & RE_NO_GNU_OPS)) 780 return lasttok = ENDWORD; 781 goto normal_char; 782 783 case 'b': 784 if (backslash && !(syntax_bits & RE_NO_GNU_OPS)) 785 return lasttok = LIMWORD; 786 goto normal_char; 787 788 case 'B': 789 if (backslash && !(syntax_bits & RE_NO_GNU_OPS)) 790 return lasttok = NOTLIMWORD; 791 goto normal_char; 792 793 case '?': 794 if (syntax_bits & RE_LIMITED_OPS) 795 goto normal_char; 796 if (backslash != ((syntax_bits & RE_BK_PLUS_QM) != 0)) 797 goto normal_char; 798 if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart) 799 goto normal_char; 800 return lasttok = QMARK; 801 802 case '*': 803 if (backslash) 804 goto normal_char; 805 if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart) 806 goto normal_char; 807 return lasttok = STAR; 808 809 case '+': 810 if (syntax_bits & RE_LIMITED_OPS) 811 goto normal_char; 812 if (backslash != ((syntax_bits & RE_BK_PLUS_QM) != 0)) 813 goto normal_char; 814 if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart) 815 goto normal_char; 816 return lasttok = PLUS; 817 818 case '{': 819 if (!(syntax_bits & RE_INTERVALS)) 820 goto normal_char; 821 if (backslash != ((syntax_bits & RE_NO_BK_BRACES) == 0)) 822 goto normal_char; 823 if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart) 824 goto normal_char; 825 826 if (syntax_bits & RE_NO_BK_BRACES) 827 { 828 /* Scan ahead for a valid interval; if it's not valid, 829 treat it as a literal '{'. */ 830 int lo = -1, hi = -1; 831 char const *p = lexptr; 832 char const *lim = p + lexleft; 833 for (; p != lim && ISASCIIDIGIT (*p); p++) 834 lo = (lo < 0 ? 0 : lo * 10) + *p - '0'; 835 if (p != lim && *p == ',') 836 while (++p != lim && ISASCIIDIGIT (*p)) 837 hi = (hi < 0 ? 0 : hi * 10) + *p - '0'; 838 else 839 hi = lo; 840 if (p == lim || *p != '}' 841 || lo < 0 || RE_DUP_MAX < hi || (0 <= hi && hi < lo)) 842 goto normal_char; 843 } 844 845 minrep = 0; 846 /* Cases: 847 {M} - exact count 848 {M,} - minimum count, maximum is infinity 849 {M,N} - M through N */ 850 FETCH(c, _("unfinished repeat count")); 851 if (ISASCIIDIGIT (c)) 852 { 853 minrep = c - '0'; 854 for (;;) 855 { 856 FETCH(c, _("unfinished repeat count")); 857 if (! ISASCIIDIGIT (c)) 858 break; 859 minrep = 10 * minrep + c - '0'; 860 } 861 } 862 else 863 dfaerror(_("malformed repeat count")); 864 if (c == ',') 865 { 866 FETCH (c, _("unfinished repeat count")); 867 if (! ISASCIIDIGIT (c)) 868 maxrep = -1; 869 else 870 { 871 maxrep = c - '0'; 872 for (;;) 873 { 874 FETCH (c, _("unfinished repeat count")); 875 if (! ISASCIIDIGIT (c)) 876 break; 877 maxrep = 10 * maxrep + c - '0'; 878 } 879 if (0 <= maxrep && maxrep < minrep) 880 dfaerror (_("malformed repeat count")); 881 } 882 } 883 else 884 maxrep = minrep; 885 if (!(syntax_bits & RE_NO_BK_BRACES)) 886 { 887 if (c != '\\') 888 dfaerror(_("malformed repeat count")); 889 FETCH(c, _("unfinished repeat count")); 890 } 891 if (c != '}') 892 dfaerror(_("malformed repeat count")); 893 laststart = 0; 894 return lasttok = REPMN; 895 896 case '|': 897 if (syntax_bits & RE_LIMITED_OPS) 898 goto normal_char; 899 if (backslash != ((syntax_bits & RE_NO_BK_VBAR) == 0)) 900 goto normal_char; 901 laststart = 1; 902 return lasttok = OR; 903 904 case '\n': 905 if (syntax_bits & RE_LIMITED_OPS 906 || backslash 907 || !(syntax_bits & RE_NEWLINE_ALT)) 908 goto normal_char; 909 laststart = 1; 910 return lasttok = OR; 911 912 case '(': 913 if (backslash != ((syntax_bits & RE_NO_BK_PARENS) == 0)) 914 goto normal_char; 915 ++parens; 916 laststart = 1; 917 return lasttok = LPAREN; 918 919 case ')': 920 if (backslash != ((syntax_bits & RE_NO_BK_PARENS) == 0)) 921 goto normal_char; 922 if (parens == 0 && syntax_bits & RE_UNMATCHED_RIGHT_PAREN_ORD) 923 goto normal_char; 924 --parens; 925 laststart = 0; 926 return lasttok = RPAREN; 927 928 case '.': 929 if (backslash) 930 goto normal_char; 931#ifdef MBS_SUPPORT 932 if (MB_CUR_MAX > 1) 933 { 934 /* In multibyte environment period must match with a single 935 character not a byte. So we use ANYCHAR. */ 936 laststart = 0; 937 return lasttok = ANYCHAR; 938 } 939#endif /* MBS_SUPPORT */ 940 zeroset(ccl); 941 notset(ccl); 942 if (!(syntax_bits & RE_DOT_NEWLINE)) 943 clrbit(eolbyte, ccl); 944 if (syntax_bits & RE_DOT_NOT_NULL) 945 clrbit('\0', ccl); 946 laststart = 0; 947 return lasttok = CSET + charclass_index(ccl); 948 949 case 'w': 950 case 'W': 951 if (!backslash || (syntax_bits & RE_NO_GNU_OPS)) 952 goto normal_char; 953 zeroset(ccl); 954 for (c2 = 0; c2 < NOTCHAR; ++c2) 955 if (IS_WORD_CONSTITUENT(c2)) 956 setbit(c2, ccl); 957 if (c == 'W') 958 notset(ccl); 959 laststart = 0; 960 return lasttok = CSET + charclass_index(ccl); 961 962 case '[': 963 if (backslash) 964 goto normal_char; 965 laststart = 0; 966#ifdef MBS_SUPPORT 967 if (MB_CUR_MAX > 1) 968 { 969 /* In multibyte environment a bracket expression may contain 970 multibyte characters, which must be treated as characters 971 (not bytes). So we parse it by parse_bracket_exp_mb(). */ 972 parse_bracket_exp_mb(); 973 return lasttok = MBCSET; 974 } 975#endif 976 zeroset(ccl); 977 FETCH(c, _("Unbalanced [")); 978 if (c == '^') 979 { 980 FETCH(c, _("Unbalanced [")); 981 invert = 1; 982 } 983 else 984 invert = 0; 985 do 986 { 987 /* Nobody ever said this had to be fast. :-) 988 Note that if we're looking at some other [:...:] 989 construct, we just treat it as a bunch of ordinary 990 characters. We can do this because we assume 991 regex has checked for syntax errors before 992 dfa is ever called. */ 993 if (c == '[' && (syntax_bits & RE_CHAR_CLASSES)) 994 for (c1 = 0; prednames[c1].name; ++c1) 995 if (looking_at(prednames[c1].name)) 996 { 997 int (*pred) (int) = prednames[c1].pred; 998 999 for (c2 = 0; c2 < NOTCHAR; ++c2) 1000 if ((*pred)(c2)) 1001 setbit_case_fold (c2, ccl); 1002 lexptr += strlen(prednames[c1].name); 1003 lexleft -= strlen(prednames[c1].name); 1004 FETCH(c1, _("Unbalanced [")); 1005 goto skip; 1006 } 1007 if (c == '\\' && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS)) 1008 FETCH(c, _("Unbalanced [")); 1009 FETCH(c1, _("Unbalanced [")); 1010 if (c1 == '-') 1011 { 1012 FETCH(c2, _("Unbalanced [")); 1013 if (c2 == ']') 1014 { 1015 /* In the case [x-], the - is an ordinary hyphen, 1016 which is left in c1, the lookahead character. */ 1017 --lexptr; 1018 ++lexleft; 1019 } 1020 else 1021 { 1022 if (c2 == '\\' 1023 && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS)) 1024 FETCH(c2, _("Unbalanced [")); 1025 FETCH(c1, _("Unbalanced [")); 1026 if (!hard_LC_COLLATE) { 1027 for (; c <= c2; c++) 1028 setbit_case_fold (c, ccl); 1029 } else { 1030 /* POSIX locales are painful - leave the decision to libc */ 1031 regex_t re; 1032 char expr[6]; /* = { '[', c, '-', c2, ']', '\0' }; */ 1033 1034 expr[0] = '['; expr[1] = c; expr[2] = '-'; 1035 expr[3] = c2; expr[4] = ']'; expr[5] = '\0'; 1036 if (regcomp (&re, expr, case_fold ? REG_ICASE : 0) == REG_NOERROR) { 1037 for (c = 0; c < NOTCHAR; ++c) { 1038 regmatch_t mat; 1039 char buf[2]; /* = { c, '\0' }; */ 1040 1041 buf[0] = c; buf[1] = '\0'; 1042 if (regexec (&re, buf, 1, &mat, 0) == REG_NOERROR 1043 && mat.rm_so == 0 && mat.rm_eo == 1) 1044 setbit_case_fold (c, ccl); 1045 } 1046 regfree (&re); 1047 } 1048 } 1049 continue; 1050 } 1051 } 1052 1053 setbit_case_fold (c, ccl); 1054 1055 skip: 1056 ; 1057 } 1058 while ((c = c1) != ']'); 1059 if (invert) 1060 { 1061 notset(ccl); 1062 if (syntax_bits & RE_HAT_LISTS_NOT_NEWLINE) 1063 clrbit(eolbyte, ccl); 1064 } 1065 return lasttok = CSET + charclass_index(ccl); 1066 1067 default: 1068 normal_char: 1069 laststart = 0; 1070 if (case_fold && ISALPHA(c)) 1071 { 1072 zeroset(ccl); 1073 setbit_case_fold (c, ccl); 1074 return lasttok = CSET + charclass_index(ccl); 1075 } 1076 return c; 1077 } 1078 } 1079 1080 /* The above loop should consume at most a backslash 1081 and some other character. */ 1082 abort(); 1083 return END; /* keeps pedantic compilers happy. */ 1084} 1085 1086/* Recursive descent parser for regular expressions. */ 1087 1088static token tok; /* Lookahead token. */ 1089static int depth; /* Current depth of a hypothetical stack 1090 holding deferred productions. This is 1091 used to determine the depth that will be 1092 required of the real stack later on in 1093 dfaanalyze(). */ 1094 1095/* Add the given token to the parse tree, maintaining the depth count and 1096 updating the maximum depth if necessary. */ 1097static void 1098addtok (token t) 1099{ 1100#ifdef MBS_SUPPORT 1101 if (MB_CUR_MAX > 1) 1102 { 1103 REALLOC_IF_NECESSARY(dfa->multibyte_prop, int, dfa->nmultibyte_prop, 1104 dfa->tindex); 1105 /* Set dfa->multibyte_prop. See struct dfa in dfa.h. */ 1106 if (t == MBCSET) 1107 dfa->multibyte_prop[dfa->tindex] = ((dfa->nmbcsets - 1) << 2) + 3; 1108 else if (t < NOTCHAR) 1109 dfa->multibyte_prop[dfa->tindex] 1110 = (cur_mb_len == 1)? 3 /* single-byte char */ 1111 : (((cur_mb_index == 1)? 1 : 0) /* 1st-byte of multibyte char */ 1112 + ((cur_mb_index == cur_mb_len)? 2 : 0)); /* last-byte */ 1113 else 1114 /* It may be unnecesssary, but it is safer to treat other 1115 symbols as singlebyte characters. */ 1116 dfa->multibyte_prop[dfa->tindex] = 3; 1117 } 1118#endif 1119 1120 REALLOC_IF_NECESSARY(dfa->tokens, token, dfa->talloc, dfa->tindex); 1121 dfa->tokens[dfa->tindex++] = t; 1122 1123 switch (t) 1124 { 1125 case QMARK: 1126 case STAR: 1127 case PLUS: 1128 break; 1129 1130 case CAT: 1131 case OR: 1132 case ORTOP: 1133 --depth; 1134 break; 1135 1136 default: 1137 ++dfa->nleaves; 1138 case EMPTY: 1139 ++depth; 1140 break; 1141 } 1142 if (depth > dfa->depth) 1143 dfa->depth = depth; 1144} 1145 1146/* The grammar understood by the parser is as follows. 1147 1148 regexp: 1149 regexp OR branch 1150 branch 1151 1152 branch: 1153 branch closure 1154 closure 1155 1156 closure: 1157 closure QMARK 1158 closure STAR 1159 closure PLUS 1160 closure REPMN 1161 atom 1162 1163 atom: 1164 <normal character> 1165 <multibyte character> 1166 ANYCHAR 1167 MBCSET 1168 CSET 1169 BACKREF 1170 BEGLINE 1171 ENDLINE 1172 BEGWORD 1173 ENDWORD 1174 LIMWORD 1175 NOTLIMWORD 1176 CRANGE 1177 LPAREN regexp RPAREN 1178 <empty> 1179 1180 The parser builds a parse tree in postfix form in an array of tokens. */ 1181 1182static void 1183atom (void) 1184{ 1185 if ((tok >= 0 && tok < NOTCHAR) || tok >= CSET || tok == BACKREF 1186 || tok == BEGLINE || tok == ENDLINE || tok == BEGWORD 1187#ifdef MBS_SUPPORT 1188 || tok == ANYCHAR || tok == MBCSET /* MB_CUR_MAX > 1 */ 1189#endif /* MBS_SUPPORT */ 1190 || tok == ENDWORD || tok == LIMWORD || tok == NOTLIMWORD) 1191 { 1192 addtok(tok); 1193 tok = lex(); 1194#ifdef MBS_SUPPORT 1195 /* We treat a multibyte character as a single atom, so that DFA 1196 can treat a multibyte character as a single expression. 1197 1198 e.g. We construct following tree from "<mb1><mb2>". 1199 <mb1(1st-byte)><mb1(2nd-byte)><CAT><mb1(3rd-byte)><CAT> 1200 <mb2(1st-byte)><mb2(2nd-byte)><CAT><mb2(3rd-byte)><CAT><CAT> 1201 */ 1202 if (MB_CUR_MAX > 1) 1203 { 1204 while (cur_mb_index > 1 && tok >= 0 && tok < NOTCHAR) 1205 { 1206 addtok(tok); 1207 addtok(CAT); 1208 tok = lex(); 1209 } 1210 } 1211#endif /* MBS_SUPPORT */ 1212 } 1213 else if (tok == CRANGE) 1214 { 1215 /* A character range like "[a-z]" in a locale other than "C" or 1216 "POSIX". This range might any sequence of one or more 1217 characters. Unfortunately the POSIX locale primitives give 1218 us no practical way to find what character sequences might be 1219 matched. Treat this approximately like "(.\1)" -- i.e. match 1220 one character, and then punt to the full matcher. */ 1221 charclass ccl; 1222 zeroset (ccl); 1223 notset (ccl); 1224 addtok (CSET + charclass_index (ccl)); 1225 addtok (BACKREF); 1226 addtok (CAT); 1227 tok = lex (); 1228 } 1229 else if (tok == LPAREN) 1230 { 1231 tok = lex(); 1232 regexp(0); 1233 if (tok != RPAREN) 1234 dfaerror(_("Unbalanced (")); 1235 tok = lex(); 1236 } 1237 else 1238 addtok(EMPTY); 1239} 1240 1241/* Return the number of tokens in the given subexpression. */ 1242static int 1243nsubtoks (int tindex) 1244{ 1245 int ntoks1; 1246 1247 switch (dfa->tokens[tindex - 1]) 1248 { 1249 default: 1250 return 1; 1251 case QMARK: 1252 case STAR: 1253 case PLUS: 1254 return 1 + nsubtoks(tindex - 1); 1255 case CAT: 1256 case OR: 1257 case ORTOP: 1258 ntoks1 = nsubtoks(tindex - 1); 1259 return 1 + ntoks1 + nsubtoks(tindex - 1 - ntoks1); 1260 } 1261} 1262 1263/* Copy the given subexpression to the top of the tree. */ 1264static void 1265copytoks (int tindex, int ntokens) 1266{ 1267 int i; 1268 1269 for (i = 0; i < ntokens; ++i) 1270 addtok(dfa->tokens[tindex + i]); 1271} 1272 1273static void 1274closure (void) 1275{ 1276 int tindex, ntokens, i; 1277 1278 atom(); 1279 while (tok == QMARK || tok == STAR || tok == PLUS || tok == REPMN) 1280 if (tok == REPMN) 1281 { 1282 ntokens = nsubtoks(dfa->tindex); 1283 tindex = dfa->tindex - ntokens; 1284 if (maxrep < 0) 1285 addtok(PLUS); 1286 if (minrep == 0) 1287 addtok(QMARK); 1288 for (i = 1; i < minrep; ++i) 1289 { 1290 copytoks(tindex, ntokens); 1291 addtok(CAT); 1292 } 1293 for (; i < maxrep; ++i) 1294 { 1295 copytoks(tindex, ntokens); 1296 addtok(QMARK); 1297 addtok(CAT); 1298 } 1299 tok = lex(); 1300 } 1301 else 1302 { 1303 addtok(tok); 1304 tok = lex(); 1305 } 1306} 1307 1308static void 1309branch (void) 1310{ 1311 closure(); 1312 while (tok != RPAREN && tok != OR && tok >= 0) 1313 { 1314 closure(); 1315 addtok(CAT); 1316 } 1317} 1318 1319static void 1320regexp (int toplevel) 1321{ 1322 branch(); 1323 while (tok == OR) 1324 { 1325 tok = lex(); 1326 branch(); 1327 if (toplevel) 1328 addtok(ORTOP); 1329 else 1330 addtok(OR); 1331 } 1332} 1333 1334/* Main entry point for the parser. S is a string to be parsed, len is the 1335 length of the string, so s can include NUL characters. D is a pointer to 1336 the struct dfa to parse into. */ 1337void 1338dfaparse (char const *s, size_t len, struct dfa *d) 1339{ 1340 dfa = d; 1341 lexstart = lexptr = s; 1342 lexleft = len; 1343 lasttok = END; 1344 laststart = 1; 1345 parens = 0; 1346#if ENABLE_NLS 1347 hard_LC_COLLATE = hard_locale (LC_COLLATE); 1348#endif 1349#ifdef MBS_SUPPORT 1350 if (MB_CUR_MAX > 1) 1351 { 1352 cur_mb_index = 0; 1353 cur_mb_len = 0; 1354 memset(&mbs, 0, sizeof(mbstate_t)); 1355 } 1356#endif /* MBS_SUPPORT */ 1357 1358 if (! syntax_bits_set) 1359 dfaerror(_("No syntax specified")); 1360 1361 tok = lex(); 1362 depth = d->depth; 1363 1364 regexp(1); 1365 1366 if (tok != END) 1367 dfaerror(_("Unbalanced )")); 1368 1369 addtok(END - d->nregexps); 1370 addtok(CAT); 1371 1372 if (d->nregexps) 1373 addtok(ORTOP); 1374 1375 ++d->nregexps; 1376} 1377 1378/* Some primitives for operating on sets of positions. */ 1379 1380/* Copy one set to another; the destination must be large enough. */ 1381static void 1382copy (position_set const *src, position_set *dst) 1383{ 1384 int i; 1385 1386 for (i = 0; i < src->nelem; ++i) 1387 dst->elems[i] = src->elems[i]; 1388 dst->nelem = src->nelem; 1389} 1390 1391/* Insert a position in a set. Position sets are maintained in sorted 1392 order according to index. If position already exists in the set with 1393 the same index then their constraints are logically or'd together. 1394 S->elems must point to an array large enough to hold the resulting set. */ 1395static void 1396insert (position p, position_set *s) 1397{ 1398 int i; 1399 position t1, t2; 1400 1401 for (i = 0; i < s->nelem && p.index < s->elems[i].index; ++i) 1402 continue; 1403 if (i < s->nelem && p.index == s->elems[i].index) 1404 s->elems[i].constraint |= p.constraint; 1405 else 1406 { 1407 t1 = p; 1408 ++s->nelem; 1409 while (i < s->nelem) 1410 { 1411 t2 = s->elems[i]; 1412 s->elems[i++] = t1; 1413 t1 = t2; 1414 } 1415 } 1416} 1417 1418/* Merge two sets of positions into a third. The result is exactly as if 1419 the positions of both sets were inserted into an initially empty set. */ 1420static void 1421merge (position_set const *s1, position_set const *s2, position_set *m) 1422{ 1423 int i = 0, j = 0; 1424 1425 m->nelem = 0; 1426 while (i < s1->nelem && j < s2->nelem) 1427 if (s1->elems[i].index > s2->elems[j].index) 1428 m->elems[m->nelem++] = s1->elems[i++]; 1429 else if (s1->elems[i].index < s2->elems[j].index) 1430 m->elems[m->nelem++] = s2->elems[j++]; 1431 else 1432 { 1433 m->elems[m->nelem] = s1->elems[i++]; 1434 m->elems[m->nelem++].constraint |= s2->elems[j++].constraint; 1435 } 1436 while (i < s1->nelem) 1437 m->elems[m->nelem++] = s1->elems[i++]; 1438 while (j < s2->nelem) 1439 m->elems[m->nelem++] = s2->elems[j++]; 1440} 1441 1442/* Delete a position from a set. */ 1443static void 1444delete_pos (position p, position_set *s) 1445{ 1446 int i; 1447 1448 for (i = 0; i < s->nelem; ++i) 1449 if (p.index == s->elems[i].index) 1450 break; 1451 if (i < s->nelem) 1452 for (--s->nelem; i < s->nelem; ++i) 1453 s->elems[i] = s->elems[i + 1]; 1454} 1455 1456/* Find the index of the state corresponding to the given position set with 1457 the given preceding context, or create a new state if there is no such 1458 state. Newline and letter tell whether we got here on a newline or 1459 letter, respectively. */ 1460static int 1461state_index (struct dfa *d, position_set const *s, int newline, int letter) 1462{ 1463 int hash = 0; 1464 int constraint; 1465 int i, j; 1466 1467 newline = newline ? 1 : 0; 1468 letter = letter ? 1 : 0; 1469 1470 for (i = 0; i < s->nelem; ++i) 1471 hash ^= s->elems[i].index + s->elems[i].constraint; 1472 1473 /* Try to find a state that exactly matches the proposed one. */ 1474 for (i = 0; i < d->sindex; ++i) 1475 { 1476 if (hash != d->states[i].hash || s->nelem != d->states[i].elems.nelem 1477 || newline != d->states[i].newline || letter != d->states[i].letter) 1478 continue; 1479 for (j = 0; j < s->nelem; ++j) 1480 if (s->elems[j].constraint 1481 != d->states[i].elems.elems[j].constraint 1482 || s->elems[j].index != d->states[i].elems.elems[j].index) 1483 break; 1484 if (j == s->nelem) 1485 return i; 1486 } 1487 1488 /* We'll have to create a new state. */ 1489 REALLOC_IF_NECESSARY(d->states, dfa_state, d->salloc, d->sindex); 1490 d->states[i].hash = hash; 1491 MALLOC(d->states[i].elems.elems, position, s->nelem); 1492 copy(s, &d->states[i].elems); 1493 d->states[i].newline = newline; 1494 d->states[i].letter = letter; 1495 d->states[i].backref = 0; 1496 d->states[i].constraint = 0; 1497 d->states[i].first_end = 0; 1498#ifdef MBS_SUPPORT 1499 if (MB_CUR_MAX > 1) 1500 d->states[i].mbps.nelem = 0; 1501#endif 1502 for (j = 0; j < s->nelem; ++j) 1503 if (d->tokens[s->elems[j].index] < 0) 1504 { 1505 constraint = s->elems[j].constraint; 1506 if (SUCCEEDS_IN_CONTEXT(constraint, newline, 0, letter, 0) 1507 || SUCCEEDS_IN_CONTEXT(constraint, newline, 0, letter, 1) 1508 || SUCCEEDS_IN_CONTEXT(constraint, newline, 1, letter, 0) 1509 || SUCCEEDS_IN_CONTEXT(constraint, newline, 1, letter, 1)) 1510 d->states[i].constraint |= constraint; 1511 if (! d->states[i].first_end) 1512 d->states[i].first_end = d->tokens[s->elems[j].index]; 1513 } 1514 else if (d->tokens[s->elems[j].index] == BACKREF) 1515 { 1516 d->states[i].constraint = NO_CONSTRAINT; 1517 d->states[i].backref = 1; 1518 } 1519 1520 ++d->sindex; 1521 1522 return i; 1523} 1524 1525/* Find the epsilon closure of a set of positions. If any position of the set 1526 contains a symbol that matches the empty string in some context, replace 1527 that position with the elements of its follow labeled with an appropriate 1528 constraint. Repeat exhaustively until no funny positions are left. 1529 S->elems must be large enough to hold the result. */ 1530static void 1531epsclosure (position_set *s, struct dfa const *d) 1532{ 1533 int i, j; 1534 int *visited; 1535 position p, old; 1536 1537 MALLOC(visited, int, d->tindex); 1538 for (i = 0; i < d->tindex; ++i) 1539 visited[i] = 0; 1540 1541 for (i = 0; i < s->nelem; ++i) 1542 if (d->tokens[s->elems[i].index] >= NOTCHAR 1543 && d->tokens[s->elems[i].index] != BACKREF 1544#ifdef MBS_SUPPORT 1545 && d->tokens[s->elems[i].index] != ANYCHAR 1546 && d->tokens[s->elems[i].index] != MBCSET 1547#endif 1548 && d->tokens[s->elems[i].index] < CSET) 1549 { 1550 old = s->elems[i]; 1551 p.constraint = old.constraint; 1552 delete_pos(s->elems[i], s); 1553 if (visited[old.index]) 1554 { 1555 --i; 1556 continue; 1557 } 1558 visited[old.index] = 1; 1559 switch (d->tokens[old.index]) 1560 { 1561 case BEGLINE: 1562 p.constraint &= BEGLINE_CONSTRAINT; 1563 break; 1564 case ENDLINE: 1565 p.constraint &= ENDLINE_CONSTRAINT; 1566 break; 1567 case BEGWORD: 1568 p.constraint &= BEGWORD_CONSTRAINT; 1569 break; 1570 case ENDWORD: 1571 p.constraint &= ENDWORD_CONSTRAINT; 1572 break; 1573 case LIMWORD: 1574 p.constraint &= LIMWORD_CONSTRAINT; 1575 break; 1576 case NOTLIMWORD: 1577 p.constraint &= NOTLIMWORD_CONSTRAINT; 1578 break; 1579 default: 1580 break; 1581 } 1582 for (j = 0; j < d->follows[old.index].nelem; ++j) 1583 { 1584 p.index = d->follows[old.index].elems[j].index; 1585 insert(p, s); 1586 } 1587 /* Force rescan to start at the beginning. */ 1588 i = -1; 1589 } 1590 1591 free(visited); 1592} 1593 1594/* Perform bottom-up analysis on the parse tree, computing various functions. 1595 Note that at this point, we're pretending constructs like \< are real 1596 characters rather than constraints on what can follow them. 1597 1598 Nullable: A node is nullable if it is at the root of a regexp that can 1599 match the empty string. 1600 * EMPTY leaves are nullable. 1601 * No other leaf is nullable. 1602 * A QMARK or STAR node is nullable. 1603 * A PLUS node is nullable if its argument is nullable. 1604 * A CAT node is nullable if both its arguments are nullable. 1605 * An OR node is nullable if either argument is nullable. 1606 1607 Firstpos: The firstpos of a node is the set of positions (nonempty leaves) 1608 that could correspond to the first character of a string matching the 1609 regexp rooted at the given node. 1610 * EMPTY leaves have empty firstpos. 1611 * The firstpos of a nonempty leaf is that leaf itself. 1612 * The firstpos of a QMARK, STAR, or PLUS node is the firstpos of its 1613 argument. 1614 * The firstpos of a CAT node is the firstpos of the left argument, union 1615 the firstpos of the right if the left argument is nullable. 1616 * The firstpos of an OR node is the union of firstpos of each argument. 1617 1618 Lastpos: The lastpos of a node is the set of positions that could 1619 correspond to the last character of a string matching the regexp at 1620 the given node. 1621 * EMPTY leaves have empty lastpos. 1622 * The lastpos of a nonempty leaf is that leaf itself. 1623 * The lastpos of a QMARK, STAR, or PLUS node is the lastpos of its 1624 argument. 1625 * The lastpos of a CAT node is the lastpos of its right argument, union 1626 the lastpos of the left if the right argument is nullable. 1627 * The lastpos of an OR node is the union of the lastpos of each argument. 1628 1629 Follow: The follow of a position is the set of positions that could 1630 correspond to the character following a character matching the node in 1631 a string matching the regexp. At this point we consider special symbols 1632 that match the empty string in some context to be just normal characters. 1633 Later, if we find that a special symbol is in a follow set, we will 1634 replace it with the elements of its follow, labeled with an appropriate 1635 constraint. 1636 * Every node in the firstpos of the argument of a STAR or PLUS node is in 1637 the follow of every node in the lastpos. 1638 * Every node in the firstpos of the second argument of a CAT node is in 1639 the follow of every node in the lastpos of the first argument. 1640 1641 Because of the postfix representation of the parse tree, the depth-first 1642 analysis is conveniently done by a linear scan with the aid of a stack. 1643 Sets are stored as arrays of the elements, obeying a stack-like allocation 1644 scheme; the number of elements in each set deeper in the stack can be 1645 used to determine the address of a particular set's array. */ 1646void 1647dfaanalyze (struct dfa *d, int searchflag) 1648{ 1649 int *nullable; /* Nullable stack. */ 1650 int *nfirstpos; /* Element count stack for firstpos sets. */ 1651 position *firstpos; /* Array where firstpos elements are stored. */ 1652 int *nlastpos; /* Element count stack for lastpos sets. */ 1653 position *lastpos; /* Array where lastpos elements are stored. */ 1654 int *nalloc; /* Sizes of arrays allocated to follow sets. */ 1655 position_set tmp; /* Temporary set for merging sets. */ 1656 position_set merged; /* Result of merging sets. */ 1657 int wants_newline; /* True if some position wants newline info. */ 1658 int *o_nullable; 1659 int *o_nfirst, *o_nlast; 1660 position *o_firstpos, *o_lastpos; 1661 int i, j; 1662 position *pos; 1663 1664#ifdef DEBUG 1665 fprintf(stderr, "dfaanalyze:\n"); 1666 for (i = 0; i < d->tindex; ++i) 1667 { 1668 fprintf(stderr, " %d:", i); 1669 prtok(d->tokens[i]); 1670 } 1671 putc('\n', stderr); 1672#endif 1673 1674 d->searchflag = searchflag; 1675 1676 MALLOC(nullable, int, d->depth); 1677 o_nullable = nullable; 1678 MALLOC(nfirstpos, int, d->depth); 1679 o_nfirst = nfirstpos; 1680 MALLOC(firstpos, position, d->nleaves); 1681 o_firstpos = firstpos, firstpos += d->nleaves; 1682 MALLOC(nlastpos, int, d->depth); 1683 o_nlast = nlastpos; 1684 MALLOC(lastpos, position, d->nleaves); 1685 o_lastpos = lastpos, lastpos += d->nleaves; 1686 MALLOC(nalloc, int, d->tindex); 1687 for (i = 0; i < d->tindex; ++i) 1688 nalloc[i] = 0; 1689 MALLOC(merged.elems, position, d->nleaves); 1690 1691 CALLOC(d->follows, position_set, d->tindex); 1692 1693 for (i = 0; i < d->tindex; ++i) 1694#ifdef DEBUG 1695 { /* Nonsyntactic #ifdef goo... */ 1696#endif 1697 switch (d->tokens[i]) 1698 { 1699 case EMPTY: 1700 /* The empty set is nullable. */ 1701 *nullable++ = 1; 1702 1703 /* The firstpos and lastpos of the empty leaf are both empty. */ 1704 *nfirstpos++ = *nlastpos++ = 0; 1705 break; 1706 1707 case STAR: 1708 case PLUS: 1709 /* Every element in the firstpos of the argument is in the follow 1710 of every element in the lastpos. */ 1711 tmp.nelem = nfirstpos[-1]; 1712 tmp.elems = firstpos; 1713 pos = lastpos; 1714 for (j = 0; j < nlastpos[-1]; ++j) 1715 { 1716 merge(&tmp, &d->follows[pos[j].index], &merged); 1717 REALLOC_IF_NECESSARY(d->follows[pos[j].index].elems, position, 1718 nalloc[pos[j].index], merged.nelem - 1); 1719 copy(&merged, &d->follows[pos[j].index]); 1720 } 1721 1722 case QMARK: 1723 /* A QMARK or STAR node is automatically nullable. */ 1724 if (d->tokens[i] != PLUS) 1725 nullable[-1] = 1; 1726 break; 1727 1728 case CAT: 1729 /* Every element in the firstpos of the second argument is in the 1730 follow of every element in the lastpos of the first argument. */ 1731 tmp.nelem = nfirstpos[-1]; 1732 tmp.elems = firstpos; 1733 pos = lastpos + nlastpos[-1]; 1734 for (j = 0; j < nlastpos[-2]; ++j) 1735 { 1736 merge(&tmp, &d->follows[pos[j].index], &merged); 1737 REALLOC_IF_NECESSARY(d->follows[pos[j].index].elems, position, 1738 nalloc[pos[j].index], merged.nelem - 1); 1739 copy(&merged, &d->follows[pos[j].index]); 1740 } 1741 1742 /* The firstpos of a CAT node is the firstpos of the first argument, 1743 union that of the second argument if the first is nullable. */ 1744 if (nullable[-2]) 1745 nfirstpos[-2] += nfirstpos[-1]; 1746 else 1747 firstpos += nfirstpos[-1]; 1748 --nfirstpos; 1749 1750 /* The lastpos of a CAT node is the lastpos of the second argument, 1751 union that of the first argument if the second is nullable. */ 1752 if (nullable[-1]) 1753 nlastpos[-2] += nlastpos[-1]; 1754 else 1755 { 1756 pos = lastpos + nlastpos[-2]; 1757 for (j = nlastpos[-1] - 1; j >= 0; --j) 1758 pos[j] = lastpos[j]; 1759 lastpos += nlastpos[-2]; 1760 nlastpos[-2] = nlastpos[-1]; 1761 } 1762 --nlastpos; 1763 1764 /* A CAT node is nullable if both arguments are nullable. */ 1765 nullable[-2] = nullable[-1] && nullable[-2]; 1766 --nullable; 1767 break; 1768 1769 case OR: 1770 case ORTOP: 1771 /* The firstpos is the union of the firstpos of each argument. */ 1772 nfirstpos[-2] += nfirstpos[-1]; 1773 --nfirstpos; 1774 1775 /* The lastpos is the union of the lastpos of each argument. */ 1776 nlastpos[-2] += nlastpos[-1]; 1777 --nlastpos; 1778 1779 /* An OR node is nullable if either argument is nullable. */ 1780 nullable[-2] = nullable[-1] || nullable[-2]; 1781 --nullable; 1782 break; 1783 1784 default: 1785 /* Anything else is a nonempty position. (Note that special 1786 constructs like \< are treated as nonempty strings here; 1787 an "epsilon closure" effectively makes them nullable later. 1788 Backreferences have to get a real position so we can detect 1789 transitions on them later. But they are nullable. */ 1790 *nullable++ = d->tokens[i] == BACKREF; 1791 1792 /* This position is in its own firstpos and lastpos. */ 1793 *nfirstpos++ = *nlastpos++ = 1; 1794 --firstpos, --lastpos; 1795 firstpos->index = lastpos->index = i; 1796 firstpos->constraint = lastpos->constraint = NO_CONSTRAINT; 1797 1798 /* Allocate the follow set for this position. */ 1799 nalloc[i] = 1; 1800 MALLOC(d->follows[i].elems, position, nalloc[i]); 1801 break; 1802 } 1803#ifdef DEBUG 1804 /* ... balance the above nonsyntactic #ifdef goo... */ 1805 fprintf(stderr, "node %d:", i); 1806 prtok(d->tokens[i]); 1807 putc('\n', stderr); 1808 fprintf(stderr, nullable[-1] ? " nullable: yes\n" : " nullable: no\n"); 1809 fprintf(stderr, " firstpos:"); 1810 for (j = nfirstpos[-1] - 1; j >= 0; --j) 1811 { 1812 fprintf(stderr, " %d:", firstpos[j].index); 1813 prtok(d->tokens[firstpos[j].index]); 1814 } 1815 fprintf(stderr, "\n lastpos:"); 1816 for (j = nlastpos[-1] - 1; j >= 0; --j) 1817 { 1818 fprintf(stderr, " %d:", lastpos[j].index); 1819 prtok(d->tokens[lastpos[j].index]); 1820 } 1821 putc('\n', stderr); 1822 } 1823#endif 1824 1825 /* For each follow set that is the follow set of a real position, replace 1826 it with its epsilon closure. */ 1827 for (i = 0; i < d->tindex; ++i) 1828 if (d->tokens[i] < NOTCHAR || d->tokens[i] == BACKREF 1829#ifdef MBS_SUPPORT 1830 || d->tokens[i] == ANYCHAR 1831 || d->tokens[i] == MBCSET 1832#endif 1833 || d->tokens[i] >= CSET) 1834 { 1835#ifdef DEBUG 1836 fprintf(stderr, "follows(%d:", i); 1837 prtok(d->tokens[i]); 1838 fprintf(stderr, "):"); 1839 for (j = d->follows[i].nelem - 1; j >= 0; --j) 1840 { 1841 fprintf(stderr, " %d:", d->follows[i].elems[j].index); 1842 prtok(d->tokens[d->follows[i].elems[j].index]); 1843 } 1844 putc('\n', stderr); 1845#endif 1846 copy(&d->follows[i], &merged); 1847 epsclosure(&merged, d); 1848 if (d->follows[i].nelem < merged.nelem) 1849 REALLOC(d->follows[i].elems, position, merged.nelem); 1850 copy(&merged, &d->follows[i]); 1851 } 1852 1853 /* Get the epsilon closure of the firstpos of the regexp. The result will 1854 be the set of positions of state 0. */ 1855 merged.nelem = 0; 1856 for (i = 0; i < nfirstpos[-1]; ++i) 1857 insert(firstpos[i], &merged); 1858 epsclosure(&merged, d); 1859 1860 /* Check if any of the positions of state 0 will want newline context. */ 1861 wants_newline = 0; 1862 for (i = 0; i < merged.nelem; ++i) 1863 if (PREV_NEWLINE_DEPENDENT(merged.elems[i].constraint)) 1864 wants_newline = 1; 1865 1866 /* Build the initial state. */ 1867 d->salloc = 1; 1868 d->sindex = 0; 1869 MALLOC(d->states, dfa_state, d->salloc); 1870 state_index(d, &merged, wants_newline, 0); 1871 1872 free(o_nullable); 1873 free(o_nfirst); 1874 free(o_firstpos); 1875 free(o_nlast); 1876 free(o_lastpos); 1877 free(nalloc); 1878 free(merged.elems); 1879} 1880 1881/* Find, for each character, the transition out of state s of d, and store 1882 it in the appropriate slot of trans. 1883 1884 We divide the positions of s into groups (positions can appear in more 1885 than one group). Each group is labeled with a set of characters that 1886 every position in the group matches (taking into account, if necessary, 1887 preceding context information of s). For each group, find the union 1888 of the its elements' follows. This set is the set of positions of the 1889 new state. For each character in the group's label, set the transition 1890 on this character to be to a state corresponding to the set's positions, 1891 and its associated backward context information, if necessary. 1892 1893 If we are building a searching matcher, we include the positions of state 1894 0 in every state. 1895 1896 The collection of groups is constructed by building an equivalence-class 1897 partition of the positions of s. 1898 1899 For each position, find the set of characters C that it matches. Eliminate 1900 any characters from C that fail on grounds of backward context. 1901 1902 Search through the groups, looking for a group whose label L has nonempty 1903 intersection with C. If L - C is nonempty, create a new group labeled 1904 L - C and having the same positions as the current group, and set L to 1905 the intersection of L and C. Insert the position in this group, set 1906 C = C - L, and resume scanning. 1907 1908 If after comparing with every group there are characters remaining in C, 1909 create a new group labeled with the characters of C and insert this 1910 position in that group. */ 1911void 1912dfastate (int s, struct dfa *d, int trans[]) 1913{ 1914 position_set grps[NOTCHAR]; /* As many as will ever be needed. */ 1915 charclass labels[NOTCHAR]; /* Labels corresponding to the groups. */ 1916 int ngrps = 0; /* Number of groups actually used. */ 1917 position pos; /* Current position being considered. */ 1918 charclass matches; /* Set of matching characters. */ 1919 int matchesf; /* True if matches is nonempty. */ 1920 charclass intersect; /* Intersection with some label set. */ 1921 int intersectf; /* True if intersect is nonempty. */ 1922 charclass leftovers; /* Stuff in the label that didn't match. */ 1923 int leftoversf; /* True if leftovers is nonempty. */ 1924 static charclass letters; /* Set of characters considered letters. */ 1925 static charclass newline; /* Set of characters that aren't newline. */ 1926 position_set follows; /* Union of the follows of some group. */ 1927 position_set tmp; /* Temporary space for merging sets. */ 1928 int state; /* New state. */ 1929 int wants_newline; /* New state wants to know newline context. */ 1930 int state_newline; /* New state on a newline transition. */ 1931 int wants_letter; /* New state wants to know letter context. */ 1932 int state_letter; /* New state on a letter transition. */ 1933 static int initialized; /* Flag for static initialization. */ 1934#ifdef MBS_SUPPORT 1935 int next_isnt_1st_byte = 0; /* Flag If we can't add state0. */ 1936#endif 1937 int i, j, k; 1938 1939 /* Initialize the set of letters, if necessary. */ 1940 if (! initialized) 1941 { 1942 initialized = 1; 1943 for (i = 0; i < NOTCHAR; ++i) 1944 if (IS_WORD_CONSTITUENT(i)) 1945 setbit(i, letters); 1946 setbit(eolbyte, newline); 1947 } 1948 1949 zeroset(matches); 1950 1951 for (i = 0; i < d->states[s].elems.nelem; ++i) 1952 { 1953 pos = d->states[s].elems.elems[i]; 1954 if (d->tokens[pos.index] >= 0 && d->tokens[pos.index] < NOTCHAR) 1955 setbit(d->tokens[pos.index], matches); 1956 else if (d->tokens[pos.index] >= CSET) 1957 copyset(d->charclasses[d->tokens[pos.index] - CSET], matches); 1958#ifdef MBS_SUPPORT 1959 else if (d->tokens[pos.index] == ANYCHAR 1960 || d->tokens[pos.index] == MBCSET) 1961 /* MB_CUR_MAX > 1 */ 1962 { 1963 /* ANYCHAR and MBCSET must match with a single character, so we 1964 must put it to d->states[s].mbps, which contains the positions 1965 which can match with a single character not a byte. */ 1966 if (d->states[s].mbps.nelem == 0) 1967 { 1968 MALLOC(d->states[s].mbps.elems, position, 1969 d->states[s].elems.nelem); 1970 } 1971 insert(pos, &(d->states[s].mbps)); 1972 continue; 1973 } 1974#endif /* MBS_SUPPORT */ 1975 else 1976 continue; 1977 1978 /* Some characters may need to be eliminated from matches because 1979 they fail in the current context. */ 1980 if (pos.constraint != 0xFF) 1981 { 1982 if (! MATCHES_NEWLINE_CONTEXT(pos.constraint, 1983 d->states[s].newline, 1)) 1984 clrbit(eolbyte, matches); 1985 if (! MATCHES_NEWLINE_CONTEXT(pos.constraint, 1986 d->states[s].newline, 0)) 1987 for (j = 0; j < CHARCLASS_INTS; ++j) 1988 matches[j] &= newline[j]; 1989 if (! MATCHES_LETTER_CONTEXT(pos.constraint, 1990 d->states[s].letter, 1)) 1991 for (j = 0; j < CHARCLASS_INTS; ++j) 1992 matches[j] &= ~letters[j]; 1993 if (! MATCHES_LETTER_CONTEXT(pos.constraint, 1994 d->states[s].letter, 0)) 1995 for (j = 0; j < CHARCLASS_INTS; ++j) 1996 matches[j] &= letters[j]; 1997 1998 /* If there are no characters left, there's no point in going on. */ 1999 for (j = 0; j < CHARCLASS_INTS && !matches[j]; ++j) 2000 continue; 2001 if (j == CHARCLASS_INTS) 2002 continue; 2003 } 2004 2005 for (j = 0; j < ngrps; ++j) 2006 { 2007 /* If matches contains a single character only, and the current 2008 group's label doesn't contain that character, go on to the 2009 next group. */ 2010 if (d->tokens[pos.index] >= 0 && d->tokens[pos.index] < NOTCHAR 2011 && !tstbit(d->tokens[pos.index], labels[j])) 2012 continue; 2013 2014 /* Check if this group's label has a nonempty intersection with 2015 matches. */ 2016 intersectf = 0; 2017 for (k = 0; k < CHARCLASS_INTS; ++k) 2018 (intersect[k] = matches[k] & labels[j][k]) ? (intersectf = 1) : 0; 2019 if (! intersectf) 2020 continue; 2021 2022 /* It does; now find the set differences both ways. */ 2023 leftoversf = matchesf = 0; 2024 for (k = 0; k < CHARCLASS_INTS; ++k) 2025 { 2026 /* Even an optimizing compiler can't know this for sure. */ 2027 int match = matches[k], label = labels[j][k]; 2028 2029 (leftovers[k] = ~match & label) ? (leftoversf = 1) : 0; 2030 (matches[k] = match & ~label) ? (matchesf = 1) : 0; 2031 } 2032 2033 /* If there were leftovers, create a new group labeled with them. */ 2034 if (leftoversf) 2035 { 2036 copyset(leftovers, labels[ngrps]); 2037 copyset(intersect, labels[j]); 2038 MALLOC(grps[ngrps].elems, position, d->nleaves); 2039 copy(&grps[j], &grps[ngrps]); 2040 ++ngrps; 2041 } 2042 2043 /* Put the position in the current group. Note that there is no 2044 reason to call insert() here. */ 2045 grps[j].elems[grps[j].nelem++] = pos; 2046 2047 /* If every character matching the current position has been 2048 accounted for, we're done. */ 2049 if (! matchesf) 2050 break; 2051 } 2052 2053 /* If we've passed the last group, and there are still characters 2054 unaccounted for, then we'll have to create a new group. */ 2055 if (j == ngrps) 2056 { 2057 copyset(matches, labels[ngrps]); 2058 zeroset(matches); 2059 MALLOC(grps[ngrps].elems, position, d->nleaves); 2060 grps[ngrps].nelem = 1; 2061 grps[ngrps].elems[0] = pos; 2062 ++ngrps; 2063 } 2064 } 2065 2066 MALLOC(follows.elems, position, d->nleaves); 2067 MALLOC(tmp.elems, position, d->nleaves); 2068 2069 /* If we are a searching matcher, the default transition is to a state 2070 containing the positions of state 0, otherwise the default transition 2071 is to fail miserably. */ 2072 if (d->searchflag) 2073 { 2074 wants_newline = 0; 2075 wants_letter = 0; 2076 for (i = 0; i < d->states[0].elems.nelem; ++i) 2077 { 2078 if (PREV_NEWLINE_DEPENDENT(d->states[0].elems.elems[i].constraint)) 2079 wants_newline = 1; 2080 if (PREV_LETTER_DEPENDENT(d->states[0].elems.elems[i].constraint)) 2081 wants_letter = 1; 2082 } 2083 copy(&d->states[0].elems, &follows); 2084 state = state_index(d, &follows, 0, 0); 2085 if (wants_newline) 2086 state_newline = state_index(d, &follows, 1, 0); 2087 else 2088 state_newline = state; 2089 if (wants_letter) 2090 state_letter = state_index(d, &follows, 0, 1); 2091 else 2092 state_letter = state; 2093 for (i = 0; i < NOTCHAR; ++i) 2094 trans[i] = (IS_WORD_CONSTITUENT(i)) ? state_letter : state; 2095 trans[eolbyte] = state_newline; 2096 } 2097 else 2098 for (i = 0; i < NOTCHAR; ++i) 2099 trans[i] = -1; 2100 2101 for (i = 0; i < ngrps; ++i) 2102 { 2103 follows.nelem = 0; 2104 2105 /* Find the union of the follows of the positions of the group. 2106 This is a hideously inefficient loop. Fix it someday. */ 2107 for (j = 0; j < grps[i].nelem; ++j) 2108 for (k = 0; k < d->follows[grps[i].elems[j].index].nelem; ++k) 2109 insert(d->follows[grps[i].elems[j].index].elems[k], &follows); 2110 2111#ifdef MBS_SUPPORT 2112 if (MB_CUR_MAX > 1) 2113 { 2114 /* If a token in follows.elems is not 1st byte of a multibyte 2115 character, or the states of follows must accept the bytes 2116 which are not 1st byte of the multibyte character. 2117 Then, if a state of follows encounter a byte, it must not be 2118 a 1st byte of a multibyte character nor singlebyte character. 2119 We cansel to add state[0].follows to next state, because 2120 state[0] must accept 1st-byte 2121 2122 For example, we assume <sb a> is a certain singlebyte 2123 character, <mb A> is a certain multibyte character, and the 2124 codepoint of <sb a> equals the 2nd byte of the codepoint of 2125 <mb A>. 2126 When state[0] accepts <sb a>, state[i] transit to state[i+1] 2127 by accepting accepts 1st byte of <mb A>, and state[i+1] 2128 accepts 2nd byte of <mb A>, if state[i+1] encounter the 2129 codepoint of <sb a>, it must not be <sb a> but 2nd byte of 2130 <mb A>, so we can not add state[0]. */ 2131 2132 next_isnt_1st_byte = 0; 2133 for (j = 0; j < follows.nelem; ++j) 2134 { 2135 if (!(d->multibyte_prop[follows.elems[j].index] & 1)) 2136 { 2137 next_isnt_1st_byte = 1; 2138 break; 2139 } 2140 } 2141 } 2142#endif 2143 2144 /* If we are building a searching matcher, throw in the positions 2145 of state 0 as well. */ 2146#ifdef MBS_SUPPORT 2147 if (d->searchflag && (MB_CUR_MAX == 1 || !next_isnt_1st_byte)) 2148#else 2149 if (d->searchflag) 2150#endif 2151 for (j = 0; j < d->states[0].elems.nelem; ++j) 2152 insert(d->states[0].elems.elems[j], &follows); 2153 2154 /* Find out if the new state will want any context information. */ 2155 wants_newline = 0; 2156 if (tstbit(eolbyte, labels[i])) 2157 for (j = 0; j < follows.nelem; ++j) 2158 if (PREV_NEWLINE_DEPENDENT(follows.elems[j].constraint)) 2159 wants_newline = 1; 2160 2161 wants_letter = 0; 2162 for (j = 0; j < CHARCLASS_INTS; ++j) 2163 if (labels[i][j] & letters[j]) 2164 break; 2165 if (j < CHARCLASS_INTS) 2166 for (j = 0; j < follows.nelem; ++j) 2167 if (PREV_LETTER_DEPENDENT(follows.elems[j].constraint)) 2168 wants_letter = 1; 2169 2170 /* Find the state(s) corresponding to the union of the follows. */ 2171 state = state_index(d, &follows, 0, 0); 2172 if (wants_newline) 2173 state_newline = state_index(d, &follows, 1, 0); 2174 else 2175 state_newline = state; 2176 if (wants_letter) 2177 state_letter = state_index(d, &follows, 0, 1); 2178 else 2179 state_letter = state; 2180 2181 /* Set the transitions for each character in the current label. */ 2182 for (j = 0; j < CHARCLASS_INTS; ++j) 2183 for (k = 0; k < INTBITS; ++k) 2184 if (labels[i][j] & 1 << k) 2185 { 2186 int c = j * INTBITS + k; 2187 2188 if (c == eolbyte) 2189 trans[c] = state_newline; 2190 else if (IS_WORD_CONSTITUENT(c)) 2191 trans[c] = state_letter; 2192 else if (c < NOTCHAR) 2193 trans[c] = state; 2194 } 2195 } 2196 2197 for (i = 0; i < ngrps; ++i) 2198 free(grps[i].elems); 2199 free(follows.elems); 2200 free(tmp.elems); 2201} 2202 2203/* Some routines for manipulating a compiled dfa's transition tables. 2204 Each state may or may not have a transition table; if it does, and it 2205 is a non-accepting state, then d->trans[state] points to its table. 2206 If it is an accepting state then d->fails[state] points to its table. 2207 If it has no table at all, then d->trans[state] is NULL. 2208 TODO: Improve this comment, get rid of the unnecessary redundancy. */ 2209 2210static void 2211build_state (int s, struct dfa *d) 2212{ 2213 int *trans; /* The new transition table. */ 2214 int i; 2215 2216 /* Set an upper limit on the number of transition tables that will ever 2217 exist at once. 1024 is arbitrary. The idea is that the frequently 2218 used transition tables will be quickly rebuilt, whereas the ones that 2219 were only needed once or twice will be cleared away. */ 2220 if (d->trcount >= 1024) 2221 { 2222 for (i = 0; i < d->tralloc; ++i) 2223 if (d->trans[i]) 2224 { 2225 free((ptr_t) d->trans[i]); 2226 d->trans[i] = NULL; 2227 } 2228 else if (d->fails[i]) 2229 { 2230 free((ptr_t) d->fails[i]); 2231 d->fails[i] = NULL; 2232 } 2233 d->trcount = 0; 2234 } 2235 2236 ++d->trcount; 2237 2238 /* Set up the success bits for this state. */ 2239 d->success[s] = 0; 2240 if (ACCEPTS_IN_CONTEXT(d->states[s].newline, 1, d->states[s].letter, 0, 2241 s, *d)) 2242 d->success[s] |= 4; 2243 if (ACCEPTS_IN_CONTEXT(d->states[s].newline, 0, d->states[s].letter, 1, 2244 s, *d)) 2245 d->success[s] |= 2; 2246 if (ACCEPTS_IN_CONTEXT(d->states[s].newline, 0, d->states[s].letter, 0, 2247 s, *d)) 2248 d->success[s] |= 1; 2249 2250 MALLOC(trans, int, NOTCHAR); 2251 dfastate(s, d, trans); 2252 2253 /* Now go through the new transition table, and make sure that the trans 2254 and fail arrays are allocated large enough to hold a pointer for the 2255 largest state mentioned in the table. */ 2256 for (i = 0; i < NOTCHAR; ++i) 2257 if (trans[i] >= d->tralloc) 2258 { 2259 int oldalloc = d->tralloc; 2260 2261 while (trans[i] >= d->tralloc) 2262 d->tralloc *= 2; 2263 REALLOC(d->realtrans, int *, d->tralloc + 1); 2264 d->trans = d->realtrans + 1; 2265 REALLOC(d->fails, int *, d->tralloc); 2266 REALLOC(d->success, int, d->tralloc); 2267 while (oldalloc < d->tralloc) 2268 { 2269 d->trans[oldalloc] = NULL; 2270 d->fails[oldalloc++] = NULL; 2271 } 2272 } 2273 2274 /* Newline is a sentinel. */ 2275 trans[eolbyte] = -1; 2276 2277 if (ACCEPTING(s, *d)) 2278 d->fails[s] = trans; 2279 else 2280 d->trans[s] = trans; 2281} 2282 2283static void 2284build_state_zero (struct dfa *d) 2285{ 2286 d->tralloc = 1; 2287 d->trcount = 0; 2288 CALLOC(d->realtrans, int *, d->tralloc + 1); 2289 d->trans = d->realtrans + 1; 2290 CALLOC(d->fails, int *, d->tralloc); 2291 MALLOC(d->success, int, d->tralloc); 2292 build_state(0, d); 2293} 2294 2295#ifdef MBS_SUPPORT 2296/* Multibyte character handling sub-routins for dfaexec. */ 2297 2298/* Initial state may encounter the byte which is not a singlebyte character 2299 nor 1st byte of a multibyte character. But it is incorrect for initial 2300 state to accept such a byte. 2301 For example, in sjis encoding the regular expression like "\\" accepts 2302 the codepoint 0x5c, but should not accept the 2nd byte of the codepoint 2303 0x815c. Then Initial state must skip the bytes which are not a singlebyte 2304 character nor 1st byte of a multibyte character. */ 2305#define SKIP_REMAINS_MB_IF_INITIAL_STATE(s, p) \ 2306 if (s == 0) \ 2307 { \ 2308 while (inputwcs[p - buf_begin] == 0 \ 2309 && mblen_buf[p - buf_begin] > 0 \ 2310 && p < buf_end) \ 2311 ++p; \ 2312 if (p >= end) \ 2313 { \ 2314 free(mblen_buf); \ 2315 free(inputwcs); \ 2316 return (size_t) -1; \ 2317 } \ 2318 } 2319 2320static void 2321realloc_trans_if_necessary(struct dfa *d, int new_state) 2322{ 2323 /* Make sure that the trans and fail arrays are allocated large enough 2324 to hold a pointer for the new state. */ 2325 if (new_state >= d->tralloc) 2326 { 2327 int oldalloc = d->tralloc; 2328 2329 while (new_state >= d->tralloc) 2330 d->tralloc *= 2; 2331 REALLOC(d->realtrans, int *, d->tralloc + 1); 2332 d->trans = d->realtrans + 1; 2333 REALLOC(d->fails, int *, d->tralloc); 2334 REALLOC(d->success, int, d->tralloc); 2335 while (oldalloc < d->tralloc) 2336 { 2337 d->trans[oldalloc] = NULL; 2338 d->fails[oldalloc++] = NULL; 2339 } 2340 } 2341} 2342 2343/* Return values of transit_state_singlebyte(), and 2344 transit_state_consume_1char. */ 2345typedef enum 2346{ 2347 TRANSIT_STATE_IN_PROGRESS, /* State transition has not finished. */ 2348 TRANSIT_STATE_DONE, /* State transition has finished. */ 2349 TRANSIT_STATE_END_BUFFER /* Reach the end of the buffer. */ 2350} status_transit_state; 2351 2352/* Consume a single byte and transit state from 's' to '*next_state'. 2353 This function is almost same as the state transition routin in dfaexec(). 2354 But state transition is done just once, otherwise matching succeed or 2355 reach the end of the buffer. */ 2356static status_transit_state 2357transit_state_singlebyte (struct dfa *d, int s, unsigned char const *p, 2358 int *next_state) 2359{ 2360 int *t; 2361 int works = s; 2362 2363 status_transit_state rval = TRANSIT_STATE_IN_PROGRESS; 2364 2365 while (rval == TRANSIT_STATE_IN_PROGRESS) 2366 { 2367 if ((t = d->trans[works]) != NULL) 2368 { 2369 works = t[*p]; 2370 rval = TRANSIT_STATE_DONE; 2371 if (works < 0) 2372 works = 0; 2373 } 2374 else if (works < 0) 2375 { 2376 if (p == buf_end) 2377 /* At the moment, it must not happen. */ 2378 return TRANSIT_STATE_END_BUFFER; 2379 works = 0; 2380 } 2381 else if (d->fails[works]) 2382 { 2383 works = d->fails[works][*p]; 2384 rval = TRANSIT_STATE_DONE; 2385 } 2386 else 2387 { 2388 build_state(works, d); 2389 } 2390 } 2391 *next_state = works; 2392 return rval; 2393} 2394 2395/* Check whether period can match or not in the current context. If it can, 2396 return the amount of the bytes with which period can match, otherwise 2397 return 0. 2398 `pos' is the position of the period. `index' is the index from the 2399 buf_begin, and it is the current position in the buffer. */ 2400static int 2401match_anychar (struct dfa *d, int s, position pos, int index) 2402{ 2403 int newline = 0; 2404 int letter = 0; 2405 wchar_t wc; 2406 int mbclen; 2407 2408 wc = inputwcs[index]; 2409 mbclen = (mblen_buf[index] == 0)? 1 : mblen_buf[index]; 2410 2411 /* Check context. */ 2412 if (wc == (wchar_t)eolbyte) 2413 { 2414 if (!(syntax_bits & RE_DOT_NEWLINE)) 2415 return 0; 2416 newline = 1; 2417 } 2418 else if (wc == (wchar_t)'\0') 2419 { 2420 if (syntax_bits & RE_DOT_NOT_NULL) 2421 return 0; 2422 newline = 1; 2423 } 2424 2425 if (iswalnum(wc) || wc == L'_') 2426 letter = 1; 2427 2428 if (!SUCCEEDS_IN_CONTEXT(pos.constraint, d->states[s].newline, 2429 newline, d->states[s].letter, letter)) 2430 return 0; 2431 2432 return mbclen; 2433} 2434 2435/* Check whether bracket expression can match or not in the current context. 2436 If it can, return the amount of the bytes with which expression can match, 2437 otherwise return 0. 2438 `pos' is the position of the bracket expression. `index' is the index 2439 from the buf_begin, and it is the current position in the buffer. */ 2440int 2441match_mb_charset (struct dfa *d, int s, position pos, int index) 2442{ 2443 int i; 2444 int match; /* Flag which represent that matching succeed. */ 2445 int match_len; /* Length of the character (or collating element) 2446 with which this operator match. */ 2447 int op_len; /* Length of the operator. */ 2448 char buffer[128]; 2449 wchar_t wcbuf[6]; 2450 2451 /* Pointer to the structure to which we are currently reffering. */ 2452 struct mb_char_classes *work_mbc; 2453 2454 int newline = 0; 2455 int letter = 0; 2456 wchar_t wc; /* Current reffering character. */ 2457 2458 wc = inputwcs[index]; 2459 2460 /* Check context. */ 2461 if (wc == (wchar_t)eolbyte) 2462 { 2463 if (!(syntax_bits & RE_DOT_NEWLINE)) 2464 return 0; 2465 newline = 1; 2466 } 2467 else if (wc == (wchar_t)'\0') 2468 { 2469 if (syntax_bits & RE_DOT_NOT_NULL) 2470 return 0; 2471 newline = 1; 2472 } 2473 if (iswalnum(wc) || wc == L'_') 2474 letter = 1; 2475 if (!SUCCEEDS_IN_CONTEXT(pos.constraint, d->states[s].newline, 2476 newline, d->states[s].letter, letter)) 2477 return 0; 2478 2479 /* Assign the current reffering operator to work_mbc. */ 2480 work_mbc = &(d->mbcsets[(d->multibyte_prop[pos.index]) >> 2]); 2481 match = !work_mbc->invert; 2482 match_len = (mblen_buf[index] == 0)? 1 : mblen_buf[index]; 2483 2484 /* match with a character class? */ 2485 for (i = 0; i<work_mbc->nch_classes; i++) 2486 { 2487 if (iswctype((wint_t)wc, work_mbc->ch_classes[i])) 2488 goto charset_matched; 2489 } 2490 2491 strncpy(buffer, (const char *) (buf_begin + index), match_len); 2492 buffer[match_len] = '\0'; 2493 2494 /* match with an equivalent class? */ 2495 for (i = 0; i<work_mbc->nequivs; i++) 2496 { 2497 op_len = strlen(work_mbc->equivs[i]); 2498 strncpy(buffer, (const char *) (buf_begin + index), op_len); 2499 buffer[op_len] = '\0'; 2500 if (strcoll(work_mbc->equivs[i], buffer) == 0) 2501 { 2502 match_len = op_len; 2503 goto charset_matched; 2504 } 2505 } 2506 2507 /* match with a collating element? */ 2508 for (i = 0; i<work_mbc->ncoll_elems; i++) 2509 { 2510 op_len = strlen(work_mbc->coll_elems[i]); 2511 strncpy(buffer, (const char *) (buf_begin + index), op_len); 2512 buffer[op_len] = '\0'; 2513 2514 if (strcoll(work_mbc->coll_elems[i], buffer) == 0) 2515 { 2516 match_len = op_len; 2517 goto charset_matched; 2518 } 2519 } 2520 2521 wcbuf[0] = wc; 2522 wcbuf[1] = wcbuf[3] = wcbuf[5] = '\0'; 2523 2524 /* match with a range? */ 2525 for (i = 0; i<work_mbc->nranges; i++) 2526 { 2527 wcbuf[2] = work_mbc->range_sts[i]; 2528 wcbuf[4] = work_mbc->range_ends[i]; 2529 2530 if (wcscoll(wcbuf, wcbuf+2) >= 0 && 2531 wcscoll(wcbuf+4, wcbuf) >= 0) 2532 goto charset_matched; 2533 } 2534 2535 /* match with a character? */ 2536 for (i = 0; i<work_mbc->nchars; i++) 2537 { 2538 if (wc == work_mbc->chars[i]) 2539 goto charset_matched; 2540 } 2541 2542 match = !match; 2543 2544 charset_matched: 2545 return match ? match_len : 0; 2546} 2547 2548/* Check each of `d->states[s].mbps.elem' can match or not. Then return the 2549 array which corresponds to `d->states[s].mbps.elem' and each element of 2550 the array contains the amount of the bytes with which the element can 2551 match. 2552 `index' is the index from the buf_begin, and it is the current position 2553 in the buffer. 2554 Caller MUST free the array which this function return. */ 2555static int* 2556check_matching_with_multibyte_ops (struct dfa *d, int s, int index) 2557{ 2558 int i; 2559 int* rarray; 2560 2561 MALLOC(rarray, int, d->states[s].mbps.nelem); 2562 for (i = 0; i < d->states[s].mbps.nelem; ++i) 2563 { 2564 position pos = d->states[s].mbps.elems[i]; 2565 switch(d->tokens[pos.index]) 2566 { 2567 case ANYCHAR: 2568 rarray[i] = match_anychar(d, s, pos, index); 2569 break; 2570 case MBCSET: 2571 rarray[i] = match_mb_charset(d, s, pos, index); 2572 break; 2573 default: 2574 break; /* can not happen. */ 2575 } 2576 } 2577 return rarray; 2578} 2579 2580/* Consume a single character and enumerate all of the positions which can 2581 be next position from the state `s'. 2582 `match_lens' is the input. It can be NULL, but it can also be the output 2583 of check_matching_with_multibyte_ops() for optimization. 2584 `mbclen' and `pps' are the output. `mbclen' is the length of the 2585 character consumed, and `pps' is the set this function enumerate. */ 2586static status_transit_state 2587transit_state_consume_1char (struct dfa *d, int s, unsigned char const **pp, 2588 int *match_lens, int *mbclen, position_set *pps) 2589{ 2590 int i, j; 2591 int s1, s2; 2592 int* work_mbls; 2593 status_transit_state rs = TRANSIT_STATE_DONE; 2594 2595 /* Calculate the length of the (single/multi byte) character 2596 to which p points. */ 2597 *mbclen = (mblen_buf[*pp - buf_begin] == 0)? 1 2598 : mblen_buf[*pp - buf_begin]; 2599 2600 /* Calculate the state which can be reached from the state `s' by 2601 consuming `*mbclen' single bytes from the buffer. */ 2602 s1 = s; 2603 for (i = 0; i < *mbclen; i++) 2604 { 2605 s2 = s1; 2606 rs = transit_state_singlebyte(d, s2, (*pp)++, &s1); 2607 } 2608 /* Copy the positions contained by `s1' to the set `pps'. */ 2609 copy(&(d->states[s1].elems), pps); 2610 2611 /* Check (inputed)match_lens, and initialize if it is NULL. */ 2612 if (match_lens == NULL && d->states[s].mbps.nelem != 0) 2613 work_mbls = check_matching_with_multibyte_ops(d, s, *pp - buf_begin); 2614 else 2615 work_mbls = match_lens; 2616 2617 /* Add all of the positions which can be reached from `s' by consuming 2618 a single character. */ 2619 for (i = 0; i < d->states[s].mbps.nelem ; i++) 2620 { 2621 if (work_mbls[i] == *mbclen) 2622 for (j = 0; j < d->follows[d->states[s].mbps.elems[i].index].nelem; 2623 j++) 2624 insert(d->follows[d->states[s].mbps.elems[i].index].elems[j], 2625 pps); 2626 } 2627 2628 if (match_lens == NULL && work_mbls != NULL) 2629 free(work_mbls); 2630 return rs; 2631} 2632 2633/* Transit state from s, then return new state and update the pointer of the 2634 buffer. This function is for some operator which can match with a multi- 2635 byte character or a collating element(which may be multi characters). */ 2636static int 2637transit_state (struct dfa *d, int s, unsigned char const **pp) 2638{ 2639 int s1; 2640 int mbclen; /* The length of current input multibyte character. */ 2641 int maxlen = 0; 2642 int i, j; 2643 int *match_lens = NULL; 2644 int nelem = d->states[s].mbps.nelem; /* Just a alias. */ 2645 position_set follows; 2646 unsigned char const *p1 = *pp; 2647 status_transit_state rs; 2648 wchar_t wc; 2649 2650 if (nelem > 0) 2651 /* This state has (a) multibyte operator(s). 2652 We check whether each of them can match or not. */ 2653 { 2654 /* Note: caller must free the return value of this function. */ 2655 match_lens = check_matching_with_multibyte_ops(d, s, *pp - buf_begin); 2656 2657 for (i = 0; i < nelem; i++) 2658 /* Search the operator which match the longest string, 2659 in this state. */ 2660 { 2661 if (match_lens[i] > maxlen) 2662 maxlen = match_lens[i]; 2663 } 2664 } 2665 2666 if (nelem == 0 || maxlen == 0) 2667 /* This state has no multibyte operator which can match. 2668 We need to check only one singlebyte character. */ 2669 { 2670 status_transit_state rs; 2671 rs = transit_state_singlebyte(d, s, *pp, &s1); 2672 2673 /* We must update the pointer if state transition succeeded. */ 2674 if (rs == TRANSIT_STATE_DONE) 2675 ++*pp; 2676 2677 if (match_lens != NULL) 2678 free(match_lens); 2679 return s1; 2680 } 2681 2682 /* This state has some operators which can match a multibyte character. */ 2683 follows.nelem = 0; 2684 MALLOC(follows.elems, position, d->nleaves); 2685 2686 /* `maxlen' may be longer than the length of a character, because it may 2687 not be a character but a (multi character) collating element. 2688 We enumerate all of the positions which `s' can reach by consuming 2689 `maxlen' bytes. */ 2690 rs = transit_state_consume_1char(d, s, pp, match_lens, &mbclen, &follows); 2691 2692 wc = inputwcs[*pp - mbclen - buf_begin]; 2693 s1 = state_index(d, &follows, wc == L'\n', iswalnum(wc)); 2694 realloc_trans_if_necessary(d, s1); 2695 2696 while (*pp - p1 < maxlen) 2697 { 2698 follows.nelem = 0; 2699 rs = transit_state_consume_1char(d, s1, pp, NULL, &mbclen, &follows); 2700 2701 for (i = 0; i < nelem ; i++) 2702 { 2703 if (match_lens[i] == *pp - p1) 2704 for (j = 0; 2705 j < d->follows[d->states[s1].mbps.elems[i].index].nelem; j++) 2706 insert(d->follows[d->states[s1].mbps.elems[i].index].elems[j], 2707 &follows); 2708 } 2709 2710 wc = inputwcs[*pp - mbclen - buf_begin]; 2711 s1 = state_index(d, &follows, wc == L'\n', iswalnum(wc)); 2712 realloc_trans_if_necessary(d, s1); 2713 } 2714 free(match_lens); 2715 free(follows.elems); 2716 return s1; 2717} 2718 2719#endif 2720 2721/* Search through a buffer looking for a match to the given struct dfa. 2722 Find the first occurrence of a string matching the regexp in the buffer, 2723 and the shortest possible version thereof. Return the offset of the first 2724 character after the match, or (size_t) -1 if none is found. BEGIN points to 2725 the beginning of the buffer, and SIZE is the size of the buffer. If SIZE 2726 is nonzero, BEGIN[SIZE - 1] must be a newline. BACKREF points to a place 2727 where we're supposed to store a 1 if backreferencing happened and the 2728 match needs to be verified by a backtracking matcher. Otherwise 2729 we store a 0 in *backref. */ 2730size_t 2731dfaexec (struct dfa *d, char const *begin, size_t size, int *backref) 2732{ 2733 register int s; /* Current state. */ 2734 register unsigned char const *p; /* Current input character. */ 2735 register unsigned char const *end; /* One past the last input character. */ 2736 register int **trans, *t; /* Copy of d->trans so it can be optimized 2737 into a register. */ 2738 register unsigned char eol = eolbyte; /* Likewise for eolbyte. */ 2739 static int sbit[NOTCHAR]; /* Table for anding with d->success. */ 2740 static int sbit_init; 2741 2742 if (! sbit_init) 2743 { 2744 int i; 2745 2746 sbit_init = 1; 2747 for (i = 0; i < NOTCHAR; ++i) 2748 sbit[i] = (IS_WORD_CONSTITUENT(i)) ? 2 : 1; 2749 sbit[eol] = 4; 2750 } 2751 2752 if (! d->tralloc) 2753 build_state_zero(d); 2754 2755 s = 0; 2756 p = (unsigned char const *) begin; 2757 end = p + size; 2758 trans = d->trans; 2759 2760#ifdef MBS_SUPPORT 2761 if (MB_CUR_MAX > 1) 2762 { 2763 int remain_bytes, i; 2764 buf_begin = (unsigned char const *)begin; 2765 buf_end = end; 2766 2767 /* initialize mblen_buf, and inputwcs. */ 2768 MALLOC(mblen_buf, unsigned char, end - (unsigned char const *)begin + 2); 2769 MALLOC(inputwcs, wchar_t, end - (unsigned char const *)begin + 2); 2770 memset(&mbs, 0, sizeof(mbstate_t)); 2771 remain_bytes = 0; 2772 for (i = 0; i < end - (unsigned char const *)begin + 1; i++) 2773 { 2774 if (remain_bytes == 0) 2775 { 2776 remain_bytes 2777 = mbrtowc(inputwcs + i, begin + i, 2778 end - (unsigned char const *)begin - i + 1, &mbs); 2779 if (remain_bytes <= 1) 2780 { 2781 remain_bytes = 0; 2782 inputwcs[i] = (wchar_t)begin[i]; 2783 mblen_buf[i] = 0; 2784 } 2785 else 2786 { 2787 mblen_buf[i] = remain_bytes; 2788 remain_bytes--; 2789 } 2790 } 2791 else 2792 { 2793 mblen_buf[i] = remain_bytes; 2794 inputwcs[i] = 0; 2795 remain_bytes--; 2796 } 2797 } 2798 mblen_buf[i] = 0; 2799 inputwcs[i] = 0; /* sentinel */ 2800 } 2801#endif /* MBS_SUPPORT */ 2802 2803 for (;;) 2804 { 2805#ifdef MBS_SUPPORT 2806 if (MB_CUR_MAX > 1) 2807 while ((t = trans[s])) 2808 { 2809 if (p == end) 2810 { 2811 free(mblen_buf); 2812 free(inputwcs); 2813 return (size_t) -1; 2814 } 2815 if (d->states[s].mbps.nelem != 0) 2816 { 2817 /* Can match with a multibyte character( and multi character 2818 collating element). */ 2819 unsigned char const *nextp; 2820 2821 SKIP_REMAINS_MB_IF_INITIAL_STATE(s, p); 2822 2823 nextp = p; 2824 s = transit_state(d, s, &nextp); 2825 p = nextp; 2826 2827 /* Trans table might be updated. */ 2828 trans = d->trans; 2829 } 2830 else 2831 { 2832 SKIP_REMAINS_MB_IF_INITIAL_STATE(s, p); 2833 s = t[*p++]; 2834 } 2835 } 2836 else 2837#endif /* MBS_SUPPORT */ 2838 while ((t = trans[s])) 2839 { 2840 if (p == end) 2841 return (size_t) -1; 2842 s = t[*p++]; 2843 } 2844 2845 if (s < 0) 2846 { 2847 if (p == end) 2848 { 2849#ifdef MBS_SUPPORT 2850 if (MB_CUR_MAX > 1) 2851 { 2852 free(mblen_buf); 2853 free(inputwcs); 2854 } 2855#endif /* MBS_SUPPORT */ 2856 return (size_t) -1; 2857 } 2858 s = 0; 2859 } 2860 else if ((t = d->fails[s])) 2861 { 2862 if (d->success[s] & sbit[*p]) 2863 { 2864 if (backref) 2865 *backref = (d->states[s].backref != 0); 2866#ifdef MBS_SUPPORT 2867 if (MB_CUR_MAX > 1) 2868 { 2869 free(mblen_buf); 2870 free(inputwcs); 2871 } 2872#endif /* MBS_SUPPORT */ 2873 return (char const *) p - begin; 2874 } 2875 2876#ifdef MBS_SUPPORT 2877 if (MB_CUR_MAX > 1) 2878 { 2879 SKIP_REMAINS_MB_IF_INITIAL_STATE(s, p); 2880 if (d->states[s].mbps.nelem != 0) 2881 { 2882 /* Can match with a multibyte character( and multi 2883 character collating element). */ 2884 unsigned char const *nextp; 2885 nextp = p; 2886 s = transit_state(d, s, &nextp); 2887 p = nextp; 2888 2889 /* Trans table might be updated. */ 2890 trans = d->trans; 2891 } 2892 else 2893 s = t[*p++]; 2894 } 2895 else 2896#endif /* MBS_SUPPORT */ 2897 s = t[*p++]; 2898 } 2899 else 2900 { 2901 build_state(s, d); 2902 trans = d->trans; 2903 } 2904 } 2905} 2906 2907/* Initialize the components of a dfa that the other routines don't 2908 initialize for themselves. */ 2909void 2910dfainit (struct dfa *d) 2911{ 2912 d->calloc = 1; 2913 MALLOC(d->charclasses, charclass, d->calloc); 2914 d->cindex = 0; 2915 2916 d->talloc = 1; 2917 MALLOC(d->tokens, token, d->talloc); 2918 d->tindex = d->depth = d->nleaves = d->nregexps = 0; 2919#ifdef MBS_SUPPORT 2920 if (MB_CUR_MAX > 1) 2921 { 2922 d->nmultibyte_prop = 1; 2923 MALLOC(d->multibyte_prop, int, d->nmultibyte_prop); 2924 d->nmbcsets = 0; 2925 d->mbcsets_alloc = 1; 2926 MALLOC(d->mbcsets, struct mb_char_classes, d->mbcsets_alloc); 2927 } 2928#endif 2929 2930 d->searchflag = 0; 2931 d->tralloc = 0; 2932 2933 d->musts = 0; 2934} 2935 2936/* Parse and analyze a single string of the given length. */ 2937void 2938dfacomp (char const *s, size_t len, struct dfa *d, int searchflag) 2939{ 2940 if (case_fold) /* dummy folding in service of dfamust() */ 2941 { 2942 char *lcopy; 2943 int i; 2944 2945 lcopy = (char *) malloc(len); 2946 if (!lcopy) 2947 dfaerror(_("out of memory")); 2948 2949 /* This is a kludge. */ 2950 case_fold = 0; 2951 for (i = 0; i < len; ++i) 2952 if (ISUPPER ((unsigned char) s[i])) 2953 lcopy[i] = tolower ((unsigned char) s[i]); 2954 else 2955 lcopy[i] = s[i]; 2956 2957 dfainit(d); 2958 dfaparse(lcopy, len, d); 2959 free(lcopy); 2960 dfamust(d); 2961 d->cindex = d->tindex = d->depth = d->nleaves = d->nregexps = 0; 2962 case_fold = 1; 2963 dfaparse(s, len, d); 2964 dfaanalyze(d, searchflag); 2965 } 2966 else 2967 { 2968 dfainit(d); 2969 dfaparse(s, len, d); 2970 dfamust(d); 2971 dfaanalyze(d, searchflag); 2972 } 2973} 2974 2975/* Free the storage held by the components of a dfa. */ 2976void 2977dfafree (struct dfa *d) 2978{ 2979 int i; 2980 struct dfamust *dm, *ndm; 2981 2982 free((ptr_t) d->charclasses); 2983 free((ptr_t) d->tokens); 2984 2985#ifdef MBS_SUPPORT 2986 if (MB_CUR_MAX > 1) 2987 { 2988 free((ptr_t) d->multibyte_prop); 2989 for (i = 0; i < d->nmbcsets; ++i) 2990 { 2991 int j; 2992 struct mb_char_classes *p = &(d->mbcsets[i]); 2993 if (p->chars != NULL) 2994 free(p->chars); 2995 if (p->ch_classes != NULL) 2996 free(p->ch_classes); 2997 if (p->range_sts != NULL) 2998 free(p->range_sts); 2999 if (p->range_ends != NULL) 3000 free(p->range_ends); 3001 3002 for (j = 0; j < p->nequivs; ++j) 3003 free(p->equivs[j]); 3004 if (p->equivs != NULL) 3005 free(p->equivs); 3006 3007 for (j = 0; j < p->ncoll_elems; ++j) 3008 free(p->coll_elems[j]); 3009 if (p->coll_elems != NULL) 3010 free(p->coll_elems); 3011 } 3012 free((ptr_t) d->mbcsets); 3013 } 3014#endif /* MBS_SUPPORT */ 3015 3016 for (i = 0; i < d->sindex; ++i) 3017 free((ptr_t) d->states[i].elems.elems); 3018 free((ptr_t) d->states); 3019 for (i = 0; i < d->tindex; ++i) 3020 if (d->follows[i].elems) 3021 free((ptr_t) d->follows[i].elems); 3022 free((ptr_t) d->follows); 3023 for (i = 0; i < d->tralloc; ++i) 3024 if (d->trans[i]) 3025 free((ptr_t) d->trans[i]); 3026 else if (d->fails[i]) 3027 free((ptr_t) d->fails[i]); 3028 if (d->realtrans) free((ptr_t) d->realtrans); 3029 if (d->fails) free((ptr_t) d->fails); 3030 if (d->success) free((ptr_t) d->success); 3031 for (dm = d->musts; dm; dm = ndm) 3032 { 3033 ndm = dm->next; 3034 free(dm->must); 3035 free((ptr_t) dm); 3036 } 3037} 3038 3039/* Having found the postfix representation of the regular expression, 3040 try to find a long sequence of characters that must appear in any line 3041 containing the r.e. 3042 Finding a "longest" sequence is beyond the scope here; 3043 we take an easy way out and hope for the best. 3044 (Take "(ab|a)b"--please.) 3045 3046 We do a bottom-up calculation of sequences of characters that must appear 3047 in matches of r.e.'s represented by trees rooted at the nodes of the postfix 3048 representation: 3049 sequences that must appear at the left of the match ("left") 3050 sequences that must appear at the right of the match ("right") 3051 lists of sequences that must appear somewhere in the match ("in") 3052 sequences that must constitute the match ("is") 3053 3054 When we get to the root of the tree, we use one of the longest of its 3055 calculated "in" sequences as our answer. The sequence we find is returned in 3056 d->must (where "d" is the single argument passed to "dfamust"); 3057 the length of the sequence is returned in d->mustn. 3058 3059 The sequences calculated for the various types of node (in pseudo ANSI c) 3060 are shown below. "p" is the operand of unary operators (and the left-hand 3061 operand of binary operators); "q" is the right-hand operand of binary 3062 operators. 3063 3064 "ZERO" means "a zero-length sequence" below. 3065 3066 Type left right is in 3067 ---- ---- ----- -- -- 3068 char c # c # c # c # c 3069 3070 ANYCHAR ZERO ZERO ZERO ZERO 3071 3072 MBCSET ZERO ZERO ZERO ZERO 3073 3074 CSET ZERO ZERO ZERO ZERO 3075 3076 STAR ZERO ZERO ZERO ZERO 3077 3078 QMARK ZERO ZERO ZERO ZERO 3079 3080 PLUS p->left p->right ZERO p->in 3081 3082 CAT (p->is==ZERO)? (q->is==ZERO)? (p->is!=ZERO && p->in plus 3083 p->left : q->right : q->is!=ZERO) ? q->in plus 3084 p->is##q->left p->right##q->is p->is##q->is : p->right##q->left 3085 ZERO 3086 3087 OR longest common longest common (do p->is and substrings common to 3088 leading trailing q->is have same p->in and q->in 3089 (sub)sequence (sub)sequence length and 3090 of p->left of p->right content) ? 3091 and q->left and q->right p->is : NULL 3092 3093 If there's anything else we recognize in the tree, all four sequences get set 3094 to zero-length sequences. If there's something we don't recognize in the tree, 3095 we just return a zero-length sequence. 3096 3097 Break ties in favor of infrequent letters (choosing 'zzz' in preference to 3098 'aaa')? 3099 3100 And. . .is it here or someplace that we might ponder "optimizations" such as 3101 egrep 'psi|epsilon' -> egrep 'psi' 3102 egrep 'pepsi|epsilon' -> egrep 'epsi' 3103 (Yes, we now find "epsi" as a "string 3104 that must occur", but we might also 3105 simplify the *entire* r.e. being sought) 3106 grep '[c]' -> grep 'c' 3107 grep '(ab|a)b' -> grep 'ab' 3108 grep 'ab*' -> grep 'a' 3109 grep 'a*b' -> grep 'b' 3110 3111 There are several issues: 3112 3113 Is optimization easy (enough)? 3114 3115 Does optimization actually accomplish anything, 3116 or is the automaton you get from "psi|epsilon" (for example) 3117 the same as the one you get from "psi" (for example)? 3118 3119 Are optimizable r.e.'s likely to be used in real-life situations 3120 (something like 'ab*' is probably unlikely; something like is 3121 'psi|epsilon' is likelier)? */ 3122 3123static char * 3124icatalloc (char *stem, char *suffix) 3125{ 3126 char *result; 3127 size_t stemsize, suffixsize; 3128 3129 suffixsize = (suffix == NULL) ? 0 : strlen(suffix); 3130 if (stem == NULL) 3131 stemsize = 0; 3132 else if (suffixsize == 0) 3133 return stem; 3134 else stemsize = strlen(stem); 3135 if (stem == NULL) 3136 result = (char *) malloc(suffixsize + 1); 3137 else 3138 result = (char *) realloc((void *) stem, stemsize + suffixsize + 1); 3139 if (result != NULL && suffix != NULL) 3140 (void) strcpy(result + stemsize, suffix); 3141 return result; 3142} 3143 3144static char * 3145icpyalloc (char *string) 3146{ 3147 return icatalloc((char *) NULL, string); 3148} 3149 3150static char * 3151istrstr (char *lookin, char *lookfor) 3152{ 3153 char *cp; 3154 size_t len; 3155 3156 len = strlen(lookfor); 3157 for (cp = lookin; *cp != '\0'; ++cp) 3158 if (strncmp(cp, lookfor, len) == 0) 3159 return cp; 3160 return NULL; 3161} 3162 3163static void 3164ifree (char *cp) 3165{ 3166 if (cp != NULL) 3167 free(cp); 3168} 3169 3170static void 3171freelist (char **cpp) 3172{ 3173 int i; 3174 3175 if (cpp == NULL) 3176 return; 3177 for (i = 0; cpp[i] != NULL; ++i) 3178 { 3179 free(cpp[i]); 3180 cpp[i] = NULL; 3181 } 3182} 3183 3184static char ** 3185enlist (char **cpp, char *suffix, size_t len) 3186{ 3187 int i, j; 3188 3189 if (cpp == NULL) 3190 return NULL; 3191 if ((suffix = icpyalloc(suffix)) == NULL) 3192 { 3193 freelist(cpp); 3194 return NULL; 3195 } 3196 suffix[len] = '\0'; 3197 /* Is there already something in the list that's suffix (or longer)? */ 3198 for (i = 0; cpp[i] != NULL; ++i) 3199 if (istrstr(cpp[i], suffix) != NULL) 3200 { 3201 free(suffix); 3202 return cpp; 3203 } 3204 /* Eliminate any obsoleted strings. */ 3205 j = 0; 3206 while (cpp[j] != NULL) 3207 if (istrstr(suffix, cpp[j]) == NULL) 3208 ++j; 3209 else 3210 { 3211 free(cpp[j]); 3212 if (--i == j) 3213 break; 3214 cpp[j] = cpp[i]; 3215 cpp[i] = NULL; 3216 } 3217 /* Add the new string. */ 3218 cpp = (char **) realloc((char *) cpp, (i + 2) * sizeof *cpp); 3219 if (cpp == NULL) 3220 return NULL; 3221 cpp[i] = suffix; 3222 cpp[i + 1] = NULL; 3223 return cpp; 3224} 3225 3226/* Given pointers to two strings, return a pointer to an allocated 3227 list of their distinct common substrings. Return NULL if something 3228 seems wild. */ 3229static char ** 3230comsubs (char *left, char *right) 3231{ 3232 char **cpp; 3233 char *lcp; 3234 char *rcp; 3235 size_t i, len; 3236 3237 if (left == NULL || right == NULL) 3238 return NULL; 3239 cpp = (char **) malloc(sizeof *cpp); 3240 if (cpp == NULL) 3241 return NULL; 3242 cpp[0] = NULL; 3243 for (lcp = left; *lcp != '\0'; ++lcp) 3244 { 3245 len = 0; 3246 rcp = strchr (right, *lcp); 3247 while (rcp != NULL) 3248 { 3249 for (i = 1; lcp[i] != '\0' && lcp[i] == rcp[i]; ++i) 3250 continue; 3251 if (i > len) 3252 len = i; 3253 rcp = strchr (rcp + 1, *lcp); 3254 } 3255 if (len == 0) 3256 continue; 3257 if ((cpp = enlist(cpp, lcp, len)) == NULL) 3258 break; 3259 } 3260 return cpp; 3261} 3262 3263static char ** 3264addlists (char **old, char **suffixes) 3265{ 3266 int i; 3267 3268 if (old == NULL || suffixes == NULL) 3269 return NULL; 3270 for (i = 0; suffixes[i] != NULL; ++i) 3271 { 3272 old = enlist(old, suffixes[i], strlen(suffixes[i])); 3273 if (old == NULL) 3274 break; 3275 } 3276 return old; 3277} 3278 3279/* Given two lists of substrings, return a new list giving substrings 3280 common to both. */ 3281static char ** 3282inboth (char **left, char **right) 3283{ 3284 char **both; 3285 char **temp; 3286 int lnum, rnum; 3287 3288 if (left == NULL || right == NULL) 3289 return NULL; 3290 both = (char **) malloc(sizeof *both); 3291 if (both == NULL) 3292 return NULL; 3293 both[0] = NULL; 3294 for (lnum = 0; left[lnum] != NULL; ++lnum) 3295 { 3296 for (rnum = 0; right[rnum] != NULL; ++rnum) 3297 { 3298 temp = comsubs(left[lnum], right[rnum]); 3299 if (temp == NULL) 3300 { 3301 freelist(both); 3302 return NULL; 3303 } 3304 both = addlists(both, temp); 3305 freelist(temp); 3306 free(temp); 3307 if (both == NULL) 3308 return NULL; 3309 } 3310 } 3311 return both; 3312} 3313 3314typedef struct 3315{ 3316 char **in; 3317 char *left; 3318 char *right; 3319 char *is; 3320} must; 3321 3322static void 3323resetmust (must *mp) 3324{ 3325 mp->left[0] = mp->right[0] = mp->is[0] = '\0'; 3326 freelist(mp->in); 3327} 3328 3329static void 3330dfamust (struct dfa *dfa) 3331{ 3332 must *musts; 3333 must *mp; 3334 char *result; 3335 int ri; 3336 int i; 3337 int exact; 3338 token t; 3339 static must must0; 3340 struct dfamust *dm; 3341 static char empty_string[] = ""; 3342 3343 result = empty_string; 3344 exact = 0; 3345 musts = (must *) malloc((dfa->tindex + 1) * sizeof *musts); 3346 if (musts == NULL) 3347 return; 3348 mp = musts; 3349 for (i = 0; i <= dfa->tindex; ++i) 3350 mp[i] = must0; 3351 for (i = 0; i <= dfa->tindex; ++i) 3352 { 3353 mp[i].in = (char **) malloc(sizeof *mp[i].in); 3354 mp[i].left = (char *) malloc(2); 3355 mp[i].right = (char *) malloc(2); 3356 mp[i].is = (char *) malloc(2); 3357 if (mp[i].in == NULL || mp[i].left == NULL || 3358 mp[i].right == NULL || mp[i].is == NULL) 3359 goto done; 3360 mp[i].left[0] = mp[i].right[0] = mp[i].is[0] = '\0'; 3361 mp[i].in[0] = NULL; 3362 } 3363#ifdef DEBUG 3364 fprintf(stderr, "dfamust:\n"); 3365 for (i = 0; i < dfa->tindex; ++i) 3366 { 3367 fprintf(stderr, " %d:", i); 3368 prtok(dfa->tokens[i]); 3369 } 3370 putc('\n', stderr); 3371#endif 3372 for (ri = 0; ri < dfa->tindex; ++ri) 3373 { 3374 switch (t = dfa->tokens[ri]) 3375 { 3376 case LPAREN: 3377 case RPAREN: 3378 goto done; /* "cannot happen" */ 3379 case EMPTY: 3380 case BEGLINE: 3381 case ENDLINE: 3382 case BEGWORD: 3383 case ENDWORD: 3384 case LIMWORD: 3385 case NOTLIMWORD: 3386 case BACKREF: 3387 resetmust(mp); 3388 break; 3389 case STAR: 3390 case QMARK: 3391 if (mp <= musts) 3392 goto done; /* "cannot happen" */ 3393 --mp; 3394 resetmust(mp); 3395 break; 3396 case OR: 3397 case ORTOP: 3398 if (mp < &musts[2]) 3399 goto done; /* "cannot happen" */ 3400 { 3401 char **common; 3402 must *lmp; 3403 must *rmp; 3404 int j, ln, rn, n; 3405 3406 rmp = --mp; 3407 lmp = --mp; 3408 /* Guaranteed to be. Unlikely, but. . . */ 3409 if (strcmp(lmp->is, rmp->is) != 0) 3410 lmp->is[0] = '\0'; 3411 /* Left side--easy */ 3412 i = 0; 3413 while (lmp->left[i] != '\0' && lmp->left[i] == rmp->left[i]) 3414 ++i; 3415 lmp->left[i] = '\0'; 3416 /* Right side */ 3417 ln = strlen(lmp->right); 3418 rn = strlen(rmp->right); 3419 n = ln; 3420 if (n > rn) 3421 n = rn; 3422 for (i = 0; i < n; ++i) 3423 if (lmp->right[ln - i - 1] != rmp->right[rn - i - 1]) 3424 break; 3425 for (j = 0; j < i; ++j) 3426 lmp->right[j] = lmp->right[(ln - i) + j]; 3427 lmp->right[j] = '\0'; 3428 common = inboth(lmp->in, rmp->in); 3429 if (common == NULL) 3430 goto done; 3431 freelist(lmp->in); 3432 free((char *) lmp->in); 3433 lmp->in = common; 3434 } 3435 break; 3436 case PLUS: 3437 if (mp <= musts) 3438 goto done; /* "cannot happen" */ 3439 --mp; 3440 mp->is[0] = '\0'; 3441 break; 3442 case END: 3443 if (mp != &musts[1]) 3444 goto done; /* "cannot happen" */ 3445 for (i = 0; musts[0].in[i] != NULL; ++i) 3446 if (strlen(musts[0].in[i]) > strlen(result)) 3447 result = musts[0].in[i]; 3448 if (strcmp(result, musts[0].is) == 0) 3449 exact = 1; 3450 goto done; 3451 case CAT: 3452 if (mp < &musts[2]) 3453 goto done; /* "cannot happen" */ 3454 { 3455 must *lmp; 3456 must *rmp; 3457 3458 rmp = --mp; 3459 lmp = --mp; 3460 /* In. Everything in left, plus everything in 3461 right, plus catenation of 3462 left's right and right's left. */ 3463 lmp->in = addlists(lmp->in, rmp->in); 3464 if (lmp->in == NULL) 3465 goto done; 3466 if (lmp->right[0] != '\0' && 3467 rmp->left[0] != '\0') 3468 { 3469 char *tp; 3470 3471 tp = icpyalloc(lmp->right); 3472 if (tp == NULL) 3473 goto done; 3474 tp = icatalloc(tp, rmp->left); 3475 if (tp == NULL) 3476 goto done; 3477 lmp->in = enlist(lmp->in, tp, 3478 strlen(tp)); 3479 free(tp); 3480 if (lmp->in == NULL) 3481 goto done; 3482 } 3483 /* Left-hand */ 3484 if (lmp->is[0] != '\0') 3485 { 3486 lmp->left = icatalloc(lmp->left, 3487 rmp->left); 3488 if (lmp->left == NULL) 3489 goto done; 3490 } 3491 /* Right-hand */ 3492 if (rmp->is[0] == '\0') 3493 lmp->right[0] = '\0'; 3494 lmp->right = icatalloc(lmp->right, rmp->right); 3495 if (lmp->right == NULL) 3496 goto done; 3497 /* Guaranteed to be */ 3498 if (lmp->is[0] != '\0' && rmp->is[0] != '\0') 3499 { 3500 lmp->is = icatalloc(lmp->is, rmp->is); 3501 if (lmp->is == NULL) 3502 goto done; 3503 } 3504 else 3505 lmp->is[0] = '\0'; 3506 } 3507 break; 3508 default: 3509 if (t < END) 3510 { 3511 /* "cannot happen" */ 3512 goto done; 3513 } 3514 else if (t == '\0') 3515 { 3516 /* not on *my* shift */ 3517 goto done; 3518 } 3519 else if (t >= CSET 3520#ifdef MBS_SUPPORT 3521 || t == ANYCHAR 3522 || t == MBCSET 3523#endif /* MBS_SUPPORT */ 3524 ) 3525 { 3526 /* easy enough */ 3527 resetmust(mp); 3528 } 3529 else 3530 { 3531 /* plain character */ 3532 resetmust(mp); 3533 mp->is[0] = mp->left[0] = mp->right[0] = t; 3534 mp->is[1] = mp->left[1] = mp->right[1] = '\0'; 3535 mp->in = enlist(mp->in, mp->is, (size_t)1); 3536 if (mp->in == NULL) 3537 goto done; 3538 } 3539 break; 3540 } 3541#ifdef DEBUG 3542 fprintf(stderr, " node: %d:", ri); 3543 prtok(dfa->tokens[ri]); 3544 fprintf(stderr, "\n in:"); 3545 for (i = 0; mp->in[i]; ++i) 3546 fprintf(stderr, " \"%s\"", mp->in[i]); 3547 fprintf(stderr, "\n is: \"%s\"\n", mp->is); 3548 fprintf(stderr, " left: \"%s\"\n", mp->left); 3549 fprintf(stderr, " right: \"%s\"\n", mp->right); 3550#endif 3551 ++mp; 3552 } 3553 done: 3554 if (strlen(result)) 3555 { 3556 dm = (struct dfamust *) malloc(sizeof (struct dfamust)); 3557 dm->exact = exact; 3558 dm->must = (char *) malloc(strlen(result) + 1); 3559 strcpy(dm->must, result); 3560 dm->next = dfa->musts; 3561 dfa->musts = dm; 3562 } 3563 mp = musts; 3564 for (i = 0; i <= dfa->tindex; ++i) 3565 { 3566 freelist(mp[i].in); 3567 ifree((char *) mp[i].in); 3568 ifree(mp[i].left); 3569 ifree(mp[i].right); 3570 ifree(mp[i].is); 3571 } 3572 free((char *) mp); 3573} 3574/* vim:set shiftwidth=2: */ 3575