1/* dfa.c - deterministic extended regexp routines for GNU 2 Copyright 1988, 1998, 2000 Free Software Foundation, Inc. 3 4 This program is free software; you can redistribute it and/or modify 5 it under the terms of the GNU General Public License as published by 6 the Free Software Foundation; either version 2, or (at your option) 7 any later version. 8 9 This program is distributed in the hope that it will be useful, 10 but WITHOUT ANY WARRANTY; without even the implied warranty of 11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 GNU General Public License for more details. 13 14 You should have received a copy of the GNU General Public License 15 along with this program; if not, write to the Free Software 16 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA */ 17 18/* Written June, 1988 by Mike Haertel 19 Modified July, 1988 by Arthur David Olson to assist BMG speedups */ 20 21/* $FreeBSD: releng/10.2/gnu/usr.bin/grep/dfa.c 250823 2013-05-20 03:15:25Z pfg $ */ 22 23#ifdef HAVE_CONFIG_H 24#include <config.h> 25#endif 26 27#include <assert.h> 28#include <ctype.h> 29#include <stdio.h> 30 31#include <sys/types.h> 32#ifdef STDC_HEADERS 33#include <stdlib.h> 34#else 35extern char *calloc(), *malloc(), *realloc(); 36extern void free(); 37#endif 38 39#if defined(HAVE_STRING_H) || defined(STDC_HEADERS) 40#include <string.h> 41#else 42#include <strings.h> 43#endif 44 45#if HAVE_SETLOCALE 46# include <locale.h> 47#endif 48 49#if defined HAVE_WCTYPE_H && defined HAVE_WCHAR_H && defined HAVE_MBRTOWC 50/* We can handle multibyte string. */ 51# define MBS_SUPPORT 52#endif 53 54#ifdef MBS_SUPPORT 55# include <wchar.h> 56# include <wctype.h> 57#endif 58 59#ifndef DEBUG /* use the same approach as regex.c */ 60#undef assert 61#define assert(e) 62#endif /* DEBUG */ 63 64#ifndef isgraph 65#define isgraph(C) (isprint(C) && !isspace(C)) 66#endif 67 68#if defined (STDC_HEADERS) || (!defined (isascii) && !defined (HAVE_ISASCII)) 69#define ISALPHA(C) isalpha(C) 70#define ISUPPER(C) isupper(C) 71#define ISLOWER(C) islower(C) 72#define ISDIGIT(C) isdigit(C) 73#define ISXDIGIT(C) isxdigit(C) 74#define ISSPACE(C) isspace(C) 75#define ISPUNCT(C) ispunct(C) 76#define ISALNUM(C) isalnum(C) 77#define ISPRINT(C) isprint(C) 78#define ISGRAPH(C) isgraph(C) 79#define ISCNTRL(C) iscntrl(C) 80#else 81#define ISALPHA(C) (isascii(C) && isalpha(C)) 82#define ISUPPER(C) (isascii(C) && isupper(C)) 83#define ISLOWER(C) (isascii(C) && islower(C)) 84#define ISDIGIT(C) (isascii(C) && isdigit(C)) 85#define ISXDIGIT(C) (isascii(C) && isxdigit(C)) 86#define ISSPACE(C) (isascii(C) && isspace(C)) 87#define ISPUNCT(C) (isascii(C) && ispunct(C)) 88#define ISALNUM(C) (isascii(C) && isalnum(C)) 89#define ISPRINT(C) (isascii(C) && isprint(C)) 90#define ISGRAPH(C) (isascii(C) && isgraph(C)) 91#define ISCNTRL(C) (isascii(C) && iscntrl(C)) 92#endif 93 94/* ISASCIIDIGIT differs from ISDIGIT, as follows: 95 - Its arg may be any int or unsigned int; it need not be an unsigned char. 96 - It's guaranteed to evaluate its argument exactly once. 97 - It's typically faster. 98 Posix 1003.2-1992 section 2.5.2.1 page 50 lines 1556-1558 says that 99 only '0' through '9' are digits. Prefer ISASCIIDIGIT to ISDIGIT unless 100 it's important to use the locale's definition of `digit' even when the 101 host does not conform to Posix. */ 102#define ISASCIIDIGIT(c) ((unsigned) (c) - '0' <= 9) 103 104/* If we (don't) have I18N. */ 105/* glibc defines _ */ 106#ifndef _ 107# ifdef HAVE_LIBINTL_H 108# include <libintl.h> 109# ifndef _ 110# define _(Str) gettext (Str) 111# endif 112# else 113# define _(Str) (Str) 114# endif 115#endif 116 117#include "regex.h" 118#include "dfa.h" 119#include "hard-locale.h" 120 121/* HPUX, define those as macros in sys/param.h */ 122#ifdef setbit 123# undef setbit 124#endif 125#ifdef clrbit 126# undef clrbit 127#endif 128 129static void dfamust PARAMS ((struct dfa *dfa)); 130static void regexp PARAMS ((int toplevel)); 131 132static ptr_t 133xcalloc (size_t n, size_t s) 134{ 135 ptr_t r = calloc(n, s); 136 137 if (!r) 138 dfaerror(_("Memory exhausted")); 139 return r; 140} 141 142static ptr_t 143xmalloc (size_t n) 144{ 145 ptr_t r = malloc(n); 146 147 assert(n != 0); 148 if (!r) 149 dfaerror(_("Memory exhausted")); 150 return r; 151} 152 153static ptr_t 154xrealloc (ptr_t p, size_t n) 155{ 156 ptr_t r = realloc(p, n); 157 158 assert(n != 0); 159 if (!r) 160 dfaerror(_("Memory exhausted")); 161 return r; 162} 163 164#define CALLOC(p, t, n) ((p) = (t *) xcalloc((size_t)(n), sizeof (t))) 165#define MALLOC(p, t, n) ((p) = (t *) xmalloc((n) * sizeof (t))) 166#define REALLOC(p, t, n) ((p) = (t *) xrealloc((ptr_t) (p), (n) * sizeof (t))) 167 168/* Reallocate an array of type t if nalloc is too small for index. */ 169#define REALLOC_IF_NECESSARY(p, t, nalloc, index) \ 170 if ((index) >= (nalloc)) \ 171 { \ 172 do \ 173 (nalloc) *= 2; \ 174 while ((index) >= (nalloc)); \ 175 REALLOC(p, t, nalloc); \ 176 } 177 178#ifdef DEBUG 179 180static void 181prtok (token t) 182{ 183 char const *s; 184 185 if (t < 0) 186 fprintf(stderr, "END"); 187 else if (t < NOTCHAR) 188 fprintf(stderr, "%c", t); 189 else 190 { 191 switch (t) 192 { 193 case EMPTY: s = "EMPTY"; break; 194 case BACKREF: s = "BACKREF"; break; 195 case BEGLINE: s = "BEGLINE"; break; 196 case ENDLINE: s = "ENDLINE"; break; 197 case BEGWORD: s = "BEGWORD"; break; 198 case ENDWORD: s = "ENDWORD"; break; 199 case LIMWORD: s = "LIMWORD"; break; 200 case NOTLIMWORD: s = "NOTLIMWORD"; break; 201 case QMARK: s = "QMARK"; break; 202 case STAR: s = "STAR"; break; 203 case PLUS: s = "PLUS"; break; 204 case CAT: s = "CAT"; break; 205 case OR: s = "OR"; break; 206 case ORTOP: s = "ORTOP"; break; 207 case LPAREN: s = "LPAREN"; break; 208 case RPAREN: s = "RPAREN"; break; 209 case CRANGE: s = "CRANGE"; break; 210#ifdef MBS_SUPPORT 211 case ANYCHAR: s = "ANYCHAR"; break; 212 case MBCSET: s = "MBCSET"; break; 213#endif /* MBS_SUPPORT */ 214 default: s = "CSET"; break; 215 } 216 fprintf(stderr, "%s", s); 217 } 218} 219#endif /* DEBUG */ 220 221/* Stuff pertaining to charclasses. */ 222 223static int 224tstbit (unsigned b, charclass c) 225{ 226 return c[b / INTBITS] & 1 << b % INTBITS; 227} 228 229static void 230setbit (unsigned b, charclass c) 231{ 232 c[b / INTBITS] |= 1 << b % INTBITS; 233} 234 235static void 236clrbit (unsigned b, charclass c) 237{ 238 c[b / INTBITS] &= ~(1 << b % INTBITS); 239} 240 241static void 242copyset (charclass src, charclass dst) 243{ 244 memcpy (dst, src, sizeof (charclass)); 245} 246 247static void 248zeroset (charclass s) 249{ 250 memset (s, 0, sizeof (charclass)); 251} 252 253static void 254notset (charclass s) 255{ 256 int i; 257 258 for (i = 0; i < CHARCLASS_INTS; ++i) 259 s[i] = ~s[i]; 260} 261 262static int 263equal (charclass s1, charclass s2) 264{ 265 return memcmp (s1, s2, sizeof (charclass)) == 0; 266} 267 268/* A pointer to the current dfa is kept here during parsing. */ 269static struct dfa *dfa; 270 271/* Find the index of charclass s in dfa->charclasses, or allocate a new charclass. */ 272static int 273charclass_index (charclass s) 274{ 275 int i; 276 277 for (i = 0; i < dfa->cindex; ++i) 278 if (equal(s, dfa->charclasses[i])) 279 return i; 280 REALLOC_IF_NECESSARY(dfa->charclasses, charclass, dfa->calloc, dfa->cindex); 281 ++dfa->cindex; 282 copyset(s, dfa->charclasses[i]); 283 return i; 284} 285 286/* Syntax bits controlling the behavior of the lexical analyzer. */ 287static reg_syntax_t syntax_bits, syntax_bits_set; 288 289/* Flag for case-folding letters into sets. */ 290static int case_fold; 291 292/* End-of-line byte in data. */ 293static unsigned char eolbyte; 294 295/* Entry point to set syntax options. */ 296void 297dfasyntax (reg_syntax_t bits, int fold, unsigned char eol) 298{ 299 syntax_bits_set = 1; 300 syntax_bits = bits; 301 case_fold = fold; 302 eolbyte = eol; 303} 304 305/* Like setbit, but if case is folded, set both cases of a letter. */ 306static void 307setbit_case_fold (unsigned b, charclass c) 308{ 309 setbit (b, c); 310 if (case_fold) 311 { 312 if (ISUPPER (b)) 313 setbit (tolower (b), c); 314 else if (ISLOWER (b)) 315 setbit (toupper (b), c); 316 } 317} 318 319/* Lexical analyzer. All the dross that deals with the obnoxious 320 GNU Regex syntax bits is located here. The poor, suffering 321 reader is referred to the GNU Regex documentation for the 322 meaning of the @#%!@#%^!@ syntax bits. */ 323 324static char const *lexstart; /* Pointer to beginning of input string. */ 325static char const *lexptr; /* Pointer to next input character. */ 326static int lexleft; /* Number of characters remaining. */ 327static token lasttok; /* Previous token returned; initially END. */ 328static int laststart; /* True if we're separated from beginning or (, | 329 only by zero-width characters. */ 330static int parens; /* Count of outstanding left parens. */ 331static int minrep, maxrep; /* Repeat counts for {m,n}. */ 332static int hard_LC_COLLATE; /* Nonzero if LC_COLLATE is hard. */ 333 334#ifdef MBS_SUPPORT 335/* These variables are used only if (MB_CUR_MAX > 1). */ 336static mbstate_t mbs; /* Mbstate for mbrlen(). */ 337static ssize_t cur_mb_len; /* Byte length of the current scanning 338 multibyte character. Must also handle 339 negative result from mbrlen(). */ 340static ssize_t cur_mb_index; /* Byte index of the current scanning multibyte 341 character. 342 343 singlebyte character : cur_mb_index = 0 344 multibyte character 345 1st byte : cur_mb_index = 1 346 2nd byte : cur_mb_index = 2 347 ... 348 nth byte : cur_mb_index = n */ 349static unsigned char *mblen_buf;/* Correspond to the input buffer in dfaexec(). 350 Each element store the amount of remain 351 byte of corresponding multibyte character 352 in the input string. A element's value 353 is 0 if corresponding character is a 354 singlebyte chracter. 355 e.g. input : 'a', <mb(0)>, <mb(1)>, <mb(2)> 356 mblen_buf : 0, 3, 2, 1 357 */ 358static wchar_t *inputwcs; /* Wide character representation of input 359 string in dfaexec(). 360 The length of this array is same as 361 the length of input string(char array). 362 inputstring[i] is a single-byte char, 363 or 1st byte of a multibyte char. 364 And inputwcs[i] is the codepoint. */ 365static unsigned char const *buf_begin;/* refference to begin in dfaexec(). */ 366static unsigned char const *buf_end; /* refference to end in dfaexec(). */ 367#endif /* MBS_SUPPORT */ 368 369#ifdef MBS_SUPPORT 370/* This function update cur_mb_len, and cur_mb_index. 371 p points current lexptr, len is the remaining buffer length. */ 372static void 373update_mb_len_index (unsigned char const *p, size_t len) 374{ 375 /* If last character is a part of a multibyte character, 376 we update cur_mb_index. */ 377 if (cur_mb_index) 378 cur_mb_index = (cur_mb_index >= cur_mb_len)? 0 379 : cur_mb_index + 1; 380 381 /* If last character is a single byte character, or the 382 last portion of a multibyte character, we check whether 383 next character is a multibyte character or not. */ 384 if (! cur_mb_index) 385 { 386 cur_mb_len = mbrlen(p, len, &mbs); 387 if (cur_mb_len > 1) 388 /* It is a multibyte character. 389 cur_mb_len was already set by mbrlen(). */ 390 cur_mb_index = 1; 391 else if (cur_mb_len < 1) 392 /* Invalid sequence. We treat it as a singlebyte character. 393 cur_mb_index is aleady 0. */ 394 cur_mb_len = 1; 395 /* Otherwise, cur_mb_len == 1, it is a singlebyte character. 396 cur_mb_index is aleady 0. */ 397 } 398} 399#endif /* MBS_SUPPORT */ 400 401#ifdef MBS_SUPPORT 402/* Note that characters become unsigned here. */ 403# define FETCH(c, eoferr) \ 404 { \ 405 if (! lexleft) \ 406 { \ 407 if (eoferr != 0) \ 408 dfaerror (eoferr); \ 409 else \ 410 return lasttok = END; \ 411 } \ 412 if (MB_CUR_MAX > 1) \ 413 update_mb_len_index(lexptr, lexleft); \ 414 (c) = (unsigned char) *lexptr++; \ 415 --lexleft; \ 416 } 417 418/* This function fetch a wide character, and update cur_mb_len, 419 used only if the current locale is a multibyte environment. */ 420static wint_t 421fetch_wc (char const *eoferr) 422{ 423 wchar_t wc; 424 if (! lexleft) 425 { 426 if (eoferr != 0) 427 dfaerror (eoferr); 428 else 429 return WEOF; 430 } 431 432 cur_mb_len = mbrtowc(&wc, lexptr, lexleft, &mbs); 433 if (cur_mb_len <= 0) 434 { 435 cur_mb_len = 1; 436 wc = *lexptr; 437 } 438 lexptr += cur_mb_len; 439 lexleft -= cur_mb_len; 440 return wc; 441} 442#else 443/* Note that characters become unsigned here. */ 444# define FETCH(c, eoferr) \ 445 { \ 446 if (! lexleft) \ 447 { \ 448 if (eoferr != 0) \ 449 dfaerror (eoferr); \ 450 else \ 451 return lasttok = END; \ 452 } \ 453 (c) = (unsigned char) *lexptr++; \ 454 --lexleft; \ 455 } 456#endif /* MBS_SUPPORT */ 457 458#ifdef MBS_SUPPORT 459/* Multibyte character handling sub-routin for lex. 460 This function parse a bracket expression and build a struct 461 mb_char_classes. */ 462static void 463parse_bracket_exp_mb () 464{ 465 wint_t wc, wc1, wc2; 466 467 /* Work area to build a mb_char_classes. */ 468 struct mb_char_classes *work_mbc; 469 int chars_al, range_sts_al, range_ends_al, ch_classes_al, 470 equivs_al, coll_elems_al; 471 472 REALLOC_IF_NECESSARY(dfa->mbcsets, struct mb_char_classes, 473 dfa->mbcsets_alloc, dfa->nmbcsets + 1); 474 /* dfa->multibyte_prop[] hold the index of dfa->mbcsets. 475 We will update dfa->multibyte_prop in addtok(), because we can't 476 decide the index in dfa->tokens[]. */ 477 478 /* Initialize work are */ 479 work_mbc = &(dfa->mbcsets[dfa->nmbcsets++]); 480 481 chars_al = 1; 482 range_sts_al = range_ends_al = 0; 483 ch_classes_al = equivs_al = coll_elems_al = 0; 484 MALLOC(work_mbc->chars, wchar_t, chars_al); 485 486 work_mbc->nchars = work_mbc->nranges = work_mbc->nch_classes = 0; 487 work_mbc->nequivs = work_mbc->ncoll_elems = 0; 488 work_mbc->chars = work_mbc->ch_classes = NULL; 489 work_mbc->range_sts = work_mbc->range_ends = NULL; 490 work_mbc->equivs = work_mbc->coll_elems = NULL; 491 492 wc = fetch_wc(_("Unbalanced [")); 493 if (wc == L'^') 494 { 495 wc = fetch_wc(_("Unbalanced [")); 496 work_mbc->invert = 1; 497 } 498 else 499 work_mbc->invert = 0; 500 do 501 { 502 wc1 = WEOF; /* mark wc1 is not initialized". */ 503 504 /* Note that if we're looking at some other [:...:] construct, 505 we just treat it as a bunch of ordinary characters. We can do 506 this because we assume regex has checked for syntax errors before 507 dfa is ever called. */ 508 if (wc == L'[' && (syntax_bits & RE_CHAR_CLASSES)) 509 { 510#define BRACKET_BUFFER_SIZE 128 511 char str[BRACKET_BUFFER_SIZE]; 512 wc1 = wc; 513 wc = fetch_wc(_("Unbalanced [")); 514 515 /* If pattern contains `[[:', `[[.', or `[[='. */ 516 if (cur_mb_len == 1 && (wc == L':' || wc == L'.' || wc == L'=')) 517 { 518 unsigned char c; 519 unsigned char delim = (unsigned char)wc; 520 int len = 0; 521 for (;;) 522 { 523 if (! lexleft) 524 dfaerror (_("Unbalanced [")); 525 c = (unsigned char) *lexptr++; 526 --lexleft; 527 528 if ((c == delim && *lexptr == ']') || lexleft == 0) 529 break; 530 if (len < BRACKET_BUFFER_SIZE) 531 str[len++] = c; 532 else 533 /* This is in any case an invalid class name. */ 534 str[0] = '\0'; 535 } 536 str[len] = '\0'; 537 538 if (lexleft == 0) 539 { 540 REALLOC_IF_NECESSARY(work_mbc->chars, wchar_t, chars_al, 541 work_mbc->nchars + 2); 542 work_mbc->chars[work_mbc->nchars++] = L'['; 543 work_mbc->chars[work_mbc->nchars++] = delim; 544 break; 545 } 546 547 if (--lexleft, *lexptr++ != ']') 548 dfaerror (_("Unbalanced [")); 549 if (delim == ':') 550 /* build character class. */ 551 { 552 wctype_t wt; 553 /* Query the character class as wctype_t. */ 554 wt = wctype (str); 555 556 if (ch_classes_al == 0) 557 MALLOC(work_mbc->ch_classes, wchar_t, ++ch_classes_al); 558 REALLOC_IF_NECESSARY(work_mbc->ch_classes, wctype_t, 559 ch_classes_al, 560 work_mbc->nch_classes + 1); 561 work_mbc->ch_classes[work_mbc->nch_classes++] = wt; 562 563 } 564 else if (delim == '=' || delim == '.') 565 { 566 char *elem; 567 MALLOC(elem, char, len + 1); 568 strncpy(elem, str, len + 1); 569 570 if (delim == '=') 571 /* build equivalent class. */ 572 { 573 if (equivs_al == 0) 574 MALLOC(work_mbc->equivs, char*, ++equivs_al); 575 REALLOC_IF_NECESSARY(work_mbc->equivs, char*, 576 equivs_al, 577 work_mbc->nequivs + 1); 578 work_mbc->equivs[work_mbc->nequivs++] = elem; 579 } 580 581 if (delim == '.') 582 /* build collating element. */ 583 { 584 if (coll_elems_al == 0) 585 MALLOC(work_mbc->coll_elems, char*, ++coll_elems_al); 586 REALLOC_IF_NECESSARY(work_mbc->coll_elems, char*, 587 coll_elems_al, 588 work_mbc->ncoll_elems + 1); 589 work_mbc->coll_elems[work_mbc->ncoll_elems++] = elem; 590 } 591 } 592 wc1 = wc = WEOF; 593 } 594 else 595 /* We treat '[' as a normal character here. */ 596 { 597 wc2 = wc1; wc1 = wc; wc = wc2; /* swap */ 598 } 599 } 600 else 601 { 602 if (wc == L'\\' && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS)) 603 wc = fetch_wc(("Unbalanced [")); 604 } 605 606 if (wc1 == WEOF) 607 wc1 = fetch_wc(_("Unbalanced [")); 608 609 if (wc1 == L'-') 610 /* build range characters. */ 611 { 612 wc2 = fetch_wc(_("Unbalanced [")); 613 if (wc2 == L']') 614 { 615 /* In the case [x-], the - is an ordinary hyphen, 616 which is left in c1, the lookahead character. */ 617 lexptr -= cur_mb_len; 618 lexleft += cur_mb_len; 619 wc2 = wc; 620 } 621 else 622 { 623 if (wc2 == L'\\' 624 && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS)) 625 wc2 = fetch_wc(_("Unbalanced [")); 626 wc1 = fetch_wc(_("Unbalanced [")); 627 } 628 629 if (range_sts_al == 0) 630 { 631 MALLOC(work_mbc->range_sts, wchar_t, ++range_sts_al); 632 MALLOC(work_mbc->range_ends, wchar_t, ++range_ends_al); 633 } 634 REALLOC_IF_NECESSARY(work_mbc->range_sts, wchar_t, 635 range_sts_al, work_mbc->nranges + 1); 636 work_mbc->range_sts[work_mbc->nranges] = (wchar_t)wc; 637 REALLOC_IF_NECESSARY(work_mbc->range_ends, wchar_t, 638 range_ends_al, work_mbc->nranges + 1); 639 work_mbc->range_ends[work_mbc->nranges++] = (wchar_t)wc2; 640 } 641 else if (wc != WEOF) 642 /* build normal characters. */ 643 { 644 REALLOC_IF_NECESSARY(work_mbc->chars, wchar_t, chars_al, 645 work_mbc->nchars + 1); 646 work_mbc->chars[work_mbc->nchars++] = (wchar_t)wc; 647 } 648 } 649 while ((wc = wc1) != L']'); 650} 651#endif /* MBS_SUPPORT */ 652 653#ifdef __STDC__ 654#define FUNC(F, P) static int F(int c) { return P(c); } 655#else 656#define FUNC(F, P) static int F(c) int c; { return P(c); } 657#endif 658 659FUNC(is_alpha, ISALPHA) 660FUNC(is_upper, ISUPPER) 661FUNC(is_lower, ISLOWER) 662FUNC(is_digit, ISDIGIT) 663FUNC(is_xdigit, ISXDIGIT) 664FUNC(is_space, ISSPACE) 665FUNC(is_punct, ISPUNCT) 666FUNC(is_alnum, ISALNUM) 667FUNC(is_print, ISPRINT) 668FUNC(is_graph, ISGRAPH) 669FUNC(is_cntrl, ISCNTRL) 670 671static int 672is_blank (int c) 673{ 674 return (c == ' ' || c == '\t'); 675} 676 677/* The following list maps the names of the Posix named character classes 678 to predicate functions that determine whether a given character is in 679 the class. The leading [ has already been eaten by the lexical analyzer. */ 680static struct { 681 const char *name; 682 int (*pred) PARAMS ((int)); 683} const prednames[] = { 684 { ":alpha:]", is_alpha }, 685 { ":upper:]", is_upper }, 686 { ":lower:]", is_lower }, 687 { ":digit:]", is_digit }, 688 { ":xdigit:]", is_xdigit }, 689 { ":space:]", is_space }, 690 { ":punct:]", is_punct }, 691 { ":alnum:]", is_alnum }, 692 { ":print:]", is_print }, 693 { ":graph:]", is_graph }, 694 { ":cntrl:]", is_cntrl }, 695 { ":blank:]", is_blank }, 696 { 0 } 697}; 698 699/* Return non-zero if C is a `word-constituent' byte; zero otherwise. */ 700#define IS_WORD_CONSTITUENT(C) (ISALNUM(C) || (C) == '_') 701 702static int 703looking_at (char const *s) 704{ 705 size_t len; 706 707 len = strlen(s); 708 if (lexleft < len) 709 return 0; 710 return strncmp(s, lexptr, len) == 0; 711} 712 713static token 714lex (void) 715{ 716 unsigned c, c1, c2; 717 int backslash = 0, invert; 718 charclass ccl; 719 int i; 720 721 /* Basic plan: We fetch a character. If it's a backslash, 722 we set the backslash flag and go through the loop again. 723 On the plus side, this avoids having a duplicate of the 724 main switch inside the backslash case. On the minus side, 725 it means that just about every case begins with 726 "if (backslash) ...". */ 727 for (i = 0; i < 2; ++i) 728 { 729 FETCH(c, 0); 730#ifdef MBS_SUPPORT 731 if (MB_CUR_MAX > 1 && cur_mb_index) 732 /* If this is a part of a multi-byte character, we must treat 733 this byte data as a normal character. 734 e.g. In case of SJIS encoding, some character contains '\', 735 but they must not be backslash. */ 736 goto normal_char; 737#endif /* MBS_SUPPORT */ 738 switch (c) 739 { 740 case '\\': 741 if (backslash) 742 goto normal_char; 743 if (lexleft == 0) 744 dfaerror(_("Unfinished \\ escape")); 745 backslash = 1; 746 break; 747 748 case '^': 749 if (backslash) 750 goto normal_char; 751 if (syntax_bits & RE_CONTEXT_INDEP_ANCHORS 752 || lasttok == END 753 || lasttok == LPAREN 754 || lasttok == OR) 755 return lasttok = BEGLINE; 756 goto normal_char; 757 758 case '$': 759 if (backslash) 760 goto normal_char; 761 if (syntax_bits & RE_CONTEXT_INDEP_ANCHORS 762 || lexleft == 0 763 || (syntax_bits & RE_NO_BK_PARENS 764 ? lexleft > 0 && *lexptr == ')' 765 : lexleft > 1 && lexptr[0] == '\\' && lexptr[1] == ')') 766 || (syntax_bits & RE_NO_BK_VBAR 767 ? lexleft > 0 && *lexptr == '|' 768 : lexleft > 1 && lexptr[0] == '\\' && lexptr[1] == '|') 769 || ((syntax_bits & RE_NEWLINE_ALT) 770 && lexleft > 0 && *lexptr == '\n')) 771 return lasttok = ENDLINE; 772 goto normal_char; 773 774 case '1': 775 case '2': 776 case '3': 777 case '4': 778 case '5': 779 case '6': 780 case '7': 781 case '8': 782 case '9': 783 if (backslash && !(syntax_bits & RE_NO_BK_REFS)) 784 { 785 laststart = 0; 786 return lasttok = BACKREF; 787 } 788 goto normal_char; 789 790 case '`': 791 if (backslash && !(syntax_bits & RE_NO_GNU_OPS)) 792 return lasttok = BEGLINE; /* FIXME: should be beginning of string */ 793 goto normal_char; 794 795 case '\'': 796 if (backslash && !(syntax_bits & RE_NO_GNU_OPS)) 797 return lasttok = ENDLINE; /* FIXME: should be end of string */ 798 goto normal_char; 799 800 case '<': 801 if (backslash && !(syntax_bits & RE_NO_GNU_OPS)) 802 return lasttok = BEGWORD; 803 goto normal_char; 804 805 case '>': 806 if (backslash && !(syntax_bits & RE_NO_GNU_OPS)) 807 return lasttok = ENDWORD; 808 goto normal_char; 809 810 case 'b': 811 if (backslash && !(syntax_bits & RE_NO_GNU_OPS)) 812 return lasttok = LIMWORD; 813 goto normal_char; 814 815 case 'B': 816 if (backslash && !(syntax_bits & RE_NO_GNU_OPS)) 817 return lasttok = NOTLIMWORD; 818 goto normal_char; 819 820 case '?': 821 if (syntax_bits & RE_LIMITED_OPS) 822 goto normal_char; 823 if (backslash != ((syntax_bits & RE_BK_PLUS_QM) != 0)) 824 goto normal_char; 825 if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart) 826 goto normal_char; 827 return lasttok = QMARK; 828 829 case '*': 830 if (backslash) 831 goto normal_char; 832 if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart) 833 goto normal_char; 834 return lasttok = STAR; 835 836 case '+': 837 if (syntax_bits & RE_LIMITED_OPS) 838 goto normal_char; 839 if (backslash != ((syntax_bits & RE_BK_PLUS_QM) != 0)) 840 goto normal_char; 841 if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart) 842 goto normal_char; 843 return lasttok = PLUS; 844 845 case '{': 846 if (!(syntax_bits & RE_INTERVALS)) 847 goto normal_char; 848 if (backslash != ((syntax_bits & RE_NO_BK_BRACES) == 0)) 849 goto normal_char; 850 if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart) 851 goto normal_char; 852 853 if (syntax_bits & RE_NO_BK_BRACES) 854 { 855 /* Scan ahead for a valid interval; if it's not valid, 856 treat it as a literal '{'. */ 857 int lo = -1, hi = -1; 858 char const *p = lexptr; 859 char const *lim = p + lexleft; 860 for (; p != lim && ISASCIIDIGIT (*p); p++) 861 lo = (lo < 0 ? 0 : lo * 10) + *p - '0'; 862 if (p != lim && *p == ',') 863 while (++p != lim && ISASCIIDIGIT (*p)) 864 hi = (hi < 0 ? 0 : hi * 10) + *p - '0'; 865 else 866 hi = lo; 867 if (p == lim || *p != '}' 868 || lo < 0 || RE_DUP_MAX < hi || (0 <= hi && hi < lo)) 869 goto normal_char; 870 } 871 872 minrep = 0; 873 /* Cases: 874 {M} - exact count 875 {M,} - minimum count, maximum is infinity 876 {M,N} - M through N */ 877 FETCH(c, _("unfinished repeat count")); 878 if (ISASCIIDIGIT (c)) 879 { 880 minrep = c - '0'; 881 for (;;) 882 { 883 FETCH(c, _("unfinished repeat count")); 884 if (! ISASCIIDIGIT (c)) 885 break; 886 minrep = 10 * minrep + c - '0'; 887 } 888 } 889 else 890 dfaerror(_("malformed repeat count")); 891 if (c == ',') 892 { 893 FETCH (c, _("unfinished repeat count")); 894 if (! ISASCIIDIGIT (c)) 895 maxrep = -1; 896 else 897 { 898 maxrep = c - '0'; 899 for (;;) 900 { 901 FETCH (c, _("unfinished repeat count")); 902 if (! ISASCIIDIGIT (c)) 903 break; 904 maxrep = 10 * maxrep + c - '0'; 905 } 906 if (0 <= maxrep && maxrep < minrep) 907 dfaerror (_("malformed repeat count")); 908 } 909 } 910 else 911 maxrep = minrep; 912 if (!(syntax_bits & RE_NO_BK_BRACES)) 913 { 914 if (c != '\\') 915 dfaerror(_("malformed repeat count")); 916 FETCH(c, _("unfinished repeat count")); 917 } 918 if (c != '}') 919 dfaerror(_("malformed repeat count")); 920 laststart = 0; 921 return lasttok = REPMN; 922 923 case '|': 924 if (syntax_bits & RE_LIMITED_OPS) 925 goto normal_char; 926 if (backslash != ((syntax_bits & RE_NO_BK_VBAR) == 0)) 927 goto normal_char; 928 laststart = 1; 929 return lasttok = OR; 930 931 case '\n': 932 if (syntax_bits & RE_LIMITED_OPS 933 || backslash 934 || !(syntax_bits & RE_NEWLINE_ALT)) 935 goto normal_char; 936 laststart = 1; 937 return lasttok = OR; 938 939 case '(': 940 if (backslash != ((syntax_bits & RE_NO_BK_PARENS) == 0)) 941 goto normal_char; 942 ++parens; 943 laststart = 1; 944 return lasttok = LPAREN; 945 946 case ')': 947 if (backslash != ((syntax_bits & RE_NO_BK_PARENS) == 0)) 948 goto normal_char; 949 if (parens == 0 && syntax_bits & RE_UNMATCHED_RIGHT_PAREN_ORD) 950 goto normal_char; 951 --parens; 952 laststart = 0; 953 return lasttok = RPAREN; 954 955 case '.': 956 if (backslash) 957 goto normal_char; 958#ifdef MBS_SUPPORT 959 if (MB_CUR_MAX > 1) 960 { 961 /* In multibyte environment period must match with a single 962 character not a byte. So we use ANYCHAR. */ 963 laststart = 0; 964 return lasttok = ANYCHAR; 965 } 966#endif /* MBS_SUPPORT */ 967 zeroset(ccl); 968 notset(ccl); 969 if (!(syntax_bits & RE_DOT_NEWLINE)) 970 clrbit(eolbyte, ccl); 971 if (syntax_bits & RE_DOT_NOT_NULL) 972 clrbit('\0', ccl); 973 laststart = 0; 974 return lasttok = CSET + charclass_index(ccl); 975 976 case 'w': 977 case 'W': 978 if (!backslash || (syntax_bits & RE_NO_GNU_OPS)) 979 goto normal_char; 980 zeroset(ccl); 981 for (c2 = 0; c2 < NOTCHAR; ++c2) 982 if (IS_WORD_CONSTITUENT(c2)) 983 setbit(c2, ccl); 984 if (c == 'W') 985 notset(ccl); 986 laststart = 0; 987 return lasttok = CSET + charclass_index(ccl); 988 989 case '[': 990 if (backslash) 991 goto normal_char; 992 laststart = 0; 993#ifdef MBS_SUPPORT 994 if (MB_CUR_MAX > 1) 995 { 996 /* In multibyte environment a bracket expression may contain 997 multibyte characters, which must be treated as characters 998 (not bytes). So we parse it by parse_bracket_exp_mb(). */ 999 parse_bracket_exp_mb(); 1000 return lasttok = MBCSET; 1001 } 1002#endif 1003 zeroset(ccl); 1004 FETCH(c, _("Unbalanced [")); 1005 if (c == '^') 1006 { 1007 FETCH(c, _("Unbalanced [")); 1008 invert = 1; 1009 } 1010 else 1011 invert = 0; 1012 do 1013 { 1014 /* Nobody ever said this had to be fast. :-) 1015 Note that if we're looking at some other [:...:] 1016 construct, we just treat it as a bunch of ordinary 1017 characters. We can do this because we assume 1018 regex has checked for syntax errors before 1019 dfa is ever called. */ 1020 if (c == '[' && (syntax_bits & RE_CHAR_CLASSES)) 1021 for (c1 = 0; prednames[c1].name; ++c1) 1022 if (looking_at(prednames[c1].name)) 1023 { 1024 int (*pred) PARAMS ((int)) = prednames[c1].pred; 1025 1026 for (c2 = 0; c2 < NOTCHAR; ++c2) 1027 if ((*pred)(c2)) 1028 setbit_case_fold (c2, ccl); 1029 lexptr += strlen(prednames[c1].name); 1030 lexleft -= strlen(prednames[c1].name); 1031 FETCH(c1, _("Unbalanced [")); 1032 goto skip; 1033 } 1034 if (c == '\\' && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS)) 1035 FETCH(c, _("Unbalanced [")); 1036 FETCH(c1, _("Unbalanced [")); 1037 if (c1 == '-') 1038 { 1039 FETCH(c2, _("Unbalanced [")); 1040 if (c2 == ']') 1041 { 1042 /* In the case [x-], the - is an ordinary hyphen, 1043 which is left in c1, the lookahead character. */ 1044 --lexptr; 1045 ++lexleft; 1046 } 1047 else 1048 { 1049 if (c2 == '\\' 1050 && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS)) 1051 FETCH(c2, _("Unbalanced [")); 1052 FETCH(c1, _("Unbalanced [")); 1053 if (!hard_LC_COLLATE) { 1054 for (; c <= c2; c++) 1055 setbit_case_fold (c, ccl); 1056 } else { 1057 /* POSIX locales are painful - leave the decision to libc */ 1058 char expr[6] = { '[', c, '-', c2, ']', '\0' }; 1059 regex_t re; 1060 if (regcomp (&re, expr, case_fold ? REG_ICASE : 0) == REG_NOERROR) { 1061 for (c = 0; c < NOTCHAR; ++c) { 1062 char buf[2] = { c, '\0' }; 1063 regmatch_t mat; 1064 if (regexec (&re, buf, 1, &mat, 0) == REG_NOERROR 1065 && mat.rm_so == 0 && mat.rm_eo == 1) 1066 setbit_case_fold (c, ccl); 1067 } 1068 regfree (&re); 1069 } 1070 } 1071 continue; 1072 } 1073 } 1074 1075 setbit_case_fold (c, ccl); 1076 1077 skip: 1078 ; 1079 } 1080 while ((c = c1) != ']'); 1081 if (invert) 1082 { 1083 notset(ccl); 1084 if (syntax_bits & RE_HAT_LISTS_NOT_NEWLINE) 1085 clrbit(eolbyte, ccl); 1086 } 1087 return lasttok = CSET + charclass_index(ccl); 1088 1089 default: 1090 normal_char: 1091 laststart = 0; 1092 if (case_fold && ISALPHA(c)) 1093 { 1094 zeroset(ccl); 1095 setbit_case_fold (c, ccl); 1096 return lasttok = CSET + charclass_index(ccl); 1097 } 1098 return c; 1099 } 1100 } 1101 1102 /* The above loop should consume at most a backslash 1103 and some other character. */ 1104 abort(); 1105 return END; /* keeps pedantic compilers happy. */ 1106} 1107 1108/* Recursive descent parser for regular expressions. */ 1109 1110static token tok; /* Lookahead token. */ 1111static int depth; /* Current depth of a hypothetical stack 1112 holding deferred productions. This is 1113 used to determine the depth that will be 1114 required of the real stack later on in 1115 dfaanalyze(). */ 1116 1117/* Add the given token to the parse tree, maintaining the depth count and 1118 updating the maximum depth if necessary. */ 1119static void 1120addtok (token t) 1121{ 1122#ifdef MBS_SUPPORT 1123 if (MB_CUR_MAX > 1) 1124 { 1125 REALLOC_IF_NECESSARY(dfa->multibyte_prop, int, dfa->nmultibyte_prop, 1126 dfa->tindex); 1127 /* Set dfa->multibyte_prop. See struct dfa in dfa.h. */ 1128 if (t == MBCSET) 1129 dfa->multibyte_prop[dfa->tindex] = ((dfa->nmbcsets - 1) << 2) + 3; 1130 else if (t < NOTCHAR) 1131 dfa->multibyte_prop[dfa->tindex] 1132 = (cur_mb_len == 1)? 3 /* single-byte char */ 1133 : (((cur_mb_index == 1)? 1 : 0) /* 1st-byte of multibyte char */ 1134 + ((cur_mb_index == cur_mb_len)? 2 : 0)); /* last-byte */ 1135 else 1136 /* It may be unnecesssary, but it is safer to treat other 1137 symbols as singlebyte characters. */ 1138 dfa->multibyte_prop[dfa->tindex] = 3; 1139 } 1140#endif 1141 1142 REALLOC_IF_NECESSARY(dfa->tokens, token, dfa->talloc, dfa->tindex); 1143 dfa->tokens[dfa->tindex++] = t; 1144 1145 switch (t) 1146 { 1147 case QMARK: 1148 case STAR: 1149 case PLUS: 1150 break; 1151 1152 case CAT: 1153 case OR: 1154 case ORTOP: 1155 --depth; 1156 break; 1157 1158 default: 1159 ++dfa->nleaves; 1160 case EMPTY: 1161 ++depth; 1162 break; 1163 } 1164 if (depth > dfa->depth) 1165 dfa->depth = depth; 1166} 1167 1168/* The grammar understood by the parser is as follows. 1169 1170 regexp: 1171 regexp OR branch 1172 branch 1173 1174 branch: 1175 branch closure 1176 closure 1177 1178 closure: 1179 closure QMARK 1180 closure STAR 1181 closure PLUS 1182 closure REPMN 1183 atom 1184 1185 atom: 1186 <normal character> 1187 <multibyte character> 1188 ANYCHAR 1189 MBCSET 1190 CSET 1191 BACKREF 1192 BEGLINE 1193 ENDLINE 1194 BEGWORD 1195 ENDWORD 1196 LIMWORD 1197 NOTLIMWORD 1198 CRANGE 1199 LPAREN regexp RPAREN 1200 <empty> 1201 1202 The parser builds a parse tree in postfix form in an array of tokens. */ 1203 1204static void 1205atom (void) 1206{ 1207 if ((tok >= 0 && tok < NOTCHAR) || tok >= CSET || tok == BACKREF 1208 || tok == BEGLINE || tok == ENDLINE || tok == BEGWORD 1209#ifdef MBS_SUPPORT 1210 || tok == ANYCHAR || tok == MBCSET /* MB_CUR_MAX > 1 */ 1211#endif /* MBS_SUPPORT */ 1212 || tok == ENDWORD || tok == LIMWORD || tok == NOTLIMWORD) 1213 { 1214 addtok(tok); 1215 tok = lex(); 1216#ifdef MBS_SUPPORT 1217 /* We treat a multibyte character as a single atom, so that DFA 1218 can treat a multibyte character as a single expression. 1219 1220 e.g. We construct following tree from "<mb1><mb2>". 1221 <mb1(1st-byte)><mb1(2nd-byte)><CAT><mb1(3rd-byte)><CAT> 1222 <mb2(1st-byte)><mb2(2nd-byte)><CAT><mb2(3rd-byte)><CAT><CAT> 1223 */ 1224 if (MB_CUR_MAX > 1) 1225 { 1226 while (cur_mb_index > 1 && tok >= 0 && tok < NOTCHAR) 1227 { 1228 addtok(tok); 1229 addtok(CAT); 1230 tok = lex(); 1231 } 1232 } 1233#endif /* MBS_SUPPORT */ 1234 } 1235 else if (tok == CRANGE) 1236 { 1237 /* A character range like "[a-z]" in a locale other than "C" or 1238 "POSIX". This range might any sequence of one or more 1239 characters. Unfortunately the POSIX locale primitives give 1240 us no practical way to find what character sequences might be 1241 matched. Treat this approximately like "(.\1)" -- i.e. match 1242 one character, and then punt to the full matcher. */ 1243 charclass ccl; 1244 zeroset (ccl); 1245 notset (ccl); 1246 addtok (CSET + charclass_index (ccl)); 1247 addtok (BACKREF); 1248 addtok (CAT); 1249 tok = lex (); 1250 } 1251 else if (tok == LPAREN) 1252 { 1253 tok = lex(); 1254 regexp(0); 1255 if (tok != RPAREN) 1256 dfaerror(_("Unbalanced (")); 1257 tok = lex(); 1258 } 1259 else 1260 addtok(EMPTY); 1261} 1262 1263/* Return the number of tokens in the given subexpression. */ 1264static int 1265nsubtoks (int tindex) 1266{ 1267 int ntoks1; 1268 1269 switch (dfa->tokens[tindex - 1]) 1270 { 1271 default: 1272 return 1; 1273 case QMARK: 1274 case STAR: 1275 case PLUS: 1276 return 1 + nsubtoks(tindex - 1); 1277 case CAT: 1278 case OR: 1279 case ORTOP: 1280 ntoks1 = nsubtoks(tindex - 1); 1281 return 1 + ntoks1 + nsubtoks(tindex - 1 - ntoks1); 1282 } 1283} 1284 1285/* Copy the given subexpression to the top of the tree. */ 1286static void 1287copytoks (int tindex, int ntokens) 1288{ 1289 int i; 1290 1291 for (i = 0; i < ntokens; ++i) 1292 addtok(dfa->tokens[tindex + i]); 1293} 1294 1295static void 1296closure (void) 1297{ 1298 int tindex, ntokens, i; 1299 1300 atom(); 1301 while (tok == QMARK || tok == STAR || tok == PLUS || tok == REPMN) 1302 if (tok == REPMN) 1303 { 1304 ntokens = nsubtoks(dfa->tindex); 1305 tindex = dfa->tindex - ntokens; 1306 if (maxrep < 0) 1307 addtok(PLUS); 1308 if (minrep == 0) 1309 addtok(QMARK); 1310 for (i = 1; i < minrep; ++i) 1311 { 1312 copytoks(tindex, ntokens); 1313 addtok(CAT); 1314 } 1315 for (; i < maxrep; ++i) 1316 { 1317 copytoks(tindex, ntokens); 1318 addtok(QMARK); 1319 addtok(CAT); 1320 } 1321 tok = lex(); 1322 } 1323 else 1324 { 1325 addtok(tok); 1326 tok = lex(); 1327 } 1328} 1329 1330static void 1331branch (void) 1332{ 1333 closure(); 1334 while (tok != RPAREN && tok != OR && tok >= 0) 1335 { 1336 closure(); 1337 addtok(CAT); 1338 } 1339} 1340 1341static void 1342regexp (int toplevel) 1343{ 1344 branch(); 1345 while (tok == OR) 1346 { 1347 tok = lex(); 1348 branch(); 1349 if (toplevel) 1350 addtok(ORTOP); 1351 else 1352 addtok(OR); 1353 } 1354} 1355 1356/* Main entry point for the parser. S is a string to be parsed, len is the 1357 length of the string, so s can include NUL characters. D is a pointer to 1358 the struct dfa to parse into. */ 1359void 1360dfaparse (char const *s, size_t len, struct dfa *d) 1361{ 1362 dfa = d; 1363 lexstart = lexptr = s; 1364 lexleft = len; 1365 lasttok = END; 1366 laststart = 1; 1367 parens = 0; 1368 hard_LC_COLLATE = hard_locale (LC_COLLATE); 1369#ifdef MBS_SUPPORT 1370 if (MB_CUR_MAX > 1) 1371 { 1372 cur_mb_index = 0; 1373 cur_mb_len = 0; 1374 memset(&mbs, 0, sizeof(mbstate_t)); 1375 } 1376#endif /* MBS_SUPPORT */ 1377 1378 if (! syntax_bits_set) 1379 dfaerror(_("No syntax specified")); 1380 1381 tok = lex(); 1382 depth = d->depth; 1383 1384 regexp(1); 1385 1386 if (tok != END) 1387 dfaerror(_("Unbalanced )")); 1388 1389 addtok(END - d->nregexps); 1390 addtok(CAT); 1391 1392 if (d->nregexps) 1393 addtok(ORTOP); 1394 1395 ++d->nregexps; 1396} 1397 1398/* Some primitives for operating on sets of positions. */ 1399 1400/* Copy one set to another; the destination must be large enough. */ 1401static void 1402copy (position_set const *src, position_set *dst) 1403{ 1404 int i; 1405 1406 for (i = 0; i < src->nelem; ++i) 1407 dst->elems[i] = src->elems[i]; 1408 dst->nelem = src->nelem; 1409} 1410 1411/* Insert a position in a set. Position sets are maintained in sorted 1412 order according to index. If position already exists in the set with 1413 the same index then their constraints are logically or'd together. 1414 S->elems must point to an array large enough to hold the resulting set. */ 1415static void 1416insert (position p, position_set *s) 1417{ 1418 int i; 1419 position t1, t2; 1420 1421 for (i = 0; i < s->nelem && p.index < s->elems[i].index; ++i) 1422 continue; 1423 if (i < s->nelem && p.index == s->elems[i].index) 1424 s->elems[i].constraint |= p.constraint; 1425 else 1426 { 1427 t1 = p; 1428 ++s->nelem; 1429 while (i < s->nelem) 1430 { 1431 t2 = s->elems[i]; 1432 s->elems[i++] = t1; 1433 t1 = t2; 1434 } 1435 } 1436} 1437 1438/* Merge two sets of positions into a third. The result is exactly as if 1439 the positions of both sets were inserted into an initially empty set. */ 1440static void 1441merge (position_set const *s1, position_set const *s2, position_set *m) 1442{ 1443 int i = 0, j = 0; 1444 1445 m->nelem = 0; 1446 while (i < s1->nelem && j < s2->nelem) 1447 if (s1->elems[i].index > s2->elems[j].index) 1448 m->elems[m->nelem++] = s1->elems[i++]; 1449 else if (s1->elems[i].index < s2->elems[j].index) 1450 m->elems[m->nelem++] = s2->elems[j++]; 1451 else 1452 { 1453 m->elems[m->nelem] = s1->elems[i++]; 1454 m->elems[m->nelem++].constraint |= s2->elems[j++].constraint; 1455 } 1456 while (i < s1->nelem) 1457 m->elems[m->nelem++] = s1->elems[i++]; 1458 while (j < s2->nelem) 1459 m->elems[m->nelem++] = s2->elems[j++]; 1460} 1461 1462/* Delete a position from a set. */ 1463static void 1464delete (position p, position_set *s) 1465{ 1466 int i; 1467 1468 for (i = 0; i < s->nelem; ++i) 1469 if (p.index == s->elems[i].index) 1470 break; 1471 if (i < s->nelem) 1472 for (--s->nelem; i < s->nelem; ++i) 1473 s->elems[i] = s->elems[i + 1]; 1474} 1475 1476/* Find the index of the state corresponding to the given position set with 1477 the given preceding context, or create a new state if there is no such 1478 state. Newline and letter tell whether we got here on a newline or 1479 letter, respectively. */ 1480static int 1481state_index (struct dfa *d, position_set const *s, int newline, int letter) 1482{ 1483 int hash = 0; 1484 int constraint; 1485 int i, j; 1486 1487 newline = newline ? 1 : 0; 1488 letter = letter ? 1 : 0; 1489 1490 for (i = 0; i < s->nelem; ++i) 1491 hash ^= s->elems[i].index + s->elems[i].constraint; 1492 1493 /* Try to find a state that exactly matches the proposed one. */ 1494 for (i = 0; i < d->sindex; ++i) 1495 { 1496 if (hash != d->states[i].hash || s->nelem != d->states[i].elems.nelem 1497 || newline != d->states[i].newline || letter != d->states[i].letter) 1498 continue; 1499 for (j = 0; j < s->nelem; ++j) 1500 if (s->elems[j].constraint 1501 != d->states[i].elems.elems[j].constraint 1502 || s->elems[j].index != d->states[i].elems.elems[j].index) 1503 break; 1504 if (j == s->nelem) 1505 return i; 1506 } 1507 1508 /* We'll have to create a new state. */ 1509 REALLOC_IF_NECESSARY(d->states, dfa_state, d->salloc, d->sindex); 1510 d->states[i].hash = hash; 1511 MALLOC(d->states[i].elems.elems, position, s->nelem); 1512 copy(s, &d->states[i].elems); 1513 d->states[i].newline = newline; 1514 d->states[i].letter = letter; 1515 d->states[i].backref = 0; 1516 d->states[i].constraint = 0; 1517 d->states[i].first_end = 0; 1518#ifdef MBS_SUPPORT 1519 if (MB_CUR_MAX > 1) 1520 d->states[i].mbps.nelem = 0; 1521#endif 1522 for (j = 0; j < s->nelem; ++j) 1523 if (d->tokens[s->elems[j].index] < 0) 1524 { 1525 constraint = s->elems[j].constraint; 1526 if (SUCCEEDS_IN_CONTEXT(constraint, newline, 0, letter, 0) 1527 || SUCCEEDS_IN_CONTEXT(constraint, newline, 0, letter, 1) 1528 || SUCCEEDS_IN_CONTEXT(constraint, newline, 1, letter, 0) 1529 || SUCCEEDS_IN_CONTEXT(constraint, newline, 1, letter, 1)) 1530 d->states[i].constraint |= constraint; 1531 if (! d->states[i].first_end) 1532 d->states[i].first_end = d->tokens[s->elems[j].index]; 1533 } 1534 else if (d->tokens[s->elems[j].index] == BACKREF) 1535 { 1536 d->states[i].constraint = NO_CONSTRAINT; 1537 d->states[i].backref = 1; 1538 } 1539 1540 ++d->sindex; 1541 1542 return i; 1543} 1544 1545/* Find the epsilon closure of a set of positions. If any position of the set 1546 contains a symbol that matches the empty string in some context, replace 1547 that position with the elements of its follow labeled with an appropriate 1548 constraint. Repeat exhaustively until no funny positions are left. 1549 S->elems must be large enough to hold the result. */ 1550static void 1551epsclosure (position_set *s, struct dfa const *d) 1552{ 1553 int i, j; 1554 int *visited; 1555 position p, old; 1556 1557 MALLOC(visited, int, d->tindex); 1558 for (i = 0; i < d->tindex; ++i) 1559 visited[i] = 0; 1560 1561 for (i = 0; i < s->nelem; ++i) 1562 if (d->tokens[s->elems[i].index] >= NOTCHAR 1563 && d->tokens[s->elems[i].index] != BACKREF 1564#ifdef MBS_SUPPORT 1565 && d->tokens[s->elems[i].index] != ANYCHAR 1566 && d->tokens[s->elems[i].index] != MBCSET 1567#endif 1568 && d->tokens[s->elems[i].index] < CSET) 1569 { 1570 old = s->elems[i]; 1571 p.constraint = old.constraint; 1572 delete(s->elems[i], s); 1573 if (visited[old.index]) 1574 { 1575 --i; 1576 continue; 1577 } 1578 visited[old.index] = 1; 1579 switch (d->tokens[old.index]) 1580 { 1581 case BEGLINE: 1582 p.constraint &= BEGLINE_CONSTRAINT; 1583 break; 1584 case ENDLINE: 1585 p.constraint &= ENDLINE_CONSTRAINT; 1586 break; 1587 case BEGWORD: 1588 p.constraint &= BEGWORD_CONSTRAINT; 1589 break; 1590 case ENDWORD: 1591 p.constraint &= ENDWORD_CONSTRAINT; 1592 break; 1593 case LIMWORD: 1594 p.constraint &= LIMWORD_CONSTRAINT; 1595 break; 1596 case NOTLIMWORD: 1597 p.constraint &= NOTLIMWORD_CONSTRAINT; 1598 break; 1599 default: 1600 break; 1601 } 1602 for (j = 0; j < d->follows[old.index].nelem; ++j) 1603 { 1604 p.index = d->follows[old.index].elems[j].index; 1605 insert(p, s); 1606 } 1607 /* Force rescan to start at the beginning. */ 1608 i = -1; 1609 } 1610 1611 free(visited); 1612} 1613 1614/* Perform bottom-up analysis on the parse tree, computing various functions. 1615 Note that at this point, we're pretending constructs like \< are real 1616 characters rather than constraints on what can follow them. 1617 1618 Nullable: A node is nullable if it is at the root of a regexp that can 1619 match the empty string. 1620 * EMPTY leaves are nullable. 1621 * No other leaf is nullable. 1622 * A QMARK or STAR node is nullable. 1623 * A PLUS node is nullable if its argument is nullable. 1624 * A CAT node is nullable if both its arguments are nullable. 1625 * An OR node is nullable if either argument is nullable. 1626 1627 Firstpos: The firstpos of a node is the set of positions (nonempty leaves) 1628 that could correspond to the first character of a string matching the 1629 regexp rooted at the given node. 1630 * EMPTY leaves have empty firstpos. 1631 * The firstpos of a nonempty leaf is that leaf itself. 1632 * The firstpos of a QMARK, STAR, or PLUS node is the firstpos of its 1633 argument. 1634 * The firstpos of a CAT node is the firstpos of the left argument, union 1635 the firstpos of the right if the left argument is nullable. 1636 * The firstpos of an OR node is the union of firstpos of each argument. 1637 1638 Lastpos: The lastpos of a node is the set of positions that could 1639 correspond to the last character of a string matching the regexp at 1640 the given node. 1641 * EMPTY leaves have empty lastpos. 1642 * The lastpos of a nonempty leaf is that leaf itself. 1643 * The lastpos of a QMARK, STAR, or PLUS node is the lastpos of its 1644 argument. 1645 * The lastpos of a CAT node is the lastpos of its right argument, union 1646 the lastpos of the left if the right argument is nullable. 1647 * The lastpos of an OR node is the union of the lastpos of each argument. 1648 1649 Follow: The follow of a position is the set of positions that could 1650 correspond to the character following a character matching the node in 1651 a string matching the regexp. At this point we consider special symbols 1652 that match the empty string in some context to be just normal characters. 1653 Later, if we find that a special symbol is in a follow set, we will 1654 replace it with the elements of its follow, labeled with an appropriate 1655 constraint. 1656 * Every node in the firstpos of the argument of a STAR or PLUS node is in 1657 the follow of every node in the lastpos. 1658 * Every node in the firstpos of the second argument of a CAT node is in 1659 the follow of every node in the lastpos of the first argument. 1660 1661 Because of the postfix representation of the parse tree, the depth-first 1662 analysis is conveniently done by a linear scan with the aid of a stack. 1663 Sets are stored as arrays of the elements, obeying a stack-like allocation 1664 scheme; the number of elements in each set deeper in the stack can be 1665 used to determine the address of a particular set's array. */ 1666void 1667dfaanalyze (struct dfa *d, int searchflag) 1668{ 1669 int *nullable; /* Nullable stack. */ 1670 int *nfirstpos; /* Element count stack for firstpos sets. */ 1671 position *firstpos; /* Array where firstpos elements are stored. */ 1672 int *nlastpos; /* Element count stack for lastpos sets. */ 1673 position *lastpos; /* Array where lastpos elements are stored. */ 1674 int *nalloc; /* Sizes of arrays allocated to follow sets. */ 1675 position_set tmp; /* Temporary set for merging sets. */ 1676 position_set merged; /* Result of merging sets. */ 1677 int wants_newline; /* True if some position wants newline info. */ 1678 int *o_nullable; 1679 int *o_nfirst, *o_nlast; 1680 position *o_firstpos, *o_lastpos; 1681 int i, j; 1682 position *pos; 1683 1684#ifdef DEBUG 1685 fprintf(stderr, "dfaanalyze:\n"); 1686 for (i = 0; i < d->tindex; ++i) 1687 { 1688 fprintf(stderr, " %d:", i); 1689 prtok(d->tokens[i]); 1690 } 1691 putc('\n', stderr); 1692#endif 1693 1694 d->searchflag = searchflag; 1695 1696 MALLOC(nullable, int, d->depth); 1697 o_nullable = nullable; 1698 MALLOC(nfirstpos, int, d->depth); 1699 o_nfirst = nfirstpos; 1700 MALLOC(firstpos, position, d->nleaves); 1701 o_firstpos = firstpos, firstpos += d->nleaves; 1702 MALLOC(nlastpos, int, d->depth); 1703 o_nlast = nlastpos; 1704 MALLOC(lastpos, position, d->nleaves); 1705 o_lastpos = lastpos, lastpos += d->nleaves; 1706 MALLOC(nalloc, int, d->tindex); 1707 for (i = 0; i < d->tindex; ++i) 1708 nalloc[i] = 0; 1709 MALLOC(merged.elems, position, d->nleaves); 1710 1711 CALLOC(d->follows, position_set, d->tindex); 1712 1713 for (i = 0; i < d->tindex; ++i) 1714#ifdef DEBUG 1715 { /* Nonsyntactic #ifdef goo... */ 1716#endif 1717 switch (d->tokens[i]) 1718 { 1719 case EMPTY: 1720 /* The empty set is nullable. */ 1721 *nullable++ = 1; 1722 1723 /* The firstpos and lastpos of the empty leaf are both empty. */ 1724 *nfirstpos++ = *nlastpos++ = 0; 1725 break; 1726 1727 case STAR: 1728 case PLUS: 1729 /* Every element in the firstpos of the argument is in the follow 1730 of every element in the lastpos. */ 1731 tmp.nelem = nfirstpos[-1]; 1732 tmp.elems = firstpos; 1733 pos = lastpos; 1734 for (j = 0; j < nlastpos[-1]; ++j) 1735 { 1736 merge(&tmp, &d->follows[pos[j].index], &merged); 1737 REALLOC_IF_NECESSARY(d->follows[pos[j].index].elems, position, 1738 nalloc[pos[j].index], merged.nelem - 1); 1739 copy(&merged, &d->follows[pos[j].index]); 1740 } 1741 1742 case QMARK: 1743 /* A QMARK or STAR node is automatically nullable. */ 1744 if (d->tokens[i] != PLUS) 1745 nullable[-1] = 1; 1746 break; 1747 1748 case CAT: 1749 /* Every element in the firstpos of the second argument is in the 1750 follow of every element in the lastpos of the first argument. */ 1751 tmp.nelem = nfirstpos[-1]; 1752 tmp.elems = firstpos; 1753 pos = lastpos + nlastpos[-1]; 1754 for (j = 0; j < nlastpos[-2]; ++j) 1755 { 1756 merge(&tmp, &d->follows[pos[j].index], &merged); 1757 REALLOC_IF_NECESSARY(d->follows[pos[j].index].elems, position, 1758 nalloc[pos[j].index], merged.nelem - 1); 1759 copy(&merged, &d->follows[pos[j].index]); 1760 } 1761 1762 /* The firstpos of a CAT node is the firstpos of the first argument, 1763 union that of the second argument if the first is nullable. */ 1764 if (nullable[-2]) 1765 nfirstpos[-2] += nfirstpos[-1]; 1766 else 1767 firstpos += nfirstpos[-1]; 1768 --nfirstpos; 1769 1770 /* The lastpos of a CAT node is the lastpos of the second argument, 1771 union that of the first argument if the second is nullable. */ 1772 if (nullable[-1]) 1773 nlastpos[-2] += nlastpos[-1]; 1774 else 1775 { 1776 pos = lastpos + nlastpos[-2]; 1777 for (j = nlastpos[-1] - 1; j >= 0; --j) 1778 pos[j] = lastpos[j]; 1779 lastpos += nlastpos[-2]; 1780 nlastpos[-2] = nlastpos[-1]; 1781 } 1782 --nlastpos; 1783 1784 /* A CAT node is nullable if both arguments are nullable. */ 1785 nullable[-2] = nullable[-1] && nullable[-2]; 1786 --nullable; 1787 break; 1788 1789 case OR: 1790 case ORTOP: 1791 /* The firstpos is the union of the firstpos of each argument. */ 1792 nfirstpos[-2] += nfirstpos[-1]; 1793 --nfirstpos; 1794 1795 /* The lastpos is the union of the lastpos of each argument. */ 1796 nlastpos[-2] += nlastpos[-1]; 1797 --nlastpos; 1798 1799 /* An OR node is nullable if either argument is nullable. */ 1800 nullable[-2] = nullable[-1] || nullable[-2]; 1801 --nullable; 1802 break; 1803 1804 default: 1805 /* Anything else is a nonempty position. (Note that special 1806 constructs like \< are treated as nonempty strings here; 1807 an "epsilon closure" effectively makes them nullable later. 1808 Backreferences have to get a real position so we can detect 1809 transitions on them later. But they are nullable. */ 1810 *nullable++ = d->tokens[i] == BACKREF; 1811 1812 /* This position is in its own firstpos and lastpos. */ 1813 *nfirstpos++ = *nlastpos++ = 1; 1814 --firstpos, --lastpos; 1815 firstpos->index = lastpos->index = i; 1816 firstpos->constraint = lastpos->constraint = NO_CONSTRAINT; 1817 1818 /* Allocate the follow set for this position. */ 1819 nalloc[i] = 1; 1820 MALLOC(d->follows[i].elems, position, nalloc[i]); 1821 break; 1822 } 1823#ifdef DEBUG 1824 /* ... balance the above nonsyntactic #ifdef goo... */ 1825 fprintf(stderr, "node %d:", i); 1826 prtok(d->tokens[i]); 1827 putc('\n', stderr); 1828 fprintf(stderr, nullable[-1] ? " nullable: yes\n" : " nullable: no\n"); 1829 fprintf(stderr, " firstpos:"); 1830 for (j = nfirstpos[-1] - 1; j >= 0; --j) 1831 { 1832 fprintf(stderr, " %d:", firstpos[j].index); 1833 prtok(d->tokens[firstpos[j].index]); 1834 } 1835 fprintf(stderr, "\n lastpos:"); 1836 for (j = nlastpos[-1] - 1; j >= 0; --j) 1837 { 1838 fprintf(stderr, " %d:", lastpos[j].index); 1839 prtok(d->tokens[lastpos[j].index]); 1840 } 1841 putc('\n', stderr); 1842 } 1843#endif 1844 1845 /* For each follow set that is the follow set of a real position, replace 1846 it with its epsilon closure. */ 1847 for (i = 0; i < d->tindex; ++i) 1848 if (d->tokens[i] < NOTCHAR || d->tokens[i] == BACKREF 1849#ifdef MBS_SUPPORT 1850 || d->tokens[i] == ANYCHAR 1851 || d->tokens[i] == MBCSET 1852#endif 1853 || d->tokens[i] >= CSET) 1854 { 1855#ifdef DEBUG 1856 fprintf(stderr, "follows(%d:", i); 1857 prtok(d->tokens[i]); 1858 fprintf(stderr, "):"); 1859 for (j = d->follows[i].nelem - 1; j >= 0; --j) 1860 { 1861 fprintf(stderr, " %d:", d->follows[i].elems[j].index); 1862 prtok(d->tokens[d->follows[i].elems[j].index]); 1863 } 1864 putc('\n', stderr); 1865#endif 1866 copy(&d->follows[i], &merged); 1867 epsclosure(&merged, d); 1868 if (d->follows[i].nelem < merged.nelem) 1869 REALLOC(d->follows[i].elems, position, merged.nelem); 1870 copy(&merged, &d->follows[i]); 1871 } 1872 1873 /* Get the epsilon closure of the firstpos of the regexp. The result will 1874 be the set of positions of state 0. */ 1875 merged.nelem = 0; 1876 for (i = 0; i < nfirstpos[-1]; ++i) 1877 insert(firstpos[i], &merged); 1878 epsclosure(&merged, d); 1879 1880 /* Check if any of the positions of state 0 will want newline context. */ 1881 wants_newline = 0; 1882 for (i = 0; i < merged.nelem; ++i) 1883 if (PREV_NEWLINE_DEPENDENT(merged.elems[i].constraint)) 1884 wants_newline = 1; 1885 1886 /* Build the initial state. */ 1887 d->salloc = 1; 1888 d->sindex = 0; 1889 MALLOC(d->states, dfa_state, d->salloc); 1890 state_index(d, &merged, wants_newline, 0); 1891 1892 free(o_nullable); 1893 free(o_nfirst); 1894 free(o_firstpos); 1895 free(o_nlast); 1896 free(o_lastpos); 1897 free(nalloc); 1898 free(merged.elems); 1899} 1900 1901/* Find, for each character, the transition out of state s of d, and store 1902 it in the appropriate slot of trans. 1903 1904 We divide the positions of s into groups (positions can appear in more 1905 than one group). Each group is labeled with a set of characters that 1906 every position in the group matches (taking into account, if necessary, 1907 preceding context information of s). For each group, find the union 1908 of the its elements' follows. This set is the set of positions of the 1909 new state. For each character in the group's label, set the transition 1910 on this character to be to a state corresponding to the set's positions, 1911 and its associated backward context information, if necessary. 1912 1913 If we are building a searching matcher, we include the positions of state 1914 0 in every state. 1915 1916 The collection of groups is constructed by building an equivalence-class 1917 partition of the positions of s. 1918 1919 For each position, find the set of characters C that it matches. Eliminate 1920 any characters from C that fail on grounds of backward context. 1921 1922 Search through the groups, looking for a group whose label L has nonempty 1923 intersection with C. If L - C is nonempty, create a new group labeled 1924 L - C and having the same positions as the current group, and set L to 1925 the intersection of L and C. Insert the position in this group, set 1926 C = C - L, and resume scanning. 1927 1928 If after comparing with every group there are characters remaining in C, 1929 create a new group labeled with the characters of C and insert this 1930 position in that group. */ 1931void 1932dfastate (int s, struct dfa *d, int trans[]) 1933{ 1934 position_set grps[NOTCHAR]; /* As many as will ever be needed. */ 1935 charclass labels[NOTCHAR]; /* Labels corresponding to the groups. */ 1936 int ngrps = 0; /* Number of groups actually used. */ 1937 position pos; /* Current position being considered. */ 1938 charclass matches; /* Set of matching characters. */ 1939 int matchesf; /* True if matches is nonempty. */ 1940 charclass intersect; /* Intersection with some label set. */ 1941 int intersectf; /* True if intersect is nonempty. */ 1942 charclass leftovers; /* Stuff in the label that didn't match. */ 1943 int leftoversf; /* True if leftovers is nonempty. */ 1944 static charclass letters; /* Set of characters considered letters. */ 1945 static charclass newline; /* Set of characters that aren't newline. */ 1946 position_set follows; /* Union of the follows of some group. */ 1947 position_set tmp; /* Temporary space for merging sets. */ 1948 int state; /* New state. */ 1949 int wants_newline; /* New state wants to know newline context. */ 1950 int state_newline; /* New state on a newline transition. */ 1951 int wants_letter; /* New state wants to know letter context. */ 1952 int state_letter; /* New state on a letter transition. */ 1953 static int initialized; /* Flag for static initialization. */ 1954#ifdef MBS_SUPPORT 1955 int next_isnt_1st_byte = 0; /* Flag If we can't add state0. */ 1956#endif 1957 int i, j, k; 1958 1959 /* Initialize the set of letters, if necessary. */ 1960 if (! initialized) 1961 { 1962 initialized = 1; 1963 for (i = 0; i < NOTCHAR; ++i) 1964 if (IS_WORD_CONSTITUENT(i)) 1965 setbit(i, letters); 1966 setbit(eolbyte, newline); 1967 } 1968 1969 zeroset(matches); 1970 1971 for (i = 0; i < d->states[s].elems.nelem; ++i) 1972 { 1973 pos = d->states[s].elems.elems[i]; 1974 if (d->tokens[pos.index] >= 0 && d->tokens[pos.index] < NOTCHAR) 1975 setbit(d->tokens[pos.index], matches); 1976 else if (d->tokens[pos.index] >= CSET) 1977 copyset(d->charclasses[d->tokens[pos.index] - CSET], matches); 1978#ifdef MBS_SUPPORT 1979 else if (d->tokens[pos.index] == ANYCHAR 1980 || d->tokens[pos.index] == MBCSET) 1981 /* MB_CUR_MAX > 1 */ 1982 { 1983 /* ANYCHAR and MBCSET must match with a single character, so we 1984 must put it to d->states[s].mbps, which contains the positions 1985 which can match with a single character not a byte. */ 1986 if (d->states[s].mbps.nelem == 0) 1987 { 1988 MALLOC(d->states[s].mbps.elems, position, 1989 d->states[s].elems.nelem); 1990 } 1991 insert(pos, &(d->states[s].mbps)); 1992 continue; 1993 } 1994#endif /* MBS_SUPPORT */ 1995 else 1996 continue; 1997 1998 /* Some characters may need to be eliminated from matches because 1999 they fail in the current context. */ 2000 if (pos.constraint != 0xFF) 2001 { 2002 if (! MATCHES_NEWLINE_CONTEXT(pos.constraint, 2003 d->states[s].newline, 1)) 2004 clrbit(eolbyte, matches); 2005 if (! MATCHES_NEWLINE_CONTEXT(pos.constraint, 2006 d->states[s].newline, 0)) 2007 for (j = 0; j < CHARCLASS_INTS; ++j) 2008 matches[j] &= newline[j]; 2009 if (! MATCHES_LETTER_CONTEXT(pos.constraint, 2010 d->states[s].letter, 1)) 2011 for (j = 0; j < CHARCLASS_INTS; ++j) 2012 matches[j] &= ~letters[j]; 2013 if (! MATCHES_LETTER_CONTEXT(pos.constraint, 2014 d->states[s].letter, 0)) 2015 for (j = 0; j < CHARCLASS_INTS; ++j) 2016 matches[j] &= letters[j]; 2017 2018 /* If there are no characters left, there's no point in going on. */ 2019 for (j = 0; j < CHARCLASS_INTS && !matches[j]; ++j) 2020 continue; 2021 if (j == CHARCLASS_INTS) 2022 continue; 2023 } 2024 2025 for (j = 0; j < ngrps; ++j) 2026 { 2027 /* If matches contains a single character only, and the current 2028 group's label doesn't contain that character, go on to the 2029 next group. */ 2030 if (d->tokens[pos.index] >= 0 && d->tokens[pos.index] < NOTCHAR 2031 && !tstbit(d->tokens[pos.index], labels[j])) 2032 continue; 2033 2034 /* Check if this group's label has a nonempty intersection with 2035 matches. */ 2036 intersectf = 0; 2037 for (k = 0; k < CHARCLASS_INTS; ++k) 2038 (intersect[k] = matches[k] & labels[j][k]) ? (intersectf = 1) : 0; 2039 if (! intersectf) 2040 continue; 2041 2042 /* It does; now find the set differences both ways. */ 2043 leftoversf = matchesf = 0; 2044 for (k = 0; k < CHARCLASS_INTS; ++k) 2045 { 2046 /* Even an optimizing compiler can't know this for sure. */ 2047 int match = matches[k], label = labels[j][k]; 2048 2049 (leftovers[k] = ~match & label) ? (leftoversf = 1) : 0; 2050 (matches[k] = match & ~label) ? (matchesf = 1) : 0; 2051 } 2052 2053 /* If there were leftovers, create a new group labeled with them. */ 2054 if (leftoversf) 2055 { 2056 copyset(leftovers, labels[ngrps]); 2057 copyset(intersect, labels[j]); 2058 MALLOC(grps[ngrps].elems, position, d->nleaves); 2059 copy(&grps[j], &grps[ngrps]); 2060 ++ngrps; 2061 } 2062 2063 /* Put the position in the current group. Note that there is no 2064 reason to call insert() here. */ 2065 grps[j].elems[grps[j].nelem++] = pos; 2066 2067 /* If every character matching the current position has been 2068 accounted for, we're done. */ 2069 if (! matchesf) 2070 break; 2071 } 2072 2073 /* If we've passed the last group, and there are still characters 2074 unaccounted for, then we'll have to create a new group. */ 2075 if (j == ngrps) 2076 { 2077 copyset(matches, labels[ngrps]); 2078 zeroset(matches); 2079 MALLOC(grps[ngrps].elems, position, d->nleaves); 2080 grps[ngrps].nelem = 1; 2081 grps[ngrps].elems[0] = pos; 2082 ++ngrps; 2083 } 2084 } 2085 2086 MALLOC(follows.elems, position, d->nleaves); 2087 MALLOC(tmp.elems, position, d->nleaves); 2088 2089 /* If we are a searching matcher, the default transition is to a state 2090 containing the positions of state 0, otherwise the default transition 2091 is to fail miserably. */ 2092 if (d->searchflag) 2093 { 2094 wants_newline = 0; 2095 wants_letter = 0; 2096 for (i = 0; i < d->states[0].elems.nelem; ++i) 2097 { 2098 if (PREV_NEWLINE_DEPENDENT(d->states[0].elems.elems[i].constraint)) 2099 wants_newline = 1; 2100 if (PREV_LETTER_DEPENDENT(d->states[0].elems.elems[i].constraint)) 2101 wants_letter = 1; 2102 } 2103 copy(&d->states[0].elems, &follows); 2104 state = state_index(d, &follows, 0, 0); 2105 if (wants_newline) 2106 state_newline = state_index(d, &follows, 1, 0); 2107 else 2108 state_newline = state; 2109 if (wants_letter) 2110 state_letter = state_index(d, &follows, 0, 1); 2111 else 2112 state_letter = state; 2113 for (i = 0; i < NOTCHAR; ++i) 2114 trans[i] = (IS_WORD_CONSTITUENT(i)) ? state_letter : state; 2115 trans[eolbyte] = state_newline; 2116 } 2117 else 2118 for (i = 0; i < NOTCHAR; ++i) 2119 trans[i] = -1; 2120 2121 for (i = 0; i < ngrps; ++i) 2122 { 2123 follows.nelem = 0; 2124 2125 /* Find the union of the follows of the positions of the group. 2126 This is a hideously inefficient loop. Fix it someday. */ 2127 for (j = 0; j < grps[i].nelem; ++j) 2128 for (k = 0; k < d->follows[grps[i].elems[j].index].nelem; ++k) 2129 insert(d->follows[grps[i].elems[j].index].elems[k], &follows); 2130 2131#ifdef MBS_SUPPORT 2132 if (MB_CUR_MAX > 1) 2133 { 2134 /* If a token in follows.elems is not 1st byte of a multibyte 2135 character, or the states of follows must accept the bytes 2136 which are not 1st byte of the multibyte character. 2137 Then, if a state of follows encounter a byte, it must not be 2138 a 1st byte of a multibyte character nor singlebyte character. 2139 We cansel to add state[0].follows to next state, because 2140 state[0] must accept 1st-byte 2141 2142 For example, we assume <sb a> is a certain singlebyte 2143 character, <mb A> is a certain multibyte character, and the 2144 codepoint of <sb a> equals the 2nd byte of the codepoint of 2145 <mb A>. 2146 When state[0] accepts <sb a>, state[i] transit to state[i+1] 2147 by accepting accepts 1st byte of <mb A>, and state[i+1] 2148 accepts 2nd byte of <mb A>, if state[i+1] encounter the 2149 codepoint of <sb a>, it must not be <sb a> but 2nd byte of 2150 <mb A>, so we can not add state[0]. */ 2151 2152 next_isnt_1st_byte = 0; 2153 for (j = 0; j < follows.nelem; ++j) 2154 { 2155 if (!(d->multibyte_prop[follows.elems[j].index] & 1)) 2156 { 2157 next_isnt_1st_byte = 1; 2158 break; 2159 } 2160 } 2161 } 2162#endif 2163 2164 /* If we are building a searching matcher, throw in the positions 2165 of state 0 as well. */ 2166#ifdef MBS_SUPPORT 2167 if (d->searchflag && (MB_CUR_MAX == 1 || !next_isnt_1st_byte)) 2168#else 2169 if (d->searchflag) 2170#endif 2171 for (j = 0; j < d->states[0].elems.nelem; ++j) 2172 insert(d->states[0].elems.elems[j], &follows); 2173 2174 /* Find out if the new state will want any context information. */ 2175 wants_newline = 0; 2176 if (tstbit(eolbyte, labels[i])) 2177 for (j = 0; j < follows.nelem; ++j) 2178 if (PREV_NEWLINE_DEPENDENT(follows.elems[j].constraint)) 2179 wants_newline = 1; 2180 2181 wants_letter = 0; 2182 for (j = 0; j < CHARCLASS_INTS; ++j) 2183 if (labels[i][j] & letters[j]) 2184 break; 2185 if (j < CHARCLASS_INTS) 2186 for (j = 0; j < follows.nelem; ++j) 2187 if (PREV_LETTER_DEPENDENT(follows.elems[j].constraint)) 2188 wants_letter = 1; 2189 2190 /* Find the state(s) corresponding to the union of the follows. */ 2191 state = state_index(d, &follows, 0, 0); 2192 if (wants_newline) 2193 state_newline = state_index(d, &follows, 1, 0); 2194 else 2195 state_newline = state; 2196 if (wants_letter) 2197 state_letter = state_index(d, &follows, 0, 1); 2198 else 2199 state_letter = state; 2200 2201 /* Set the transitions for each character in the current label. */ 2202 for (j = 0; j < CHARCLASS_INTS; ++j) 2203 for (k = 0; k < INTBITS; ++k) 2204 if (labels[i][j] & 1 << k) 2205 { 2206 int c = j * INTBITS + k; 2207 2208 if (c == eolbyte) 2209 trans[c] = state_newline; 2210 else if (IS_WORD_CONSTITUENT(c)) 2211 trans[c] = state_letter; 2212 else if (c < NOTCHAR) 2213 trans[c] = state; 2214 } 2215 } 2216 2217 for (i = 0; i < ngrps; ++i) 2218 free(grps[i].elems); 2219 free(follows.elems); 2220 free(tmp.elems); 2221} 2222 2223/* Some routines for manipulating a compiled dfa's transition tables. 2224 Each state may or may not have a transition table; if it does, and it 2225 is a non-accepting state, then d->trans[state] points to its table. 2226 If it is an accepting state then d->fails[state] points to its table. 2227 If it has no table at all, then d->trans[state] is NULL. 2228 TODO: Improve this comment, get rid of the unnecessary redundancy. */ 2229 2230static void 2231build_state (int s, struct dfa *d) 2232{ 2233 int *trans; /* The new transition table. */ 2234 int i; 2235 2236 /* Set an upper limit on the number of transition tables that will ever 2237 exist at once. 1024 is arbitrary. The idea is that the frequently 2238 used transition tables will be quickly rebuilt, whereas the ones that 2239 were only needed once or twice will be cleared away. */ 2240 if (d->trcount >= 1024) 2241 { 2242 for (i = 0; i < d->tralloc; ++i) 2243 if (d->trans[i]) 2244 { 2245 free((ptr_t) d->trans[i]); 2246 d->trans[i] = NULL; 2247 } 2248 else if (d->fails[i]) 2249 { 2250 free((ptr_t) d->fails[i]); 2251 d->fails[i] = NULL; 2252 } 2253 d->trcount = 0; 2254 } 2255 2256 ++d->trcount; 2257 2258 /* Set up the success bits for this state. */ 2259 d->success[s] = 0; 2260 if (ACCEPTS_IN_CONTEXT(d->states[s].newline, 1, d->states[s].letter, 0, 2261 s, *d)) 2262 d->success[s] |= 4; 2263 if (ACCEPTS_IN_CONTEXT(d->states[s].newline, 0, d->states[s].letter, 1, 2264 s, *d)) 2265 d->success[s] |= 2; 2266 if (ACCEPTS_IN_CONTEXT(d->states[s].newline, 0, d->states[s].letter, 0, 2267 s, *d)) 2268 d->success[s] |= 1; 2269 2270 MALLOC(trans, int, NOTCHAR); 2271 dfastate(s, d, trans); 2272 2273 /* Now go through the new transition table, and make sure that the trans 2274 and fail arrays are allocated large enough to hold a pointer for the 2275 largest state mentioned in the table. */ 2276 for (i = 0; i < NOTCHAR; ++i) 2277 if (trans[i] >= d->tralloc) 2278 { 2279 int oldalloc = d->tralloc; 2280 2281 while (trans[i] >= d->tralloc) 2282 d->tralloc *= 2; 2283 REALLOC(d->realtrans, int *, d->tralloc + 1); 2284 d->trans = d->realtrans + 1; 2285 REALLOC(d->fails, int *, d->tralloc); 2286 REALLOC(d->success, int, d->tralloc); 2287 while (oldalloc < d->tralloc) 2288 { 2289 d->trans[oldalloc] = NULL; 2290 d->fails[oldalloc++] = NULL; 2291 } 2292 } 2293 2294 /* Newline is a sentinel. */ 2295 trans[eolbyte] = -1; 2296 2297 if (ACCEPTING(s, *d)) 2298 d->fails[s] = trans; 2299 else 2300 d->trans[s] = trans; 2301} 2302 2303static void 2304build_state_zero (struct dfa *d) 2305{ 2306 d->tralloc = 1; 2307 d->trcount = 0; 2308 CALLOC(d->realtrans, int *, d->tralloc + 1); 2309 d->trans = d->realtrans + 1; 2310 CALLOC(d->fails, int *, d->tralloc); 2311 MALLOC(d->success, int, d->tralloc); 2312 build_state(0, d); 2313} 2314 2315#ifdef MBS_SUPPORT 2316/* Multibyte character handling sub-routins for dfaexec. */ 2317 2318/* Initial state may encounter the byte which is not a singlebyte character 2319 nor 1st byte of a multibyte character. But it is incorrect for initial 2320 state to accept such a byte. 2321 For example, in sjis encoding the regular expression like "\\" accepts 2322 the codepoint 0x5c, but should not accept the 2nd byte of the codepoint 2323 0x815c. Then Initial state must skip the bytes which are not a singlebyte 2324 character nor 1st byte of a multibyte character. */ 2325#define SKIP_REMAINS_MB_IF_INITIAL_STATE(s, p) \ 2326 if (s == 0) \ 2327 { \ 2328 while (inputwcs[p - buf_begin] == 0 \ 2329 && mblen_buf[p - buf_begin] > 0 \ 2330 && p < buf_end) \ 2331 ++p; \ 2332 if (p >= end) \ 2333 { \ 2334 free(mblen_buf); \ 2335 free(inputwcs); \ 2336 return (size_t) -1; \ 2337 } \ 2338 } 2339 2340static void 2341realloc_trans_if_necessary(struct dfa *d, int new_state) 2342{ 2343 /* Make sure that the trans and fail arrays are allocated large enough 2344 to hold a pointer for the new state. */ 2345 if (new_state >= d->tralloc) 2346 { 2347 int oldalloc = d->tralloc; 2348 2349 while (new_state >= d->tralloc) 2350 d->tralloc *= 2; 2351 REALLOC(d->realtrans, int *, d->tralloc + 1); 2352 d->trans = d->realtrans + 1; 2353 REALLOC(d->fails, int *, d->tralloc); 2354 REALLOC(d->success, int, d->tralloc); 2355 while (oldalloc < d->tralloc) 2356 { 2357 d->trans[oldalloc] = NULL; 2358 d->fails[oldalloc++] = NULL; 2359 } 2360 } 2361} 2362 2363/* Return values of transit_state_singlebyte(), and 2364 transit_state_consume_1char. */ 2365typedef enum 2366{ 2367 TRANSIT_STATE_IN_PROGRESS, /* State transition has not finished. */ 2368 TRANSIT_STATE_DONE, /* State transition has finished. */ 2369 TRANSIT_STATE_END_BUFFER /* Reach the end of the buffer. */ 2370} status_transit_state; 2371 2372/* Consume a single byte and transit state from 's' to '*next_state'. 2373 This function is almost same as the state transition routin in dfaexec(). 2374 But state transition is done just once, otherwise matching succeed or 2375 reach the end of the buffer. */ 2376static status_transit_state 2377transit_state_singlebyte (struct dfa *d, int s, unsigned char const *p, 2378 int *next_state) 2379{ 2380 int *t; 2381 int works = s; 2382 2383 status_transit_state rval = TRANSIT_STATE_IN_PROGRESS; 2384 2385 while (rval == TRANSIT_STATE_IN_PROGRESS) 2386 { 2387 if ((t = d->trans[works]) != NULL) 2388 { 2389 works = t[*p]; 2390 rval = TRANSIT_STATE_DONE; 2391 if (works < 0) 2392 works = 0; 2393 } 2394 else if (works < 0) 2395 { 2396 if (p == buf_end) 2397 /* At the moment, it must not happen. */ 2398 return TRANSIT_STATE_END_BUFFER; 2399 works = 0; 2400 } 2401 else if (d->fails[works]) 2402 { 2403 works = d->fails[works][*p]; 2404 rval = TRANSIT_STATE_DONE; 2405 } 2406 else 2407 { 2408 build_state(works, d); 2409 } 2410 } 2411 *next_state = works; 2412 return rval; 2413} 2414 2415/* Check whether period can match or not in the current context. If it can, 2416 return the amount of the bytes with which period can match, otherwise 2417 return 0. 2418 `pos' is the position of the period. `index' is the index from the 2419 buf_begin, and it is the current position in the buffer. */ 2420static int 2421match_anychar (struct dfa *d, int s, position pos, int index) 2422{ 2423 int newline = 0; 2424 int letter = 0; 2425 wchar_t wc; 2426 int mbclen; 2427 2428 wc = inputwcs[index]; 2429 mbclen = (mblen_buf[index] == 0)? 1 : mblen_buf[index]; 2430 2431 /* Check context. */ 2432 if (wc == (wchar_t)eolbyte) 2433 { 2434 if (!(syntax_bits & RE_DOT_NEWLINE)) 2435 return 0; 2436 newline = 1; 2437 } 2438 else if (wc == (wchar_t)'\0') 2439 { 2440 if (syntax_bits & RE_DOT_NOT_NULL) 2441 return 0; 2442 newline = 1; 2443 } 2444 2445 if (iswalnum(wc) || wc == L'_') 2446 letter = 1; 2447 2448 if (!SUCCEEDS_IN_CONTEXT(pos.constraint, d->states[s].newline, 2449 newline, d->states[s].letter, letter)) 2450 return 0; 2451 2452 return mbclen; 2453} 2454 2455/* Check whether bracket expression can match or not in the current context. 2456 If it can, return the amount of the bytes with which expression can match, 2457 otherwise return 0. 2458 `pos' is the position of the bracket expression. `index' is the index 2459 from the buf_begin, and it is the current position in the buffer. */ 2460int 2461match_mb_charset (struct dfa *d, int s, position pos, int index) 2462{ 2463 int i; 2464 int match; /* Flag which represent that matching succeed. */ 2465 int match_len; /* Length of the character (or collating element) 2466 with which this operator match. */ 2467 size_t op_len; /* Length of the operator. */ 2468 char buffer[128]; 2469 wchar_t wcbuf[6]; 2470 2471 /* Pointer to the structure to which we are currently reffering. */ 2472 struct mb_char_classes *work_mbc; 2473 2474 int newline = 0; 2475 int letter = 0; 2476 wchar_t wc; /* Current reffering character. */ 2477 2478 wc = inputwcs[index]; 2479 2480 /* Check context. */ 2481 if (wc == (wchar_t)eolbyte) 2482 { 2483 if (!(syntax_bits & RE_DOT_NEWLINE)) 2484 return 0; 2485 newline = 1; 2486 } 2487 else if (wc == (wchar_t)'\0') 2488 { 2489 if (syntax_bits & RE_DOT_NOT_NULL) 2490 return 0; 2491 newline = 1; 2492 } 2493 if (iswalnum(wc) || wc == L'_') 2494 letter = 1; 2495 if (!SUCCEEDS_IN_CONTEXT(pos.constraint, d->states[s].newline, 2496 newline, d->states[s].letter, letter)) 2497 return 0; 2498 2499 /* Assign the current reffering operator to work_mbc. */ 2500 work_mbc = &(d->mbcsets[(d->multibyte_prop[pos.index]) >> 2]); 2501 match = !work_mbc->invert; 2502 match_len = (mblen_buf[index] == 0)? 1 : mblen_buf[index]; 2503 2504 /* match with a character class? */ 2505 for (i = 0; i<work_mbc->nch_classes; i++) 2506 { 2507 if (iswctype((wint_t)wc, work_mbc->ch_classes[i])) 2508 goto charset_matched; 2509 } 2510 2511 strncpy(buffer, buf_begin + index, match_len); 2512 buffer[match_len] = '\0'; 2513 2514 /* match with an equivalent class? */ 2515 for (i = 0; i<work_mbc->nequivs; i++) 2516 { 2517 op_len = strlen(work_mbc->equivs[i]); 2518 strncpy(buffer, buf_begin + index, op_len); 2519 buffer[op_len] = '\0'; 2520 if (strcoll(work_mbc->equivs[i], buffer) == 0) 2521 { 2522 match_len = op_len; 2523 goto charset_matched; 2524 } 2525 } 2526 2527 /* match with a collating element? */ 2528 for (i = 0; i<work_mbc->ncoll_elems; i++) 2529 { 2530 op_len = strlen(work_mbc->coll_elems[i]); 2531 strncpy(buffer, buf_begin + index, op_len); 2532 buffer[op_len] = '\0'; 2533 2534 if (strcoll(work_mbc->coll_elems[i], buffer) == 0) 2535 { 2536 match_len = op_len; 2537 goto charset_matched; 2538 } 2539 } 2540 2541 wcbuf[0] = wc; 2542 wcbuf[1] = wcbuf[3] = wcbuf[5] = '\0'; 2543 2544 /* match with a range? */ 2545 for (i = 0; i<work_mbc->nranges; i++) 2546 { 2547 wcbuf[2] = work_mbc->range_sts[i]; 2548 wcbuf[4] = work_mbc->range_ends[i]; 2549 2550 if (wcscoll(wcbuf, wcbuf+2) >= 0 && 2551 wcscoll(wcbuf+4, wcbuf) >= 0) 2552 goto charset_matched; 2553 } 2554 2555 /* match with a character? */ 2556 if (case_fold) 2557 wc = towlower (wc); 2558 for (i = 0; i<work_mbc->nchars; i++) 2559 { 2560 if (wc == work_mbc->chars[i]) 2561 goto charset_matched; 2562 } 2563 2564 match = !match; 2565 2566 charset_matched: 2567 return match ? match_len : 0; 2568} 2569 2570/* Check each of `d->states[s].mbps.elem' can match or not. Then return the 2571 array which corresponds to `d->states[s].mbps.elem' and each element of 2572 the array contains the amount of the bytes with which the element can 2573 match. 2574 `index' is the index from the buf_begin, and it is the current position 2575 in the buffer. 2576 Caller MUST free the array which this function return. */ 2577static int* 2578check_matching_with_multibyte_ops (struct dfa *d, int s, int index) 2579{ 2580 int i; 2581 int* rarray; 2582 2583 MALLOC(rarray, int, d->states[s].mbps.nelem); 2584 for (i = 0; i < d->states[s].mbps.nelem; ++i) 2585 { 2586 position pos = d->states[s].mbps.elems[i]; 2587 switch(d->tokens[pos.index]) 2588 { 2589 case ANYCHAR: 2590 rarray[i] = match_anychar(d, s, pos, index); 2591 break; 2592 case MBCSET: 2593 rarray[i] = match_mb_charset(d, s, pos, index); 2594 break; 2595 default: 2596 break; /* can not happen. */ 2597 } 2598 } 2599 return rarray; 2600} 2601 2602/* Consume a single character and enumerate all of the positions which can 2603 be next position from the state `s'. 2604 `match_lens' is the input. It can be NULL, but it can also be the output 2605 of check_matching_with_multibyte_ops() for optimization. 2606 `mbclen' and `pps' are the output. `mbclen' is the length of the 2607 character consumed, and `pps' is the set this function enumerate. */ 2608static status_transit_state 2609transit_state_consume_1char (struct dfa *d, int s, unsigned char const **pp, 2610 int *match_lens, int *mbclen, position_set *pps) 2611{ 2612 int i, j; 2613 int s1, s2; 2614 int* work_mbls; 2615 status_transit_state rs = TRANSIT_STATE_DONE; 2616 2617 /* Calculate the length of the (single/multi byte) character 2618 to which p points. */ 2619 *mbclen = (mblen_buf[*pp - buf_begin] == 0)? 1 2620 : mblen_buf[*pp - buf_begin]; 2621 2622 /* Calculate the state which can be reached from the state `s' by 2623 consuming `*mbclen' single bytes from the buffer. */ 2624 s1 = s; 2625 for (i = 0; i < *mbclen; i++) 2626 { 2627 s2 = s1; 2628 rs = transit_state_singlebyte(d, s2, (*pp)++, &s1); 2629 } 2630 /* Copy the positions contained by `s1' to the set `pps'. */ 2631 copy(&(d->states[s1].elems), pps); 2632 2633 /* Check (inputed)match_lens, and initialize if it is NULL. */ 2634 if (match_lens == NULL && d->states[s].mbps.nelem != 0) 2635 work_mbls = check_matching_with_multibyte_ops(d, s, *pp - buf_begin); 2636 else 2637 work_mbls = match_lens; 2638 2639 /* Add all of the positions which can be reached from `s' by consuming 2640 a single character. */ 2641 for (i = 0; i < d->states[s].mbps.nelem ; i++) 2642 { 2643 if (work_mbls[i] == *mbclen) 2644 for (j = 0; j < d->follows[d->states[s].mbps.elems[i].index].nelem; 2645 j++) 2646 insert(d->follows[d->states[s].mbps.elems[i].index].elems[j], 2647 pps); 2648 } 2649 2650 if (match_lens == NULL && work_mbls != NULL) 2651 free(work_mbls); 2652 return rs; 2653} 2654 2655/* Transit state from s, then return new state and update the pointer of the 2656 buffer. This function is for some operator which can match with a multi- 2657 byte character or a collating element(which may be multi characters). */ 2658static int 2659transit_state (struct dfa *d, int s, unsigned char const **pp) 2660{ 2661 int s1; 2662 int mbclen; /* The length of current input multibyte character. */ 2663 int maxlen = 0; 2664 int i, j; 2665 int *match_lens = NULL; 2666 int nelem = d->states[s].mbps.nelem; /* Just a alias. */ 2667 position_set follows; 2668 unsigned char const *p1 = *pp; 2669 status_transit_state rs; 2670 wchar_t wc; 2671 2672 if (nelem > 0) 2673 /* This state has (a) multibyte operator(s). 2674 We check whether each of them can match or not. */ 2675 { 2676 /* Note: caller must free the return value of this function. */ 2677 match_lens = check_matching_with_multibyte_ops(d, s, *pp - buf_begin); 2678 2679 for (i = 0; i < nelem; i++) 2680 /* Search the operator which match the longest string, 2681 in this state. */ 2682 { 2683 if (match_lens[i] > maxlen) 2684 maxlen = match_lens[i]; 2685 } 2686 } 2687 2688 if (nelem == 0 || maxlen == 0) 2689 /* This state has no multibyte operator which can match. 2690 We need to check only one singlebyte character. */ 2691 { 2692 status_transit_state rs; 2693 rs = transit_state_singlebyte(d, s, *pp, &s1); 2694 2695 /* We must update the pointer if state transition succeeded. */ 2696 if (rs == TRANSIT_STATE_DONE) 2697 ++*pp; 2698 2699 if (match_lens != NULL) 2700 free(match_lens); 2701 return s1; 2702 } 2703 2704 /* This state has some operators which can match a multibyte character. */ 2705 follows.nelem = 0; 2706 MALLOC(follows.elems, position, d->nleaves); 2707 2708 /* `maxlen' may be longer than the length of a character, because it may 2709 not be a character but a (multi character) collating element. 2710 We enumerate all of the positions which `s' can reach by consuming 2711 `maxlen' bytes. */ 2712 rs = transit_state_consume_1char(d, s, pp, match_lens, &mbclen, &follows); 2713 2714 wc = inputwcs[*pp - mbclen - buf_begin]; 2715 s1 = state_index(d, &follows, wc == L'\n', iswalnum(wc)); 2716 realloc_trans_if_necessary(d, s1); 2717 2718 while (*pp - p1 < maxlen) 2719 { 2720 follows.nelem = 0; 2721 rs = transit_state_consume_1char(d, s1, pp, NULL, &mbclen, &follows); 2722 2723 for (i = 0; i < nelem ; i++) 2724 { 2725 if (match_lens[i] == *pp - p1) 2726 for (j = 0; 2727 j < d->follows[d->states[s1].mbps.elems[i].index].nelem; j++) 2728 insert(d->follows[d->states[s1].mbps.elems[i].index].elems[j], 2729 &follows); 2730 } 2731 2732 wc = inputwcs[*pp - mbclen - buf_begin]; 2733 s1 = state_index(d, &follows, wc == L'\n', iswalnum(wc)); 2734 realloc_trans_if_necessary(d, s1); 2735 } 2736 free(match_lens); 2737 free(follows.elems); 2738 return s1; 2739} 2740 2741#endif 2742 2743/* Search through a buffer looking for a match to the given struct dfa. 2744 Find the first occurrence of a string matching the regexp in the buffer, 2745 and the shortest possible version thereof. Return the offset of the first 2746 character after the match, or (size_t) -1 if none is found. BEGIN points to 2747 the beginning of the buffer, and SIZE is the size of the buffer. If SIZE 2748 is nonzero, BEGIN[SIZE - 1] must be a newline. BACKREF points to a place 2749 where we're supposed to store a 1 if backreferencing happened and the 2750 match needs to be verified by a backtracking matcher. Otherwise 2751 we store a 0 in *backref. */ 2752size_t 2753dfaexec (struct dfa *d, char const *begin, size_t size, int *backref) 2754{ 2755 register int s; /* Current state. */ 2756 register unsigned char const *p; /* Current input character. */ 2757 register unsigned char const *end; /* One past the last input character. */ 2758 register int **trans, *t; /* Copy of d->trans so it can be optimized 2759 into a register. */ 2760 register unsigned char eol = eolbyte; /* Likewise for eolbyte. */ 2761 static int sbit[NOTCHAR]; /* Table for anding with d->success. */ 2762 static int sbit_init; 2763 2764 if (! sbit_init) 2765 { 2766 int i; 2767 2768 sbit_init = 1; 2769 for (i = 0; i < NOTCHAR; ++i) 2770 sbit[i] = (IS_WORD_CONSTITUENT(i)) ? 2 : 1; 2771 sbit[eol] = 4; 2772 } 2773 2774 if (! d->tralloc) 2775 build_state_zero(d); 2776 2777 s = 0; 2778 p = (unsigned char const *) begin; 2779 end = p + size; 2780 trans = d->trans; 2781 2782#ifdef MBS_SUPPORT 2783 if (MB_CUR_MAX > 1) 2784 { 2785 int remain_bytes, i; 2786 buf_begin = begin; 2787 buf_end = end; 2788 2789 /* initialize mblen_buf, and inputwcs. */ 2790 MALLOC(mblen_buf, unsigned char, end - (unsigned char const *)begin + 2); 2791 MALLOC(inputwcs, wchar_t, end - (unsigned char const *)begin + 2); 2792 memset(&mbs, 0, sizeof(mbstate_t)); 2793 remain_bytes = 0; 2794 for (i = 0; i < end - (unsigned char const *)begin + 1; i++) 2795 { 2796 if (remain_bytes == 0) 2797 { 2798 remain_bytes 2799 = mbrtowc(inputwcs + i, begin + i, 2800 end - (unsigned char const *)begin - i + 1, &mbs); 2801 if (remain_bytes <= 1) 2802 { 2803 remain_bytes = 0; 2804 inputwcs[i] = (wchar_t)begin[i]; 2805 mblen_buf[i] = 0; 2806 } 2807 else 2808 { 2809 mblen_buf[i] = remain_bytes; 2810 remain_bytes--; 2811 } 2812 } 2813 else 2814 { 2815 mblen_buf[i] = remain_bytes; 2816 inputwcs[i] = 0; 2817 remain_bytes--; 2818 } 2819 } 2820 mblen_buf[i] = 0; 2821 inputwcs[i] = 0; /* sentinel */ 2822 } 2823#endif /* MBS_SUPPORT */ 2824 2825 for (;;) 2826 { 2827#ifdef MBS_SUPPORT 2828 if (MB_CUR_MAX > 1) 2829 while ((t = trans[s])) 2830 { 2831 if (d->states[s].mbps.nelem != 0) 2832 { 2833 /* Can match with a multibyte character( and multi character 2834 collating element). */ 2835 unsigned char const *nextp; 2836 2837 SKIP_REMAINS_MB_IF_INITIAL_STATE(s, p); 2838 2839 nextp = p; 2840 s = transit_state(d, s, &nextp); 2841 p = nextp; 2842 2843 /* Trans table might be updated. */ 2844 trans = d->trans; 2845 } 2846 else 2847 { 2848 SKIP_REMAINS_MB_IF_INITIAL_STATE(s, p); 2849 s = t[*p++]; 2850 } 2851 } 2852 else 2853#endif /* MBS_SUPPORT */ 2854 while ((t = trans[s])) 2855 s = t[*p++]; 2856 2857 if (s < 0) 2858 { 2859 if (p == end) 2860 { 2861#ifdef MBS_SUPPORT 2862 if (MB_CUR_MAX > 1) 2863 { 2864 free(mblen_buf); 2865 free(inputwcs); 2866 } 2867#endif /* MBS_SUPPORT */ 2868 return (size_t) -1; 2869 } 2870 s = 0; 2871 } 2872 else if ((t = d->fails[s])) 2873 { 2874 if (d->success[s] & sbit[*p]) 2875 { 2876 if (backref) 2877 *backref = (d->states[s].backref != 0); 2878#ifdef MBS_SUPPORT 2879 if (MB_CUR_MAX > 1) 2880 { 2881 free(mblen_buf); 2882 free(inputwcs); 2883 } 2884#endif /* MBS_SUPPORT */ 2885 return (char const *) p - begin; 2886 } 2887 2888#ifdef MBS_SUPPORT 2889 if (MB_CUR_MAX > 1) 2890 { 2891 SKIP_REMAINS_MB_IF_INITIAL_STATE(s, p); 2892 if (d->states[s].mbps.nelem != 0) 2893 { 2894 /* Can match with a multibyte character( and multi 2895 character collating element). */ 2896 unsigned char const *nextp; 2897 nextp = p; 2898 s = transit_state(d, s, &nextp); 2899 p = nextp; 2900 2901 /* Trans table might be updated. */ 2902 trans = d->trans; 2903 } 2904 else 2905 s = t[*p++]; 2906 } 2907 else 2908#endif /* MBS_SUPPORT */ 2909 s = t[*p++]; 2910 } 2911 else 2912 { 2913 build_state(s, d); 2914 trans = d->trans; 2915 } 2916 } 2917} 2918 2919/* Initialize the components of a dfa that the other routines don't 2920 initialize for themselves. */ 2921void 2922dfainit (struct dfa *d) 2923{ 2924 d->calloc = 1; 2925 MALLOC(d->charclasses, charclass, d->calloc); 2926 d->cindex = 0; 2927 2928 d->talloc = 1; 2929 MALLOC(d->tokens, token, d->talloc); 2930 d->tindex = d->depth = d->nleaves = d->nregexps = 0; 2931#ifdef MBS_SUPPORT 2932 if (MB_CUR_MAX > 1) 2933 { 2934 d->nmultibyte_prop = 1; 2935 MALLOC(d->multibyte_prop, int, d->nmultibyte_prop); 2936 d->nmbcsets = 0; 2937 d->mbcsets_alloc = 1; 2938 MALLOC(d->mbcsets, struct mb_char_classes, d->mbcsets_alloc); 2939 } 2940#endif 2941 2942 d->searchflag = 0; 2943 d->tralloc = 0; 2944 2945 d->musts = 0; 2946} 2947 2948/* Parse and analyze a single string of the given length. */ 2949void 2950dfacomp (char const *s, size_t len, struct dfa *d, int searchflag) 2951{ 2952 if (case_fold) /* dummy folding in service of dfamust() */ 2953 { 2954 char *lcopy; 2955 int i; 2956 2957 lcopy = malloc(len); 2958 if (!lcopy) 2959 dfaerror(_("out of memory")); 2960 2961 /* This is a kludge. */ 2962 case_fold = 0; 2963 for (i = 0; i < len; ++i) 2964 if (ISUPPER ((unsigned char) s[i])) 2965 lcopy[i] = tolower ((unsigned char) s[i]); 2966 else 2967 lcopy[i] = s[i]; 2968 2969 dfainit(d); 2970 dfaparse(lcopy, len, d); 2971 free(lcopy); 2972 dfamust(d); 2973 d->cindex = d->tindex = d->depth = d->nleaves = d->nregexps = 0; 2974 case_fold = 1; 2975 dfaparse(s, len, d); 2976 dfaanalyze(d, searchflag); 2977 } 2978 else 2979 { 2980 dfainit(d); 2981 dfaparse(s, len, d); 2982 dfamust(d); 2983 dfaanalyze(d, searchflag); 2984 } 2985} 2986 2987/* Free the storage held by the components of a dfa. */ 2988void 2989dfafree (struct dfa *d) 2990{ 2991 int i; 2992 struct dfamust *dm, *ndm; 2993 2994 free((ptr_t) d->charclasses); 2995 free((ptr_t) d->tokens); 2996 2997#ifdef MBS_SUPPORT 2998 if (MB_CUR_MAX > 1) 2999 { 3000 free((ptr_t) d->multibyte_prop); 3001 for (i = 0; i < d->nmbcsets; ++i) 3002 { 3003 int j; 3004 struct mb_char_classes *p = &(d->mbcsets[i]); 3005 if (p->chars != NULL) 3006 free(p->chars); 3007 if (p->ch_classes != NULL) 3008 free(p->ch_classes); 3009 if (p->range_sts != NULL) 3010 free(p->range_sts); 3011 if (p->range_ends != NULL) 3012 free(p->range_ends); 3013 3014 for (j = 0; j < p->nequivs; ++j) 3015 free(p->equivs[j]); 3016 if (p->equivs != NULL) 3017 free(p->equivs); 3018 3019 for (j = 0; j < p->ncoll_elems; ++j) 3020 free(p->coll_elems[j]); 3021 if (p->coll_elems != NULL) 3022 free(p->coll_elems); 3023 } 3024 free((ptr_t) d->mbcsets); 3025 } 3026#endif /* MBS_SUPPORT */ 3027 3028 for (i = 0; i < d->sindex; ++i) 3029 free((ptr_t) d->states[i].elems.elems); 3030 free((ptr_t) d->states); 3031 for (i = 0; i < d->tindex; ++i) 3032 if (d->follows[i].elems) 3033 free((ptr_t) d->follows[i].elems); 3034 free((ptr_t) d->follows); 3035 for (i = 0; i < d->tralloc; ++i) 3036 if (d->trans[i]) 3037 free((ptr_t) d->trans[i]); 3038 else if (d->fails[i]) 3039 free((ptr_t) d->fails[i]); 3040 if (d->realtrans) free((ptr_t) d->realtrans); 3041 if (d->fails) free((ptr_t) d->fails); 3042 if (d->success) free((ptr_t) d->success); 3043 for (dm = d->musts; dm; dm = ndm) 3044 { 3045 ndm = dm->next; 3046 free(dm->must); 3047 free((ptr_t) dm); 3048 } 3049} 3050 3051/* Having found the postfix representation of the regular expression, 3052 try to find a long sequence of characters that must appear in any line 3053 containing the r.e. 3054 Finding a "longest" sequence is beyond the scope here; 3055 we take an easy way out and hope for the best. 3056 (Take "(ab|a)b"--please.) 3057 3058 We do a bottom-up calculation of sequences of characters that must appear 3059 in matches of r.e.'s represented by trees rooted at the nodes of the postfix 3060 representation: 3061 sequences that must appear at the left of the match ("left") 3062 sequences that must appear at the right of the match ("right") 3063 lists of sequences that must appear somewhere in the match ("in") 3064 sequences that must constitute the match ("is") 3065 3066 When we get to the root of the tree, we use one of the longest of its 3067 calculated "in" sequences as our answer. The sequence we find is returned in 3068 d->must (where "d" is the single argument passed to "dfamust"); 3069 the length of the sequence is returned in d->mustn. 3070 3071 The sequences calculated for the various types of node (in pseudo ANSI c) 3072 are shown below. "p" is the operand of unary operators (and the left-hand 3073 operand of binary operators); "q" is the right-hand operand of binary 3074 operators. 3075 3076 "ZERO" means "a zero-length sequence" below. 3077 3078 Type left right is in 3079 ---- ---- ----- -- -- 3080 char c # c # c # c # c 3081 3082 ANYCHAR ZERO ZERO ZERO ZERO 3083 3084 MBCSET ZERO ZERO ZERO ZERO 3085 3086 CSET ZERO ZERO ZERO ZERO 3087 3088 STAR ZERO ZERO ZERO ZERO 3089 3090 QMARK ZERO ZERO ZERO ZERO 3091 3092 PLUS p->left p->right ZERO p->in 3093 3094 CAT (p->is==ZERO)? (q->is==ZERO)? (p->is!=ZERO && p->in plus 3095 p->left : q->right : q->is!=ZERO) ? q->in plus 3096 p->is##q->left p->right##q->is p->is##q->is : p->right##q->left 3097 ZERO 3098 3099 OR longest common longest common (do p->is and substrings common to 3100 leading trailing q->is have same p->in and q->in 3101 (sub)sequence (sub)sequence length and 3102 of p->left of p->right content) ? 3103 and q->left and q->right p->is : NULL 3104 3105 If there's anything else we recognize in the tree, all four sequences get set 3106 to zero-length sequences. If there's something we don't recognize in the tree, 3107 we just return a zero-length sequence. 3108 3109 Break ties in favor of infrequent letters (choosing 'zzz' in preference to 3110 'aaa')? 3111 3112 And. . .is it here or someplace that we might ponder "optimizations" such as 3113 egrep 'psi|epsilon' -> egrep 'psi' 3114 egrep 'pepsi|epsilon' -> egrep 'epsi' 3115 (Yes, we now find "epsi" as a "string 3116 that must occur", but we might also 3117 simplify the *entire* r.e. being sought) 3118 grep '[c]' -> grep 'c' 3119 grep '(ab|a)b' -> grep 'ab' 3120 grep 'ab*' -> grep 'a' 3121 grep 'a*b' -> grep 'b' 3122 3123 There are several issues: 3124 3125 Is optimization easy (enough)? 3126 3127 Does optimization actually accomplish anything, 3128 or is the automaton you get from "psi|epsilon" (for example) 3129 the same as the one you get from "psi" (for example)? 3130 3131 Are optimizable r.e.'s likely to be used in real-life situations 3132 (something like 'ab*' is probably unlikely; something like is 3133 'psi|epsilon' is likelier)? */ 3134 3135static char * 3136icatalloc (char *old, char *new) 3137{ 3138 char *result; 3139 size_t oldsize, newsize; 3140 3141 newsize = (new == NULL) ? 0 : strlen(new); 3142 if (old == NULL) 3143 oldsize = 0; 3144 else if (newsize == 0) 3145 return old; 3146 else oldsize = strlen(old); 3147 if (old == NULL) 3148 result = (char *) malloc(newsize + 1); 3149 else 3150 result = (char *) realloc((void *) old, oldsize + newsize + 1); 3151 if (result != NULL && new != NULL) 3152 (void) strcpy(result + oldsize, new); 3153 return result; 3154} 3155 3156static char * 3157icpyalloc (char *string) 3158{ 3159 return icatalloc((char *) NULL, string); 3160} 3161 3162static char * 3163istrstr (char *lookin, char *lookfor) 3164{ 3165 char *cp; 3166 size_t len; 3167 3168 len = strlen(lookfor); 3169 for (cp = lookin; *cp != '\0'; ++cp) 3170 if (strncmp(cp, lookfor, len) == 0) 3171 return cp; 3172 return NULL; 3173} 3174 3175static void 3176ifree (char *cp) 3177{ 3178 if (cp != NULL) 3179 free(cp); 3180} 3181 3182static void 3183freelist (char **cpp) 3184{ 3185 int i; 3186 3187 if (cpp == NULL) 3188 return; 3189 for (i = 0; cpp[i] != NULL; ++i) 3190 { 3191 free(cpp[i]); 3192 cpp[i] = NULL; 3193 } 3194} 3195 3196static char ** 3197enlist (char **cpp, char *new, size_t len) 3198{ 3199 int i, j; 3200 3201 if (cpp == NULL) 3202 return NULL; 3203 if ((new = icpyalloc(new)) == NULL) 3204 { 3205 freelist(cpp); 3206 return NULL; 3207 } 3208 new[len] = '\0'; 3209 /* Is there already something in the list that's new (or longer)? */ 3210 for (i = 0; cpp[i] != NULL; ++i) 3211 if (istrstr(cpp[i], new) != NULL) 3212 { 3213 free(new); 3214 return cpp; 3215 } 3216 /* Eliminate any obsoleted strings. */ 3217 j = 0; 3218 while (cpp[j] != NULL) 3219 if (istrstr(new, cpp[j]) == NULL) 3220 ++j; 3221 else 3222 { 3223 free(cpp[j]); 3224 if (--i == j) 3225 break; 3226 cpp[j] = cpp[i]; 3227 cpp[i] = NULL; 3228 } 3229 /* Add the new string. */ 3230 cpp = (char **) realloc((char *) cpp, (i + 2) * sizeof *cpp); 3231 if (cpp == NULL) 3232 return NULL; 3233 cpp[i] = new; 3234 cpp[i + 1] = NULL; 3235 return cpp; 3236} 3237 3238/* Given pointers to two strings, return a pointer to an allocated 3239 list of their distinct common substrings. Return NULL if something 3240 seems wild. */ 3241static char ** 3242comsubs (char *left, char *right) 3243{ 3244 char **cpp; 3245 char *lcp; 3246 char *rcp; 3247 size_t i, len; 3248 3249 if (left == NULL || right == NULL) 3250 return NULL; 3251 cpp = (char **) malloc(sizeof *cpp); 3252 if (cpp == NULL) 3253 return NULL; 3254 cpp[0] = NULL; 3255 for (lcp = left; *lcp != '\0'; ++lcp) 3256 { 3257 len = 0; 3258 rcp = strchr (right, *lcp); 3259 while (rcp != NULL) 3260 { 3261 for (i = 1; lcp[i] != '\0' && lcp[i] == rcp[i]; ++i) 3262 continue; 3263 if (i > len) 3264 len = i; 3265 rcp = strchr (rcp + 1, *lcp); 3266 } 3267 if (len == 0) 3268 continue; 3269 if ((cpp = enlist(cpp, lcp, len)) == NULL) 3270 break; 3271 } 3272 return cpp; 3273} 3274 3275static char ** 3276addlists (char **old, char **new) 3277{ 3278 int i; 3279 3280 if (old == NULL || new == NULL) 3281 return NULL; 3282 for (i = 0; new[i] != NULL; ++i) 3283 { 3284 old = enlist(old, new[i], strlen(new[i])); 3285 if (old == NULL) 3286 break; 3287 } 3288 return old; 3289} 3290 3291/* Given two lists of substrings, return a new list giving substrings 3292 common to both. */ 3293static char ** 3294inboth (char **left, char **right) 3295{ 3296 char **both; 3297 char **temp; 3298 int lnum, rnum; 3299 3300 if (left == NULL || right == NULL) 3301 return NULL; 3302 both = (char **) malloc(sizeof *both); 3303 if (both == NULL) 3304 return NULL; 3305 both[0] = NULL; 3306 for (lnum = 0; left[lnum] != NULL; ++lnum) 3307 { 3308 for (rnum = 0; right[rnum] != NULL; ++rnum) 3309 { 3310 temp = comsubs(left[lnum], right[rnum]); 3311 if (temp == NULL) 3312 { 3313 freelist(both); 3314 return NULL; 3315 } 3316 both = addlists(both, temp); 3317 freelist(temp); 3318 free(temp); 3319 if (both == NULL) 3320 return NULL; 3321 } 3322 } 3323 return both; 3324} 3325 3326typedef struct 3327{ 3328 char **in; 3329 char *left; 3330 char *right; 3331 char *is; 3332} must; 3333 3334static void 3335resetmust (must *mp) 3336{ 3337 mp->left[0] = mp->right[0] = mp->is[0] = '\0'; 3338 freelist(mp->in); 3339} 3340 3341static void 3342dfamust (struct dfa *dfa) 3343{ 3344 must *musts; 3345 must *mp; 3346 char *result; 3347 int ri; 3348 int i; 3349 int exact; 3350 token t; 3351 static must must0; 3352 struct dfamust *dm; 3353 static char empty_string[] = ""; 3354 3355 result = empty_string; 3356 exact = 0; 3357 musts = (must *) malloc((dfa->tindex + 1) * sizeof *musts); 3358 if (musts == NULL) 3359 return; 3360 mp = musts; 3361 for (i = 0; i <= dfa->tindex; ++i) 3362 mp[i] = must0; 3363 for (i = 0; i <= dfa->tindex; ++i) 3364 { 3365 mp[i].in = (char **) malloc(sizeof *mp[i].in); 3366 mp[i].left = malloc(2); 3367 mp[i].right = malloc(2); 3368 mp[i].is = malloc(2); 3369 if (mp[i].in == NULL || mp[i].left == NULL || 3370 mp[i].right == NULL || mp[i].is == NULL) 3371 goto done; 3372 mp[i].left[0] = mp[i].right[0] = mp[i].is[0] = '\0'; 3373 mp[i].in[0] = NULL; 3374 } 3375#ifdef DEBUG 3376 fprintf(stderr, "dfamust:\n"); 3377 for (i = 0; i < dfa->tindex; ++i) 3378 { 3379 fprintf(stderr, " %d:", i); 3380 prtok(dfa->tokens[i]); 3381 } 3382 putc('\n', stderr); 3383#endif 3384 for (ri = 0; ri < dfa->tindex; ++ri) 3385 { 3386 switch (t = dfa->tokens[ri]) 3387 { 3388 case LPAREN: 3389 case RPAREN: 3390 goto done; /* "cannot happen" */ 3391 case EMPTY: 3392 case BEGLINE: 3393 case ENDLINE: 3394 case BEGWORD: 3395 case ENDWORD: 3396 case LIMWORD: 3397 case NOTLIMWORD: 3398 case BACKREF: 3399 resetmust(mp); 3400 break; 3401 case STAR: 3402 case QMARK: 3403 if (mp <= musts) 3404 goto done; /* "cannot happen" */ 3405 --mp; 3406 resetmust(mp); 3407 break; 3408 case OR: 3409 case ORTOP: 3410 if (mp < &musts[2]) 3411 goto done; /* "cannot happen" */ 3412 { 3413 char **new; 3414 must *lmp; 3415 must *rmp; 3416 int j, ln, rn, n; 3417 3418 rmp = --mp; 3419 lmp = --mp; 3420 /* Guaranteed to be. Unlikely, but. . . */ 3421 if (strcmp(lmp->is, rmp->is) != 0) 3422 lmp->is[0] = '\0'; 3423 /* Left side--easy */ 3424 i = 0; 3425 while (lmp->left[i] != '\0' && lmp->left[i] == rmp->left[i]) 3426 ++i; 3427 lmp->left[i] = '\0'; 3428 /* Right side */ 3429 ln = strlen(lmp->right); 3430 rn = strlen(rmp->right); 3431 n = ln; 3432 if (n > rn) 3433 n = rn; 3434 for (i = 0; i < n; ++i) 3435 if (lmp->right[ln - i - 1] != rmp->right[rn - i - 1]) 3436 break; 3437 for (j = 0; j < i; ++j) 3438 lmp->right[j] = lmp->right[(ln - i) + j]; 3439 lmp->right[j] = '\0'; 3440 new = inboth(lmp->in, rmp->in); 3441 if (new == NULL) 3442 goto done; 3443 freelist(lmp->in); 3444 free((char *) lmp->in); 3445 lmp->in = new; 3446 } 3447 break; 3448 case PLUS: 3449 if (mp <= musts) 3450 goto done; /* "cannot happen" */ 3451 --mp; 3452 mp->is[0] = '\0'; 3453 break; 3454 case END: 3455 if (mp != &musts[1]) 3456 goto done; /* "cannot happen" */ 3457 for (i = 0; musts[0].in[i] != NULL; ++i) 3458 if (strlen(musts[0].in[i]) > strlen(result)) 3459 result = musts[0].in[i]; 3460 if (strcmp(result, musts[0].is) == 0) 3461 exact = 1; 3462 goto done; 3463 case CAT: 3464 if (mp < &musts[2]) 3465 goto done; /* "cannot happen" */ 3466 { 3467 must *lmp; 3468 must *rmp; 3469 3470 rmp = --mp; 3471 lmp = --mp; 3472 /* In. Everything in left, plus everything in 3473 right, plus catenation of 3474 left's right and right's left. */ 3475 lmp->in = addlists(lmp->in, rmp->in); 3476 if (lmp->in == NULL) 3477 goto done; 3478 if (lmp->right[0] != '\0' && 3479 rmp->left[0] != '\0') 3480 { 3481 char *tp; 3482 3483 tp = icpyalloc(lmp->right); 3484 if (tp == NULL) 3485 goto done; 3486 tp = icatalloc(tp, rmp->left); 3487 if (tp == NULL) 3488 goto done; 3489 lmp->in = enlist(lmp->in, tp, 3490 strlen(tp)); 3491 free(tp); 3492 if (lmp->in == NULL) 3493 goto done; 3494 } 3495 /* Left-hand */ 3496 if (lmp->is[0] != '\0') 3497 { 3498 lmp->left = icatalloc(lmp->left, 3499 rmp->left); 3500 if (lmp->left == NULL) 3501 goto done; 3502 } 3503 /* Right-hand */ 3504 if (rmp->is[0] == '\0') 3505 lmp->right[0] = '\0'; 3506 lmp->right = icatalloc(lmp->right, rmp->right); 3507 if (lmp->right == NULL) 3508 goto done; 3509 /* Guaranteed to be */ 3510 if (lmp->is[0] != '\0' && rmp->is[0] != '\0') 3511 { 3512 lmp->is = icatalloc(lmp->is, rmp->is); 3513 if (lmp->is == NULL) 3514 goto done; 3515 } 3516 else 3517 lmp->is[0] = '\0'; 3518 } 3519 break; 3520 default: 3521 if (t < END) 3522 { 3523 /* "cannot happen" */ 3524 goto done; 3525 } 3526 else if (t == '\0') 3527 { 3528 /* not on *my* shift */ 3529 goto done; 3530 } 3531 else if (t >= CSET 3532#ifdef MBS_SUPPORT 3533 || t == ANYCHAR 3534 || t == MBCSET 3535#endif /* MBS_SUPPORT */ 3536 ) 3537 { 3538 /* easy enough */ 3539 resetmust(mp); 3540 } 3541 else 3542 { 3543 /* plain character */ 3544 resetmust(mp); 3545 mp->is[0] = mp->left[0] = mp->right[0] = t; 3546 mp->is[1] = mp->left[1] = mp->right[1] = '\0'; 3547 mp->in = enlist(mp->in, mp->is, (size_t)1); 3548 if (mp->in == NULL) 3549 goto done; 3550 } 3551 break; 3552 } 3553#ifdef DEBUG 3554 fprintf(stderr, " node: %d:", ri); 3555 prtok(dfa->tokens[ri]); 3556 fprintf(stderr, "\n in:"); 3557 for (i = 0; mp->in[i]; ++i) 3558 fprintf(stderr, " \"%s\"", mp->in[i]); 3559 fprintf(stderr, "\n is: \"%s\"\n", mp->is); 3560 fprintf(stderr, " left: \"%s\"\n", mp->left); 3561 fprintf(stderr, " right: \"%s\"\n", mp->right); 3562#endif 3563 ++mp; 3564 } 3565 done: 3566 if (strlen(result)) 3567 { 3568 dm = (struct dfamust *) malloc(sizeof (struct dfamust)); 3569 dm->exact = exact; 3570 dm->must = malloc(strlen(result) + 1); 3571 strcpy(dm->must, result); 3572 dm->next = dfa->musts; 3573 dfa->musts = dm; 3574 } 3575 mp = musts; 3576 for (i = 0; i <= dfa->tindex; ++i) 3577 { 3578 freelist(mp[i].in); 3579 ifree((char *) mp[i].in); 3580 ifree(mp[i].left); 3581 ifree(mp[i].right); 3582 ifree(mp[i].is); 3583 } 3584 free((char *) mp); 3585} 3586/* vim:set shiftwidth=2: */ 3587