1/* -*- buffer-read-only: t -*- vi: set ro: */ 2/* DO NOT EDIT! GENERATED AUTOMATICALLY! */ 3#line 1 4/* Extended regular expression matching and search library. 5 Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010 Free 6 Software Foundation, Inc. 7 This file is part of the GNU C Library. 8 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>. 9 10 This program is free software; you can redistribute it and/or modify 11 it under the terms of the GNU General Public License as published by 12 the Free Software Foundation; either version 3, or (at your option) 13 any later version. 14 15 This program is distributed in the hope that it will be useful, 16 but WITHOUT ANY WARRANTY; without even the implied warranty of 17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 18 GNU General Public License for more details. 19 20 You should have received a copy of the GNU General Public License along 21 with this program; if not, write to the Free Software Foundation, 22 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ 23 24#include "verify.h" 25#include "intprops.h" 26static reg_errcode_t match_ctx_init (re_match_context_t *cache, int eflags, 27 Idx n) internal_function; 28static void match_ctx_clean (re_match_context_t *mctx) internal_function; 29static void match_ctx_free (re_match_context_t *cache) internal_function; 30static reg_errcode_t match_ctx_add_entry (re_match_context_t *cache, Idx node, 31 Idx str_idx, Idx from, Idx to) 32 internal_function; 33static Idx search_cur_bkref_entry (const re_match_context_t *mctx, Idx str_idx) 34 internal_function; 35static reg_errcode_t match_ctx_add_subtop (re_match_context_t *mctx, Idx node, 36 Idx str_idx) internal_function; 37static re_sub_match_last_t * match_ctx_add_sublast (re_sub_match_top_t *subtop, 38 Idx node, Idx str_idx) 39 internal_function; 40static void sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts, 41 re_dfastate_t **limited_sts, Idx last_node, 42 Idx last_str_idx) 43 internal_function; 44static reg_errcode_t re_search_internal (const regex_t *preg, 45 const char *string, Idx length, 46 Idx start, Idx last_start, Idx stop, 47 size_t nmatch, regmatch_t pmatch[], 48 int eflags) internal_function; 49static regoff_t re_search_2_stub (struct re_pattern_buffer *bufp, 50 const char *string1, Idx length1, 51 const char *string2, Idx length2, 52 Idx start, regoff_t range, 53 struct re_registers *regs, 54 Idx stop, bool ret_len) internal_function; 55static regoff_t re_search_stub (struct re_pattern_buffer *bufp, 56 const char *string, Idx length, Idx start, 57 regoff_t range, Idx stop, 58 struct re_registers *regs, 59 bool ret_len) internal_function; 60static unsigned int re_copy_regs (struct re_registers *regs, regmatch_t *pmatch, 61 Idx nregs, int regs_allocated) 62 internal_function; 63static reg_errcode_t prune_impossible_nodes (re_match_context_t *mctx) 64 internal_function; 65static Idx check_matching (re_match_context_t *mctx, bool fl_longest_match, 66 Idx *p_match_first) internal_function; 67static Idx check_halt_state_context (const re_match_context_t *mctx, 68 const re_dfastate_t *state, Idx idx) 69 internal_function; 70static void update_regs (const re_dfa_t *dfa, regmatch_t *pmatch, 71 regmatch_t *prev_idx_match, Idx cur_node, 72 Idx cur_idx, Idx nmatch) internal_function; 73static reg_errcode_t push_fail_stack (struct re_fail_stack_t *fs, 74 Idx str_idx, Idx dest_node, Idx nregs, 75 regmatch_t *regs, 76 re_node_set *eps_via_nodes) 77 internal_function; 78static reg_errcode_t set_regs (const regex_t *preg, 79 const re_match_context_t *mctx, 80 size_t nmatch, regmatch_t *pmatch, 81 bool fl_backtrack) internal_function; 82static reg_errcode_t free_fail_stack_return (struct re_fail_stack_t *fs) 83 internal_function; 84 85#ifdef RE_ENABLE_I18N 86static int sift_states_iter_mb (const re_match_context_t *mctx, 87 re_sift_context_t *sctx, 88 Idx node_idx, Idx str_idx, Idx max_str_idx) 89 internal_function; 90#endif /* RE_ENABLE_I18N */ 91static reg_errcode_t sift_states_backward (const re_match_context_t *mctx, 92 re_sift_context_t *sctx) 93 internal_function; 94static reg_errcode_t build_sifted_states (const re_match_context_t *mctx, 95 re_sift_context_t *sctx, Idx str_idx, 96 re_node_set *cur_dest) 97 internal_function; 98static reg_errcode_t update_cur_sifted_state (const re_match_context_t *mctx, 99 re_sift_context_t *sctx, 100 Idx str_idx, 101 re_node_set *dest_nodes) 102 internal_function; 103static reg_errcode_t add_epsilon_src_nodes (const re_dfa_t *dfa, 104 re_node_set *dest_nodes, 105 const re_node_set *candidates) 106 internal_function; 107static bool check_dst_limits (const re_match_context_t *mctx, 108 const re_node_set *limits, 109 Idx dst_node, Idx dst_idx, Idx src_node, 110 Idx src_idx) internal_function; 111static int check_dst_limits_calc_pos_1 (const re_match_context_t *mctx, 112 int boundaries, Idx subexp_idx, 113 Idx from_node, Idx bkref_idx) 114 internal_function; 115static int check_dst_limits_calc_pos (const re_match_context_t *mctx, 116 Idx limit, Idx subexp_idx, 117 Idx node, Idx str_idx, 118 Idx bkref_idx) internal_function; 119static reg_errcode_t check_subexp_limits (const re_dfa_t *dfa, 120 re_node_set *dest_nodes, 121 const re_node_set *candidates, 122 re_node_set *limits, 123 struct re_backref_cache_entry *bkref_ents, 124 Idx str_idx) internal_function; 125static reg_errcode_t sift_states_bkref (const re_match_context_t *mctx, 126 re_sift_context_t *sctx, 127 Idx str_idx, const re_node_set *candidates) 128 internal_function; 129static reg_errcode_t merge_state_array (const re_dfa_t *dfa, 130 re_dfastate_t **dst, 131 re_dfastate_t **src, Idx num) 132 internal_function; 133static re_dfastate_t *find_recover_state (reg_errcode_t *err, 134 re_match_context_t *mctx) internal_function; 135static re_dfastate_t *transit_state (reg_errcode_t *err, 136 re_match_context_t *mctx, 137 re_dfastate_t *state) internal_function; 138static re_dfastate_t *merge_state_with_log (reg_errcode_t *err, 139 re_match_context_t *mctx, 140 re_dfastate_t *next_state) 141 internal_function; 142static reg_errcode_t check_subexp_matching_top (re_match_context_t *mctx, 143 re_node_set *cur_nodes, 144 Idx str_idx) internal_function; 145#if 0 146static re_dfastate_t *transit_state_sb (reg_errcode_t *err, 147 re_match_context_t *mctx, 148 re_dfastate_t *pstate) 149 internal_function; 150#endif 151#ifdef RE_ENABLE_I18N 152static reg_errcode_t transit_state_mb (re_match_context_t *mctx, 153 re_dfastate_t *pstate) 154 internal_function; 155#endif /* RE_ENABLE_I18N */ 156static reg_errcode_t transit_state_bkref (re_match_context_t *mctx, 157 const re_node_set *nodes) 158 internal_function; 159static reg_errcode_t get_subexp (re_match_context_t *mctx, 160 Idx bkref_node, Idx bkref_str_idx) 161 internal_function; 162static reg_errcode_t get_subexp_sub (re_match_context_t *mctx, 163 const re_sub_match_top_t *sub_top, 164 re_sub_match_last_t *sub_last, 165 Idx bkref_node, Idx bkref_str) 166 internal_function; 167static Idx find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes, 168 Idx subexp_idx, int type) internal_function; 169static reg_errcode_t check_arrival (re_match_context_t *mctx, 170 state_array_t *path, Idx top_node, 171 Idx top_str, Idx last_node, Idx last_str, 172 int type) internal_function; 173static reg_errcode_t check_arrival_add_next_nodes (re_match_context_t *mctx, 174 Idx str_idx, 175 re_node_set *cur_nodes, 176 re_node_set *next_nodes) 177 internal_function; 178static reg_errcode_t check_arrival_expand_ecl (const re_dfa_t *dfa, 179 re_node_set *cur_nodes, 180 Idx ex_subexp, int type) 181 internal_function; 182static reg_errcode_t check_arrival_expand_ecl_sub (const re_dfa_t *dfa, 183 re_node_set *dst_nodes, 184 Idx target, Idx ex_subexp, 185 int type) internal_function; 186static reg_errcode_t expand_bkref_cache (re_match_context_t *mctx, 187 re_node_set *cur_nodes, Idx cur_str, 188 Idx subexp_num, int type) 189 internal_function; 190static bool build_trtable (const re_dfa_t *dfa, 191 re_dfastate_t *state) internal_function; 192#ifdef RE_ENABLE_I18N 193static int check_node_accept_bytes (const re_dfa_t *dfa, Idx node_idx, 194 const re_string_t *input, Idx idx) 195 internal_function; 196# ifdef _LIBC 197static unsigned int find_collation_sequence_value (const unsigned char *mbs, 198 size_t name_len) 199 internal_function; 200# endif /* _LIBC */ 201#endif /* RE_ENABLE_I18N */ 202static Idx group_nodes_into_DFAstates (const re_dfa_t *dfa, 203 const re_dfastate_t *state, 204 re_node_set *states_node, 205 bitset_t *states_ch) internal_function; 206static bool check_node_accept (const re_match_context_t *mctx, 207 const re_token_t *node, Idx idx) 208 internal_function; 209static reg_errcode_t extend_buffers (re_match_context_t *mctx) 210 internal_function; 211 212/* Entry point for POSIX code. */ 213 214/* regexec searches for a given pattern, specified by PREG, in the 215 string STRING. 216 217 If NMATCH is zero or REG_NOSUB was set in the cflags argument to 218 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at 219 least NMATCH elements, and we set them to the offsets of the 220 corresponding matched substrings. 221 222 EFLAGS specifies `execution flags' which affect matching: if 223 REG_NOTBOL is set, then ^ does not match at the beginning of the 224 string; if REG_NOTEOL is set, then $ does not match at the end. 225 226 We return 0 if we find a match and REG_NOMATCH if not. */ 227 228int 229regexec (preg, string, nmatch, pmatch, eflags) 230 const regex_t *_Restrict_ preg; 231 const char *_Restrict_ string; 232 size_t nmatch; 233 regmatch_t pmatch[_Restrict_arr_]; 234 int eflags; 235{ 236 reg_errcode_t err; 237 Idx start, length; 238#ifdef _LIBC 239 re_dfa_t *dfa = (re_dfa_t *) preg->buffer; 240#endif 241 242 if (eflags & ~(REG_NOTBOL | REG_NOTEOL | REG_STARTEND)) 243 return REG_BADPAT; 244 245 if (eflags & REG_STARTEND) 246 { 247 start = pmatch[0].rm_so; 248 length = pmatch[0].rm_eo; 249 } 250 else 251 { 252 start = 0; 253 length = strlen (string); 254 } 255 256 __libc_lock_lock (dfa->lock); 257 if (preg->no_sub) 258 err = re_search_internal (preg, string, length, start, length, 259 length, 0, NULL, eflags); 260 else 261 err = re_search_internal (preg, string, length, start, length, 262 length, nmatch, pmatch, eflags); 263 __libc_lock_unlock (dfa->lock); 264 return err != REG_NOERROR; 265} 266 267#ifdef _LIBC 268# include <shlib-compat.h> 269versioned_symbol (libc, __regexec, regexec, GLIBC_2_3_4); 270 271# if SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3_4) 272__typeof__ (__regexec) __compat_regexec; 273 274int 275attribute_compat_text_section 276__compat_regexec (const regex_t *_Restrict_ preg, 277 const char *_Restrict_ string, size_t nmatch, 278 regmatch_t pmatch[], int eflags) 279{ 280 return regexec (preg, string, nmatch, pmatch, 281 eflags & (REG_NOTBOL | REG_NOTEOL)); 282} 283compat_symbol (libc, __compat_regexec, regexec, GLIBC_2_0); 284# endif 285#endif 286 287/* Entry points for GNU code. */ 288 289/* re_match, re_search, re_match_2, re_search_2 290 291 The former two functions operate on STRING with length LENGTH, 292 while the later two operate on concatenation of STRING1 and STRING2 293 with lengths LENGTH1 and LENGTH2, respectively. 294 295 re_match() matches the compiled pattern in BUFP against the string, 296 starting at index START. 297 298 re_search() first tries matching at index START, then it tries to match 299 starting from index START + 1, and so on. The last start position tried 300 is START + RANGE. (Thus RANGE = 0 forces re_search to operate the same 301 way as re_match().) 302 303 The parameter STOP of re_{match,search}_2 specifies that no match exceeding 304 the first STOP characters of the concatenation of the strings should be 305 concerned. 306 307 If REGS is not NULL, and BUFP->no_sub is not set, the offsets of the match 308 and all groups is stored in REGS. (For the "_2" variants, the offsets are 309 computed relative to the concatenation, not relative to the individual 310 strings.) 311 312 On success, re_match* functions return the length of the match, re_search* 313 return the position of the start of the match. Return value -1 means no 314 match was found and -2 indicates an internal error. */ 315 316regoff_t 317re_match (bufp, string, length, start, regs) 318 struct re_pattern_buffer *bufp; 319 const char *string; 320 Idx length, start; 321 struct re_registers *regs; 322{ 323 return re_search_stub (bufp, string, length, start, 0, length, regs, true); 324} 325#ifdef _LIBC 326weak_alias (__re_match, re_match) 327#endif 328 329regoff_t 330re_search (bufp, string, length, start, range, regs) 331 struct re_pattern_buffer *bufp; 332 const char *string; 333 Idx length, start; 334 regoff_t range; 335 struct re_registers *regs; 336{ 337 return re_search_stub (bufp, string, length, start, range, length, regs, 338 false); 339} 340#ifdef _LIBC 341weak_alias (__re_search, re_search) 342#endif 343 344regoff_t 345re_match_2 (bufp, string1, length1, string2, length2, start, regs, stop) 346 struct re_pattern_buffer *bufp; 347 const char *string1, *string2; 348 Idx length1, length2, start, stop; 349 struct re_registers *regs; 350{ 351 return re_search_2_stub (bufp, string1, length1, string2, length2, 352 start, 0, regs, stop, true); 353} 354#ifdef _LIBC 355weak_alias (__re_match_2, re_match_2) 356#endif 357 358regoff_t 359re_search_2 (bufp, string1, length1, string2, length2, start, range, regs, stop) 360 struct re_pattern_buffer *bufp; 361 const char *string1, *string2; 362 Idx length1, length2, start, stop; 363 regoff_t range; 364 struct re_registers *regs; 365{ 366 return re_search_2_stub (bufp, string1, length1, string2, length2, 367 start, range, regs, stop, false); 368} 369#ifdef _LIBC 370weak_alias (__re_search_2, re_search_2) 371#endif 372 373static regoff_t 374internal_function 375re_search_2_stub (struct re_pattern_buffer *bufp, 376 const char *string1, Idx length1, 377 const char *string2, Idx length2, 378 Idx start, regoff_t range, struct re_registers *regs, 379 Idx stop, bool ret_len) 380{ 381 const char *str; 382 regoff_t rval; 383 Idx len = length1 + length2; 384 char *s = NULL; 385 386 verify (! TYPE_SIGNED (Idx)); 387 if (BE (len < length1, 0)) 388 return -2; 389 /* if (BE (length1 < 0 || length2 < 0 || stop < 0, 0)) 390 return -2; */ 391 392 /* Concatenate the strings. */ 393 if (length2 > 0) 394 if (length1 > 0) 395 { 396 s = re_malloc (char, len); 397 398 if (BE (s == NULL, 0)) 399 return -2; 400#ifdef _LIBC 401 memcpy (__mempcpy (s, string1, length1), string2, length2); 402#else 403 memcpy (s, string1, length1); 404 memcpy (s + length1, string2, length2); 405#endif 406 str = s; 407 } 408 else 409 str = string2; 410 else 411 str = string1; 412 413 rval = re_search_stub (bufp, str, len, start, range, stop, regs, 414 ret_len); 415 re_free (s); 416 return rval; 417} 418 419/* The parameters have the same meaning as those of re_search. 420 Additional parameters: 421 If RET_LEN is true the length of the match is returned (re_match style); 422 otherwise the position of the match is returned. */ 423 424static regoff_t 425internal_function 426re_search_stub (struct re_pattern_buffer *bufp, 427 const char *string, Idx length, 428 Idx start, regoff_t range, Idx stop, struct re_registers *regs, 429 bool ret_len) 430{ 431 reg_errcode_t result; 432 regmatch_t *pmatch; 433 Idx nregs; 434 regoff_t rval; 435 int eflags = 0; 436#ifdef _LIBC 437 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer; 438#endif 439 Idx last_start = start + range; 440 441 /* Check for out-of-range. */ 442 verify (! TYPE_SIGNED (Idx)); 443 /* if (BE (start < 0, 0)) 444 return -1; */ 445 if (BE (start > length, 0)) 446 return -1; 447 if (BE (length < last_start || (0 <= range && last_start < start), 0)) 448 last_start = length; 449 else if (BE (/* last_start < 0 || */ (range < 0 && start <= last_start), 0)) 450 last_start = 0; 451 452 __libc_lock_lock (dfa->lock); 453 454 eflags |= (bufp->not_bol) ? REG_NOTBOL : 0; 455 eflags |= (bufp->not_eol) ? REG_NOTEOL : 0; 456 457 /* Compile fastmap if we haven't yet. */ 458 if (start < last_start && bufp->fastmap != NULL && !bufp->fastmap_accurate) 459 re_compile_fastmap (bufp); 460 461 if (BE (bufp->no_sub, 0)) 462 regs = NULL; 463 464 /* We need at least 1 register. */ 465 if (regs == NULL) 466 nregs = 1; 467 else if (BE (bufp->regs_allocated == REGS_FIXED 468 && regs->num_regs <= bufp->re_nsub, 0)) 469 { 470 nregs = regs->num_regs; 471 if (BE (nregs < 1, 0)) 472 { 473 /* Nothing can be copied to regs. */ 474 regs = NULL; 475 nregs = 1; 476 } 477 } 478 else 479 nregs = bufp->re_nsub + 1; 480 pmatch = re_malloc (regmatch_t, nregs); 481 if (BE (pmatch == NULL, 0)) 482 { 483 rval = -2; 484 goto out; 485 } 486 487 result = re_search_internal (bufp, string, length, start, last_start, stop, 488 nregs, pmatch, eflags); 489 490 rval = 0; 491 492 /* I hope we needn't fill ther regs with -1's when no match was found. */ 493 if (result != REG_NOERROR) 494 rval = -1; 495 else if (regs != NULL) 496 { 497 /* If caller wants register contents data back, copy them. */ 498 bufp->regs_allocated = re_copy_regs (regs, pmatch, nregs, 499 bufp->regs_allocated); 500 if (BE (bufp->regs_allocated == REGS_UNALLOCATED, 0)) 501 rval = -2; 502 } 503 504 if (BE (rval == 0, 1)) 505 { 506 if (ret_len) 507 { 508 assert (pmatch[0].rm_so == start); 509 rval = pmatch[0].rm_eo - start; 510 } 511 else 512 rval = pmatch[0].rm_so; 513 } 514 re_free (pmatch); 515 out: 516 __libc_lock_unlock (dfa->lock); 517 return rval; 518} 519 520static unsigned int 521internal_function 522re_copy_regs (struct re_registers *regs, regmatch_t *pmatch, Idx nregs, 523 int regs_allocated) 524{ 525 int rval = REGS_REALLOCATE; 526 Idx i; 527 Idx need_regs = nregs + 1; 528 /* We need one extra element beyond `num_regs' for the `-1' marker GNU code 529 uses. */ 530 531 /* Have the register data arrays been allocated? */ 532 if (regs_allocated == REGS_UNALLOCATED) 533 { /* No. So allocate them with malloc. */ 534 regs->start = re_malloc (regoff_t, need_regs); 535 if (BE (regs->start == NULL, 0)) 536 return REGS_UNALLOCATED; 537 regs->end = re_malloc (regoff_t, need_regs); 538 if (BE (regs->end == NULL, 0)) 539 { 540 re_free (regs->start); 541 return REGS_UNALLOCATED; 542 } 543 regs->num_regs = need_regs; 544 } 545 else if (regs_allocated == REGS_REALLOCATE) 546 { /* Yes. If we need more elements than were already 547 allocated, reallocate them. If we need fewer, just 548 leave it alone. */ 549 if (BE (need_regs > regs->num_regs, 0)) 550 { 551 regoff_t *new_start = re_realloc (regs->start, regoff_t, need_regs); 552 regoff_t *new_end; 553 if (BE (new_start == NULL, 0)) 554 return REGS_UNALLOCATED; 555 new_end = re_realloc (regs->end, regoff_t, need_regs); 556 if (BE (new_end == NULL, 0)) 557 { 558 re_free (new_start); 559 return REGS_UNALLOCATED; 560 } 561 regs->start = new_start; 562 regs->end = new_end; 563 regs->num_regs = need_regs; 564 } 565 } 566 else 567 { 568 assert (regs_allocated == REGS_FIXED); 569 /* This function may not be called with REGS_FIXED and nregs too big. */ 570 assert (regs->num_regs >= nregs); 571 rval = REGS_FIXED; 572 } 573 574 /* Copy the regs. */ 575 for (i = 0; i < nregs; ++i) 576 { 577 regs->start[i] = pmatch[i].rm_so; 578 regs->end[i] = pmatch[i].rm_eo; 579 } 580 for ( ; i < regs->num_regs; ++i) 581 regs->start[i] = regs->end[i] = -1; 582 583 return rval; 584} 585 586/* Set REGS to hold NUM_REGS registers, storing them in STARTS and 587 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use 588 this memory for recording register information. STARTS and ENDS 589 must be allocated using the malloc library routine, and must each 590 be at least NUM_REGS * sizeof (regoff_t) bytes long. 591 592 If NUM_REGS == 0, then subsequent matches should allocate their own 593 register data. 594 595 Unless this function is called, the first search or match using 596 PATTERN_BUFFER will allocate its own register data, without 597 freeing the old data. */ 598 599void 600re_set_registers (bufp, regs, num_regs, starts, ends) 601 struct re_pattern_buffer *bufp; 602 struct re_registers *regs; 603 __re_size_t num_regs; 604 regoff_t *starts, *ends; 605{ 606 if (num_regs) 607 { 608 bufp->regs_allocated = REGS_REALLOCATE; 609 regs->num_regs = num_regs; 610 regs->start = starts; 611 regs->end = ends; 612 } 613 else 614 { 615 bufp->regs_allocated = REGS_UNALLOCATED; 616 regs->num_regs = 0; 617 regs->start = regs->end = NULL; 618 } 619} 620#ifdef _LIBC 621weak_alias (__re_set_registers, re_set_registers) 622#endif 623 624/* Entry points compatible with 4.2 BSD regex library. We don't define 625 them unless specifically requested. */ 626 627#if defined _REGEX_RE_COMP || defined _LIBC 628int 629# ifdef _LIBC 630weak_function 631# endif 632re_exec (s) 633 const char *s; 634{ 635 return 0 == regexec (&re_comp_buf, s, 0, NULL, 0); 636} 637#endif /* _REGEX_RE_COMP */ 638 639/* Internal entry point. */ 640 641/* Searches for a compiled pattern PREG in the string STRING, whose 642 length is LENGTH. NMATCH, PMATCH, and EFLAGS have the same 643 meaning as with regexec. LAST_START is START + RANGE, where 644 START and RANGE have the same meaning as with re_search. 645 Return REG_NOERROR if we find a match, and REG_NOMATCH if not, 646 otherwise return the error code. 647 Note: We assume front end functions already check ranges. 648 (0 <= LAST_START && LAST_START <= LENGTH) */ 649 650static reg_errcode_t 651internal_function 652re_search_internal (const regex_t *preg, 653 const char *string, Idx length, 654 Idx start, Idx last_start, Idx stop, 655 size_t nmatch, regmatch_t pmatch[], 656 int eflags) 657{ 658 reg_errcode_t err; 659 const re_dfa_t *dfa = (const re_dfa_t *) preg->buffer; 660 Idx left_lim, right_lim; 661 int incr; 662 bool fl_longest_match; 663 int match_kind; 664 Idx match_first; 665 Idx match_last = REG_MISSING; 666 Idx extra_nmatch; 667 bool sb; 668 int ch; 669#if defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L) 670 re_match_context_t mctx = { .dfa = dfa }; 671#else 672 re_match_context_t mctx; 673#endif 674 char *fastmap = ((preg->fastmap != NULL && preg->fastmap_accurate 675 && start != last_start && !preg->can_be_null) 676 ? preg->fastmap : NULL); 677 RE_TRANSLATE_TYPE t = preg->translate; 678 679#if !(defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L)) 680 memset (&mctx, '\0', sizeof (re_match_context_t)); 681 mctx.dfa = dfa; 682#endif 683 684 extra_nmatch = (nmatch > preg->re_nsub) ? nmatch - (preg->re_nsub + 1) : 0; 685 nmatch -= extra_nmatch; 686 687 /* Check if the DFA haven't been compiled. */ 688 if (BE (preg->used == 0 || dfa->init_state == NULL 689 || dfa->init_state_word == NULL || dfa->init_state_nl == NULL 690 || dfa->init_state_begbuf == NULL, 0)) 691 return REG_NOMATCH; 692 693#ifdef DEBUG 694 /* We assume front-end functions already check them. */ 695 assert (0 <= last_start && last_start <= length); 696#endif 697 698 /* If initial states with non-begbuf contexts have no elements, 699 the regex must be anchored. If preg->newline_anchor is set, 700 we'll never use init_state_nl, so do not check it. */ 701 if (dfa->init_state->nodes.nelem == 0 702 && dfa->init_state_word->nodes.nelem == 0 703 && (dfa->init_state_nl->nodes.nelem == 0 704 || !preg->newline_anchor)) 705 { 706 if (start != 0 && last_start != 0) 707 return REG_NOMATCH; 708 start = last_start = 0; 709 } 710 711 /* We must check the longest matching, if nmatch > 0. */ 712 fl_longest_match = (nmatch != 0 || dfa->nbackref); 713 714 err = re_string_allocate (&mctx.input, string, length, dfa->nodes_len + 1, 715 preg->translate, (preg->syntax & RE_ICASE) != 0, 716 dfa); 717 if (BE (err != REG_NOERROR, 0)) 718 goto free_return; 719 mctx.input.stop = stop; 720 mctx.input.raw_stop = stop; 721 mctx.input.newline_anchor = preg->newline_anchor; 722 723 err = match_ctx_init (&mctx, eflags, dfa->nbackref * 2); 724 if (BE (err != REG_NOERROR, 0)) 725 goto free_return; 726 727 /* We will log all the DFA states through which the dfa pass, 728 if nmatch > 1, or this dfa has "multibyte node", which is a 729 back-reference or a node which can accept multibyte character or 730 multi character collating element. */ 731 if (nmatch > 1 || dfa->has_mb_node) 732 { 733 /* Avoid overflow. */ 734 if (BE (SIZE_MAX / sizeof (re_dfastate_t *) <= mctx.input.bufs_len, 0)) 735 { 736 err = REG_ESPACE; 737 goto free_return; 738 } 739 740 mctx.state_log = re_malloc (re_dfastate_t *, mctx.input.bufs_len + 1); 741 if (BE (mctx.state_log == NULL, 0)) 742 { 743 err = REG_ESPACE; 744 goto free_return; 745 } 746 } 747 else 748 mctx.state_log = NULL; 749 750 match_first = start; 751 mctx.input.tip_context = (eflags & REG_NOTBOL) ? CONTEXT_BEGBUF 752 : CONTEXT_NEWLINE | CONTEXT_BEGBUF; 753 754 /* Check incrementally whether of not the input string match. */ 755 incr = (last_start < start) ? -1 : 1; 756 left_lim = (last_start < start) ? last_start : start; 757 right_lim = (last_start < start) ? start : last_start; 758 sb = dfa->mb_cur_max == 1; 759 match_kind = 760 (fastmap 761 ? ((sb || !(preg->syntax & RE_ICASE || t) ? 4 : 0) 762 | (start <= last_start ? 2 : 0) 763 | (t != NULL ? 1 : 0)) 764 : 8); 765 766 for (;; match_first += incr) 767 { 768 err = REG_NOMATCH; 769 if (match_first < left_lim || right_lim < match_first) 770 goto free_return; 771 772 /* Advance as rapidly as possible through the string, until we 773 find a plausible place to start matching. This may be done 774 with varying efficiency, so there are various possibilities: 775 only the most common of them are specialized, in order to 776 save on code size. We use a switch statement for speed. */ 777 switch (match_kind) 778 { 779 case 8: 780 /* No fastmap. */ 781 break; 782 783 case 7: 784 /* Fastmap with single-byte translation, match forward. */ 785 while (BE (match_first < right_lim, 1) 786 && !fastmap[t[(unsigned char) string[match_first]]]) 787 ++match_first; 788 goto forward_match_found_start_or_reached_end; 789 790 case 6: 791 /* Fastmap without translation, match forward. */ 792 while (BE (match_first < right_lim, 1) 793 && !fastmap[(unsigned char) string[match_first]]) 794 ++match_first; 795 796 forward_match_found_start_or_reached_end: 797 if (BE (match_first == right_lim, 0)) 798 { 799 ch = match_first >= length 800 ? 0 : (unsigned char) string[match_first]; 801 if (!fastmap[t ? t[ch] : ch]) 802 goto free_return; 803 } 804 break; 805 806 case 4: 807 case 5: 808 /* Fastmap without multi-byte translation, match backwards. */ 809 while (match_first >= left_lim) 810 { 811 ch = match_first >= length 812 ? 0 : (unsigned char) string[match_first]; 813 if (fastmap[t ? t[ch] : ch]) 814 break; 815 --match_first; 816 } 817 if (match_first < left_lim) 818 goto free_return; 819 break; 820 821 default: 822 /* In this case, we can't determine easily the current byte, 823 since it might be a component byte of a multibyte 824 character. Then we use the constructed buffer instead. */ 825 for (;;) 826 { 827 /* If MATCH_FIRST is out of the valid range, reconstruct the 828 buffers. */ 829 __re_size_t offset = match_first - mctx.input.raw_mbs_idx; 830 if (BE (offset >= (__re_size_t) mctx.input.valid_raw_len, 0)) 831 { 832 err = re_string_reconstruct (&mctx.input, match_first, 833 eflags); 834 if (BE (err != REG_NOERROR, 0)) 835 goto free_return; 836 837 offset = match_first - mctx.input.raw_mbs_idx; 838 } 839 /* If MATCH_FIRST is out of the buffer, leave it as '\0'. 840 Note that MATCH_FIRST must not be smaller than 0. */ 841 ch = (match_first >= length 842 ? 0 : re_string_byte_at (&mctx.input, offset)); 843 if (fastmap[ch]) 844 break; 845 match_first += incr; 846 if (match_first < left_lim || match_first > right_lim) 847 { 848 err = REG_NOMATCH; 849 goto free_return; 850 } 851 } 852 break; 853 } 854 855 /* Reconstruct the buffers so that the matcher can assume that 856 the matching starts from the beginning of the buffer. */ 857 err = re_string_reconstruct (&mctx.input, match_first, eflags); 858 if (BE (err != REG_NOERROR, 0)) 859 goto free_return; 860 861#ifdef RE_ENABLE_I18N 862 /* Don't consider this char as a possible match start if it part, 863 yet isn't the head, of a multibyte character. */ 864 if (!sb && !re_string_first_byte (&mctx.input, 0)) 865 continue; 866#endif 867 868 /* It seems to be appropriate one, then use the matcher. */ 869 /* We assume that the matching starts from 0. */ 870 mctx.state_log_top = mctx.nbkref_ents = mctx.max_mb_elem_len = 0; 871 match_last = check_matching (&mctx, fl_longest_match, 872 start <= last_start ? &match_first : NULL); 873 if (match_last != REG_MISSING) 874 { 875 if (BE (match_last == REG_ERROR, 0)) 876 { 877 err = REG_ESPACE; 878 goto free_return; 879 } 880 else 881 { 882 mctx.match_last = match_last; 883 if ((!preg->no_sub && nmatch > 1) || dfa->nbackref) 884 { 885 re_dfastate_t *pstate = mctx.state_log[match_last]; 886 mctx.last_node = check_halt_state_context (&mctx, pstate, 887 match_last); 888 } 889 if ((!preg->no_sub && nmatch > 1 && dfa->has_plural_match) 890 || dfa->nbackref) 891 { 892 err = prune_impossible_nodes (&mctx); 893 if (err == REG_NOERROR) 894 break; 895 if (BE (err != REG_NOMATCH, 0)) 896 goto free_return; 897 match_last = REG_MISSING; 898 } 899 else 900 break; /* We found a match. */ 901 } 902 } 903 904 match_ctx_clean (&mctx); 905 } 906 907#ifdef DEBUG 908 assert (match_last != REG_MISSING); 909 assert (err == REG_NOERROR); 910#endif 911 912 /* Set pmatch[] if we need. */ 913 if (nmatch > 0) 914 { 915 Idx reg_idx; 916 917 /* Initialize registers. */ 918 for (reg_idx = 1; reg_idx < nmatch; ++reg_idx) 919 pmatch[reg_idx].rm_so = pmatch[reg_idx].rm_eo = -1; 920 921 /* Set the points where matching start/end. */ 922 pmatch[0].rm_so = 0; 923 pmatch[0].rm_eo = mctx.match_last; 924 /* FIXME: This function should fail if mctx.match_last exceeds 925 the maximum possible regoff_t value. We need a new error 926 code REG_OVERFLOW. */ 927 928 if (!preg->no_sub && nmatch > 1) 929 { 930 err = set_regs (preg, &mctx, nmatch, pmatch, 931 dfa->has_plural_match && dfa->nbackref > 0); 932 if (BE (err != REG_NOERROR, 0)) 933 goto free_return; 934 } 935 936 /* At last, add the offset to the each registers, since we slided 937 the buffers so that we could assume that the matching starts 938 from 0. */ 939 for (reg_idx = 0; reg_idx < nmatch; ++reg_idx) 940 if (pmatch[reg_idx].rm_so != -1) 941 { 942#ifdef RE_ENABLE_I18N 943 if (BE (mctx.input.offsets_needed != 0, 0)) 944 { 945 pmatch[reg_idx].rm_so = 946 (pmatch[reg_idx].rm_so == mctx.input.valid_len 947 ? mctx.input.valid_raw_len 948 : mctx.input.offsets[pmatch[reg_idx].rm_so]); 949 pmatch[reg_idx].rm_eo = 950 (pmatch[reg_idx].rm_eo == mctx.input.valid_len 951 ? mctx.input.valid_raw_len 952 : mctx.input.offsets[pmatch[reg_idx].rm_eo]); 953 } 954#else 955 assert (mctx.input.offsets_needed == 0); 956#endif 957 pmatch[reg_idx].rm_so += match_first; 958 pmatch[reg_idx].rm_eo += match_first; 959 } 960 for (reg_idx = 0; reg_idx < extra_nmatch; ++reg_idx) 961 { 962 pmatch[nmatch + reg_idx].rm_so = -1; 963 pmatch[nmatch + reg_idx].rm_eo = -1; 964 } 965 966 if (dfa->subexp_map) 967 for (reg_idx = 0; reg_idx + 1 < nmatch; reg_idx++) 968 if (dfa->subexp_map[reg_idx] != reg_idx) 969 { 970 pmatch[reg_idx + 1].rm_so 971 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_so; 972 pmatch[reg_idx + 1].rm_eo 973 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_eo; 974 } 975 } 976 977 free_return: 978 re_free (mctx.state_log); 979 if (dfa->nbackref) 980 match_ctx_free (&mctx); 981 re_string_destruct (&mctx.input); 982 return err; 983} 984 985static reg_errcode_t 986internal_function 987prune_impossible_nodes (re_match_context_t *mctx) 988{ 989 const re_dfa_t *const dfa = mctx->dfa; 990 Idx halt_node, match_last; 991 reg_errcode_t ret; 992 re_dfastate_t **sifted_states; 993 re_dfastate_t **lim_states = NULL; 994 re_sift_context_t sctx; 995#ifdef DEBUG 996 assert (mctx->state_log != NULL); 997#endif 998 match_last = mctx->match_last; 999 halt_node = mctx->last_node; 1000 1001 /* Avoid overflow. */ 1002 if (BE (SIZE_MAX / sizeof (re_dfastate_t *) <= match_last, 0)) 1003 return REG_ESPACE; 1004 1005 sifted_states = re_malloc (re_dfastate_t *, match_last + 1); 1006 if (BE (sifted_states == NULL, 0)) 1007 { 1008 ret = REG_ESPACE; 1009 goto free_return; 1010 } 1011 if (dfa->nbackref) 1012 { 1013 lim_states = re_malloc (re_dfastate_t *, match_last + 1); 1014 if (BE (lim_states == NULL, 0)) 1015 { 1016 ret = REG_ESPACE; 1017 goto free_return; 1018 } 1019 while (1) 1020 { 1021 memset (lim_states, '\0', 1022 sizeof (re_dfastate_t *) * (match_last + 1)); 1023 sift_ctx_init (&sctx, sifted_states, lim_states, halt_node, 1024 match_last); 1025 ret = sift_states_backward (mctx, &sctx); 1026 re_node_set_free (&sctx.limits); 1027 if (BE (ret != REG_NOERROR, 0)) 1028 goto free_return; 1029 if (sifted_states[0] != NULL || lim_states[0] != NULL) 1030 break; 1031 do 1032 { 1033 --match_last; 1034 if (! REG_VALID_INDEX (match_last)) 1035 { 1036 ret = REG_NOMATCH; 1037 goto free_return; 1038 } 1039 } while (mctx->state_log[match_last] == NULL 1040 || !mctx->state_log[match_last]->halt); 1041 halt_node = check_halt_state_context (mctx, 1042 mctx->state_log[match_last], 1043 match_last); 1044 } 1045 ret = merge_state_array (dfa, sifted_states, lim_states, 1046 match_last + 1); 1047 re_free (lim_states); 1048 lim_states = NULL; 1049 if (BE (ret != REG_NOERROR, 0)) 1050 goto free_return; 1051 } 1052 else 1053 { 1054 sift_ctx_init (&sctx, sifted_states, lim_states, halt_node, match_last); 1055 ret = sift_states_backward (mctx, &sctx); 1056 re_node_set_free (&sctx.limits); 1057 if (BE (ret != REG_NOERROR, 0)) 1058 goto free_return; 1059 if (sifted_states[0] == NULL) 1060 { 1061 ret = REG_NOMATCH; 1062 goto free_return; 1063 } 1064 } 1065 re_free (mctx->state_log); 1066 mctx->state_log = sifted_states; 1067 sifted_states = NULL; 1068 mctx->last_node = halt_node; 1069 mctx->match_last = match_last; 1070 ret = REG_NOERROR; 1071 free_return: 1072 re_free (sifted_states); 1073 re_free (lim_states); 1074 return ret; 1075} 1076 1077/* Acquire an initial state and return it. 1078 We must select appropriate initial state depending on the context, 1079 since initial states may have constraints like "\<", "^", etc.. */ 1080 1081static inline re_dfastate_t * 1082__attribute ((always_inline)) internal_function 1083acquire_init_state_context (reg_errcode_t *err, const re_match_context_t *mctx, 1084 Idx idx) 1085{ 1086 const re_dfa_t *const dfa = mctx->dfa; 1087 if (dfa->init_state->has_constraint) 1088 { 1089 unsigned int context; 1090 context = re_string_context_at (&mctx->input, idx - 1, mctx->eflags); 1091 if (IS_WORD_CONTEXT (context)) 1092 return dfa->init_state_word; 1093 else if (IS_ORDINARY_CONTEXT (context)) 1094 return dfa->init_state; 1095 else if (IS_BEGBUF_CONTEXT (context) && IS_NEWLINE_CONTEXT (context)) 1096 return dfa->init_state_begbuf; 1097 else if (IS_NEWLINE_CONTEXT (context)) 1098 return dfa->init_state_nl; 1099 else if (IS_BEGBUF_CONTEXT (context)) 1100 { 1101 /* It is relatively rare case, then calculate on demand. */ 1102 return re_acquire_state_context (err, dfa, 1103 dfa->init_state->entrance_nodes, 1104 context); 1105 } 1106 else 1107 /* Must not happen? */ 1108 return dfa->init_state; 1109 } 1110 else 1111 return dfa->init_state; 1112} 1113 1114/* Check whether the regular expression match input string INPUT or not, 1115 and return the index where the matching end. Return REG_MISSING if 1116 there is no match, and return REG_ERROR in case of an error. 1117 FL_LONGEST_MATCH means we want the POSIX longest matching. 1118 If P_MATCH_FIRST is not NULL, and the match fails, it is set to the 1119 next place where we may want to try matching. 1120 Note that the matcher assume that the maching starts from the current 1121 index of the buffer. */ 1122 1123static Idx 1124internal_function 1125check_matching (re_match_context_t *mctx, bool fl_longest_match, 1126 Idx *p_match_first) 1127{ 1128 const re_dfa_t *const dfa = mctx->dfa; 1129 reg_errcode_t err; 1130 Idx match = 0; 1131 Idx match_last = REG_MISSING; 1132 Idx cur_str_idx = re_string_cur_idx (&mctx->input); 1133 re_dfastate_t *cur_state; 1134 bool at_init_state = p_match_first != NULL; 1135 Idx next_start_idx = cur_str_idx; 1136 1137 err = REG_NOERROR; 1138 cur_state = acquire_init_state_context (&err, mctx, cur_str_idx); 1139 /* An initial state must not be NULL (invalid). */ 1140 if (BE (cur_state == NULL, 0)) 1141 { 1142 assert (err == REG_ESPACE); 1143 return REG_ERROR; 1144 } 1145 1146 if (mctx->state_log != NULL) 1147 { 1148 mctx->state_log[cur_str_idx] = cur_state; 1149 1150 /* Check OP_OPEN_SUBEXP in the initial state in case that we use them 1151 later. E.g. Processing back references. */ 1152 if (BE (dfa->nbackref, 0)) 1153 { 1154 at_init_state = false; 1155 err = check_subexp_matching_top (mctx, &cur_state->nodes, 0); 1156 if (BE (err != REG_NOERROR, 0)) 1157 return err; 1158 1159 if (cur_state->has_backref) 1160 { 1161 err = transit_state_bkref (mctx, &cur_state->nodes); 1162 if (BE (err != REG_NOERROR, 0)) 1163 return err; 1164 } 1165 } 1166 } 1167 1168 /* If the RE accepts NULL string. */ 1169 if (BE (cur_state->halt, 0)) 1170 { 1171 if (!cur_state->has_constraint 1172 || check_halt_state_context (mctx, cur_state, cur_str_idx)) 1173 { 1174 if (!fl_longest_match) 1175 return cur_str_idx; 1176 else 1177 { 1178 match_last = cur_str_idx; 1179 match = 1; 1180 } 1181 } 1182 } 1183 1184 while (!re_string_eoi (&mctx->input)) 1185 { 1186 re_dfastate_t *old_state = cur_state; 1187 Idx next_char_idx = re_string_cur_idx (&mctx->input) + 1; 1188 1189 if (BE (next_char_idx >= mctx->input.bufs_len, 0) 1190 || (BE (next_char_idx >= mctx->input.valid_len, 0) 1191 && mctx->input.valid_len < mctx->input.len)) 1192 { 1193 err = extend_buffers (mctx); 1194 if (BE (err != REG_NOERROR, 0)) 1195 { 1196 assert (err == REG_ESPACE); 1197 return REG_ERROR; 1198 } 1199 } 1200 1201 cur_state = transit_state (&err, mctx, cur_state); 1202 if (mctx->state_log != NULL) 1203 cur_state = merge_state_with_log (&err, mctx, cur_state); 1204 1205 if (cur_state == NULL) 1206 { 1207 /* Reached the invalid state or an error. Try to recover a valid 1208 state using the state log, if available and if we have not 1209 already found a valid (even if not the longest) match. */ 1210 if (BE (err != REG_NOERROR, 0)) 1211 return REG_ERROR; 1212 1213 if (mctx->state_log == NULL 1214 || (match && !fl_longest_match) 1215 || (cur_state = find_recover_state (&err, mctx)) == NULL) 1216 break; 1217 } 1218 1219 if (BE (at_init_state, 0)) 1220 { 1221 if (old_state == cur_state) 1222 next_start_idx = next_char_idx; 1223 else 1224 at_init_state = false; 1225 } 1226 1227 if (cur_state->halt) 1228 { 1229 /* Reached a halt state. 1230 Check the halt state can satisfy the current context. */ 1231 if (!cur_state->has_constraint 1232 || check_halt_state_context (mctx, cur_state, 1233 re_string_cur_idx (&mctx->input))) 1234 { 1235 /* We found an appropriate halt state. */ 1236 match_last = re_string_cur_idx (&mctx->input); 1237 match = 1; 1238 1239 /* We found a match, do not modify match_first below. */ 1240 p_match_first = NULL; 1241 if (!fl_longest_match) 1242 break; 1243 } 1244 } 1245 } 1246 1247 if (p_match_first) 1248 *p_match_first += next_start_idx; 1249 1250 return match_last; 1251} 1252 1253/* Check NODE match the current context. */ 1254 1255static bool 1256internal_function 1257check_halt_node_context (const re_dfa_t *dfa, Idx node, unsigned int context) 1258{ 1259 re_token_type_t type = dfa->nodes[node].type; 1260 unsigned int constraint = dfa->nodes[node].constraint; 1261 if (type != END_OF_RE) 1262 return false; 1263 if (!constraint) 1264 return true; 1265 if (NOT_SATISFY_NEXT_CONSTRAINT (constraint, context)) 1266 return false; 1267 return true; 1268} 1269 1270/* Check the halt state STATE match the current context. 1271 Return 0 if not match, if the node, STATE has, is a halt node and 1272 match the context, return the node. */ 1273 1274static Idx 1275internal_function 1276check_halt_state_context (const re_match_context_t *mctx, 1277 const re_dfastate_t *state, Idx idx) 1278{ 1279 Idx i; 1280 unsigned int context; 1281#ifdef DEBUG 1282 assert (state->halt); 1283#endif 1284 context = re_string_context_at (&mctx->input, idx, mctx->eflags); 1285 for (i = 0; i < state->nodes.nelem; ++i) 1286 if (check_halt_node_context (mctx->dfa, state->nodes.elems[i], context)) 1287 return state->nodes.elems[i]; 1288 return 0; 1289} 1290 1291/* Compute the next node to which "NFA" transit from NODE("NFA" is a NFA 1292 corresponding to the DFA). 1293 Return the destination node, and update EPS_VIA_NODES; 1294 return REG_MISSING in case of errors. */ 1295 1296static Idx 1297internal_function 1298proceed_next_node (const re_match_context_t *mctx, Idx nregs, regmatch_t *regs, 1299 Idx *pidx, Idx node, re_node_set *eps_via_nodes, 1300 struct re_fail_stack_t *fs) 1301{ 1302 const re_dfa_t *const dfa = mctx->dfa; 1303 Idx i; 1304 bool ok; 1305 if (IS_EPSILON_NODE (dfa->nodes[node].type)) 1306 { 1307 re_node_set *cur_nodes = &mctx->state_log[*pidx]->nodes; 1308 re_node_set *edests = &dfa->edests[node]; 1309 Idx dest_node; 1310 ok = re_node_set_insert (eps_via_nodes, node); 1311 if (BE (! ok, 0)) 1312 return REG_ERROR; 1313 /* Pick up a valid destination, or return REG_MISSING if none 1314 is found. */ 1315 for (dest_node = REG_MISSING, i = 0; i < edests->nelem; ++i) 1316 { 1317 Idx candidate = edests->elems[i]; 1318 if (!re_node_set_contains (cur_nodes, candidate)) 1319 continue; 1320 if (dest_node == REG_MISSING) 1321 dest_node = candidate; 1322 1323 else 1324 { 1325 /* In order to avoid infinite loop like "(a*)*", return the second 1326 epsilon-transition if the first was already considered. */ 1327 if (re_node_set_contains (eps_via_nodes, dest_node)) 1328 return candidate; 1329 1330 /* Otherwise, push the second epsilon-transition on the fail stack. */ 1331 else if (fs != NULL 1332 && push_fail_stack (fs, *pidx, candidate, nregs, regs, 1333 eps_via_nodes)) 1334 return REG_ERROR; 1335 1336 /* We know we are going to exit. */ 1337 break; 1338 } 1339 } 1340 return dest_node; 1341 } 1342 else 1343 { 1344 Idx naccepted = 0; 1345 re_token_type_t type = dfa->nodes[node].type; 1346 1347#ifdef RE_ENABLE_I18N 1348 if (dfa->nodes[node].accept_mb) 1349 naccepted = check_node_accept_bytes (dfa, node, &mctx->input, *pidx); 1350 else 1351#endif /* RE_ENABLE_I18N */ 1352 if (type == OP_BACK_REF) 1353 { 1354 Idx subexp_idx = dfa->nodes[node].opr.idx + 1; 1355 naccepted = regs[subexp_idx].rm_eo - regs[subexp_idx].rm_so; 1356 if (fs != NULL) 1357 { 1358 if (regs[subexp_idx].rm_so == -1 || regs[subexp_idx].rm_eo == -1) 1359 return REG_MISSING; 1360 else if (naccepted) 1361 { 1362 char *buf = (char *) re_string_get_buffer (&mctx->input); 1363 if (memcmp (buf + regs[subexp_idx].rm_so, buf + *pidx, 1364 naccepted) != 0) 1365 return REG_MISSING; 1366 } 1367 } 1368 1369 if (naccepted == 0) 1370 { 1371 Idx dest_node; 1372 ok = re_node_set_insert (eps_via_nodes, node); 1373 if (BE (! ok, 0)) 1374 return REG_ERROR; 1375 dest_node = dfa->edests[node].elems[0]; 1376 if (re_node_set_contains (&mctx->state_log[*pidx]->nodes, 1377 dest_node)) 1378 return dest_node; 1379 } 1380 } 1381 1382 if (naccepted != 0 1383 || check_node_accept (mctx, dfa->nodes + node, *pidx)) 1384 { 1385 Idx dest_node = dfa->nexts[node]; 1386 *pidx = (naccepted == 0) ? *pidx + 1 : *pidx + naccepted; 1387 if (fs && (*pidx > mctx->match_last || mctx->state_log[*pidx] == NULL 1388 || !re_node_set_contains (&mctx->state_log[*pidx]->nodes, 1389 dest_node))) 1390 return REG_MISSING; 1391 re_node_set_empty (eps_via_nodes); 1392 return dest_node; 1393 } 1394 } 1395 return REG_MISSING; 1396} 1397 1398static reg_errcode_t 1399internal_function 1400push_fail_stack (struct re_fail_stack_t *fs, Idx str_idx, Idx dest_node, 1401 Idx nregs, regmatch_t *regs, re_node_set *eps_via_nodes) 1402{ 1403 reg_errcode_t err; 1404 Idx num = fs->num++; 1405 if (fs->num == fs->alloc) 1406 { 1407 struct re_fail_stack_ent_t *new_array; 1408 new_array = realloc (fs->stack, (sizeof (struct re_fail_stack_ent_t) 1409 * fs->alloc * 2)); 1410 if (new_array == NULL) 1411 return REG_ESPACE; 1412 fs->alloc *= 2; 1413 fs->stack = new_array; 1414 } 1415 fs->stack[num].idx = str_idx; 1416 fs->stack[num].node = dest_node; 1417 fs->stack[num].regs = re_malloc (regmatch_t, nregs); 1418 if (fs->stack[num].regs == NULL) 1419 return REG_ESPACE; 1420 memcpy (fs->stack[num].regs, regs, sizeof (regmatch_t) * nregs); 1421 err = re_node_set_init_copy (&fs->stack[num].eps_via_nodes, eps_via_nodes); 1422 return err; 1423} 1424 1425static Idx 1426internal_function 1427pop_fail_stack (struct re_fail_stack_t *fs, Idx *pidx, Idx nregs, 1428 regmatch_t *regs, re_node_set *eps_via_nodes) 1429{ 1430 Idx num = --fs->num; 1431 assert (REG_VALID_INDEX (num)); 1432 *pidx = fs->stack[num].idx; 1433 memcpy (regs, fs->stack[num].regs, sizeof (regmatch_t) * nregs); 1434 re_node_set_free (eps_via_nodes); 1435 re_free (fs->stack[num].regs); 1436 *eps_via_nodes = fs->stack[num].eps_via_nodes; 1437 return fs->stack[num].node; 1438} 1439 1440/* Set the positions where the subexpressions are starts/ends to registers 1441 PMATCH. 1442 Note: We assume that pmatch[0] is already set, and 1443 pmatch[i].rm_so == pmatch[i].rm_eo == -1 for 0 < i < nmatch. */ 1444 1445static reg_errcode_t 1446internal_function 1447set_regs (const regex_t *preg, const re_match_context_t *mctx, size_t nmatch, 1448 regmatch_t *pmatch, bool fl_backtrack) 1449{ 1450 const re_dfa_t *dfa = (const re_dfa_t *) preg->buffer; 1451 Idx idx, cur_node; 1452 re_node_set eps_via_nodes; 1453 struct re_fail_stack_t *fs; 1454 struct re_fail_stack_t fs_body = { 0, 2, NULL }; 1455 regmatch_t *prev_idx_match; 1456 bool prev_idx_match_malloced = false; 1457 1458#ifdef DEBUG 1459 assert (nmatch > 1); 1460 assert (mctx->state_log != NULL); 1461#endif 1462 if (fl_backtrack) 1463 { 1464 fs = &fs_body; 1465 fs->stack = re_malloc (struct re_fail_stack_ent_t, fs->alloc); 1466 if (fs->stack == NULL) 1467 return REG_ESPACE; 1468 } 1469 else 1470 fs = NULL; 1471 1472 cur_node = dfa->init_node; 1473 re_node_set_init_empty (&eps_via_nodes); 1474 1475 if (__libc_use_alloca (nmatch * sizeof (regmatch_t))) 1476 prev_idx_match = (regmatch_t *) alloca (nmatch * sizeof (regmatch_t)); 1477 else 1478 { 1479 prev_idx_match = re_malloc (regmatch_t, nmatch); 1480 if (prev_idx_match == NULL) 1481 { 1482 free_fail_stack_return (fs); 1483 return REG_ESPACE; 1484 } 1485 prev_idx_match_malloced = true; 1486 } 1487 memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch); 1488 1489 for (idx = pmatch[0].rm_so; idx <= pmatch[0].rm_eo ;) 1490 { 1491 update_regs (dfa, pmatch, prev_idx_match, cur_node, idx, nmatch); 1492 1493 if (idx == pmatch[0].rm_eo && cur_node == mctx->last_node) 1494 { 1495 Idx reg_idx; 1496 if (fs) 1497 { 1498 for (reg_idx = 0; reg_idx < nmatch; ++reg_idx) 1499 if (pmatch[reg_idx].rm_so > -1 && pmatch[reg_idx].rm_eo == -1) 1500 break; 1501 if (reg_idx == nmatch) 1502 { 1503 re_node_set_free (&eps_via_nodes); 1504 if (prev_idx_match_malloced) 1505 re_free (prev_idx_match); 1506 return free_fail_stack_return (fs); 1507 } 1508 cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch, 1509 &eps_via_nodes); 1510 } 1511 else 1512 { 1513 re_node_set_free (&eps_via_nodes); 1514 if (prev_idx_match_malloced) 1515 re_free (prev_idx_match); 1516 return REG_NOERROR; 1517 } 1518 } 1519 1520 /* Proceed to next node. */ 1521 cur_node = proceed_next_node (mctx, nmatch, pmatch, &idx, cur_node, 1522 &eps_via_nodes, fs); 1523 1524 if (BE (! REG_VALID_INDEX (cur_node), 0)) 1525 { 1526 if (BE (cur_node == REG_ERROR, 0)) 1527 { 1528 re_node_set_free (&eps_via_nodes); 1529 if (prev_idx_match_malloced) 1530 re_free (prev_idx_match); 1531 free_fail_stack_return (fs); 1532 return REG_ESPACE; 1533 } 1534 if (fs) 1535 cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch, 1536 &eps_via_nodes); 1537 else 1538 { 1539 re_node_set_free (&eps_via_nodes); 1540 if (prev_idx_match_malloced) 1541 re_free (prev_idx_match); 1542 return REG_NOMATCH; 1543 } 1544 } 1545 } 1546 re_node_set_free (&eps_via_nodes); 1547 if (prev_idx_match_malloced) 1548 re_free (prev_idx_match); 1549 return free_fail_stack_return (fs); 1550} 1551 1552static reg_errcode_t 1553internal_function 1554free_fail_stack_return (struct re_fail_stack_t *fs) 1555{ 1556 if (fs) 1557 { 1558 Idx fs_idx; 1559 for (fs_idx = 0; fs_idx < fs->num; ++fs_idx) 1560 { 1561 re_node_set_free (&fs->stack[fs_idx].eps_via_nodes); 1562 re_free (fs->stack[fs_idx].regs); 1563 } 1564 re_free (fs->stack); 1565 } 1566 return REG_NOERROR; 1567} 1568 1569static void 1570internal_function 1571update_regs (const re_dfa_t *dfa, regmatch_t *pmatch, 1572 regmatch_t *prev_idx_match, Idx cur_node, Idx cur_idx, Idx nmatch) 1573{ 1574 int type = dfa->nodes[cur_node].type; 1575 if (type == OP_OPEN_SUBEXP) 1576 { 1577 Idx reg_num = dfa->nodes[cur_node].opr.idx + 1; 1578 1579 /* We are at the first node of this sub expression. */ 1580 if (reg_num < nmatch) 1581 { 1582 pmatch[reg_num].rm_so = cur_idx; 1583 pmatch[reg_num].rm_eo = -1; 1584 } 1585 } 1586 else if (type == OP_CLOSE_SUBEXP) 1587 { 1588 Idx reg_num = dfa->nodes[cur_node].opr.idx + 1; 1589 if (reg_num < nmatch) 1590 { 1591 /* We are at the last node of this sub expression. */ 1592 if (pmatch[reg_num].rm_so < cur_idx) 1593 { 1594 pmatch[reg_num].rm_eo = cur_idx; 1595 /* This is a non-empty match or we are not inside an optional 1596 subexpression. Accept this right away. */ 1597 memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch); 1598 } 1599 else 1600 { 1601 if (dfa->nodes[cur_node].opt_subexp 1602 && prev_idx_match[reg_num].rm_so != -1) 1603 /* We transited through an empty match for an optional 1604 subexpression, like (a?)*, and this is not the subexp's 1605 first match. Copy back the old content of the registers 1606 so that matches of an inner subexpression are undone as 1607 well, like in ((a?))*. */ 1608 memcpy (pmatch, prev_idx_match, sizeof (regmatch_t) * nmatch); 1609 else 1610 /* We completed a subexpression, but it may be part of 1611 an optional one, so do not update PREV_IDX_MATCH. */ 1612 pmatch[reg_num].rm_eo = cur_idx; 1613 } 1614 } 1615 } 1616} 1617 1618/* This function checks the STATE_LOG from the SCTX->last_str_idx to 0 1619 and sift the nodes in each states according to the following rules. 1620 Updated state_log will be wrote to STATE_LOG. 1621 1622 Rules: We throw away the Node `a' in the STATE_LOG[STR_IDX] if... 1623 1. When STR_IDX == MATCH_LAST(the last index in the state_log): 1624 If `a' isn't the LAST_NODE and `a' can't epsilon transit to 1625 the LAST_NODE, we throw away the node `a'. 1626 2. When 0 <= STR_IDX < MATCH_LAST and `a' accepts 1627 string `s' and transit to `b': 1628 i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw 1629 away the node `a'. 1630 ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is 1631 thrown away, we throw away the node `a'. 1632 3. When 0 <= STR_IDX < MATCH_LAST and 'a' epsilon transit to 'b': 1633 i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the 1634 node `a'. 1635 ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is thrown away, 1636 we throw away the node `a'. */ 1637 1638#define STATE_NODE_CONTAINS(state,node) \ 1639 ((state) != NULL && re_node_set_contains (&(state)->nodes, node)) 1640 1641static reg_errcode_t 1642internal_function 1643sift_states_backward (const re_match_context_t *mctx, re_sift_context_t *sctx) 1644{ 1645 reg_errcode_t err; 1646 int null_cnt = 0; 1647 Idx str_idx = sctx->last_str_idx; 1648 re_node_set cur_dest; 1649 1650#ifdef DEBUG 1651 assert (mctx->state_log != NULL && mctx->state_log[str_idx] != NULL); 1652#endif 1653 1654 /* Build sifted state_log[str_idx]. It has the nodes which can epsilon 1655 transit to the last_node and the last_node itself. */ 1656 err = re_node_set_init_1 (&cur_dest, sctx->last_node); 1657 if (BE (err != REG_NOERROR, 0)) 1658 return err; 1659 err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest); 1660 if (BE (err != REG_NOERROR, 0)) 1661 goto free_return; 1662 1663 /* Then check each states in the state_log. */ 1664 while (str_idx > 0) 1665 { 1666 /* Update counters. */ 1667 null_cnt = (sctx->sifted_states[str_idx] == NULL) ? null_cnt + 1 : 0; 1668 if (null_cnt > mctx->max_mb_elem_len) 1669 { 1670 memset (sctx->sifted_states, '\0', 1671 sizeof (re_dfastate_t *) * str_idx); 1672 re_node_set_free (&cur_dest); 1673 return REG_NOERROR; 1674 } 1675 re_node_set_empty (&cur_dest); 1676 --str_idx; 1677 1678 if (mctx->state_log[str_idx]) 1679 { 1680 err = build_sifted_states (mctx, sctx, str_idx, &cur_dest); 1681 if (BE (err != REG_NOERROR, 0)) 1682 goto free_return; 1683 } 1684 1685 /* Add all the nodes which satisfy the following conditions: 1686 - It can epsilon transit to a node in CUR_DEST. 1687 - It is in CUR_SRC. 1688 And update state_log. */ 1689 err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest); 1690 if (BE (err != REG_NOERROR, 0)) 1691 goto free_return; 1692 } 1693 err = REG_NOERROR; 1694 free_return: 1695 re_node_set_free (&cur_dest); 1696 return err; 1697} 1698 1699static reg_errcode_t 1700internal_function 1701build_sifted_states (const re_match_context_t *mctx, re_sift_context_t *sctx, 1702 Idx str_idx, re_node_set *cur_dest) 1703{ 1704 const re_dfa_t *const dfa = mctx->dfa; 1705 const re_node_set *cur_src = &mctx->state_log[str_idx]->non_eps_nodes; 1706 Idx i; 1707 1708 /* Then build the next sifted state. 1709 We build the next sifted state on `cur_dest', and update 1710 `sifted_states[str_idx]' with `cur_dest'. 1711 Note: 1712 `cur_dest' is the sifted state from `state_log[str_idx + 1]'. 1713 `cur_src' points the node_set of the old `state_log[str_idx]' 1714 (with the epsilon nodes pre-filtered out). */ 1715 for (i = 0; i < cur_src->nelem; i++) 1716 { 1717 Idx prev_node = cur_src->elems[i]; 1718 int naccepted = 0; 1719 bool ok; 1720 1721#ifdef DEBUG 1722 re_token_type_t type = dfa->nodes[prev_node].type; 1723 assert (!IS_EPSILON_NODE (type)); 1724#endif 1725#ifdef RE_ENABLE_I18N 1726 /* If the node may accept `multi byte'. */ 1727 if (dfa->nodes[prev_node].accept_mb) 1728 naccepted = sift_states_iter_mb (mctx, sctx, prev_node, 1729 str_idx, sctx->last_str_idx); 1730#endif /* RE_ENABLE_I18N */ 1731 1732 /* We don't check backreferences here. 1733 See update_cur_sifted_state(). */ 1734 if (!naccepted 1735 && check_node_accept (mctx, dfa->nodes + prev_node, str_idx) 1736 && STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + 1], 1737 dfa->nexts[prev_node])) 1738 naccepted = 1; 1739 1740 if (naccepted == 0) 1741 continue; 1742 1743 if (sctx->limits.nelem) 1744 { 1745 Idx to_idx = str_idx + naccepted; 1746 if (check_dst_limits (mctx, &sctx->limits, 1747 dfa->nexts[prev_node], to_idx, 1748 prev_node, str_idx)) 1749 continue; 1750 } 1751 ok = re_node_set_insert (cur_dest, prev_node); 1752 if (BE (! ok, 0)) 1753 return REG_ESPACE; 1754 } 1755 1756 return REG_NOERROR; 1757} 1758 1759/* Helper functions. */ 1760 1761static reg_errcode_t 1762internal_function 1763clean_state_log_if_needed (re_match_context_t *mctx, Idx next_state_log_idx) 1764{ 1765 Idx top = mctx->state_log_top; 1766 1767 if (next_state_log_idx >= mctx->input.bufs_len 1768 || (next_state_log_idx >= mctx->input.valid_len 1769 && mctx->input.valid_len < mctx->input.len)) 1770 { 1771 reg_errcode_t err; 1772 err = extend_buffers (mctx); 1773 if (BE (err != REG_NOERROR, 0)) 1774 return err; 1775 } 1776 1777 if (top < next_state_log_idx) 1778 { 1779 memset (mctx->state_log + top + 1, '\0', 1780 sizeof (re_dfastate_t *) * (next_state_log_idx - top)); 1781 mctx->state_log_top = next_state_log_idx; 1782 } 1783 return REG_NOERROR; 1784} 1785 1786static reg_errcode_t 1787internal_function 1788merge_state_array (const re_dfa_t *dfa, re_dfastate_t **dst, 1789 re_dfastate_t **src, Idx num) 1790{ 1791 Idx st_idx; 1792 reg_errcode_t err; 1793 for (st_idx = 0; st_idx < num; ++st_idx) 1794 { 1795 if (dst[st_idx] == NULL) 1796 dst[st_idx] = src[st_idx]; 1797 else if (src[st_idx] != NULL) 1798 { 1799 re_node_set merged_set; 1800 err = re_node_set_init_union (&merged_set, &dst[st_idx]->nodes, 1801 &src[st_idx]->nodes); 1802 if (BE (err != REG_NOERROR, 0)) 1803 return err; 1804 dst[st_idx] = re_acquire_state (&err, dfa, &merged_set); 1805 re_node_set_free (&merged_set); 1806 if (BE (err != REG_NOERROR, 0)) 1807 return err; 1808 } 1809 } 1810 return REG_NOERROR; 1811} 1812 1813static reg_errcode_t 1814internal_function 1815update_cur_sifted_state (const re_match_context_t *mctx, 1816 re_sift_context_t *sctx, Idx str_idx, 1817 re_node_set *dest_nodes) 1818{ 1819 const re_dfa_t *const dfa = mctx->dfa; 1820 reg_errcode_t err = REG_NOERROR; 1821 const re_node_set *candidates; 1822 candidates = ((mctx->state_log[str_idx] == NULL) ? NULL 1823 : &mctx->state_log[str_idx]->nodes); 1824 1825 if (dest_nodes->nelem == 0) 1826 sctx->sifted_states[str_idx] = NULL; 1827 else 1828 { 1829 if (candidates) 1830 { 1831 /* At first, add the nodes which can epsilon transit to a node in 1832 DEST_NODE. */ 1833 err = add_epsilon_src_nodes (dfa, dest_nodes, candidates); 1834 if (BE (err != REG_NOERROR, 0)) 1835 return err; 1836 1837 /* Then, check the limitations in the current sift_context. */ 1838 if (sctx->limits.nelem) 1839 { 1840 err = check_subexp_limits (dfa, dest_nodes, candidates, &sctx->limits, 1841 mctx->bkref_ents, str_idx); 1842 if (BE (err != REG_NOERROR, 0)) 1843 return err; 1844 } 1845 } 1846 1847 sctx->sifted_states[str_idx] = re_acquire_state (&err, dfa, dest_nodes); 1848 if (BE (err != REG_NOERROR, 0)) 1849 return err; 1850 } 1851 1852 if (candidates && mctx->state_log[str_idx]->has_backref) 1853 { 1854 err = sift_states_bkref (mctx, sctx, str_idx, candidates); 1855 if (BE (err != REG_NOERROR, 0)) 1856 return err; 1857 } 1858 return REG_NOERROR; 1859} 1860 1861static reg_errcode_t 1862internal_function 1863add_epsilon_src_nodes (const re_dfa_t *dfa, re_node_set *dest_nodes, 1864 const re_node_set *candidates) 1865{ 1866 reg_errcode_t err = REG_NOERROR; 1867 Idx i; 1868 1869 re_dfastate_t *state = re_acquire_state (&err, dfa, dest_nodes); 1870 if (BE (err != REG_NOERROR, 0)) 1871 return err; 1872 1873 if (!state->inveclosure.alloc) 1874 { 1875 err = re_node_set_alloc (&state->inveclosure, dest_nodes->nelem); 1876 if (BE (err != REG_NOERROR, 0)) 1877 return REG_ESPACE; 1878 for (i = 0; i < dest_nodes->nelem; i++) 1879 re_node_set_merge (&state->inveclosure, 1880 dfa->inveclosures + dest_nodes->elems[i]); 1881 } 1882 return re_node_set_add_intersect (dest_nodes, candidates, 1883 &state->inveclosure); 1884} 1885 1886static reg_errcode_t 1887internal_function 1888sub_epsilon_src_nodes (const re_dfa_t *dfa, Idx node, re_node_set *dest_nodes, 1889 const re_node_set *candidates) 1890{ 1891 Idx ecl_idx; 1892 reg_errcode_t err; 1893 re_node_set *inv_eclosure = dfa->inveclosures + node; 1894 re_node_set except_nodes; 1895 re_node_set_init_empty (&except_nodes); 1896 for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx) 1897 { 1898 Idx cur_node = inv_eclosure->elems[ecl_idx]; 1899 if (cur_node == node) 1900 continue; 1901 if (IS_EPSILON_NODE (dfa->nodes[cur_node].type)) 1902 { 1903 Idx edst1 = dfa->edests[cur_node].elems[0]; 1904 Idx edst2 = ((dfa->edests[cur_node].nelem > 1) 1905 ? dfa->edests[cur_node].elems[1] : REG_MISSING); 1906 if ((!re_node_set_contains (inv_eclosure, edst1) 1907 && re_node_set_contains (dest_nodes, edst1)) 1908 || (REG_VALID_NONZERO_INDEX (edst2) 1909 && !re_node_set_contains (inv_eclosure, edst2) 1910 && re_node_set_contains (dest_nodes, edst2))) 1911 { 1912 err = re_node_set_add_intersect (&except_nodes, candidates, 1913 dfa->inveclosures + cur_node); 1914 if (BE (err != REG_NOERROR, 0)) 1915 { 1916 re_node_set_free (&except_nodes); 1917 return err; 1918 } 1919 } 1920 } 1921 } 1922 for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx) 1923 { 1924 Idx cur_node = inv_eclosure->elems[ecl_idx]; 1925 if (!re_node_set_contains (&except_nodes, cur_node)) 1926 { 1927 Idx idx = re_node_set_contains (dest_nodes, cur_node) - 1; 1928 re_node_set_remove_at (dest_nodes, idx); 1929 } 1930 } 1931 re_node_set_free (&except_nodes); 1932 return REG_NOERROR; 1933} 1934 1935static bool 1936internal_function 1937check_dst_limits (const re_match_context_t *mctx, const re_node_set *limits, 1938 Idx dst_node, Idx dst_idx, Idx src_node, Idx src_idx) 1939{ 1940 const re_dfa_t *const dfa = mctx->dfa; 1941 Idx lim_idx, src_pos, dst_pos; 1942 1943 Idx dst_bkref_idx = search_cur_bkref_entry (mctx, dst_idx); 1944 Idx src_bkref_idx = search_cur_bkref_entry (mctx, src_idx); 1945 for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx) 1946 { 1947 Idx subexp_idx; 1948 struct re_backref_cache_entry *ent; 1949 ent = mctx->bkref_ents + limits->elems[lim_idx]; 1950 subexp_idx = dfa->nodes[ent->node].opr.idx; 1951 1952 dst_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx], 1953 subexp_idx, dst_node, dst_idx, 1954 dst_bkref_idx); 1955 src_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx], 1956 subexp_idx, src_node, src_idx, 1957 src_bkref_idx); 1958 1959 /* In case of: 1960 <src> <dst> ( <subexp> ) 1961 ( <subexp> ) <src> <dst> 1962 ( <subexp1> <src> <subexp2> <dst> <subexp3> ) */ 1963 if (src_pos == dst_pos) 1964 continue; /* This is unrelated limitation. */ 1965 else 1966 return true; 1967 } 1968 return false; 1969} 1970 1971static int 1972internal_function 1973check_dst_limits_calc_pos_1 (const re_match_context_t *mctx, int boundaries, 1974 Idx subexp_idx, Idx from_node, Idx bkref_idx) 1975{ 1976 const re_dfa_t *const dfa = mctx->dfa; 1977 const re_node_set *eclosures = dfa->eclosures + from_node; 1978 Idx node_idx; 1979 1980 /* Else, we are on the boundary: examine the nodes on the epsilon 1981 closure. */ 1982 for (node_idx = 0; node_idx < eclosures->nelem; ++node_idx) 1983 { 1984 Idx node = eclosures->elems[node_idx]; 1985 switch (dfa->nodes[node].type) 1986 { 1987 case OP_BACK_REF: 1988 if (bkref_idx != REG_MISSING) 1989 { 1990 struct re_backref_cache_entry *ent = mctx->bkref_ents + bkref_idx; 1991 do 1992 { 1993 Idx dst; 1994 int cpos; 1995 1996 if (ent->node != node) 1997 continue; 1998 1999 if (subexp_idx < BITSET_WORD_BITS 2000 && !(ent->eps_reachable_subexps_map 2001 & ((bitset_word_t) 1 << subexp_idx))) 2002 continue; 2003 2004 /* Recurse trying to reach the OP_OPEN_SUBEXP and 2005 OP_CLOSE_SUBEXP cases below. But, if the 2006 destination node is the same node as the source 2007 node, don't recurse because it would cause an 2008 infinite loop: a regex that exhibits this behavior 2009 is ()\1*\1* */ 2010 dst = dfa->edests[node].elems[0]; 2011 if (dst == from_node) 2012 { 2013 if (boundaries & 1) 2014 return -1; 2015 else /* if (boundaries & 2) */ 2016 return 0; 2017 } 2018 2019 cpos = 2020 check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx, 2021 dst, bkref_idx); 2022 if (cpos == -1 /* && (boundaries & 1) */) 2023 return -1; 2024 if (cpos == 0 && (boundaries & 2)) 2025 return 0; 2026 2027 if (subexp_idx < BITSET_WORD_BITS) 2028 ent->eps_reachable_subexps_map 2029 &= ~((bitset_word_t) 1 << subexp_idx); 2030 } 2031 while (ent++->more); 2032 } 2033 break; 2034 2035 case OP_OPEN_SUBEXP: 2036 if ((boundaries & 1) && subexp_idx == dfa->nodes[node].opr.idx) 2037 return -1; 2038 break; 2039 2040 case OP_CLOSE_SUBEXP: 2041 if ((boundaries & 2) && subexp_idx == dfa->nodes[node].opr.idx) 2042 return 0; 2043 break; 2044 2045 default: 2046 break; 2047 } 2048 } 2049 2050 return (boundaries & 2) ? 1 : 0; 2051} 2052 2053static int 2054internal_function 2055check_dst_limits_calc_pos (const re_match_context_t *mctx, Idx limit, 2056 Idx subexp_idx, Idx from_node, Idx str_idx, 2057 Idx bkref_idx) 2058{ 2059 struct re_backref_cache_entry *lim = mctx->bkref_ents + limit; 2060 int boundaries; 2061 2062 /* If we are outside the range of the subexpression, return -1 or 1. */ 2063 if (str_idx < lim->subexp_from) 2064 return -1; 2065 2066 if (lim->subexp_to < str_idx) 2067 return 1; 2068 2069 /* If we are within the subexpression, return 0. */ 2070 boundaries = (str_idx == lim->subexp_from); 2071 boundaries |= (str_idx == lim->subexp_to) << 1; 2072 if (boundaries == 0) 2073 return 0; 2074 2075 /* Else, examine epsilon closure. */ 2076 return check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx, 2077 from_node, bkref_idx); 2078} 2079 2080/* Check the limitations of sub expressions LIMITS, and remove the nodes 2081 which are against limitations from DEST_NODES. */ 2082 2083static reg_errcode_t 2084internal_function 2085check_subexp_limits (const re_dfa_t *dfa, re_node_set *dest_nodes, 2086 const re_node_set *candidates, re_node_set *limits, 2087 struct re_backref_cache_entry *bkref_ents, Idx str_idx) 2088{ 2089 reg_errcode_t err; 2090 Idx node_idx, lim_idx; 2091 2092 for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx) 2093 { 2094 Idx subexp_idx; 2095 struct re_backref_cache_entry *ent; 2096 ent = bkref_ents + limits->elems[lim_idx]; 2097 2098 if (str_idx <= ent->subexp_from || ent->str_idx < str_idx) 2099 continue; /* This is unrelated limitation. */ 2100 2101 subexp_idx = dfa->nodes[ent->node].opr.idx; 2102 if (ent->subexp_to == str_idx) 2103 { 2104 Idx ops_node = REG_MISSING; 2105 Idx cls_node = REG_MISSING; 2106 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx) 2107 { 2108 Idx node = dest_nodes->elems[node_idx]; 2109 re_token_type_t type = dfa->nodes[node].type; 2110 if (type == OP_OPEN_SUBEXP 2111 && subexp_idx == dfa->nodes[node].opr.idx) 2112 ops_node = node; 2113 else if (type == OP_CLOSE_SUBEXP 2114 && subexp_idx == dfa->nodes[node].opr.idx) 2115 cls_node = node; 2116 } 2117 2118 /* Check the limitation of the open subexpression. */ 2119 /* Note that (ent->subexp_to = str_idx != ent->subexp_from). */ 2120 if (REG_VALID_INDEX (ops_node)) 2121 { 2122 err = sub_epsilon_src_nodes (dfa, ops_node, dest_nodes, 2123 candidates); 2124 if (BE (err != REG_NOERROR, 0)) 2125 return err; 2126 } 2127 2128 /* Check the limitation of the close subexpression. */ 2129 if (REG_VALID_INDEX (cls_node)) 2130 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx) 2131 { 2132 Idx node = dest_nodes->elems[node_idx]; 2133 if (!re_node_set_contains (dfa->inveclosures + node, 2134 cls_node) 2135 && !re_node_set_contains (dfa->eclosures + node, 2136 cls_node)) 2137 { 2138 /* It is against this limitation. 2139 Remove it form the current sifted state. */ 2140 err = sub_epsilon_src_nodes (dfa, node, dest_nodes, 2141 candidates); 2142 if (BE (err != REG_NOERROR, 0)) 2143 return err; 2144 --node_idx; 2145 } 2146 } 2147 } 2148 else /* (ent->subexp_to != str_idx) */ 2149 { 2150 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx) 2151 { 2152 Idx node = dest_nodes->elems[node_idx]; 2153 re_token_type_t type = dfa->nodes[node].type; 2154 if (type == OP_CLOSE_SUBEXP || type == OP_OPEN_SUBEXP) 2155 { 2156 if (subexp_idx != dfa->nodes[node].opr.idx) 2157 continue; 2158 /* It is against this limitation. 2159 Remove it form the current sifted state. */ 2160 err = sub_epsilon_src_nodes (dfa, node, dest_nodes, 2161 candidates); 2162 if (BE (err != REG_NOERROR, 0)) 2163 return err; 2164 } 2165 } 2166 } 2167 } 2168 return REG_NOERROR; 2169} 2170 2171static reg_errcode_t 2172internal_function 2173sift_states_bkref (const re_match_context_t *mctx, re_sift_context_t *sctx, 2174 Idx str_idx, const re_node_set *candidates) 2175{ 2176 const re_dfa_t *const dfa = mctx->dfa; 2177 reg_errcode_t err; 2178 Idx node_idx, node; 2179 re_sift_context_t local_sctx; 2180 Idx first_idx = search_cur_bkref_entry (mctx, str_idx); 2181 2182 if (first_idx == REG_MISSING) 2183 return REG_NOERROR; 2184 2185 local_sctx.sifted_states = NULL; /* Mark that it hasn't been initialized. */ 2186 2187 for (node_idx = 0; node_idx < candidates->nelem; ++node_idx) 2188 { 2189 Idx enabled_idx; 2190 re_token_type_t type; 2191 struct re_backref_cache_entry *entry; 2192 node = candidates->elems[node_idx]; 2193 type = dfa->nodes[node].type; 2194 /* Avoid infinite loop for the REs like "()\1+". */ 2195 if (node == sctx->last_node && str_idx == sctx->last_str_idx) 2196 continue; 2197 if (type != OP_BACK_REF) 2198 continue; 2199 2200 entry = mctx->bkref_ents + first_idx; 2201 enabled_idx = first_idx; 2202 do 2203 { 2204 Idx subexp_len; 2205 Idx to_idx; 2206 Idx dst_node; 2207 bool ok; 2208 re_dfastate_t *cur_state; 2209 2210 if (entry->node != node) 2211 continue; 2212 subexp_len = entry->subexp_to - entry->subexp_from; 2213 to_idx = str_idx + subexp_len; 2214 dst_node = (subexp_len ? dfa->nexts[node] 2215 : dfa->edests[node].elems[0]); 2216 2217 if (to_idx > sctx->last_str_idx 2218 || sctx->sifted_states[to_idx] == NULL 2219 || !STATE_NODE_CONTAINS (sctx->sifted_states[to_idx], dst_node) 2220 || check_dst_limits (mctx, &sctx->limits, node, 2221 str_idx, dst_node, to_idx)) 2222 continue; 2223 2224 if (local_sctx.sifted_states == NULL) 2225 { 2226 local_sctx = *sctx; 2227 err = re_node_set_init_copy (&local_sctx.limits, &sctx->limits); 2228 if (BE (err != REG_NOERROR, 0)) 2229 goto free_return; 2230 } 2231 local_sctx.last_node = node; 2232 local_sctx.last_str_idx = str_idx; 2233 ok = re_node_set_insert (&local_sctx.limits, enabled_idx); 2234 if (BE (! ok, 0)) 2235 { 2236 err = REG_ESPACE; 2237 goto free_return; 2238 } 2239 cur_state = local_sctx.sifted_states[str_idx]; 2240 err = sift_states_backward (mctx, &local_sctx); 2241 if (BE (err != REG_NOERROR, 0)) 2242 goto free_return; 2243 if (sctx->limited_states != NULL) 2244 { 2245 err = merge_state_array (dfa, sctx->limited_states, 2246 local_sctx.sifted_states, 2247 str_idx + 1); 2248 if (BE (err != REG_NOERROR, 0)) 2249 goto free_return; 2250 } 2251 local_sctx.sifted_states[str_idx] = cur_state; 2252 re_node_set_remove (&local_sctx.limits, enabled_idx); 2253 2254 /* mctx->bkref_ents may have changed, reload the pointer. */ 2255 entry = mctx->bkref_ents + enabled_idx; 2256 } 2257 while (enabled_idx++, entry++->more); 2258 } 2259 err = REG_NOERROR; 2260 free_return: 2261 if (local_sctx.sifted_states != NULL) 2262 { 2263 re_node_set_free (&local_sctx.limits); 2264 } 2265 2266 return err; 2267} 2268 2269 2270#ifdef RE_ENABLE_I18N 2271static int 2272internal_function 2273sift_states_iter_mb (const re_match_context_t *mctx, re_sift_context_t *sctx, 2274 Idx node_idx, Idx str_idx, Idx max_str_idx) 2275{ 2276 const re_dfa_t *const dfa = mctx->dfa; 2277 int naccepted; 2278 /* Check the node can accept `multi byte'. */ 2279 naccepted = check_node_accept_bytes (dfa, node_idx, &mctx->input, str_idx); 2280 if (naccepted > 0 && str_idx + naccepted <= max_str_idx && 2281 !STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + naccepted], 2282 dfa->nexts[node_idx])) 2283 /* The node can't accept the `multi byte', or the 2284 destination was already thrown away, then the node 2285 could't accept the current input `multi byte'. */ 2286 naccepted = 0; 2287 /* Otherwise, it is sure that the node could accept 2288 `naccepted' bytes input. */ 2289 return naccepted; 2290} 2291#endif /* RE_ENABLE_I18N */ 2292 2293 2294/* Functions for state transition. */ 2295 2296/* Return the next state to which the current state STATE will transit by 2297 accepting the current input byte, and update STATE_LOG if necessary. 2298 If STATE can accept a multibyte char/collating element/back reference 2299 update the destination of STATE_LOG. */ 2300 2301static re_dfastate_t * 2302internal_function 2303transit_state (reg_errcode_t *err, re_match_context_t *mctx, 2304 re_dfastate_t *state) 2305{ 2306 re_dfastate_t **trtable; 2307 unsigned char ch; 2308 2309#ifdef RE_ENABLE_I18N 2310 /* If the current state can accept multibyte. */ 2311 if (BE (state->accept_mb, 0)) 2312 { 2313 *err = transit_state_mb (mctx, state); 2314 if (BE (*err != REG_NOERROR, 0)) 2315 return NULL; 2316 } 2317#endif /* RE_ENABLE_I18N */ 2318 2319 /* Then decide the next state with the single byte. */ 2320#if 0 2321 if (0) 2322 /* don't use transition table */ 2323 return transit_state_sb (err, mctx, state); 2324#endif 2325 2326 /* Use transition table */ 2327 ch = re_string_fetch_byte (&mctx->input); 2328 for (;;) 2329 { 2330 trtable = state->trtable; 2331 if (BE (trtable != NULL, 1)) 2332 return trtable[ch]; 2333 2334 trtable = state->word_trtable; 2335 if (BE (trtable != NULL, 1)) 2336 { 2337 unsigned int context; 2338 context 2339 = re_string_context_at (&mctx->input, 2340 re_string_cur_idx (&mctx->input) - 1, 2341 mctx->eflags); 2342 if (IS_WORD_CONTEXT (context)) 2343 return trtable[ch + SBC_MAX]; 2344 else 2345 return trtable[ch]; 2346 } 2347 2348 if (!build_trtable (mctx->dfa, state)) 2349 { 2350 *err = REG_ESPACE; 2351 return NULL; 2352 } 2353 2354 /* Retry, we now have a transition table. */ 2355 } 2356} 2357 2358/* Update the state_log if we need */ 2359static re_dfastate_t * 2360internal_function 2361merge_state_with_log (reg_errcode_t *err, re_match_context_t *mctx, 2362 re_dfastate_t *next_state) 2363{ 2364 const re_dfa_t *const dfa = mctx->dfa; 2365 Idx cur_idx = re_string_cur_idx (&mctx->input); 2366 2367 if (cur_idx > mctx->state_log_top) 2368 { 2369 mctx->state_log[cur_idx] = next_state; 2370 mctx->state_log_top = cur_idx; 2371 } 2372 else if (mctx->state_log[cur_idx] == 0) 2373 { 2374 mctx->state_log[cur_idx] = next_state; 2375 } 2376 else 2377 { 2378 re_dfastate_t *pstate; 2379 unsigned int context; 2380 re_node_set next_nodes, *log_nodes, *table_nodes = NULL; 2381 /* If (state_log[cur_idx] != 0), it implies that cur_idx is 2382 the destination of a multibyte char/collating element/ 2383 back reference. Then the next state is the union set of 2384 these destinations and the results of the transition table. */ 2385 pstate = mctx->state_log[cur_idx]; 2386 log_nodes = pstate->entrance_nodes; 2387 if (next_state != NULL) 2388 { 2389 table_nodes = next_state->entrance_nodes; 2390 *err = re_node_set_init_union (&next_nodes, table_nodes, 2391 log_nodes); 2392 if (BE (*err != REG_NOERROR, 0)) 2393 return NULL; 2394 } 2395 else 2396 next_nodes = *log_nodes; 2397 /* Note: We already add the nodes of the initial state, 2398 then we don't need to add them here. */ 2399 2400 context = re_string_context_at (&mctx->input, 2401 re_string_cur_idx (&mctx->input) - 1, 2402 mctx->eflags); 2403 next_state = mctx->state_log[cur_idx] 2404 = re_acquire_state_context (err, dfa, &next_nodes, context); 2405 /* We don't need to check errors here, since the return value of 2406 this function is next_state and ERR is already set. */ 2407 2408 if (table_nodes != NULL) 2409 re_node_set_free (&next_nodes); 2410 } 2411 2412 if (BE (dfa->nbackref, 0) && next_state != NULL) 2413 { 2414 /* Check OP_OPEN_SUBEXP in the current state in case that we use them 2415 later. We must check them here, since the back references in the 2416 next state might use them. */ 2417 *err = check_subexp_matching_top (mctx, &next_state->nodes, 2418 cur_idx); 2419 if (BE (*err != REG_NOERROR, 0)) 2420 return NULL; 2421 2422 /* If the next state has back references. */ 2423 if (next_state->has_backref) 2424 { 2425 *err = transit_state_bkref (mctx, &next_state->nodes); 2426 if (BE (*err != REG_NOERROR, 0)) 2427 return NULL; 2428 next_state = mctx->state_log[cur_idx]; 2429 } 2430 } 2431 2432 return next_state; 2433} 2434 2435/* Skip bytes in the input that correspond to part of a 2436 multi-byte match, then look in the log for a state 2437 from which to restart matching. */ 2438static re_dfastate_t * 2439internal_function 2440find_recover_state (reg_errcode_t *err, re_match_context_t *mctx) 2441{ 2442 re_dfastate_t *cur_state; 2443 do 2444 { 2445 Idx max = mctx->state_log_top; 2446 Idx cur_str_idx = re_string_cur_idx (&mctx->input); 2447 2448 do 2449 { 2450 if (++cur_str_idx > max) 2451 return NULL; 2452 re_string_skip_bytes (&mctx->input, 1); 2453 } 2454 while (mctx->state_log[cur_str_idx] == NULL); 2455 2456 cur_state = merge_state_with_log (err, mctx, NULL); 2457 } 2458 while (*err == REG_NOERROR && cur_state == NULL); 2459 return cur_state; 2460} 2461 2462/* Helper functions for transit_state. */ 2463 2464/* From the node set CUR_NODES, pick up the nodes whose types are 2465 OP_OPEN_SUBEXP and which have corresponding back references in the regular 2466 expression. And register them to use them later for evaluating the 2467 correspoding back references. */ 2468 2469static reg_errcode_t 2470internal_function 2471check_subexp_matching_top (re_match_context_t *mctx, re_node_set *cur_nodes, 2472 Idx str_idx) 2473{ 2474 const re_dfa_t *const dfa = mctx->dfa; 2475 Idx node_idx; 2476 reg_errcode_t err; 2477 2478 /* TODO: This isn't efficient. 2479 Because there might be more than one nodes whose types are 2480 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all 2481 nodes. 2482 E.g. RE: (a){2} */ 2483 for (node_idx = 0; node_idx < cur_nodes->nelem; ++node_idx) 2484 { 2485 Idx node = cur_nodes->elems[node_idx]; 2486 if (dfa->nodes[node].type == OP_OPEN_SUBEXP 2487 && dfa->nodes[node].opr.idx < BITSET_WORD_BITS 2488 && (dfa->used_bkref_map 2489 & ((bitset_word_t) 1 << dfa->nodes[node].opr.idx))) 2490 { 2491 err = match_ctx_add_subtop (mctx, node, str_idx); 2492 if (BE (err != REG_NOERROR, 0)) 2493 return err; 2494 } 2495 } 2496 return REG_NOERROR; 2497} 2498 2499#if 0 2500/* Return the next state to which the current state STATE will transit by 2501 accepting the current input byte. */ 2502 2503static re_dfastate_t * 2504transit_state_sb (reg_errcode_t *err, re_match_context_t *mctx, 2505 re_dfastate_t *state) 2506{ 2507 const re_dfa_t *const dfa = mctx->dfa; 2508 re_node_set next_nodes; 2509 re_dfastate_t *next_state; 2510 Idx node_cnt, cur_str_idx = re_string_cur_idx (&mctx->input); 2511 unsigned int context; 2512 2513 *err = re_node_set_alloc (&next_nodes, state->nodes.nelem + 1); 2514 if (BE (*err != REG_NOERROR, 0)) 2515 return NULL; 2516 for (node_cnt = 0; node_cnt < state->nodes.nelem; ++node_cnt) 2517 { 2518 Idx cur_node = state->nodes.elems[node_cnt]; 2519 if (check_node_accept (mctx, dfa->nodes + cur_node, cur_str_idx)) 2520 { 2521 *err = re_node_set_merge (&next_nodes, 2522 dfa->eclosures + dfa->nexts[cur_node]); 2523 if (BE (*err != REG_NOERROR, 0)) 2524 { 2525 re_node_set_free (&next_nodes); 2526 return NULL; 2527 } 2528 } 2529 } 2530 context = re_string_context_at (&mctx->input, cur_str_idx, mctx->eflags); 2531 next_state = re_acquire_state_context (err, dfa, &next_nodes, context); 2532 /* We don't need to check errors here, since the return value of 2533 this function is next_state and ERR is already set. */ 2534 2535 re_node_set_free (&next_nodes); 2536 re_string_skip_bytes (&mctx->input, 1); 2537 return next_state; 2538} 2539#endif 2540 2541#ifdef RE_ENABLE_I18N 2542static reg_errcode_t 2543internal_function 2544transit_state_mb (re_match_context_t *mctx, re_dfastate_t *pstate) 2545{ 2546 const re_dfa_t *const dfa = mctx->dfa; 2547 reg_errcode_t err; 2548 Idx i; 2549 2550 for (i = 0; i < pstate->nodes.nelem; ++i) 2551 { 2552 re_node_set dest_nodes, *new_nodes; 2553 Idx cur_node_idx = pstate->nodes.elems[i]; 2554 int naccepted; 2555 Idx dest_idx; 2556 unsigned int context; 2557 re_dfastate_t *dest_state; 2558 2559 if (!dfa->nodes[cur_node_idx].accept_mb) 2560 continue; 2561 2562 if (dfa->nodes[cur_node_idx].constraint) 2563 { 2564 context = re_string_context_at (&mctx->input, 2565 re_string_cur_idx (&mctx->input), 2566 mctx->eflags); 2567 if (NOT_SATISFY_NEXT_CONSTRAINT (dfa->nodes[cur_node_idx].constraint, 2568 context)) 2569 continue; 2570 } 2571 2572 /* How many bytes the node can accept? */ 2573 naccepted = check_node_accept_bytes (dfa, cur_node_idx, &mctx->input, 2574 re_string_cur_idx (&mctx->input)); 2575 if (naccepted == 0) 2576 continue; 2577 2578 /* The node can accepts `naccepted' bytes. */ 2579 dest_idx = re_string_cur_idx (&mctx->input) + naccepted; 2580 mctx->max_mb_elem_len = ((mctx->max_mb_elem_len < naccepted) ? naccepted 2581 : mctx->max_mb_elem_len); 2582 err = clean_state_log_if_needed (mctx, dest_idx); 2583 if (BE (err != REG_NOERROR, 0)) 2584 return err; 2585#ifdef DEBUG 2586 assert (dfa->nexts[cur_node_idx] != REG_MISSING); 2587#endif 2588 new_nodes = dfa->eclosures + dfa->nexts[cur_node_idx]; 2589 2590 dest_state = mctx->state_log[dest_idx]; 2591 if (dest_state == NULL) 2592 dest_nodes = *new_nodes; 2593 else 2594 { 2595 err = re_node_set_init_union (&dest_nodes, 2596 dest_state->entrance_nodes, new_nodes); 2597 if (BE (err != REG_NOERROR, 0)) 2598 return err; 2599 } 2600 context = re_string_context_at (&mctx->input, dest_idx - 1, 2601 mctx->eflags); 2602 mctx->state_log[dest_idx] 2603 = re_acquire_state_context (&err, dfa, &dest_nodes, context); 2604 if (dest_state != NULL) 2605 re_node_set_free (&dest_nodes); 2606 if (BE (mctx->state_log[dest_idx] == NULL && err != REG_NOERROR, 0)) 2607 return err; 2608 } 2609 return REG_NOERROR; 2610} 2611#endif /* RE_ENABLE_I18N */ 2612 2613static reg_errcode_t 2614internal_function 2615transit_state_bkref (re_match_context_t *mctx, const re_node_set *nodes) 2616{ 2617 const re_dfa_t *const dfa = mctx->dfa; 2618 reg_errcode_t err; 2619 Idx i; 2620 Idx cur_str_idx = re_string_cur_idx (&mctx->input); 2621 2622 for (i = 0; i < nodes->nelem; ++i) 2623 { 2624 Idx dest_str_idx, prev_nelem, bkc_idx; 2625 Idx node_idx = nodes->elems[i]; 2626 unsigned int context; 2627 const re_token_t *node = dfa->nodes + node_idx; 2628 re_node_set *new_dest_nodes; 2629 2630 /* Check whether `node' is a backreference or not. */ 2631 if (node->type != OP_BACK_REF) 2632 continue; 2633 2634 if (node->constraint) 2635 { 2636 context = re_string_context_at (&mctx->input, cur_str_idx, 2637 mctx->eflags); 2638 if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context)) 2639 continue; 2640 } 2641 2642 /* `node' is a backreference. 2643 Check the substring which the substring matched. */ 2644 bkc_idx = mctx->nbkref_ents; 2645 err = get_subexp (mctx, node_idx, cur_str_idx); 2646 if (BE (err != REG_NOERROR, 0)) 2647 goto free_return; 2648 2649 /* And add the epsilon closures (which is `new_dest_nodes') of 2650 the backreference to appropriate state_log. */ 2651#ifdef DEBUG 2652 assert (dfa->nexts[node_idx] != REG_MISSING); 2653#endif 2654 for (; bkc_idx < mctx->nbkref_ents; ++bkc_idx) 2655 { 2656 Idx subexp_len; 2657 re_dfastate_t *dest_state; 2658 struct re_backref_cache_entry *bkref_ent; 2659 bkref_ent = mctx->bkref_ents + bkc_idx; 2660 if (bkref_ent->node != node_idx || bkref_ent->str_idx != cur_str_idx) 2661 continue; 2662 subexp_len = bkref_ent->subexp_to - bkref_ent->subexp_from; 2663 new_dest_nodes = (subexp_len == 0 2664 ? dfa->eclosures + dfa->edests[node_idx].elems[0] 2665 : dfa->eclosures + dfa->nexts[node_idx]); 2666 dest_str_idx = (cur_str_idx + bkref_ent->subexp_to 2667 - bkref_ent->subexp_from); 2668 context = re_string_context_at (&mctx->input, dest_str_idx - 1, 2669 mctx->eflags); 2670 dest_state = mctx->state_log[dest_str_idx]; 2671 prev_nelem = ((mctx->state_log[cur_str_idx] == NULL) ? 0 2672 : mctx->state_log[cur_str_idx]->nodes.nelem); 2673 /* Add `new_dest_node' to state_log. */ 2674 if (dest_state == NULL) 2675 { 2676 mctx->state_log[dest_str_idx] 2677 = re_acquire_state_context (&err, dfa, new_dest_nodes, 2678 context); 2679 if (BE (mctx->state_log[dest_str_idx] == NULL 2680 && err != REG_NOERROR, 0)) 2681 goto free_return; 2682 } 2683 else 2684 { 2685 re_node_set dest_nodes; 2686 err = re_node_set_init_union (&dest_nodes, 2687 dest_state->entrance_nodes, 2688 new_dest_nodes); 2689 if (BE (err != REG_NOERROR, 0)) 2690 { 2691 re_node_set_free (&dest_nodes); 2692 goto free_return; 2693 } 2694 mctx->state_log[dest_str_idx] 2695 = re_acquire_state_context (&err, dfa, &dest_nodes, context); 2696 re_node_set_free (&dest_nodes); 2697 if (BE (mctx->state_log[dest_str_idx] == NULL 2698 && err != REG_NOERROR, 0)) 2699 goto free_return; 2700 } 2701 /* We need to check recursively if the backreference can epsilon 2702 transit. */ 2703 if (subexp_len == 0 2704 && mctx->state_log[cur_str_idx]->nodes.nelem > prev_nelem) 2705 { 2706 err = check_subexp_matching_top (mctx, new_dest_nodes, 2707 cur_str_idx); 2708 if (BE (err != REG_NOERROR, 0)) 2709 goto free_return; 2710 err = transit_state_bkref (mctx, new_dest_nodes); 2711 if (BE (err != REG_NOERROR, 0)) 2712 goto free_return; 2713 } 2714 } 2715 } 2716 err = REG_NOERROR; 2717 free_return: 2718 return err; 2719} 2720 2721/* Enumerate all the candidates which the backreference BKREF_NODE can match 2722 at BKREF_STR_IDX, and register them by match_ctx_add_entry(). 2723 Note that we might collect inappropriate candidates here. 2724 However, the cost of checking them strictly here is too high, then we 2725 delay these checking for prune_impossible_nodes(). */ 2726 2727static reg_errcode_t 2728internal_function 2729get_subexp (re_match_context_t *mctx, Idx bkref_node, Idx bkref_str_idx) 2730{ 2731 const re_dfa_t *const dfa = mctx->dfa; 2732 Idx subexp_num, sub_top_idx; 2733 const char *buf = (const char *) re_string_get_buffer (&mctx->input); 2734 /* Return if we have already checked BKREF_NODE at BKREF_STR_IDX. */ 2735 Idx cache_idx = search_cur_bkref_entry (mctx, bkref_str_idx); 2736 if (cache_idx != REG_MISSING) 2737 { 2738 const struct re_backref_cache_entry *entry 2739 = mctx->bkref_ents + cache_idx; 2740 do 2741 if (entry->node == bkref_node) 2742 return REG_NOERROR; /* We already checked it. */ 2743 while (entry++->more); 2744 } 2745 2746 subexp_num = dfa->nodes[bkref_node].opr.idx; 2747 2748 /* For each sub expression */ 2749 for (sub_top_idx = 0; sub_top_idx < mctx->nsub_tops; ++sub_top_idx) 2750 { 2751 reg_errcode_t err; 2752 re_sub_match_top_t *sub_top = mctx->sub_tops[sub_top_idx]; 2753 re_sub_match_last_t *sub_last; 2754 Idx sub_last_idx, sl_str, bkref_str_off; 2755 2756 if (dfa->nodes[sub_top->node].opr.idx != subexp_num) 2757 continue; /* It isn't related. */ 2758 2759 sl_str = sub_top->str_idx; 2760 bkref_str_off = bkref_str_idx; 2761 /* At first, check the last node of sub expressions we already 2762 evaluated. */ 2763 for (sub_last_idx = 0; sub_last_idx < sub_top->nlasts; ++sub_last_idx) 2764 { 2765 regoff_t sl_str_diff; 2766 sub_last = sub_top->lasts[sub_last_idx]; 2767 sl_str_diff = sub_last->str_idx - sl_str; 2768 /* The matched string by the sub expression match with the substring 2769 at the back reference? */ 2770 if (sl_str_diff > 0) 2771 { 2772 if (BE (bkref_str_off + sl_str_diff > mctx->input.valid_len, 0)) 2773 { 2774 /* Not enough chars for a successful match. */ 2775 if (bkref_str_off + sl_str_diff > mctx->input.len) 2776 break; 2777 2778 err = clean_state_log_if_needed (mctx, 2779 bkref_str_off 2780 + sl_str_diff); 2781 if (BE (err != REG_NOERROR, 0)) 2782 return err; 2783 buf = (const char *) re_string_get_buffer (&mctx->input); 2784 } 2785 if (memcmp (buf + bkref_str_off, buf + sl_str, sl_str_diff) != 0) 2786 /* We don't need to search this sub expression any more. */ 2787 break; 2788 } 2789 bkref_str_off += sl_str_diff; 2790 sl_str += sl_str_diff; 2791 err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node, 2792 bkref_str_idx); 2793 2794 /* Reload buf, since the preceding call might have reallocated 2795 the buffer. */ 2796 buf = (const char *) re_string_get_buffer (&mctx->input); 2797 2798 if (err == REG_NOMATCH) 2799 continue; 2800 if (BE (err != REG_NOERROR, 0)) 2801 return err; 2802 } 2803 2804 if (sub_last_idx < sub_top->nlasts) 2805 continue; 2806 if (sub_last_idx > 0) 2807 ++sl_str; 2808 /* Then, search for the other last nodes of the sub expression. */ 2809 for (; sl_str <= bkref_str_idx; ++sl_str) 2810 { 2811 Idx cls_node; 2812 regoff_t sl_str_off; 2813 const re_node_set *nodes; 2814 sl_str_off = sl_str - sub_top->str_idx; 2815 /* The matched string by the sub expression match with the substring 2816 at the back reference? */ 2817 if (sl_str_off > 0) 2818 { 2819 if (BE (bkref_str_off >= mctx->input.valid_len, 0)) 2820 { 2821 /* If we are at the end of the input, we cannot match. */ 2822 if (bkref_str_off >= mctx->input.len) 2823 break; 2824 2825 err = extend_buffers (mctx); 2826 if (BE (err != REG_NOERROR, 0)) 2827 return err; 2828 2829 buf = (const char *) re_string_get_buffer (&mctx->input); 2830 } 2831 if (buf [bkref_str_off++] != buf[sl_str - 1]) 2832 break; /* We don't need to search this sub expression 2833 any more. */ 2834 } 2835 if (mctx->state_log[sl_str] == NULL) 2836 continue; 2837 /* Does this state have a ')' of the sub expression? */ 2838 nodes = &mctx->state_log[sl_str]->nodes; 2839 cls_node = find_subexp_node (dfa, nodes, subexp_num, 2840 OP_CLOSE_SUBEXP); 2841 if (cls_node == REG_MISSING) 2842 continue; /* No. */ 2843 if (sub_top->path == NULL) 2844 { 2845 sub_top->path = calloc (sizeof (state_array_t), 2846 sl_str - sub_top->str_idx + 1); 2847 if (sub_top->path == NULL) 2848 return REG_ESPACE; 2849 } 2850 /* Can the OP_OPEN_SUBEXP node arrive the OP_CLOSE_SUBEXP node 2851 in the current context? */ 2852 err = check_arrival (mctx, sub_top->path, sub_top->node, 2853 sub_top->str_idx, cls_node, sl_str, 2854 OP_CLOSE_SUBEXP); 2855 if (err == REG_NOMATCH) 2856 continue; 2857 if (BE (err != REG_NOERROR, 0)) 2858 return err; 2859 sub_last = match_ctx_add_sublast (sub_top, cls_node, sl_str); 2860 if (BE (sub_last == NULL, 0)) 2861 return REG_ESPACE; 2862 err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node, 2863 bkref_str_idx); 2864 if (err == REG_NOMATCH) 2865 continue; 2866 } 2867 } 2868 return REG_NOERROR; 2869} 2870 2871/* Helper functions for get_subexp(). */ 2872 2873/* Check SUB_LAST can arrive to the back reference BKREF_NODE at BKREF_STR. 2874 If it can arrive, register the sub expression expressed with SUB_TOP 2875 and SUB_LAST. */ 2876 2877static reg_errcode_t 2878internal_function 2879get_subexp_sub (re_match_context_t *mctx, const re_sub_match_top_t *sub_top, 2880 re_sub_match_last_t *sub_last, Idx bkref_node, Idx bkref_str) 2881{ 2882 reg_errcode_t err; 2883 Idx to_idx; 2884 /* Can the subexpression arrive the back reference? */ 2885 err = check_arrival (mctx, &sub_last->path, sub_last->node, 2886 sub_last->str_idx, bkref_node, bkref_str, 2887 OP_OPEN_SUBEXP); 2888 if (err != REG_NOERROR) 2889 return err; 2890 err = match_ctx_add_entry (mctx, bkref_node, bkref_str, sub_top->str_idx, 2891 sub_last->str_idx); 2892 if (BE (err != REG_NOERROR, 0)) 2893 return err; 2894 to_idx = bkref_str + sub_last->str_idx - sub_top->str_idx; 2895 return clean_state_log_if_needed (mctx, to_idx); 2896} 2897 2898/* Find the first node which is '(' or ')' and whose index is SUBEXP_IDX. 2899 Search '(' if FL_OPEN, or search ')' otherwise. 2900 TODO: This function isn't efficient... 2901 Because there might be more than one nodes whose types are 2902 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all 2903 nodes. 2904 E.g. RE: (a){2} */ 2905 2906static Idx 2907internal_function 2908find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes, 2909 Idx subexp_idx, int type) 2910{ 2911 Idx cls_idx; 2912 for (cls_idx = 0; cls_idx < nodes->nelem; ++cls_idx) 2913 { 2914 Idx cls_node = nodes->elems[cls_idx]; 2915 const re_token_t *node = dfa->nodes + cls_node; 2916 if (node->type == type 2917 && node->opr.idx == subexp_idx) 2918 return cls_node; 2919 } 2920 return REG_MISSING; 2921} 2922 2923/* Check whether the node TOP_NODE at TOP_STR can arrive to the node 2924 LAST_NODE at LAST_STR. We record the path onto PATH since it will be 2925 heavily reused. 2926 Return REG_NOERROR if it can arrive, or REG_NOMATCH otherwise. */ 2927 2928static reg_errcode_t 2929internal_function 2930check_arrival (re_match_context_t *mctx, state_array_t *path, Idx top_node, 2931 Idx top_str, Idx last_node, Idx last_str, int type) 2932{ 2933 const re_dfa_t *const dfa = mctx->dfa; 2934 reg_errcode_t err = REG_NOERROR; 2935 Idx subexp_num, backup_cur_idx, str_idx, null_cnt; 2936 re_dfastate_t *cur_state = NULL; 2937 re_node_set *cur_nodes, next_nodes; 2938 re_dfastate_t **backup_state_log; 2939 unsigned int context; 2940 2941 subexp_num = dfa->nodes[top_node].opr.idx; 2942 /* Extend the buffer if we need. */ 2943 if (BE (path->alloc < last_str + mctx->max_mb_elem_len + 1, 0)) 2944 { 2945 re_dfastate_t **new_array; 2946 Idx old_alloc = path->alloc; 2947 Idx new_alloc = old_alloc + last_str + mctx->max_mb_elem_len + 1; 2948 if (BE (new_alloc < old_alloc, 0) 2949 || BE (SIZE_MAX / sizeof (re_dfastate_t *) < new_alloc, 0)) 2950 return REG_ESPACE; 2951 new_array = re_realloc (path->array, re_dfastate_t *, new_alloc); 2952 if (BE (new_array == NULL, 0)) 2953 return REG_ESPACE; 2954 path->array = new_array; 2955 path->alloc = new_alloc; 2956 memset (new_array + old_alloc, '\0', 2957 sizeof (re_dfastate_t *) * (path->alloc - old_alloc)); 2958 } 2959 2960 str_idx = path->next_idx ? path->next_idx : top_str; 2961 2962 /* Temporary modify MCTX. */ 2963 backup_state_log = mctx->state_log; 2964 backup_cur_idx = mctx->input.cur_idx; 2965 mctx->state_log = path->array; 2966 mctx->input.cur_idx = str_idx; 2967 2968 /* Setup initial node set. */ 2969 context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags); 2970 if (str_idx == top_str) 2971 { 2972 err = re_node_set_init_1 (&next_nodes, top_node); 2973 if (BE (err != REG_NOERROR, 0)) 2974 return err; 2975 err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type); 2976 if (BE (err != REG_NOERROR, 0)) 2977 { 2978 re_node_set_free (&next_nodes); 2979 return err; 2980 } 2981 } 2982 else 2983 { 2984 cur_state = mctx->state_log[str_idx]; 2985 if (cur_state && cur_state->has_backref) 2986 { 2987 err = re_node_set_init_copy (&next_nodes, &cur_state->nodes); 2988 if (BE (err != REG_NOERROR, 0)) 2989 return err; 2990 } 2991 else 2992 re_node_set_init_empty (&next_nodes); 2993 } 2994 if (str_idx == top_str || (cur_state && cur_state->has_backref)) 2995 { 2996 if (next_nodes.nelem) 2997 { 2998 err = expand_bkref_cache (mctx, &next_nodes, str_idx, 2999 subexp_num, type); 3000 if (BE (err != REG_NOERROR, 0)) 3001 { 3002 re_node_set_free (&next_nodes); 3003 return err; 3004 } 3005 } 3006 cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context); 3007 if (BE (cur_state == NULL && err != REG_NOERROR, 0)) 3008 { 3009 re_node_set_free (&next_nodes); 3010 return err; 3011 } 3012 mctx->state_log[str_idx] = cur_state; 3013 } 3014 3015 for (null_cnt = 0; str_idx < last_str && null_cnt <= mctx->max_mb_elem_len;) 3016 { 3017 re_node_set_empty (&next_nodes); 3018 if (mctx->state_log[str_idx + 1]) 3019 { 3020 err = re_node_set_merge (&next_nodes, 3021 &mctx->state_log[str_idx + 1]->nodes); 3022 if (BE (err != REG_NOERROR, 0)) 3023 { 3024 re_node_set_free (&next_nodes); 3025 return err; 3026 } 3027 } 3028 if (cur_state) 3029 { 3030 err = check_arrival_add_next_nodes (mctx, str_idx, 3031 &cur_state->non_eps_nodes, 3032 &next_nodes); 3033 if (BE (err != REG_NOERROR, 0)) 3034 { 3035 re_node_set_free (&next_nodes); 3036 return err; 3037 } 3038 } 3039 ++str_idx; 3040 if (next_nodes.nelem) 3041 { 3042 err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type); 3043 if (BE (err != REG_NOERROR, 0)) 3044 { 3045 re_node_set_free (&next_nodes); 3046 return err; 3047 } 3048 err = expand_bkref_cache (mctx, &next_nodes, str_idx, 3049 subexp_num, type); 3050 if (BE (err != REG_NOERROR, 0)) 3051 { 3052 re_node_set_free (&next_nodes); 3053 return err; 3054 } 3055 } 3056 context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags); 3057 cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context); 3058 if (BE (cur_state == NULL && err != REG_NOERROR, 0)) 3059 { 3060 re_node_set_free (&next_nodes); 3061 return err; 3062 } 3063 mctx->state_log[str_idx] = cur_state; 3064 null_cnt = cur_state == NULL ? null_cnt + 1 : 0; 3065 } 3066 re_node_set_free (&next_nodes); 3067 cur_nodes = (mctx->state_log[last_str] == NULL ? NULL 3068 : &mctx->state_log[last_str]->nodes); 3069 path->next_idx = str_idx; 3070 3071 /* Fix MCTX. */ 3072 mctx->state_log = backup_state_log; 3073 mctx->input.cur_idx = backup_cur_idx; 3074 3075 /* Then check the current node set has the node LAST_NODE. */ 3076 if (cur_nodes != NULL && re_node_set_contains (cur_nodes, last_node)) 3077 return REG_NOERROR; 3078 3079 return REG_NOMATCH; 3080} 3081 3082/* Helper functions for check_arrival. */ 3083 3084/* Calculate the destination nodes of CUR_NODES at STR_IDX, and append them 3085 to NEXT_NODES. 3086 TODO: This function is similar to the functions transit_state*(), 3087 however this function has many additional works. 3088 Can't we unify them? */ 3089 3090static reg_errcode_t 3091internal_function 3092check_arrival_add_next_nodes (re_match_context_t *mctx, Idx str_idx, 3093 re_node_set *cur_nodes, re_node_set *next_nodes) 3094{ 3095 const re_dfa_t *const dfa = mctx->dfa; 3096 bool ok; 3097 Idx cur_idx; 3098#ifdef RE_ENABLE_I18N 3099 reg_errcode_t err = REG_NOERROR; 3100#endif 3101 re_node_set union_set; 3102 re_node_set_init_empty (&union_set); 3103 for (cur_idx = 0; cur_idx < cur_nodes->nelem; ++cur_idx) 3104 { 3105 int naccepted = 0; 3106 Idx cur_node = cur_nodes->elems[cur_idx]; 3107#ifdef DEBUG 3108 re_token_type_t type = dfa->nodes[cur_node].type; 3109 assert (!IS_EPSILON_NODE (type)); 3110#endif 3111#ifdef RE_ENABLE_I18N 3112 /* If the node may accept `multi byte'. */ 3113 if (dfa->nodes[cur_node].accept_mb) 3114 { 3115 naccepted = check_node_accept_bytes (dfa, cur_node, &mctx->input, 3116 str_idx); 3117 if (naccepted > 1) 3118 { 3119 re_dfastate_t *dest_state; 3120 Idx next_node = dfa->nexts[cur_node]; 3121 Idx next_idx = str_idx + naccepted; 3122 dest_state = mctx->state_log[next_idx]; 3123 re_node_set_empty (&union_set); 3124 if (dest_state) 3125 { 3126 err = re_node_set_merge (&union_set, &dest_state->nodes); 3127 if (BE (err != REG_NOERROR, 0)) 3128 { 3129 re_node_set_free (&union_set); 3130 return err; 3131 } 3132 } 3133 ok = re_node_set_insert (&union_set, next_node); 3134 if (BE (! ok, 0)) 3135 { 3136 re_node_set_free (&union_set); 3137 return REG_ESPACE; 3138 } 3139 mctx->state_log[next_idx] = re_acquire_state (&err, dfa, 3140 &union_set); 3141 if (BE (mctx->state_log[next_idx] == NULL 3142 && err != REG_NOERROR, 0)) 3143 { 3144 re_node_set_free (&union_set); 3145 return err; 3146 } 3147 } 3148 } 3149#endif /* RE_ENABLE_I18N */ 3150 if (naccepted 3151 || check_node_accept (mctx, dfa->nodes + cur_node, str_idx)) 3152 { 3153 ok = re_node_set_insert (next_nodes, dfa->nexts[cur_node]); 3154 if (BE (! ok, 0)) 3155 { 3156 re_node_set_free (&union_set); 3157 return REG_ESPACE; 3158 } 3159 } 3160 } 3161 re_node_set_free (&union_set); 3162 return REG_NOERROR; 3163} 3164 3165/* For all the nodes in CUR_NODES, add the epsilon closures of them to 3166 CUR_NODES, however exclude the nodes which are: 3167 - inside the sub expression whose number is EX_SUBEXP, if FL_OPEN. 3168 - out of the sub expression whose number is EX_SUBEXP, if !FL_OPEN. 3169*/ 3170 3171static reg_errcode_t 3172internal_function 3173check_arrival_expand_ecl (const re_dfa_t *dfa, re_node_set *cur_nodes, 3174 Idx ex_subexp, int type) 3175{ 3176 reg_errcode_t err; 3177 Idx idx, outside_node; 3178 re_node_set new_nodes; 3179#ifdef DEBUG 3180 assert (cur_nodes->nelem); 3181#endif 3182 err = re_node_set_alloc (&new_nodes, cur_nodes->nelem); 3183 if (BE (err != REG_NOERROR, 0)) 3184 return err; 3185 /* Create a new node set NEW_NODES with the nodes which are epsilon 3186 closures of the node in CUR_NODES. */ 3187 3188 for (idx = 0; idx < cur_nodes->nelem; ++idx) 3189 { 3190 Idx cur_node = cur_nodes->elems[idx]; 3191 const re_node_set *eclosure = dfa->eclosures + cur_node; 3192 outside_node = find_subexp_node (dfa, eclosure, ex_subexp, type); 3193 if (outside_node == REG_MISSING) 3194 { 3195 /* There are no problematic nodes, just merge them. */ 3196 err = re_node_set_merge (&new_nodes, eclosure); 3197 if (BE (err != REG_NOERROR, 0)) 3198 { 3199 re_node_set_free (&new_nodes); 3200 return err; 3201 } 3202 } 3203 else 3204 { 3205 /* There are problematic nodes, re-calculate incrementally. */ 3206 err = check_arrival_expand_ecl_sub (dfa, &new_nodes, cur_node, 3207 ex_subexp, type); 3208 if (BE (err != REG_NOERROR, 0)) 3209 { 3210 re_node_set_free (&new_nodes); 3211 return err; 3212 } 3213 } 3214 } 3215 re_node_set_free (cur_nodes); 3216 *cur_nodes = new_nodes; 3217 return REG_NOERROR; 3218} 3219 3220/* Helper function for check_arrival_expand_ecl. 3221 Check incrementally the epsilon closure of TARGET, and if it isn't 3222 problematic append it to DST_NODES. */ 3223 3224static reg_errcode_t 3225internal_function 3226check_arrival_expand_ecl_sub (const re_dfa_t *dfa, re_node_set *dst_nodes, 3227 Idx target, Idx ex_subexp, int type) 3228{ 3229 Idx cur_node; 3230 for (cur_node = target; !re_node_set_contains (dst_nodes, cur_node);) 3231 { 3232 bool ok; 3233 3234 if (dfa->nodes[cur_node].type == type 3235 && dfa->nodes[cur_node].opr.idx == ex_subexp) 3236 { 3237 if (type == OP_CLOSE_SUBEXP) 3238 { 3239 ok = re_node_set_insert (dst_nodes, cur_node); 3240 if (BE (! ok, 0)) 3241 return REG_ESPACE; 3242 } 3243 break; 3244 } 3245 ok = re_node_set_insert (dst_nodes, cur_node); 3246 if (BE (! ok, 0)) 3247 return REG_ESPACE; 3248 if (dfa->edests[cur_node].nelem == 0) 3249 break; 3250 if (dfa->edests[cur_node].nelem == 2) 3251 { 3252 reg_errcode_t err; 3253 err = check_arrival_expand_ecl_sub (dfa, dst_nodes, 3254 dfa->edests[cur_node].elems[1], 3255 ex_subexp, type); 3256 if (BE (err != REG_NOERROR, 0)) 3257 return err; 3258 } 3259 cur_node = dfa->edests[cur_node].elems[0]; 3260 } 3261 return REG_NOERROR; 3262} 3263 3264 3265/* For all the back references in the current state, calculate the 3266 destination of the back references by the appropriate entry 3267 in MCTX->BKREF_ENTS. */ 3268 3269static reg_errcode_t 3270internal_function 3271expand_bkref_cache (re_match_context_t *mctx, re_node_set *cur_nodes, 3272 Idx cur_str, Idx subexp_num, int type) 3273{ 3274 const re_dfa_t *const dfa = mctx->dfa; 3275 reg_errcode_t err; 3276 Idx cache_idx_start = search_cur_bkref_entry (mctx, cur_str); 3277 struct re_backref_cache_entry *ent; 3278 3279 if (cache_idx_start == REG_MISSING) 3280 return REG_NOERROR; 3281 3282 restart: 3283 ent = mctx->bkref_ents + cache_idx_start; 3284 do 3285 { 3286 Idx to_idx, next_node; 3287 3288 /* Is this entry ENT is appropriate? */ 3289 if (!re_node_set_contains (cur_nodes, ent->node)) 3290 continue; /* No. */ 3291 3292 to_idx = cur_str + ent->subexp_to - ent->subexp_from; 3293 /* Calculate the destination of the back reference, and append it 3294 to MCTX->STATE_LOG. */ 3295 if (to_idx == cur_str) 3296 { 3297 /* The backreference did epsilon transit, we must re-check all the 3298 node in the current state. */ 3299 re_node_set new_dests; 3300 reg_errcode_t err2, err3; 3301 next_node = dfa->edests[ent->node].elems[0]; 3302 if (re_node_set_contains (cur_nodes, next_node)) 3303 continue; 3304 err = re_node_set_init_1 (&new_dests, next_node); 3305 err2 = check_arrival_expand_ecl (dfa, &new_dests, subexp_num, type); 3306 err3 = re_node_set_merge (cur_nodes, &new_dests); 3307 re_node_set_free (&new_dests); 3308 if (BE (err != REG_NOERROR || err2 != REG_NOERROR 3309 || err3 != REG_NOERROR, 0)) 3310 { 3311 err = (err != REG_NOERROR ? err 3312 : (err2 != REG_NOERROR ? err2 : err3)); 3313 return err; 3314 } 3315 /* TODO: It is still inefficient... */ 3316 goto restart; 3317 } 3318 else 3319 { 3320 re_node_set union_set; 3321 next_node = dfa->nexts[ent->node]; 3322 if (mctx->state_log[to_idx]) 3323 { 3324 bool ok; 3325 if (re_node_set_contains (&mctx->state_log[to_idx]->nodes, 3326 next_node)) 3327 continue; 3328 err = re_node_set_init_copy (&union_set, 3329 &mctx->state_log[to_idx]->nodes); 3330 ok = re_node_set_insert (&union_set, next_node); 3331 if (BE (err != REG_NOERROR || ! ok, 0)) 3332 { 3333 re_node_set_free (&union_set); 3334 err = err != REG_NOERROR ? err : REG_ESPACE; 3335 return err; 3336 } 3337 } 3338 else 3339 { 3340 err = re_node_set_init_1 (&union_set, next_node); 3341 if (BE (err != REG_NOERROR, 0)) 3342 return err; 3343 } 3344 mctx->state_log[to_idx] = re_acquire_state (&err, dfa, &union_set); 3345 re_node_set_free (&union_set); 3346 if (BE (mctx->state_log[to_idx] == NULL 3347 && err != REG_NOERROR, 0)) 3348 return err; 3349 } 3350 } 3351 while (ent++->more); 3352 return REG_NOERROR; 3353} 3354 3355/* Build transition table for the state. 3356 Return true if successful. */ 3357 3358static bool 3359internal_function 3360build_trtable (const re_dfa_t *dfa, re_dfastate_t *state) 3361{ 3362 reg_errcode_t err; 3363 Idx i, j; 3364 int ch; 3365 bool need_word_trtable = false; 3366 bitset_word_t elem, mask; 3367 bool dests_node_malloced = false; 3368 bool dest_states_malloced = false; 3369 Idx ndests; /* Number of the destination states from `state'. */ 3370 re_dfastate_t **trtable; 3371 re_dfastate_t **dest_states = NULL, **dest_states_word, **dest_states_nl; 3372 re_node_set follows, *dests_node; 3373 bitset_t *dests_ch; 3374 bitset_t acceptable; 3375 3376 struct dests_alloc 3377 { 3378 re_node_set dests_node[SBC_MAX]; 3379 bitset_t dests_ch[SBC_MAX]; 3380 } *dests_alloc; 3381 3382 /* We build DFA states which corresponds to the destination nodes 3383 from `state'. `dests_node[i]' represents the nodes which i-th 3384 destination state contains, and `dests_ch[i]' represents the 3385 characters which i-th destination state accepts. */ 3386 if (__libc_use_alloca (sizeof (struct dests_alloc))) 3387 dests_alloc = (struct dests_alloc *) alloca (sizeof (struct dests_alloc)); 3388 else 3389 { 3390 dests_alloc = re_malloc (struct dests_alloc, 1); 3391 if (BE (dests_alloc == NULL, 0)) 3392 return false; 3393 dests_node_malloced = true; 3394 } 3395 dests_node = dests_alloc->dests_node; 3396 dests_ch = dests_alloc->dests_ch; 3397 3398 /* Initialize transiton table. */ 3399 state->word_trtable = state->trtable = NULL; 3400 3401 /* At first, group all nodes belonging to `state' into several 3402 destinations. */ 3403 ndests = group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch); 3404 if (BE (! REG_VALID_NONZERO_INDEX (ndests), 0)) 3405 { 3406 if (dests_node_malloced) 3407 free (dests_alloc); 3408 if (ndests == 0) 3409 { 3410 state->trtable = (re_dfastate_t **) 3411 calloc (sizeof (re_dfastate_t *), SBC_MAX); 3412 return true; 3413 } 3414 return false; 3415 } 3416 3417 err = re_node_set_alloc (&follows, ndests + 1); 3418 if (BE (err != REG_NOERROR, 0)) 3419 goto out_free; 3420 3421 /* Avoid arithmetic overflow in size calculation. */ 3422 if (BE ((((SIZE_MAX - (sizeof (re_node_set) + sizeof (bitset_t)) * SBC_MAX) 3423 / (3 * sizeof (re_dfastate_t *))) 3424 < ndests), 3425 0)) 3426 goto out_free; 3427 3428 if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset_t)) * SBC_MAX 3429 + ndests * 3 * sizeof (re_dfastate_t *))) 3430 dest_states = (re_dfastate_t **) 3431 alloca (ndests * 3 * sizeof (re_dfastate_t *)); 3432 else 3433 { 3434 dest_states = (re_dfastate_t **) 3435 malloc (ndests * 3 * sizeof (re_dfastate_t *)); 3436 if (BE (dest_states == NULL, 0)) 3437 { 3438out_free: 3439 if (dest_states_malloced) 3440 free (dest_states); 3441 re_node_set_free (&follows); 3442 for (i = 0; i < ndests; ++i) 3443 re_node_set_free (dests_node + i); 3444 if (dests_node_malloced) 3445 free (dests_alloc); 3446 return false; 3447 } 3448 dest_states_malloced = true; 3449 } 3450 dest_states_word = dest_states + ndests; 3451 dest_states_nl = dest_states_word + ndests; 3452 bitset_empty (acceptable); 3453 3454 /* Then build the states for all destinations. */ 3455 for (i = 0; i < ndests; ++i) 3456 { 3457 Idx next_node; 3458 re_node_set_empty (&follows); 3459 /* Merge the follows of this destination states. */ 3460 for (j = 0; j < dests_node[i].nelem; ++j) 3461 { 3462 next_node = dfa->nexts[dests_node[i].elems[j]]; 3463 if (next_node != REG_MISSING) 3464 { 3465 err = re_node_set_merge (&follows, dfa->eclosures + next_node); 3466 if (BE (err != REG_NOERROR, 0)) 3467 goto out_free; 3468 } 3469 } 3470 dest_states[i] = re_acquire_state_context (&err, dfa, &follows, 0); 3471 if (BE (dest_states[i] == NULL && err != REG_NOERROR, 0)) 3472 goto out_free; 3473 /* If the new state has context constraint, 3474 build appropriate states for these contexts. */ 3475 if (dest_states[i]->has_constraint) 3476 { 3477 dest_states_word[i] = re_acquire_state_context (&err, dfa, &follows, 3478 CONTEXT_WORD); 3479 if (BE (dest_states_word[i] == NULL && err != REG_NOERROR, 0)) 3480 goto out_free; 3481 3482 if (dest_states[i] != dest_states_word[i] && dfa->mb_cur_max > 1) 3483 need_word_trtable = true; 3484 3485 dest_states_nl[i] = re_acquire_state_context (&err, dfa, &follows, 3486 CONTEXT_NEWLINE); 3487 if (BE (dest_states_nl[i] == NULL && err != REG_NOERROR, 0)) 3488 goto out_free; 3489 } 3490 else 3491 { 3492 dest_states_word[i] = dest_states[i]; 3493 dest_states_nl[i] = dest_states[i]; 3494 } 3495 bitset_merge (acceptable, dests_ch[i]); 3496 } 3497 3498 if (!BE (need_word_trtable, 0)) 3499 { 3500 /* We don't care about whether the following character is a word 3501 character, or we are in a single-byte character set so we can 3502 discern by looking at the character code: allocate a 3503 256-entry transition table. */ 3504 trtable = state->trtable = 3505 (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), SBC_MAX); 3506 if (BE (trtable == NULL, 0)) 3507 goto out_free; 3508 3509 /* For all characters ch...: */ 3510 for (i = 0; i < BITSET_WORDS; ++i) 3511 for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1; 3512 elem; 3513 mask <<= 1, elem >>= 1, ++ch) 3514 if (BE (elem & 1, 0)) 3515 { 3516 /* There must be exactly one destination which accepts 3517 character ch. See group_nodes_into_DFAstates. */ 3518 for (j = 0; (dests_ch[j][i] & mask) == 0; ++j) 3519 ; 3520 3521 /* j-th destination accepts the word character ch. */ 3522 if (dfa->word_char[i] & mask) 3523 trtable[ch] = dest_states_word[j]; 3524 else 3525 trtable[ch] = dest_states[j]; 3526 } 3527 } 3528 else 3529 { 3530 /* We care about whether the following character is a word 3531 character, and we are in a multi-byte character set: discern 3532 by looking at the character code: build two 256-entry 3533 transition tables, one starting at trtable[0] and one 3534 starting at trtable[SBC_MAX]. */ 3535 trtable = state->word_trtable = 3536 (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), 2 * SBC_MAX); 3537 if (BE (trtable == NULL, 0)) 3538 goto out_free; 3539 3540 /* For all characters ch...: */ 3541 for (i = 0; i < BITSET_WORDS; ++i) 3542 for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1; 3543 elem; 3544 mask <<= 1, elem >>= 1, ++ch) 3545 if (BE (elem & 1, 0)) 3546 { 3547 /* There must be exactly one destination which accepts 3548 character ch. See group_nodes_into_DFAstates. */ 3549 for (j = 0; (dests_ch[j][i] & mask) == 0; ++j) 3550 ; 3551 3552 /* j-th destination accepts the word character ch. */ 3553 trtable[ch] = dest_states[j]; 3554 trtable[ch + SBC_MAX] = dest_states_word[j]; 3555 } 3556 } 3557 3558 /* new line */ 3559 if (bitset_contain (acceptable, NEWLINE_CHAR)) 3560 { 3561 /* The current state accepts newline character. */ 3562 for (j = 0; j < ndests; ++j) 3563 if (bitset_contain (dests_ch[j], NEWLINE_CHAR)) 3564 { 3565 /* k-th destination accepts newline character. */ 3566 trtable[NEWLINE_CHAR] = dest_states_nl[j]; 3567 if (need_word_trtable) 3568 trtable[NEWLINE_CHAR + SBC_MAX] = dest_states_nl[j]; 3569 /* There must be only one destination which accepts 3570 newline. See group_nodes_into_DFAstates. */ 3571 break; 3572 } 3573 } 3574 3575 if (dest_states_malloced) 3576 free (dest_states); 3577 3578 re_node_set_free (&follows); 3579 for (i = 0; i < ndests; ++i) 3580 re_node_set_free (dests_node + i); 3581 3582 if (dests_node_malloced) 3583 free (dests_alloc); 3584 3585 return true; 3586} 3587 3588/* Group all nodes belonging to STATE into several destinations. 3589 Then for all destinations, set the nodes belonging to the destination 3590 to DESTS_NODE[i] and set the characters accepted by the destination 3591 to DEST_CH[i]. This function return the number of destinations. */ 3592 3593static Idx 3594internal_function 3595group_nodes_into_DFAstates (const re_dfa_t *dfa, const re_dfastate_t *state, 3596 re_node_set *dests_node, bitset_t *dests_ch) 3597{ 3598 reg_errcode_t err; 3599 bool ok; 3600 Idx i, j, k; 3601 Idx ndests; /* Number of the destinations from `state'. */ 3602 bitset_t accepts; /* Characters a node can accept. */ 3603 const re_node_set *cur_nodes = &state->nodes; 3604 bitset_empty (accepts); 3605 ndests = 0; 3606 3607 /* For all the nodes belonging to `state', */ 3608 for (i = 0; i < cur_nodes->nelem; ++i) 3609 { 3610 re_token_t *node = &dfa->nodes[cur_nodes->elems[i]]; 3611 re_token_type_t type = node->type; 3612 unsigned int constraint = node->constraint; 3613 3614 /* Enumerate all single byte character this node can accept. */ 3615 if (type == CHARACTER) 3616 bitset_set (accepts, node->opr.c); 3617 else if (type == SIMPLE_BRACKET) 3618 { 3619 bitset_merge (accepts, node->opr.sbcset); 3620 } 3621 else if (type == OP_PERIOD) 3622 { 3623#ifdef RE_ENABLE_I18N 3624 if (dfa->mb_cur_max > 1) 3625 bitset_merge (accepts, dfa->sb_char); 3626 else 3627#endif 3628 bitset_set_all (accepts); 3629 if (!(dfa->syntax & RE_DOT_NEWLINE)) 3630 bitset_clear (accepts, '\n'); 3631 if (dfa->syntax & RE_DOT_NOT_NULL) 3632 bitset_clear (accepts, '\0'); 3633 } 3634#ifdef RE_ENABLE_I18N 3635 else if (type == OP_UTF8_PERIOD) 3636 { 3637 if (ASCII_CHARS % BITSET_WORD_BITS == 0) 3638 memset (accepts, -1, ASCII_CHARS / CHAR_BIT); 3639 else 3640 bitset_merge (accepts, utf8_sb_map); 3641 if (!(dfa->syntax & RE_DOT_NEWLINE)) 3642 bitset_clear (accepts, '\n'); 3643 if (dfa->syntax & RE_DOT_NOT_NULL) 3644 bitset_clear (accepts, '\0'); 3645 } 3646#endif 3647 else 3648 continue; 3649 3650 /* Check the `accepts' and sift the characters which are not 3651 match it the context. */ 3652 if (constraint) 3653 { 3654 if (constraint & NEXT_NEWLINE_CONSTRAINT) 3655 { 3656 bool accepts_newline = bitset_contain (accepts, NEWLINE_CHAR); 3657 bitset_empty (accepts); 3658 if (accepts_newline) 3659 bitset_set (accepts, NEWLINE_CHAR); 3660 else 3661 continue; 3662 } 3663 if (constraint & NEXT_ENDBUF_CONSTRAINT) 3664 { 3665 bitset_empty (accepts); 3666 continue; 3667 } 3668 3669 if (constraint & NEXT_WORD_CONSTRAINT) 3670 { 3671 bitset_word_t any_set = 0; 3672 if (type == CHARACTER && !node->word_char) 3673 { 3674 bitset_empty (accepts); 3675 continue; 3676 } 3677#ifdef RE_ENABLE_I18N 3678 if (dfa->mb_cur_max > 1) 3679 for (j = 0; j < BITSET_WORDS; ++j) 3680 any_set |= (accepts[j] &= (dfa->word_char[j] | ~dfa->sb_char[j])); 3681 else 3682#endif 3683 for (j = 0; j < BITSET_WORDS; ++j) 3684 any_set |= (accepts[j] &= dfa->word_char[j]); 3685 if (!any_set) 3686 continue; 3687 } 3688 if (constraint & NEXT_NOTWORD_CONSTRAINT) 3689 { 3690 bitset_word_t any_set = 0; 3691 if (type == CHARACTER && node->word_char) 3692 { 3693 bitset_empty (accepts); 3694 continue; 3695 } 3696#ifdef RE_ENABLE_I18N 3697 if (dfa->mb_cur_max > 1) 3698 for (j = 0; j < BITSET_WORDS; ++j) 3699 any_set |= (accepts[j] &= ~(dfa->word_char[j] & dfa->sb_char[j])); 3700 else 3701#endif 3702 for (j = 0; j < BITSET_WORDS; ++j) 3703 any_set |= (accepts[j] &= ~dfa->word_char[j]); 3704 if (!any_set) 3705 continue; 3706 } 3707 } 3708 3709 /* Then divide `accepts' into DFA states, or create a new 3710 state. Above, we make sure that accepts is not empty. */ 3711 for (j = 0; j < ndests; ++j) 3712 { 3713 bitset_t intersec; /* Intersection sets, see below. */ 3714 bitset_t remains; 3715 /* Flags, see below. */ 3716 bitset_word_t has_intersec, not_subset, not_consumed; 3717 3718 /* Optimization, skip if this state doesn't accept the character. */ 3719 if (type == CHARACTER && !bitset_contain (dests_ch[j], node->opr.c)) 3720 continue; 3721 3722 /* Enumerate the intersection set of this state and `accepts'. */ 3723 has_intersec = 0; 3724 for (k = 0; k < BITSET_WORDS; ++k) 3725 has_intersec |= intersec[k] = accepts[k] & dests_ch[j][k]; 3726 /* And skip if the intersection set is empty. */ 3727 if (!has_intersec) 3728 continue; 3729 3730 /* Then check if this state is a subset of `accepts'. */ 3731 not_subset = not_consumed = 0; 3732 for (k = 0; k < BITSET_WORDS; ++k) 3733 { 3734 not_subset |= remains[k] = ~accepts[k] & dests_ch[j][k]; 3735 not_consumed |= accepts[k] = accepts[k] & ~dests_ch[j][k]; 3736 } 3737 3738 /* If this state isn't a subset of `accepts', create a 3739 new group state, which has the `remains'. */ 3740 if (not_subset) 3741 { 3742 bitset_copy (dests_ch[ndests], remains); 3743 bitset_copy (dests_ch[j], intersec); 3744 err = re_node_set_init_copy (dests_node + ndests, &dests_node[j]); 3745 if (BE (err != REG_NOERROR, 0)) 3746 goto error_return; 3747 ++ndests; 3748 } 3749 3750 /* Put the position in the current group. */ 3751 ok = re_node_set_insert (&dests_node[j], cur_nodes->elems[i]); 3752 if (BE (! ok, 0)) 3753 goto error_return; 3754 3755 /* If all characters are consumed, go to next node. */ 3756 if (!not_consumed) 3757 break; 3758 } 3759 /* Some characters remain, create a new group. */ 3760 if (j == ndests) 3761 { 3762 bitset_copy (dests_ch[ndests], accepts); 3763 err = re_node_set_init_1 (dests_node + ndests, cur_nodes->elems[i]); 3764 if (BE (err != REG_NOERROR, 0)) 3765 goto error_return; 3766 ++ndests; 3767 bitset_empty (accepts); 3768 } 3769 } 3770 return ndests; 3771 error_return: 3772 for (j = 0; j < ndests; ++j) 3773 re_node_set_free (dests_node + j); 3774 return REG_MISSING; 3775} 3776 3777#ifdef RE_ENABLE_I18N 3778/* Check how many bytes the node `dfa->nodes[node_idx]' accepts. 3779 Return the number of the bytes the node accepts. 3780 STR_IDX is the current index of the input string. 3781 3782 This function handles the nodes which can accept one character, or 3783 one collating element like '.', '[a-z]', opposite to the other nodes 3784 can only accept one byte. */ 3785 3786static int 3787internal_function 3788check_node_accept_bytes (const re_dfa_t *dfa, Idx node_idx, 3789 const re_string_t *input, Idx str_idx) 3790{ 3791 const re_token_t *node = dfa->nodes + node_idx; 3792 int char_len, elem_len; 3793 Idx i; 3794 3795 if (BE (node->type == OP_UTF8_PERIOD, 0)) 3796 { 3797 unsigned char c = re_string_byte_at (input, str_idx), d; 3798 if (BE (c < 0xc2, 1)) 3799 return 0; 3800 3801 if (str_idx + 2 > input->len) 3802 return 0; 3803 3804 d = re_string_byte_at (input, str_idx + 1); 3805 if (c < 0xe0) 3806 return (d < 0x80 || d > 0xbf) ? 0 : 2; 3807 else if (c < 0xf0) 3808 { 3809 char_len = 3; 3810 if (c == 0xe0 && d < 0xa0) 3811 return 0; 3812 } 3813 else if (c < 0xf8) 3814 { 3815 char_len = 4; 3816 if (c == 0xf0 && d < 0x90) 3817 return 0; 3818 } 3819 else if (c < 0xfc) 3820 { 3821 char_len = 5; 3822 if (c == 0xf8 && d < 0x88) 3823 return 0; 3824 } 3825 else if (c < 0xfe) 3826 { 3827 char_len = 6; 3828 if (c == 0xfc && d < 0x84) 3829 return 0; 3830 } 3831 else 3832 return 0; 3833 3834 if (str_idx + char_len > input->len) 3835 return 0; 3836 3837 for (i = 1; i < char_len; ++i) 3838 { 3839 d = re_string_byte_at (input, str_idx + i); 3840 if (d < 0x80 || d > 0xbf) 3841 return 0; 3842 } 3843 return char_len; 3844 } 3845 3846 char_len = re_string_char_size_at (input, str_idx); 3847 if (node->type == OP_PERIOD) 3848 { 3849 if (char_len <= 1) 3850 return 0; 3851 /* FIXME: I don't think this if is needed, as both '\n' 3852 and '\0' are char_len == 1. */ 3853 /* '.' accepts any one character except the following two cases. */ 3854 if ((!(dfa->syntax & RE_DOT_NEWLINE) && 3855 re_string_byte_at (input, str_idx) == '\n') || 3856 ((dfa->syntax & RE_DOT_NOT_NULL) && 3857 re_string_byte_at (input, str_idx) == '\0')) 3858 return 0; 3859 return char_len; 3860 } 3861 3862 elem_len = re_string_elem_size_at (input, str_idx); 3863 if ((elem_len <= 1 && char_len <= 1) || char_len == 0) 3864 return 0; 3865 3866 if (node->type == COMPLEX_BRACKET) 3867 { 3868 const re_charset_t *cset = node->opr.mbcset; 3869# ifdef _LIBC 3870 const unsigned char *pin 3871 = ((const unsigned char *) re_string_get_buffer (input) + str_idx); 3872 Idx j; 3873 uint32_t nrules; 3874# endif /* _LIBC */ 3875 int match_len = 0; 3876 wchar_t wc = ((cset->nranges || cset->nchar_classes || cset->nmbchars) 3877 ? re_string_wchar_at (input, str_idx) : 0); 3878 3879 /* match with multibyte character? */ 3880 for (i = 0; i < cset->nmbchars; ++i) 3881 if (wc == cset->mbchars[i]) 3882 { 3883 match_len = char_len; 3884 goto check_node_accept_bytes_match; 3885 } 3886 /* match with character_class? */ 3887 for (i = 0; i < cset->nchar_classes; ++i) 3888 { 3889 wctype_t wt = cset->char_classes[i]; 3890 if (__iswctype (wc, wt)) 3891 { 3892 match_len = char_len; 3893 goto check_node_accept_bytes_match; 3894 } 3895 } 3896 3897# ifdef _LIBC 3898 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES); 3899 if (nrules != 0) 3900 { 3901 unsigned int in_collseq = 0; 3902 const int32_t *table, *indirect; 3903 const unsigned char *weights, *extra; 3904 const char *collseqwc; 3905 int32_t idx; 3906 /* This #include defines a local function! */ 3907# include <locale/weight.h> 3908 3909 /* match with collating_symbol? */ 3910 if (cset->ncoll_syms) 3911 extra = (const unsigned char *) 3912 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB); 3913 for (i = 0; i < cset->ncoll_syms; ++i) 3914 { 3915 const unsigned char *coll_sym = extra + cset->coll_syms[i]; 3916 /* Compare the length of input collating element and 3917 the length of current collating element. */ 3918 if (*coll_sym != elem_len) 3919 continue; 3920 /* Compare each bytes. */ 3921 for (j = 0; j < *coll_sym; j++) 3922 if (pin[j] != coll_sym[1 + j]) 3923 break; 3924 if (j == *coll_sym) 3925 { 3926 /* Match if every bytes is equal. */ 3927 match_len = j; 3928 goto check_node_accept_bytes_match; 3929 } 3930 } 3931 3932 if (cset->nranges) 3933 { 3934 if (elem_len <= char_len) 3935 { 3936 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC); 3937 in_collseq = __collseq_table_lookup (collseqwc, wc); 3938 } 3939 else 3940 in_collseq = find_collation_sequence_value (pin, elem_len); 3941 } 3942 /* match with range expression? */ 3943 for (i = 0; i < cset->nranges; ++i) 3944 if (cset->range_starts[i] <= in_collseq 3945 && in_collseq <= cset->range_ends[i]) 3946 { 3947 match_len = elem_len; 3948 goto check_node_accept_bytes_match; 3949 } 3950 3951 /* match with equivalence_class? */ 3952 if (cset->nequiv_classes) 3953 { 3954 const unsigned char *cp = pin; 3955 table = (const int32_t *) 3956 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB); 3957 weights = (const unsigned char *) 3958 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB); 3959 extra = (const unsigned char *) 3960 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB); 3961 indirect = (const int32_t *) 3962 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB); 3963 int32_t idx = findidx (&cp); 3964 if (idx > 0) 3965 for (i = 0; i < cset->nequiv_classes; ++i) 3966 { 3967 int32_t equiv_class_idx = cset->equiv_classes[i]; 3968 size_t weight_len = weights[idx & 0xffffff]; 3969 if (weight_len == weights[equiv_class_idx & 0xffffff] 3970 && (idx >> 24) == (equiv_class_idx >> 24)) 3971 { 3972 Idx cnt = 0; 3973 3974 idx &= 0xffffff; 3975 equiv_class_idx &= 0xffffff; 3976 3977 while (cnt <= weight_len 3978 && (weights[equiv_class_idx + 1 + cnt] 3979 == weights[idx + 1 + cnt])) 3980 ++cnt; 3981 if (cnt > weight_len) 3982 { 3983 match_len = elem_len; 3984 goto check_node_accept_bytes_match; 3985 } 3986 } 3987 } 3988 } 3989 } 3990 else 3991# endif /* _LIBC */ 3992 { 3993 /* match with range expression? */ 3994#if __GNUC__ >= 2 && ! (__STDC_VERSION__ < 199901L && __STRICT_ANSI__) 3995 wchar_t cmp_buf[] = {L'\0', L'\0', wc, L'\0', L'\0', L'\0'}; 3996#else 3997 wchar_t cmp_buf[] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'}; 3998 cmp_buf[2] = wc; 3999#endif 4000 for (i = 0; i < cset->nranges; ++i) 4001 { 4002 cmp_buf[0] = cset->range_starts[i]; 4003 cmp_buf[4] = cset->range_ends[i]; 4004 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0 4005 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0) 4006 { 4007 match_len = char_len; 4008 goto check_node_accept_bytes_match; 4009 } 4010 } 4011 } 4012 check_node_accept_bytes_match: 4013 if (!cset->non_match) 4014 return match_len; 4015 else 4016 { 4017 if (match_len > 0) 4018 return 0; 4019 else 4020 return (elem_len > char_len) ? elem_len : char_len; 4021 } 4022 } 4023 return 0; 4024} 4025 4026# ifdef _LIBC 4027static unsigned int 4028internal_function 4029find_collation_sequence_value (const unsigned char *mbs, size_t mbs_len) 4030{ 4031 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES); 4032 if (nrules == 0) 4033 { 4034 if (mbs_len == 1) 4035 { 4036 /* No valid character. Match it as a single byte character. */ 4037 const unsigned char *collseq = (const unsigned char *) 4038 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB); 4039 return collseq[mbs[0]]; 4040 } 4041 return UINT_MAX; 4042 } 4043 else 4044 { 4045 int32_t idx; 4046 const unsigned char *extra = (const unsigned char *) 4047 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB); 4048 int32_t extrasize = (const unsigned char *) 4049 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB + 1) - extra; 4050 4051 for (idx = 0; idx < extrasize;) 4052 { 4053 int mbs_cnt; 4054 bool found = false; 4055 int32_t elem_mbs_len; 4056 /* Skip the name of collating element name. */ 4057 idx = idx + extra[idx] + 1; 4058 elem_mbs_len = extra[idx++]; 4059 if (mbs_len == elem_mbs_len) 4060 { 4061 for (mbs_cnt = 0; mbs_cnt < elem_mbs_len; ++mbs_cnt) 4062 if (extra[idx + mbs_cnt] != mbs[mbs_cnt]) 4063 break; 4064 if (mbs_cnt == elem_mbs_len) 4065 /* Found the entry. */ 4066 found = true; 4067 } 4068 /* Skip the byte sequence of the collating element. */ 4069 idx += elem_mbs_len; 4070 /* Adjust for the alignment. */ 4071 idx = (idx + 3) & ~3; 4072 /* Skip the collation sequence value. */ 4073 idx += sizeof (uint32_t); 4074 /* Skip the wide char sequence of the collating element. */ 4075 idx = idx + sizeof (uint32_t) * (extra[idx] + 1); 4076 /* If we found the entry, return the sequence value. */ 4077 if (found) 4078 return *(uint32_t *) (extra + idx); 4079 /* Skip the collation sequence value. */ 4080 idx += sizeof (uint32_t); 4081 } 4082 return UINT_MAX; 4083 } 4084} 4085# endif /* _LIBC */ 4086#endif /* RE_ENABLE_I18N */ 4087 4088/* Check whether the node accepts the byte which is IDX-th 4089 byte of the INPUT. */ 4090 4091static bool 4092internal_function 4093check_node_accept (const re_match_context_t *mctx, const re_token_t *node, 4094 Idx idx) 4095{ 4096 unsigned char ch; 4097 ch = re_string_byte_at (&mctx->input, idx); 4098 switch (node->type) 4099 { 4100 case CHARACTER: 4101 if (node->opr.c != ch) 4102 return false; 4103 break; 4104 4105 case SIMPLE_BRACKET: 4106 if (!bitset_contain (node->opr.sbcset, ch)) 4107 return false; 4108 break; 4109 4110#ifdef RE_ENABLE_I18N 4111 case OP_UTF8_PERIOD: 4112 if (ch >= ASCII_CHARS) 4113 return false; 4114 /* FALLTHROUGH */ 4115#endif 4116 case OP_PERIOD: 4117 if ((ch == '\n' && !(mctx->dfa->syntax & RE_DOT_NEWLINE)) 4118 || (ch == '\0' && (mctx->dfa->syntax & RE_DOT_NOT_NULL))) 4119 return false; 4120 break; 4121 4122 default: 4123 return false; 4124 } 4125 4126 if (node->constraint) 4127 { 4128 /* The node has constraints. Check whether the current context 4129 satisfies the constraints. */ 4130 unsigned int context = re_string_context_at (&mctx->input, idx, 4131 mctx->eflags); 4132 if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context)) 4133 return false; 4134 } 4135 4136 return true; 4137} 4138 4139/* Extend the buffers, if the buffers have run out. */ 4140 4141static reg_errcode_t 4142internal_function 4143extend_buffers (re_match_context_t *mctx) 4144{ 4145 reg_errcode_t ret; 4146 re_string_t *pstr = &mctx->input; 4147 4148 /* Avoid overflow. */ 4149 if (BE (SIZE_MAX / 2 / sizeof (re_dfastate_t *) <= pstr->bufs_len, 0)) 4150 return REG_ESPACE; 4151 4152 /* Double the lengthes of the buffers. */ 4153 ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2); 4154 if (BE (ret != REG_NOERROR, 0)) 4155 return ret; 4156 4157 if (mctx->state_log != NULL) 4158 { 4159 /* And double the length of state_log. */ 4160 /* XXX We have no indication of the size of this buffer. If this 4161 allocation fail we have no indication that the state_log array 4162 does not have the right size. */ 4163 re_dfastate_t **new_array = re_realloc (mctx->state_log, re_dfastate_t *, 4164 pstr->bufs_len + 1); 4165 if (BE (new_array == NULL, 0)) 4166 return REG_ESPACE; 4167 mctx->state_log = new_array; 4168 } 4169 4170 /* Then reconstruct the buffers. */ 4171 if (pstr->icase) 4172 { 4173#ifdef RE_ENABLE_I18N 4174 if (pstr->mb_cur_max > 1) 4175 { 4176 ret = build_wcs_upper_buffer (pstr); 4177 if (BE (ret != REG_NOERROR, 0)) 4178 return ret; 4179 } 4180 else 4181#endif /* RE_ENABLE_I18N */ 4182 build_upper_buffer (pstr); 4183 } 4184 else 4185 { 4186#ifdef RE_ENABLE_I18N 4187 if (pstr->mb_cur_max > 1) 4188 build_wcs_buffer (pstr); 4189 else 4190#endif /* RE_ENABLE_I18N */ 4191 { 4192 if (pstr->trans != NULL) 4193 re_string_translate_buffer (pstr); 4194 } 4195 } 4196 return REG_NOERROR; 4197} 4198 4199 4200/* Functions for matching context. */ 4201 4202/* Initialize MCTX. */ 4203 4204static reg_errcode_t 4205internal_function 4206match_ctx_init (re_match_context_t *mctx, int eflags, Idx n) 4207{ 4208 mctx->eflags = eflags; 4209 mctx->match_last = REG_MISSING; 4210 if (n > 0) 4211 { 4212 /* Avoid overflow. */ 4213 size_t max_object_size = 4214 MAX (sizeof (struct re_backref_cache_entry), 4215 sizeof (re_sub_match_top_t *)); 4216 if (BE (SIZE_MAX / max_object_size < n, 0)) 4217 return REG_ESPACE; 4218 4219 mctx->bkref_ents = re_malloc (struct re_backref_cache_entry, n); 4220 mctx->sub_tops = re_malloc (re_sub_match_top_t *, n); 4221 if (BE (mctx->bkref_ents == NULL || mctx->sub_tops == NULL, 0)) 4222 return REG_ESPACE; 4223 } 4224 /* Already zero-ed by the caller. 4225 else 4226 mctx->bkref_ents = NULL; 4227 mctx->nbkref_ents = 0; 4228 mctx->nsub_tops = 0; */ 4229 mctx->abkref_ents = n; 4230 mctx->max_mb_elem_len = 1; 4231 mctx->asub_tops = n; 4232 return REG_NOERROR; 4233} 4234 4235/* Clean the entries which depend on the current input in MCTX. 4236 This function must be invoked when the matcher changes the start index 4237 of the input, or changes the input string. */ 4238 4239static void 4240internal_function 4241match_ctx_clean (re_match_context_t *mctx) 4242{ 4243 Idx st_idx; 4244 for (st_idx = 0; st_idx < mctx->nsub_tops; ++st_idx) 4245 { 4246 Idx sl_idx; 4247 re_sub_match_top_t *top = mctx->sub_tops[st_idx]; 4248 for (sl_idx = 0; sl_idx < top->nlasts; ++sl_idx) 4249 { 4250 re_sub_match_last_t *last = top->lasts[sl_idx]; 4251 re_free (last->path.array); 4252 re_free (last); 4253 } 4254 re_free (top->lasts); 4255 if (top->path) 4256 { 4257 re_free (top->path->array); 4258 re_free (top->path); 4259 } 4260 free (top); 4261 } 4262 4263 mctx->nsub_tops = 0; 4264 mctx->nbkref_ents = 0; 4265} 4266 4267/* Free all the memory associated with MCTX. */ 4268 4269static void 4270internal_function 4271match_ctx_free (re_match_context_t *mctx) 4272{ 4273 /* First, free all the memory associated with MCTX->SUB_TOPS. */ 4274 match_ctx_clean (mctx); 4275 re_free (mctx->sub_tops); 4276 re_free (mctx->bkref_ents); 4277} 4278 4279/* Add a new backreference entry to MCTX. 4280 Note that we assume that caller never call this function with duplicate 4281 entry, and call with STR_IDX which isn't smaller than any existing entry. 4282*/ 4283 4284static reg_errcode_t 4285internal_function 4286match_ctx_add_entry (re_match_context_t *mctx, Idx node, Idx str_idx, Idx from, 4287 Idx to) 4288{ 4289 if (mctx->nbkref_ents >= mctx->abkref_ents) 4290 { 4291 struct re_backref_cache_entry* new_entry; 4292 new_entry = re_realloc (mctx->bkref_ents, struct re_backref_cache_entry, 4293 mctx->abkref_ents * 2); 4294 if (BE (new_entry == NULL, 0)) 4295 { 4296 re_free (mctx->bkref_ents); 4297 return REG_ESPACE; 4298 } 4299 mctx->bkref_ents = new_entry; 4300 memset (mctx->bkref_ents + mctx->nbkref_ents, '\0', 4301 sizeof (struct re_backref_cache_entry) * mctx->abkref_ents); 4302 mctx->abkref_ents *= 2; 4303 } 4304 if (mctx->nbkref_ents > 0 4305 && mctx->bkref_ents[mctx->nbkref_ents - 1].str_idx == str_idx) 4306 mctx->bkref_ents[mctx->nbkref_ents - 1].more = 1; 4307 4308 mctx->bkref_ents[mctx->nbkref_ents].node = node; 4309 mctx->bkref_ents[mctx->nbkref_ents].str_idx = str_idx; 4310 mctx->bkref_ents[mctx->nbkref_ents].subexp_from = from; 4311 mctx->bkref_ents[mctx->nbkref_ents].subexp_to = to; 4312 4313 /* This is a cache that saves negative results of check_dst_limits_calc_pos. 4314 If bit N is clear, means that this entry won't epsilon-transition to 4315 an OP_OPEN_SUBEXP or OP_CLOSE_SUBEXP for the N+1-th subexpression. If 4316 it is set, check_dst_limits_calc_pos_1 will recurse and try to find one 4317 such node. 4318 4319 A backreference does not epsilon-transition unless it is empty, so set 4320 to all zeros if FROM != TO. */ 4321 mctx->bkref_ents[mctx->nbkref_ents].eps_reachable_subexps_map 4322 = (from == to ? -1 : 0); 4323 4324 mctx->bkref_ents[mctx->nbkref_ents++].more = 0; 4325 if (mctx->max_mb_elem_len < to - from) 4326 mctx->max_mb_elem_len = to - from; 4327 return REG_NOERROR; 4328} 4329 4330/* Return the first entry with the same str_idx, or REG_MISSING if none is 4331 found. Note that MCTX->BKREF_ENTS is already sorted by MCTX->STR_IDX. */ 4332 4333static Idx 4334internal_function 4335search_cur_bkref_entry (const re_match_context_t *mctx, Idx str_idx) 4336{ 4337 Idx left, right, mid, last; 4338 last = right = mctx->nbkref_ents; 4339 for (left = 0; left < right;) 4340 { 4341 mid = (left + right) / 2; 4342 if (mctx->bkref_ents[mid].str_idx < str_idx) 4343 left = mid + 1; 4344 else 4345 right = mid; 4346 } 4347 if (left < last && mctx->bkref_ents[left].str_idx == str_idx) 4348 return left; 4349 else 4350 return REG_MISSING; 4351} 4352 4353/* Register the node NODE, whose type is OP_OPEN_SUBEXP, and which matches 4354 at STR_IDX. */ 4355 4356static reg_errcode_t 4357internal_function 4358match_ctx_add_subtop (re_match_context_t *mctx, Idx node, Idx str_idx) 4359{ 4360#ifdef DEBUG 4361 assert (mctx->sub_tops != NULL); 4362 assert (mctx->asub_tops > 0); 4363#endif 4364 if (BE (mctx->nsub_tops == mctx->asub_tops, 0)) 4365 { 4366 Idx new_asub_tops = mctx->asub_tops * 2; 4367 re_sub_match_top_t **new_array = re_realloc (mctx->sub_tops, 4368 re_sub_match_top_t *, 4369 new_asub_tops); 4370 if (BE (new_array == NULL, 0)) 4371 return REG_ESPACE; 4372 mctx->sub_tops = new_array; 4373 mctx->asub_tops = new_asub_tops; 4374 } 4375 mctx->sub_tops[mctx->nsub_tops] = calloc (1, sizeof (re_sub_match_top_t)); 4376 if (BE (mctx->sub_tops[mctx->nsub_tops] == NULL, 0)) 4377 return REG_ESPACE; 4378 mctx->sub_tops[mctx->nsub_tops]->node = node; 4379 mctx->sub_tops[mctx->nsub_tops++]->str_idx = str_idx; 4380 return REG_NOERROR; 4381} 4382 4383/* Register the node NODE, whose type is OP_CLOSE_SUBEXP, and which matches 4384 at STR_IDX, whose corresponding OP_OPEN_SUBEXP is SUB_TOP. */ 4385 4386static re_sub_match_last_t * 4387internal_function 4388match_ctx_add_sublast (re_sub_match_top_t *subtop, Idx node, Idx str_idx) 4389{ 4390 re_sub_match_last_t *new_entry; 4391 if (BE (subtop->nlasts == subtop->alasts, 0)) 4392 { 4393 Idx new_alasts = 2 * subtop->alasts + 1; 4394 re_sub_match_last_t **new_array = re_realloc (subtop->lasts, 4395 re_sub_match_last_t *, 4396 new_alasts); 4397 if (BE (new_array == NULL, 0)) 4398 return NULL; 4399 subtop->lasts = new_array; 4400 subtop->alasts = new_alasts; 4401 } 4402 new_entry = calloc (1, sizeof (re_sub_match_last_t)); 4403 if (BE (new_entry != NULL, 1)) 4404 { 4405 subtop->lasts[subtop->nlasts] = new_entry; 4406 new_entry->node = node; 4407 new_entry->str_idx = str_idx; 4408 ++subtop->nlasts; 4409 } 4410 return new_entry; 4411} 4412 4413static void 4414internal_function 4415sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts, 4416 re_dfastate_t **limited_sts, Idx last_node, Idx last_str_idx) 4417{ 4418 sctx->sifted_states = sifted_sts; 4419 sctx->limited_states = limited_sts; 4420 sctx->last_node = last_node; 4421 sctx->last_str_idx = last_str_idx; 4422 re_node_set_init_empty (&sctx->limits); 4423} 4424