1/* Extended regular expression matching and search library. 2 Copyright (C) 2002-2014 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>. 5 6 The GNU C Library is free software; you can redistribute it and/or 7 modify it under the terms of the GNU General Public 8 License as published by the Free Software Foundation; either 9 version 3 of the License, or (at your option) any later version. 10 11 The GNU C Library is distributed in the hope that it will be useful, 12 but WITHOUT ANY WARRANTY; without even the implied warranty of 13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 General Public License for more details. 15 16 You should have received a copy of the GNU General Public 17 License along with the GNU C Library; if not, see 18 <http://www.gnu.org/licenses/>. */ 19 20static reg_errcode_t match_ctx_init (re_match_context_t *cache, int eflags, 21 Idx n) internal_function; 22static void match_ctx_clean (re_match_context_t *mctx) internal_function; 23static void match_ctx_free (re_match_context_t *cache) internal_function; 24static reg_errcode_t match_ctx_add_entry (re_match_context_t *cache, Idx node, 25 Idx str_idx, Idx from, Idx to) 26 internal_function; 27static Idx search_cur_bkref_entry (const re_match_context_t *mctx, Idx str_idx) 28 internal_function; 29static reg_errcode_t match_ctx_add_subtop (re_match_context_t *mctx, Idx node, 30 Idx str_idx) internal_function; 31static re_sub_match_last_t * match_ctx_add_sublast (re_sub_match_top_t *subtop, 32 Idx node, Idx str_idx) 33 internal_function; 34static void sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts, 35 re_dfastate_t **limited_sts, Idx last_node, 36 Idx last_str_idx) 37 internal_function; 38static reg_errcode_t re_search_internal (const regex_t *preg, 39 const char *string, Idx length, 40 Idx start, Idx last_start, Idx stop, 41 size_t nmatch, regmatch_t pmatch[], 42 int eflags) internal_function; 43static regoff_t re_search_2_stub (struct re_pattern_buffer *bufp, 44 const char *string1, Idx length1, 45 const char *string2, Idx length2, 46 Idx start, regoff_t range, 47 struct re_registers *regs, 48 Idx stop, bool ret_len) internal_function; 49static regoff_t re_search_stub (struct re_pattern_buffer *bufp, 50 const char *string, Idx length, Idx start, 51 regoff_t range, Idx stop, 52 struct re_registers *regs, 53 bool ret_len) internal_function; 54static unsigned re_copy_regs (struct re_registers *regs, regmatch_t *pmatch, 55 Idx nregs, int regs_allocated) internal_function; 56static reg_errcode_t prune_impossible_nodes (re_match_context_t *mctx) 57 internal_function; 58static Idx check_matching (re_match_context_t *mctx, bool fl_longest_match, 59 Idx *p_match_first) internal_function; 60static Idx check_halt_state_context (const re_match_context_t *mctx, 61 const re_dfastate_t *state, Idx idx) 62 internal_function; 63static void update_regs (const re_dfa_t *dfa, regmatch_t *pmatch, 64 regmatch_t *prev_idx_match, Idx cur_node, 65 Idx cur_idx, Idx nmatch) internal_function; 66static reg_errcode_t push_fail_stack (struct re_fail_stack_t *fs, 67 Idx str_idx, Idx dest_node, Idx nregs, 68 regmatch_t *regs, 69 re_node_set *eps_via_nodes) 70 internal_function; 71static reg_errcode_t set_regs (const regex_t *preg, 72 const re_match_context_t *mctx, 73 size_t nmatch, regmatch_t *pmatch, 74 bool fl_backtrack) internal_function; 75static reg_errcode_t free_fail_stack_return (struct re_fail_stack_t *fs) 76 internal_function; 77 78#ifdef RE_ENABLE_I18N 79static int sift_states_iter_mb (const re_match_context_t *mctx, 80 re_sift_context_t *sctx, 81 Idx node_idx, Idx str_idx, Idx max_str_idx) 82 internal_function; 83#endif /* RE_ENABLE_I18N */ 84static reg_errcode_t sift_states_backward (const re_match_context_t *mctx, 85 re_sift_context_t *sctx) 86 internal_function; 87static reg_errcode_t build_sifted_states (const re_match_context_t *mctx, 88 re_sift_context_t *sctx, Idx str_idx, 89 re_node_set *cur_dest) 90 internal_function; 91static reg_errcode_t update_cur_sifted_state (const re_match_context_t *mctx, 92 re_sift_context_t *sctx, 93 Idx str_idx, 94 re_node_set *dest_nodes) 95 internal_function; 96static reg_errcode_t add_epsilon_src_nodes (const re_dfa_t *dfa, 97 re_node_set *dest_nodes, 98 const re_node_set *candidates) 99 internal_function; 100static bool check_dst_limits (const re_match_context_t *mctx, 101 const re_node_set *limits, 102 Idx dst_node, Idx dst_idx, Idx src_node, 103 Idx src_idx) internal_function; 104static int check_dst_limits_calc_pos_1 (const re_match_context_t *mctx, 105 int boundaries, Idx subexp_idx, 106 Idx from_node, Idx bkref_idx) 107 internal_function; 108static int check_dst_limits_calc_pos (const re_match_context_t *mctx, 109 Idx limit, Idx subexp_idx, 110 Idx node, Idx str_idx, 111 Idx bkref_idx) internal_function; 112static reg_errcode_t check_subexp_limits (const re_dfa_t *dfa, 113 re_node_set *dest_nodes, 114 const re_node_set *candidates, 115 re_node_set *limits, 116 struct re_backref_cache_entry *bkref_ents, 117 Idx str_idx) internal_function; 118static reg_errcode_t sift_states_bkref (const re_match_context_t *mctx, 119 re_sift_context_t *sctx, 120 Idx str_idx, const re_node_set *candidates) 121 internal_function; 122static reg_errcode_t merge_state_array (const re_dfa_t *dfa, 123 re_dfastate_t **dst, 124 re_dfastate_t **src, Idx num) 125 internal_function; 126static re_dfastate_t *find_recover_state (reg_errcode_t *err, 127 re_match_context_t *mctx) internal_function; 128static re_dfastate_t *transit_state (reg_errcode_t *err, 129 re_match_context_t *mctx, 130 re_dfastate_t *state) internal_function; 131static re_dfastate_t *merge_state_with_log (reg_errcode_t *err, 132 re_match_context_t *mctx, 133 re_dfastate_t *next_state) 134 internal_function; 135static reg_errcode_t check_subexp_matching_top (re_match_context_t *mctx, 136 re_node_set *cur_nodes, 137 Idx str_idx) internal_function; 138#if 0 139static re_dfastate_t *transit_state_sb (reg_errcode_t *err, 140 re_match_context_t *mctx, 141 re_dfastate_t *pstate) 142 internal_function; 143#endif 144#ifdef RE_ENABLE_I18N 145static reg_errcode_t transit_state_mb (re_match_context_t *mctx, 146 re_dfastate_t *pstate) 147 internal_function; 148#endif /* RE_ENABLE_I18N */ 149static reg_errcode_t transit_state_bkref (re_match_context_t *mctx, 150 const re_node_set *nodes) 151 internal_function; 152static reg_errcode_t get_subexp (re_match_context_t *mctx, 153 Idx bkref_node, Idx bkref_str_idx) 154 internal_function; 155static reg_errcode_t get_subexp_sub (re_match_context_t *mctx, 156 const re_sub_match_top_t *sub_top, 157 re_sub_match_last_t *sub_last, 158 Idx bkref_node, Idx bkref_str) 159 internal_function; 160static Idx find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes, 161 Idx subexp_idx, int type) internal_function; 162static reg_errcode_t check_arrival (re_match_context_t *mctx, 163 state_array_t *path, Idx top_node, 164 Idx top_str, Idx last_node, Idx last_str, 165 int type) internal_function; 166static reg_errcode_t check_arrival_add_next_nodes (re_match_context_t *mctx, 167 Idx str_idx, 168 re_node_set *cur_nodes, 169 re_node_set *next_nodes) 170 internal_function; 171static reg_errcode_t check_arrival_expand_ecl (const re_dfa_t *dfa, 172 re_node_set *cur_nodes, 173 Idx ex_subexp, int type) 174 internal_function; 175static reg_errcode_t check_arrival_expand_ecl_sub (const re_dfa_t *dfa, 176 re_node_set *dst_nodes, 177 Idx target, Idx ex_subexp, 178 int type) internal_function; 179static reg_errcode_t expand_bkref_cache (re_match_context_t *mctx, 180 re_node_set *cur_nodes, Idx cur_str, 181 Idx subexp_num, int type) 182 internal_function; 183static bool build_trtable (const re_dfa_t *dfa, 184 re_dfastate_t *state) internal_function; 185#ifdef RE_ENABLE_I18N 186static int check_node_accept_bytes (const re_dfa_t *dfa, Idx node_idx, 187 const re_string_t *input, Idx idx) 188 internal_function; 189# ifdef _LIBC 190static unsigned int find_collation_sequence_value (const unsigned char *mbs, 191 size_t name_len) 192 internal_function; 193# endif /* _LIBC */ 194#endif /* RE_ENABLE_I18N */ 195static Idx group_nodes_into_DFAstates (const re_dfa_t *dfa, 196 const re_dfastate_t *state, 197 re_node_set *states_node, 198 bitset_t *states_ch) internal_function; 199static bool check_node_accept (const re_match_context_t *mctx, 200 const re_token_t *node, Idx idx) 201 internal_function; 202static reg_errcode_t extend_buffers (re_match_context_t *mctx, int min_len) 203 internal_function; 204 205/* Entry point for POSIX code. */ 206 207/* regexec searches for a given pattern, specified by PREG, in the 208 string STRING. 209 210 If NMATCH is zero or REG_NOSUB was set in the cflags argument to 211 'regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at 212 least NMATCH elements, and we set them to the offsets of the 213 corresponding matched substrings. 214 215 EFLAGS specifies "execution flags" which affect matching: if 216 REG_NOTBOL is set, then ^ does not match at the beginning of the 217 string; if REG_NOTEOL is set, then $ does not match at the end. 218 219 We return 0 if we find a match and REG_NOMATCH if not. */ 220 221int 222regexec (preg, string, nmatch, pmatch, eflags) 223 const regex_t *_Restrict_ preg; 224 const char *_Restrict_ string; 225 size_t nmatch; 226 regmatch_t pmatch[_Restrict_arr_]; 227 int eflags; 228{ 229 reg_errcode_t err; 230 Idx start, length; 231 re_dfa_t *dfa = preg->buffer; 232 233 if (eflags & ~(REG_NOTBOL | REG_NOTEOL | REG_STARTEND)) 234 return REG_BADPAT; 235 236 if (eflags & REG_STARTEND) 237 { 238 start = pmatch[0].rm_so; 239 length = pmatch[0].rm_eo; 240 } 241 else 242 { 243 start = 0; 244 length = strlen (string); 245 } 246 247 lock_lock (dfa->lock); 248 if (preg->no_sub) 249 err = re_search_internal (preg, string, length, start, length, 250 length, 0, NULL, eflags); 251 else 252 err = re_search_internal (preg, string, length, start, length, 253 length, nmatch, pmatch, eflags); 254 lock_unlock (dfa->lock); 255 return err != REG_NOERROR; 256} 257 258#ifdef _LIBC 259# include <shlib-compat.h> 260versioned_symbol (libc, __regexec, regexec, GLIBC_2_3_4); 261 262# if SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3_4) 263__typeof__ (__regexec) __compat_regexec; 264 265int 266attribute_compat_text_section 267__compat_regexec (const regex_t *_Restrict_ preg, 268 const char *_Restrict_ string, size_t nmatch, 269 regmatch_t pmatch[], int eflags) 270{ 271 return regexec (preg, string, nmatch, pmatch, 272 eflags & (REG_NOTBOL | REG_NOTEOL)); 273} 274compat_symbol (libc, __compat_regexec, regexec, GLIBC_2_0); 275# endif 276#endif 277 278/* Entry points for GNU code. */ 279 280/* re_match, re_search, re_match_2, re_search_2 281 282 The former two functions operate on STRING with length LENGTH, 283 while the later two operate on concatenation of STRING1 and STRING2 284 with lengths LENGTH1 and LENGTH2, respectively. 285 286 re_match() matches the compiled pattern in BUFP against the string, 287 starting at index START. 288 289 re_search() first tries matching at index START, then it tries to match 290 starting from index START + 1, and so on. The last start position tried 291 is START + RANGE. (Thus RANGE = 0 forces re_search to operate the same 292 way as re_match().) 293 294 The parameter STOP of re_{match,search}_2 specifies that no match exceeding 295 the first STOP characters of the concatenation of the strings should be 296 concerned. 297 298 If REGS is not NULL, and BUFP->no_sub is not set, the offsets of the match 299 and all groups is stored in REGS. (For the "_2" variants, the offsets are 300 computed relative to the concatenation, not relative to the individual 301 strings.) 302 303 On success, re_match* functions return the length of the match, re_search* 304 return the position of the start of the match. Return value -1 means no 305 match was found and -2 indicates an internal error. */ 306 307regoff_t 308re_match (bufp, string, length, start, regs) 309 struct re_pattern_buffer *bufp; 310 const char *string; 311 Idx length, start; 312 struct re_registers *regs; 313{ 314 return re_search_stub (bufp, string, length, start, 0, length, regs, true); 315} 316#ifdef _LIBC 317weak_alias (__re_match, re_match) 318#endif 319 320regoff_t 321re_search (bufp, string, length, start, range, regs) 322 struct re_pattern_buffer *bufp; 323 const char *string; 324 Idx length, start; 325 regoff_t range; 326 struct re_registers *regs; 327{ 328 return re_search_stub (bufp, string, length, start, range, length, regs, 329 false); 330} 331#ifdef _LIBC 332weak_alias (__re_search, re_search) 333#endif 334 335regoff_t 336re_match_2 (bufp, string1, length1, string2, length2, start, regs, stop) 337 struct re_pattern_buffer *bufp; 338 const char *string1, *string2; 339 Idx length1, length2, start, stop; 340 struct re_registers *regs; 341{ 342 return re_search_2_stub (bufp, string1, length1, string2, length2, 343 start, 0, regs, stop, true); 344} 345#ifdef _LIBC 346weak_alias (__re_match_2, re_match_2) 347#endif 348 349regoff_t 350re_search_2 (bufp, string1, length1, string2, length2, start, range, regs, stop) 351 struct re_pattern_buffer *bufp; 352 const char *string1, *string2; 353 Idx length1, length2, start, stop; 354 regoff_t range; 355 struct re_registers *regs; 356{ 357 return re_search_2_stub (bufp, string1, length1, string2, length2, 358 start, range, regs, stop, false); 359} 360#ifdef _LIBC 361weak_alias (__re_search_2, re_search_2) 362#endif 363 364static regoff_t 365re_search_2_stub (struct re_pattern_buffer *bufp, 366 const char *string1, Idx length1, 367 const char *string2, Idx length2, 368 Idx start, regoff_t range, struct re_registers *regs, 369 Idx stop, bool ret_len) 370{ 371 const char *str; 372 regoff_t rval; 373 Idx len = length1 + length2; 374 char *s = NULL; 375 376 if (BE (length1 < 0 || length2 < 0 || stop < 0 || len < length1, 0)) 377 return -2; 378 379 /* Concatenate the strings. */ 380 if (length2 > 0) 381 if (length1 > 0) 382 { 383 s = re_malloc (char, len); 384 385 if (BE (s == NULL, 0)) 386 return -2; 387#ifdef _LIBC 388 memcpy (__mempcpy (s, string1, length1), string2, length2); 389#else 390 memcpy (s, string1, length1); 391 memcpy (s + length1, string2, length2); 392#endif 393 str = s; 394 } 395 else 396 str = string2; 397 else 398 str = string1; 399 400 rval = re_search_stub (bufp, str, len, start, range, stop, regs, 401 ret_len); 402 re_free (s); 403 return rval; 404} 405 406/* The parameters have the same meaning as those of re_search. 407 Additional parameters: 408 If RET_LEN is true the length of the match is returned (re_match style); 409 otherwise the position of the match is returned. */ 410 411static regoff_t 412re_search_stub (struct re_pattern_buffer *bufp, 413 const char *string, Idx length, 414 Idx start, regoff_t range, Idx stop, struct re_registers *regs, 415 bool ret_len) 416{ 417 reg_errcode_t result; 418 regmatch_t *pmatch; 419 Idx nregs; 420 regoff_t rval; 421 int eflags = 0; 422 re_dfa_t *dfa = bufp->buffer; 423 Idx last_start = start + range; 424 425 /* Check for out-of-range. */ 426 if (BE (start < 0 || start > length, 0)) 427 return -1; 428 if (BE (length < last_start || (0 <= range && last_start < start), 0)) 429 last_start = length; 430 else if (BE (last_start < 0 || (range < 0 && start <= last_start), 0)) 431 last_start = 0; 432 433 lock_lock (dfa->lock); 434 435 eflags |= (bufp->not_bol) ? REG_NOTBOL : 0; 436 eflags |= (bufp->not_eol) ? REG_NOTEOL : 0; 437 438 /* Compile fastmap if we haven't yet. */ 439 if (start < last_start && bufp->fastmap != NULL && !bufp->fastmap_accurate) 440 re_compile_fastmap (bufp); 441 442 if (BE (bufp->no_sub, 0)) 443 regs = NULL; 444 445 /* We need at least 1 register. */ 446 if (regs == NULL) 447 nregs = 1; 448 else if (BE (bufp->regs_allocated == REGS_FIXED 449 && regs->num_regs <= bufp->re_nsub, 0)) 450 { 451 nregs = regs->num_regs; 452 if (BE (nregs < 1, 0)) 453 { 454 /* Nothing can be copied to regs. */ 455 regs = NULL; 456 nregs = 1; 457 } 458 } 459 else 460 nregs = bufp->re_nsub + 1; 461 pmatch = re_malloc (regmatch_t, nregs); 462 if (BE (pmatch == NULL, 0)) 463 { 464 rval = -2; 465 goto out; 466 } 467 468 result = re_search_internal (bufp, string, length, start, last_start, stop, 469 nregs, pmatch, eflags); 470 471 rval = 0; 472 473 /* I hope we needn't fill their regs with -1's when no match was found. */ 474 if (result != REG_NOERROR) 475 rval = result == REG_NOMATCH ? -1 : -2; 476 else if (regs != NULL) 477 { 478 /* If caller wants register contents data back, copy them. */ 479 bufp->regs_allocated = re_copy_regs (regs, pmatch, nregs, 480 bufp->regs_allocated); 481 if (BE (bufp->regs_allocated == REGS_UNALLOCATED, 0)) 482 rval = -2; 483 } 484 485 if (BE (rval == 0, 1)) 486 { 487 if (ret_len) 488 { 489 assert (pmatch[0].rm_so == start); 490 rval = pmatch[0].rm_eo - start; 491 } 492 else 493 rval = pmatch[0].rm_so; 494 } 495 re_free (pmatch); 496 out: 497 lock_unlock (dfa->lock); 498 return rval; 499} 500 501static unsigned 502re_copy_regs (struct re_registers *regs, regmatch_t *pmatch, Idx nregs, 503 int regs_allocated) 504{ 505 int rval = REGS_REALLOCATE; 506 Idx i; 507 Idx need_regs = nregs + 1; 508 /* We need one extra element beyond 'num_regs' for the '-1' marker GNU code 509 uses. */ 510 511 /* Have the register data arrays been allocated? */ 512 if (regs_allocated == REGS_UNALLOCATED) 513 { /* No. So allocate them with malloc. */ 514 regs->start = re_malloc (regoff_t, need_regs); 515 if (BE (regs->start == NULL, 0)) 516 return REGS_UNALLOCATED; 517 regs->end = re_malloc (regoff_t, need_regs); 518 if (BE (regs->end == NULL, 0)) 519 { 520 re_free (regs->start); 521 return REGS_UNALLOCATED; 522 } 523 regs->num_regs = need_regs; 524 } 525 else if (regs_allocated == REGS_REALLOCATE) 526 { /* Yes. If we need more elements than were already 527 allocated, reallocate them. If we need fewer, just 528 leave it alone. */ 529 if (BE (need_regs > regs->num_regs, 0)) 530 { 531 regoff_t *new_start = re_realloc (regs->start, regoff_t, need_regs); 532 regoff_t *new_end; 533 if (BE (new_start == NULL, 0)) 534 return REGS_UNALLOCATED; 535 new_end = re_realloc (regs->end, regoff_t, need_regs); 536 if (BE (new_end == NULL, 0)) 537 { 538 re_free (new_start); 539 return REGS_UNALLOCATED; 540 } 541 regs->start = new_start; 542 regs->end = new_end; 543 regs->num_regs = need_regs; 544 } 545 } 546 else 547 { 548 assert (regs_allocated == REGS_FIXED); 549 /* This function may not be called with REGS_FIXED and nregs too big. */ 550 assert (regs->num_regs >= nregs); 551 rval = REGS_FIXED; 552 } 553 554 /* Copy the regs. */ 555 for (i = 0; i < nregs; ++i) 556 { 557 regs->start[i] = pmatch[i].rm_so; 558 regs->end[i] = pmatch[i].rm_eo; 559 } 560 for ( ; i < regs->num_regs; ++i) 561 regs->start[i] = regs->end[i] = -1; 562 563 return rval; 564} 565 566/* Set REGS to hold NUM_REGS registers, storing them in STARTS and 567 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use 568 this memory for recording register information. STARTS and ENDS 569 must be allocated using the malloc library routine, and must each 570 be at least NUM_REGS * sizeof (regoff_t) bytes long. 571 572 If NUM_REGS == 0, then subsequent matches should allocate their own 573 register data. 574 575 Unless this function is called, the first search or match using 576 PATTERN_BUFFER will allocate its own register data, without 577 freeing the old data. */ 578 579void 580re_set_registers (bufp, regs, num_regs, starts, ends) 581 struct re_pattern_buffer *bufp; 582 struct re_registers *regs; 583 __re_size_t num_regs; 584 regoff_t *starts, *ends; 585{ 586 if (num_regs) 587 { 588 bufp->regs_allocated = REGS_REALLOCATE; 589 regs->num_regs = num_regs; 590 regs->start = starts; 591 regs->end = ends; 592 } 593 else 594 { 595 bufp->regs_allocated = REGS_UNALLOCATED; 596 regs->num_regs = 0; 597 regs->start = regs->end = NULL; 598 } 599} 600#ifdef _LIBC 601weak_alias (__re_set_registers, re_set_registers) 602#endif 603 604/* Entry points compatible with 4.2 BSD regex library. We don't define 605 them unless specifically requested. */ 606 607#if defined _REGEX_RE_COMP || defined _LIBC 608int 609# ifdef _LIBC 610weak_function 611# endif 612re_exec (s) 613 const char *s; 614{ 615 return 0 == regexec (&re_comp_buf, s, 0, NULL, 0); 616} 617#endif /* _REGEX_RE_COMP */ 618 619/* Internal entry point. */ 620 621/* Searches for a compiled pattern PREG in the string STRING, whose 622 length is LENGTH. NMATCH, PMATCH, and EFLAGS have the same 623 meaning as with regexec. LAST_START is START + RANGE, where 624 START and RANGE have the same meaning as with re_search. 625 Return REG_NOERROR if we find a match, and REG_NOMATCH if not, 626 otherwise return the error code. 627 Note: We assume front end functions already check ranges. 628 (0 <= LAST_START && LAST_START <= LENGTH) */ 629 630static reg_errcode_t 631__attribute_warn_unused_result__ 632re_search_internal (const regex_t *preg, 633 const char *string, Idx length, 634 Idx start, Idx last_start, Idx stop, 635 size_t nmatch, regmatch_t pmatch[], 636 int eflags) 637{ 638 reg_errcode_t err; 639 const re_dfa_t *dfa = preg->buffer; 640 Idx left_lim, right_lim; 641 int incr; 642 bool fl_longest_match; 643 int match_kind; 644 Idx match_first; 645 Idx match_last = REG_MISSING; 646 Idx extra_nmatch; 647 bool sb; 648 int ch; 649#if defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L) 650 re_match_context_t mctx = { .dfa = dfa }; 651#else 652 re_match_context_t mctx; 653#endif 654 char *fastmap = ((preg->fastmap != NULL && preg->fastmap_accurate 655 && start != last_start && !preg->can_be_null) 656 ? preg->fastmap : NULL); 657 RE_TRANSLATE_TYPE t = preg->translate; 658 659#if !(defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L)) 660 memset (&mctx, '\0', sizeof (re_match_context_t)); 661 mctx.dfa = dfa; 662#endif 663 664 extra_nmatch = (nmatch > preg->re_nsub) ? nmatch - (preg->re_nsub + 1) : 0; 665 nmatch -= extra_nmatch; 666 667 /* Check if the DFA haven't been compiled. */ 668 if (BE (preg->used == 0 || dfa->init_state == NULL 669 || dfa->init_state_word == NULL || dfa->init_state_nl == NULL 670 || dfa->init_state_begbuf == NULL, 0)) 671 return REG_NOMATCH; 672 673#ifdef DEBUG 674 /* We assume front-end functions already check them. */ 675 assert (0 <= last_start && last_start <= length); 676#endif 677 678 /* If initial states with non-begbuf contexts have no elements, 679 the regex must be anchored. If preg->newline_anchor is set, 680 we'll never use init_state_nl, so do not check it. */ 681 if (dfa->init_state->nodes.nelem == 0 682 && dfa->init_state_word->nodes.nelem == 0 683 && (dfa->init_state_nl->nodes.nelem == 0 684 || !preg->newline_anchor)) 685 { 686 if (start != 0 && last_start != 0) 687 return REG_NOMATCH; 688 start = last_start = 0; 689 } 690 691 /* We must check the longest matching, if nmatch > 0. */ 692 fl_longest_match = (nmatch != 0 || dfa->nbackref); 693 694 err = re_string_allocate (&mctx.input, string, length, dfa->nodes_len + 1, 695 preg->translate, (preg->syntax & RE_ICASE) != 0, 696 dfa); 697 if (BE (err != REG_NOERROR, 0)) 698 goto free_return; 699 mctx.input.stop = stop; 700 mctx.input.raw_stop = stop; 701 mctx.input.newline_anchor = preg->newline_anchor; 702 703 err = match_ctx_init (&mctx, eflags, dfa->nbackref * 2); 704 if (BE (err != REG_NOERROR, 0)) 705 goto free_return; 706 707 /* We will log all the DFA states through which the dfa pass, 708 if nmatch > 1, or this dfa has "multibyte node", which is a 709 back-reference or a node which can accept multibyte character or 710 multi character collating element. */ 711 if (nmatch > 1 || dfa->has_mb_node) 712 { 713 /* Avoid overflow. */ 714 if (BE ((MIN (IDX_MAX, SIZE_MAX / sizeof (re_dfastate_t *)) 715 <= mctx.input.bufs_len), 0)) 716 { 717 err = REG_ESPACE; 718 goto free_return; 719 } 720 721 mctx.state_log = re_malloc (re_dfastate_t *, mctx.input.bufs_len + 1); 722 if (BE (mctx.state_log == NULL, 0)) 723 { 724 err = REG_ESPACE; 725 goto free_return; 726 } 727 } 728 else 729 mctx.state_log = NULL; 730 731 match_first = start; 732 mctx.input.tip_context = (eflags & REG_NOTBOL) ? CONTEXT_BEGBUF 733 : CONTEXT_NEWLINE | CONTEXT_BEGBUF; 734 735 /* Check incrementally whether the input string matches. */ 736 incr = (last_start < start) ? -1 : 1; 737 left_lim = (last_start < start) ? last_start : start; 738 right_lim = (last_start < start) ? start : last_start; 739 sb = dfa->mb_cur_max == 1; 740 match_kind = 741 (fastmap 742 ? ((sb || !(preg->syntax & RE_ICASE || t) ? 4 : 0) 743 | (start <= last_start ? 2 : 0) 744 | (t != NULL ? 1 : 0)) 745 : 8); 746 747 for (;; match_first += incr) 748 { 749 err = REG_NOMATCH; 750 if (match_first < left_lim || right_lim < match_first) 751 goto free_return; 752 753 /* Advance as rapidly as possible through the string, until we 754 find a plausible place to start matching. This may be done 755 with varying efficiency, so there are various possibilities: 756 only the most common of them are specialized, in order to 757 save on code size. We use a switch statement for speed. */ 758 switch (match_kind) 759 { 760 case 8: 761 /* No fastmap. */ 762 break; 763 764 case 7: 765 /* Fastmap with single-byte translation, match forward. */ 766 while (BE (match_first < right_lim, 1) 767 && !fastmap[t[(unsigned char) string[match_first]]]) 768 ++match_first; 769 goto forward_match_found_start_or_reached_end; 770 771 case 6: 772 /* Fastmap without translation, match forward. */ 773 while (BE (match_first < right_lim, 1) 774 && !fastmap[(unsigned char) string[match_first]]) 775 ++match_first; 776 777 forward_match_found_start_or_reached_end: 778 if (BE (match_first == right_lim, 0)) 779 { 780 ch = match_first >= length 781 ? 0 : (unsigned char) string[match_first]; 782 if (!fastmap[t ? t[ch] : ch]) 783 goto free_return; 784 } 785 break; 786 787 case 4: 788 case 5: 789 /* Fastmap without multi-byte translation, match backwards. */ 790 while (match_first >= left_lim) 791 { 792 ch = match_first >= length 793 ? 0 : (unsigned char) string[match_first]; 794 if (fastmap[t ? t[ch] : ch]) 795 break; 796 --match_first; 797 } 798 if (match_first < left_lim) 799 goto free_return; 800 break; 801 802 default: 803 /* In this case, we can't determine easily the current byte, 804 since it might be a component byte of a multibyte 805 character. Then we use the constructed buffer instead. */ 806 for (;;) 807 { 808 /* If MATCH_FIRST is out of the valid range, reconstruct the 809 buffers. */ 810 __re_size_t offset = match_first - mctx.input.raw_mbs_idx; 811 if (BE (offset >= (__re_size_t) mctx.input.valid_raw_len, 0)) 812 { 813 err = re_string_reconstruct (&mctx.input, match_first, 814 eflags); 815 if (BE (err != REG_NOERROR, 0)) 816 goto free_return; 817 818 offset = match_first - mctx.input.raw_mbs_idx; 819 } 820 /* If MATCH_FIRST is out of the buffer, leave it as '\0'. 821 Note that MATCH_FIRST must not be smaller than 0. */ 822 ch = (match_first >= length 823 ? 0 : re_string_byte_at (&mctx.input, offset)); 824 if (fastmap[ch]) 825 break; 826 match_first += incr; 827 if (match_first < left_lim || match_first > right_lim) 828 { 829 err = REG_NOMATCH; 830 goto free_return; 831 } 832 } 833 break; 834 } 835 836 /* Reconstruct the buffers so that the matcher can assume that 837 the matching starts from the beginning of the buffer. */ 838 err = re_string_reconstruct (&mctx.input, match_first, eflags); 839 if (BE (err != REG_NOERROR, 0)) 840 goto free_return; 841 842#ifdef RE_ENABLE_I18N 843 /* Don't consider this char as a possible match start if it part, 844 yet isn't the head, of a multibyte character. */ 845 if (!sb && !re_string_first_byte (&mctx.input, 0)) 846 continue; 847#endif 848 849 /* It seems to be appropriate one, then use the matcher. */ 850 /* We assume that the matching starts from 0. */ 851 mctx.state_log_top = mctx.nbkref_ents = mctx.max_mb_elem_len = 0; 852 match_last = check_matching (&mctx, fl_longest_match, 853 start <= last_start ? &match_first : NULL); 854 if (match_last != REG_MISSING) 855 { 856 if (BE (match_last == REG_ERROR, 0)) 857 { 858 err = REG_ESPACE; 859 goto free_return; 860 } 861 else 862 { 863 mctx.match_last = match_last; 864 if ((!preg->no_sub && nmatch > 1) || dfa->nbackref) 865 { 866 re_dfastate_t *pstate = mctx.state_log[match_last]; 867 mctx.last_node = check_halt_state_context (&mctx, pstate, 868 match_last); 869 } 870 if ((!preg->no_sub && nmatch > 1 && dfa->has_plural_match) 871 || dfa->nbackref) 872 { 873 err = prune_impossible_nodes (&mctx); 874 if (err == REG_NOERROR) 875 break; 876 if (BE (err != REG_NOMATCH, 0)) 877 goto free_return; 878 match_last = REG_MISSING; 879 } 880 else 881 break; /* We found a match. */ 882 } 883 } 884 885 match_ctx_clean (&mctx); 886 } 887 888#ifdef DEBUG 889 assert (match_last != REG_MISSING); 890 assert (err == REG_NOERROR); 891#endif 892 893 /* Set pmatch[] if we need. */ 894 if (nmatch > 0) 895 { 896 Idx reg_idx; 897 898 /* Initialize registers. */ 899 for (reg_idx = 1; reg_idx < nmatch; ++reg_idx) 900 pmatch[reg_idx].rm_so = pmatch[reg_idx].rm_eo = -1; 901 902 /* Set the points where matching start/end. */ 903 pmatch[0].rm_so = 0; 904 pmatch[0].rm_eo = mctx.match_last; 905 /* FIXME: This function should fail if mctx.match_last exceeds 906 the maximum possible regoff_t value. We need a new error 907 code REG_OVERFLOW. */ 908 909 if (!preg->no_sub && nmatch > 1) 910 { 911 err = set_regs (preg, &mctx, nmatch, pmatch, 912 dfa->has_plural_match && dfa->nbackref > 0); 913 if (BE (err != REG_NOERROR, 0)) 914 goto free_return; 915 } 916 917 /* At last, add the offset to each register, since we slid 918 the buffers so that we could assume that the matching starts 919 from 0. */ 920 for (reg_idx = 0; reg_idx < nmatch; ++reg_idx) 921 if (pmatch[reg_idx].rm_so != -1) 922 { 923#ifdef RE_ENABLE_I18N 924 if (BE (mctx.input.offsets_needed != 0, 0)) 925 { 926 pmatch[reg_idx].rm_so = 927 (pmatch[reg_idx].rm_so == mctx.input.valid_len 928 ? mctx.input.valid_raw_len 929 : mctx.input.offsets[pmatch[reg_idx].rm_so]); 930 pmatch[reg_idx].rm_eo = 931 (pmatch[reg_idx].rm_eo == mctx.input.valid_len 932 ? mctx.input.valid_raw_len 933 : mctx.input.offsets[pmatch[reg_idx].rm_eo]); 934 } 935#else 936 assert (mctx.input.offsets_needed == 0); 937#endif 938 pmatch[reg_idx].rm_so += match_first; 939 pmatch[reg_idx].rm_eo += match_first; 940 } 941 for (reg_idx = 0; reg_idx < extra_nmatch; ++reg_idx) 942 { 943 pmatch[nmatch + reg_idx].rm_so = -1; 944 pmatch[nmatch + reg_idx].rm_eo = -1; 945 } 946 947 if (dfa->subexp_map) 948 for (reg_idx = 0; reg_idx + 1 < nmatch; reg_idx++) 949 if (dfa->subexp_map[reg_idx] != reg_idx) 950 { 951 pmatch[reg_idx + 1].rm_so 952 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_so; 953 pmatch[reg_idx + 1].rm_eo 954 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_eo; 955 } 956 } 957 958 free_return: 959 re_free (mctx.state_log); 960 if (dfa->nbackref) 961 match_ctx_free (&mctx); 962 re_string_destruct (&mctx.input); 963 return err; 964} 965 966static reg_errcode_t 967__attribute_warn_unused_result__ 968prune_impossible_nodes (re_match_context_t *mctx) 969{ 970 const re_dfa_t *const dfa = mctx->dfa; 971 Idx halt_node, match_last; 972 reg_errcode_t ret; 973 re_dfastate_t **sifted_states; 974 re_dfastate_t **lim_states = NULL; 975 re_sift_context_t sctx; 976#ifdef DEBUG 977 assert (mctx->state_log != NULL); 978#endif 979 match_last = mctx->match_last; 980 halt_node = mctx->last_node; 981 982 /* Avoid overflow. */ 983 if (BE (MIN (IDX_MAX, SIZE_MAX / sizeof (re_dfastate_t *)) <= match_last, 0)) 984 return REG_ESPACE; 985 986 sifted_states = re_malloc (re_dfastate_t *, match_last + 1); 987 if (BE (sifted_states == NULL, 0)) 988 { 989 ret = REG_ESPACE; 990 goto free_return; 991 } 992 if (dfa->nbackref) 993 { 994 lim_states = re_malloc (re_dfastate_t *, match_last + 1); 995 if (BE (lim_states == NULL, 0)) 996 { 997 ret = REG_ESPACE; 998 goto free_return; 999 } 1000 while (1) 1001 { 1002 memset (lim_states, '\0', 1003 sizeof (re_dfastate_t *) * (match_last + 1)); 1004 sift_ctx_init (&sctx, sifted_states, lim_states, halt_node, 1005 match_last); 1006 ret = sift_states_backward (mctx, &sctx); 1007 re_node_set_free (&sctx.limits); 1008 if (BE (ret != REG_NOERROR, 0)) 1009 goto free_return; 1010 if (sifted_states[0] != NULL || lim_states[0] != NULL) 1011 break; 1012 do 1013 { 1014 --match_last; 1015 if (! REG_VALID_INDEX (match_last)) 1016 { 1017 ret = REG_NOMATCH; 1018 goto free_return; 1019 } 1020 } while (mctx->state_log[match_last] == NULL 1021 || !mctx->state_log[match_last]->halt); 1022 halt_node = check_halt_state_context (mctx, 1023 mctx->state_log[match_last], 1024 match_last); 1025 } 1026 ret = merge_state_array (dfa, sifted_states, lim_states, 1027 match_last + 1); 1028 re_free (lim_states); 1029 lim_states = NULL; 1030 if (BE (ret != REG_NOERROR, 0)) 1031 goto free_return; 1032 } 1033 else 1034 { 1035 sift_ctx_init (&sctx, sifted_states, lim_states, halt_node, match_last); 1036 ret = sift_states_backward (mctx, &sctx); 1037 re_node_set_free (&sctx.limits); 1038 if (BE (ret != REG_NOERROR, 0)) 1039 goto free_return; 1040 if (sifted_states[0] == NULL) 1041 { 1042 ret = REG_NOMATCH; 1043 goto free_return; 1044 } 1045 } 1046 re_free (mctx->state_log); 1047 mctx->state_log = sifted_states; 1048 sifted_states = NULL; 1049 mctx->last_node = halt_node; 1050 mctx->match_last = match_last; 1051 ret = REG_NOERROR; 1052 free_return: 1053 re_free (sifted_states); 1054 re_free (lim_states); 1055 return ret; 1056} 1057 1058/* Acquire an initial state and return it. 1059 We must select appropriate initial state depending on the context, 1060 since initial states may have constraints like "\<", "^", etc.. */ 1061 1062static inline re_dfastate_t * 1063__attribute__ ((always_inline)) internal_function 1064acquire_init_state_context (reg_errcode_t *err, const re_match_context_t *mctx, 1065 Idx idx) 1066{ 1067 const re_dfa_t *const dfa = mctx->dfa; 1068 if (dfa->init_state->has_constraint) 1069 { 1070 unsigned int context; 1071 context = re_string_context_at (&mctx->input, idx - 1, mctx->eflags); 1072 if (IS_WORD_CONTEXT (context)) 1073 return dfa->init_state_word; 1074 else if (IS_ORDINARY_CONTEXT (context)) 1075 return dfa->init_state; 1076 else if (IS_BEGBUF_CONTEXT (context) && IS_NEWLINE_CONTEXT (context)) 1077 return dfa->init_state_begbuf; 1078 else if (IS_NEWLINE_CONTEXT (context)) 1079 return dfa->init_state_nl; 1080 else if (IS_BEGBUF_CONTEXT (context)) 1081 { 1082 /* It is relatively rare case, then calculate on demand. */ 1083 return re_acquire_state_context (err, dfa, 1084 dfa->init_state->entrance_nodes, 1085 context); 1086 } 1087 else 1088 /* Must not happen? */ 1089 return dfa->init_state; 1090 } 1091 else 1092 return dfa->init_state; 1093} 1094 1095/* Check whether the regular expression match input string INPUT or not, 1096 and return the index where the matching end. Return REG_MISSING if 1097 there is no match, and return REG_ERROR in case of an error. 1098 FL_LONGEST_MATCH means we want the POSIX longest matching. 1099 If P_MATCH_FIRST is not NULL, and the match fails, it is set to the 1100 next place where we may want to try matching. 1101 Note that the matcher assumes that the matching starts from the current 1102 index of the buffer. */ 1103 1104static Idx 1105internal_function __attribute_warn_unused_result__ 1106check_matching (re_match_context_t *mctx, bool fl_longest_match, 1107 Idx *p_match_first) 1108{ 1109 const re_dfa_t *const dfa = mctx->dfa; 1110 reg_errcode_t err; 1111 Idx match = 0; 1112 Idx match_last = REG_MISSING; 1113 Idx cur_str_idx = re_string_cur_idx (&mctx->input); 1114 re_dfastate_t *cur_state; 1115 bool at_init_state = p_match_first != NULL; 1116 Idx next_start_idx = cur_str_idx; 1117 1118 err = REG_NOERROR; 1119 cur_state = acquire_init_state_context (&err, mctx, cur_str_idx); 1120 /* An initial state must not be NULL (invalid). */ 1121 if (BE (cur_state == NULL, 0)) 1122 { 1123 assert (err == REG_ESPACE); 1124 return REG_ERROR; 1125 } 1126 1127 if (mctx->state_log != NULL) 1128 { 1129 mctx->state_log[cur_str_idx] = cur_state; 1130 1131 /* Check OP_OPEN_SUBEXP in the initial state in case that we use them 1132 later. E.g. Processing back references. */ 1133 if (BE (dfa->nbackref, 0)) 1134 { 1135 at_init_state = false; 1136 err = check_subexp_matching_top (mctx, &cur_state->nodes, 0); 1137 if (BE (err != REG_NOERROR, 0)) 1138 return err; 1139 1140 if (cur_state->has_backref) 1141 { 1142 err = transit_state_bkref (mctx, &cur_state->nodes); 1143 if (BE (err != REG_NOERROR, 0)) 1144 return err; 1145 } 1146 } 1147 } 1148 1149 /* If the RE accepts NULL string. */ 1150 if (BE (cur_state->halt, 0)) 1151 { 1152 if (!cur_state->has_constraint 1153 || check_halt_state_context (mctx, cur_state, cur_str_idx)) 1154 { 1155 if (!fl_longest_match) 1156 return cur_str_idx; 1157 else 1158 { 1159 match_last = cur_str_idx; 1160 match = 1; 1161 } 1162 } 1163 } 1164 1165 while (!re_string_eoi (&mctx->input)) 1166 { 1167 re_dfastate_t *old_state = cur_state; 1168 Idx next_char_idx = re_string_cur_idx (&mctx->input) + 1; 1169 1170 if ((BE (next_char_idx >= mctx->input.bufs_len, 0) 1171 && mctx->input.bufs_len < mctx->input.len) 1172 || (BE (next_char_idx >= mctx->input.valid_len, 0) 1173 && mctx->input.valid_len < mctx->input.len)) 1174 { 1175 err = extend_buffers (mctx, next_char_idx + 1); 1176 if (BE (err != REG_NOERROR, 0)) 1177 { 1178 assert (err == REG_ESPACE); 1179 return REG_ERROR; 1180 } 1181 } 1182 1183 cur_state = transit_state (&err, mctx, cur_state); 1184 if (mctx->state_log != NULL) 1185 cur_state = merge_state_with_log (&err, mctx, cur_state); 1186 1187 if (cur_state == NULL) 1188 { 1189 /* Reached the invalid state or an error. Try to recover a valid 1190 state using the state log, if available and if we have not 1191 already found a valid (even if not the longest) match. */ 1192 if (BE (err != REG_NOERROR, 0)) 1193 return REG_ERROR; 1194 1195 if (mctx->state_log == NULL 1196 || (match && !fl_longest_match) 1197 || (cur_state = find_recover_state (&err, mctx)) == NULL) 1198 break; 1199 } 1200 1201 if (BE (at_init_state, 0)) 1202 { 1203 if (old_state == cur_state) 1204 next_start_idx = next_char_idx; 1205 else 1206 at_init_state = false; 1207 } 1208 1209 if (cur_state->halt) 1210 { 1211 /* Reached a halt state. 1212 Check the halt state can satisfy the current context. */ 1213 if (!cur_state->has_constraint 1214 || check_halt_state_context (mctx, cur_state, 1215 re_string_cur_idx (&mctx->input))) 1216 { 1217 /* We found an appropriate halt state. */ 1218 match_last = re_string_cur_idx (&mctx->input); 1219 match = 1; 1220 1221 /* We found a match, do not modify match_first below. */ 1222 p_match_first = NULL; 1223 if (!fl_longest_match) 1224 break; 1225 } 1226 } 1227 } 1228 1229 if (p_match_first) 1230 *p_match_first += next_start_idx; 1231 1232 return match_last; 1233} 1234 1235/* Check NODE match the current context. */ 1236 1237static bool 1238internal_function 1239check_halt_node_context (const re_dfa_t *dfa, Idx node, unsigned int context) 1240{ 1241 re_token_type_t type = dfa->nodes[node].type; 1242 unsigned int constraint = dfa->nodes[node].constraint; 1243 if (type != END_OF_RE) 1244 return false; 1245 if (!constraint) 1246 return true; 1247 if (NOT_SATISFY_NEXT_CONSTRAINT (constraint, context)) 1248 return false; 1249 return true; 1250} 1251 1252/* Check the halt state STATE match the current context. 1253 Return 0 if not match, if the node, STATE has, is a halt node and 1254 match the context, return the node. */ 1255 1256static Idx 1257internal_function 1258check_halt_state_context (const re_match_context_t *mctx, 1259 const re_dfastate_t *state, Idx idx) 1260{ 1261 Idx i; 1262 unsigned int context; 1263#ifdef DEBUG 1264 assert (state->halt); 1265#endif 1266 context = re_string_context_at (&mctx->input, idx, mctx->eflags); 1267 for (i = 0; i < state->nodes.nelem; ++i) 1268 if (check_halt_node_context (mctx->dfa, state->nodes.elems[i], context)) 1269 return state->nodes.elems[i]; 1270 return 0; 1271} 1272 1273/* Compute the next node to which "NFA" transit from NODE("NFA" is a NFA 1274 corresponding to the DFA). 1275 Return the destination node, and update EPS_VIA_NODES; 1276 return REG_MISSING in case of errors. */ 1277 1278static Idx 1279internal_function 1280proceed_next_node (const re_match_context_t *mctx, Idx nregs, regmatch_t *regs, 1281 Idx *pidx, Idx node, re_node_set *eps_via_nodes, 1282 struct re_fail_stack_t *fs) 1283{ 1284 const re_dfa_t *const dfa = mctx->dfa; 1285 Idx i; 1286 bool ok; 1287 if (IS_EPSILON_NODE (dfa->nodes[node].type)) 1288 { 1289 re_node_set *cur_nodes = &mctx->state_log[*pidx]->nodes; 1290 re_node_set *edests = &dfa->edests[node]; 1291 Idx dest_node; 1292 ok = re_node_set_insert (eps_via_nodes, node); 1293 if (BE (! ok, 0)) 1294 return REG_ERROR; 1295 /* Pick up a valid destination, or return REG_MISSING if none 1296 is found. */ 1297 for (dest_node = REG_MISSING, i = 0; i < edests->nelem; ++i) 1298 { 1299 Idx candidate = edests->elems[i]; 1300 if (!re_node_set_contains (cur_nodes, candidate)) 1301 continue; 1302 if (dest_node == REG_MISSING) 1303 dest_node = candidate; 1304 1305 else 1306 { 1307 /* In order to avoid infinite loop like "(a*)*", return the second 1308 epsilon-transition if the first was already considered. */ 1309 if (re_node_set_contains (eps_via_nodes, dest_node)) 1310 return candidate; 1311 1312 /* Otherwise, push the second epsilon-transition on the fail stack. */ 1313 else if (fs != NULL 1314 && push_fail_stack (fs, *pidx, candidate, nregs, regs, 1315 eps_via_nodes)) 1316 return REG_ERROR; 1317 1318 /* We know we are going to exit. */ 1319 break; 1320 } 1321 } 1322 return dest_node; 1323 } 1324 else 1325 { 1326 Idx naccepted = 0; 1327 re_token_type_t type = dfa->nodes[node].type; 1328 1329#ifdef RE_ENABLE_I18N 1330 if (dfa->nodes[node].accept_mb) 1331 naccepted = check_node_accept_bytes (dfa, node, &mctx->input, *pidx); 1332 else 1333#endif /* RE_ENABLE_I18N */ 1334 if (type == OP_BACK_REF) 1335 { 1336 Idx subexp_idx = dfa->nodes[node].opr.idx + 1; 1337 naccepted = regs[subexp_idx].rm_eo - regs[subexp_idx].rm_so; 1338 if (fs != NULL) 1339 { 1340 if (regs[subexp_idx].rm_so == -1 || regs[subexp_idx].rm_eo == -1) 1341 return REG_MISSING; 1342 else if (naccepted) 1343 { 1344 char *buf = (char *) re_string_get_buffer (&mctx->input); 1345 if (memcmp (buf + regs[subexp_idx].rm_so, buf + *pidx, 1346 naccepted) != 0) 1347 return REG_MISSING; 1348 } 1349 } 1350 1351 if (naccepted == 0) 1352 { 1353 Idx dest_node; 1354 ok = re_node_set_insert (eps_via_nodes, node); 1355 if (BE (! ok, 0)) 1356 return REG_ERROR; 1357 dest_node = dfa->edests[node].elems[0]; 1358 if (re_node_set_contains (&mctx->state_log[*pidx]->nodes, 1359 dest_node)) 1360 return dest_node; 1361 } 1362 } 1363 1364 if (naccepted != 0 1365 || check_node_accept (mctx, dfa->nodes + node, *pidx)) 1366 { 1367 Idx dest_node = dfa->nexts[node]; 1368 *pidx = (naccepted == 0) ? *pidx + 1 : *pidx + naccepted; 1369 if (fs && (*pidx > mctx->match_last || mctx->state_log[*pidx] == NULL 1370 || !re_node_set_contains (&mctx->state_log[*pidx]->nodes, 1371 dest_node))) 1372 return REG_MISSING; 1373 re_node_set_empty (eps_via_nodes); 1374 return dest_node; 1375 } 1376 } 1377 return REG_MISSING; 1378} 1379 1380static reg_errcode_t 1381internal_function __attribute_warn_unused_result__ 1382push_fail_stack (struct re_fail_stack_t *fs, Idx str_idx, Idx dest_node, 1383 Idx nregs, regmatch_t *regs, re_node_set *eps_via_nodes) 1384{ 1385 reg_errcode_t err; 1386 Idx num = fs->num++; 1387 if (fs->num == fs->alloc) 1388 { 1389 struct re_fail_stack_ent_t *new_array; 1390 new_array = realloc (fs->stack, (sizeof (struct re_fail_stack_ent_t) 1391 * fs->alloc * 2)); 1392 if (new_array == NULL) 1393 return REG_ESPACE; 1394 fs->alloc *= 2; 1395 fs->stack = new_array; 1396 } 1397 fs->stack[num].idx = str_idx; 1398 fs->stack[num].node = dest_node; 1399 fs->stack[num].regs = re_malloc (regmatch_t, nregs); 1400 if (fs->stack[num].regs == NULL) 1401 return REG_ESPACE; 1402 memcpy (fs->stack[num].regs, regs, sizeof (regmatch_t) * nregs); 1403 err = re_node_set_init_copy (&fs->stack[num].eps_via_nodes, eps_via_nodes); 1404 return err; 1405} 1406 1407static Idx 1408internal_function 1409pop_fail_stack (struct re_fail_stack_t *fs, Idx *pidx, Idx nregs, 1410 regmatch_t *regs, re_node_set *eps_via_nodes) 1411{ 1412 Idx num = --fs->num; 1413 assert (REG_VALID_INDEX (num)); 1414 *pidx = fs->stack[num].idx; 1415 memcpy (regs, fs->stack[num].regs, sizeof (regmatch_t) * nregs); 1416 re_node_set_free (eps_via_nodes); 1417 re_free (fs->stack[num].regs); 1418 *eps_via_nodes = fs->stack[num].eps_via_nodes; 1419 return fs->stack[num].node; 1420} 1421 1422/* Set the positions where the subexpressions are starts/ends to registers 1423 PMATCH. 1424 Note: We assume that pmatch[0] is already set, and 1425 pmatch[i].rm_so == pmatch[i].rm_eo == -1 for 0 < i < nmatch. */ 1426 1427static reg_errcode_t 1428internal_function __attribute_warn_unused_result__ 1429set_regs (const regex_t *preg, const re_match_context_t *mctx, size_t nmatch, 1430 regmatch_t *pmatch, bool fl_backtrack) 1431{ 1432 const re_dfa_t *dfa = preg->buffer; 1433 Idx idx, cur_node; 1434 re_node_set eps_via_nodes; 1435 struct re_fail_stack_t *fs; 1436 struct re_fail_stack_t fs_body = { 0, 2, NULL }; 1437 regmatch_t *prev_idx_match; 1438 bool prev_idx_match_malloced = false; 1439 1440#ifdef DEBUG 1441 assert (nmatch > 1); 1442 assert (mctx->state_log != NULL); 1443#endif 1444 if (fl_backtrack) 1445 { 1446 fs = &fs_body; 1447 fs->stack = re_malloc (struct re_fail_stack_ent_t, fs->alloc); 1448 if (fs->stack == NULL) 1449 return REG_ESPACE; 1450 } 1451 else 1452 fs = NULL; 1453 1454 cur_node = dfa->init_node; 1455 re_node_set_init_empty (&eps_via_nodes); 1456 1457 if (__libc_use_alloca (nmatch * sizeof (regmatch_t))) 1458 prev_idx_match = (regmatch_t *) alloca (nmatch * sizeof (regmatch_t)); 1459 else 1460 { 1461 prev_idx_match = re_malloc (regmatch_t, nmatch); 1462 if (prev_idx_match == NULL) 1463 { 1464 free_fail_stack_return (fs); 1465 return REG_ESPACE; 1466 } 1467 prev_idx_match_malloced = true; 1468 } 1469 memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch); 1470 1471 for (idx = pmatch[0].rm_so; idx <= pmatch[0].rm_eo ;) 1472 { 1473 update_regs (dfa, pmatch, prev_idx_match, cur_node, idx, nmatch); 1474 1475 if (idx == pmatch[0].rm_eo && cur_node == mctx->last_node) 1476 { 1477 Idx reg_idx; 1478 if (fs) 1479 { 1480 for (reg_idx = 0; reg_idx < nmatch; ++reg_idx) 1481 if (pmatch[reg_idx].rm_so > -1 && pmatch[reg_idx].rm_eo == -1) 1482 break; 1483 if (reg_idx == nmatch) 1484 { 1485 re_node_set_free (&eps_via_nodes); 1486 if (prev_idx_match_malloced) 1487 re_free (prev_idx_match); 1488 return free_fail_stack_return (fs); 1489 } 1490 cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch, 1491 &eps_via_nodes); 1492 } 1493 else 1494 { 1495 re_node_set_free (&eps_via_nodes); 1496 if (prev_idx_match_malloced) 1497 re_free (prev_idx_match); 1498 return REG_NOERROR; 1499 } 1500 } 1501 1502 /* Proceed to next node. */ 1503 cur_node = proceed_next_node (mctx, nmatch, pmatch, &idx, cur_node, 1504 &eps_via_nodes, fs); 1505 1506 if (BE (! REG_VALID_INDEX (cur_node), 0)) 1507 { 1508 if (BE (cur_node == REG_ERROR, 0)) 1509 { 1510 re_node_set_free (&eps_via_nodes); 1511 if (prev_idx_match_malloced) 1512 re_free (prev_idx_match); 1513 free_fail_stack_return (fs); 1514 return REG_ESPACE; 1515 } 1516 if (fs) 1517 cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch, 1518 &eps_via_nodes); 1519 else 1520 { 1521 re_node_set_free (&eps_via_nodes); 1522 if (prev_idx_match_malloced) 1523 re_free (prev_idx_match); 1524 return REG_NOMATCH; 1525 } 1526 } 1527 } 1528 re_node_set_free (&eps_via_nodes); 1529 if (prev_idx_match_malloced) 1530 re_free (prev_idx_match); 1531 return free_fail_stack_return (fs); 1532} 1533 1534static reg_errcode_t 1535internal_function 1536free_fail_stack_return (struct re_fail_stack_t *fs) 1537{ 1538 if (fs) 1539 { 1540 Idx fs_idx; 1541 for (fs_idx = 0; fs_idx < fs->num; ++fs_idx) 1542 { 1543 re_node_set_free (&fs->stack[fs_idx].eps_via_nodes); 1544 re_free (fs->stack[fs_idx].regs); 1545 } 1546 re_free (fs->stack); 1547 } 1548 return REG_NOERROR; 1549} 1550 1551static void 1552internal_function 1553update_regs (const re_dfa_t *dfa, regmatch_t *pmatch, 1554 regmatch_t *prev_idx_match, Idx cur_node, Idx cur_idx, Idx nmatch) 1555{ 1556 int type = dfa->nodes[cur_node].type; 1557 if (type == OP_OPEN_SUBEXP) 1558 { 1559 Idx reg_num = dfa->nodes[cur_node].opr.idx + 1; 1560 1561 /* We are at the first node of this sub expression. */ 1562 if (reg_num < nmatch) 1563 { 1564 pmatch[reg_num].rm_so = cur_idx; 1565 pmatch[reg_num].rm_eo = -1; 1566 } 1567 } 1568 else if (type == OP_CLOSE_SUBEXP) 1569 { 1570 Idx reg_num = dfa->nodes[cur_node].opr.idx + 1; 1571 if (reg_num < nmatch) 1572 { 1573 /* We are at the last node of this sub expression. */ 1574 if (pmatch[reg_num].rm_so < cur_idx) 1575 { 1576 pmatch[reg_num].rm_eo = cur_idx; 1577 /* This is a non-empty match or we are not inside an optional 1578 subexpression. Accept this right away. */ 1579 memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch); 1580 } 1581 else 1582 { 1583 if (dfa->nodes[cur_node].opt_subexp 1584 && prev_idx_match[reg_num].rm_so != -1) 1585 /* We transited through an empty match for an optional 1586 subexpression, like (a?)*, and this is not the subexp's 1587 first match. Copy back the old content of the registers 1588 so that matches of an inner subexpression are undone as 1589 well, like in ((a?))*. */ 1590 memcpy (pmatch, prev_idx_match, sizeof (regmatch_t) * nmatch); 1591 else 1592 /* We completed a subexpression, but it may be part of 1593 an optional one, so do not update PREV_IDX_MATCH. */ 1594 pmatch[reg_num].rm_eo = cur_idx; 1595 } 1596 } 1597 } 1598} 1599 1600/* This function checks the STATE_LOG from the SCTX->last_str_idx to 0 1601 and sift the nodes in each states according to the following rules. 1602 Updated state_log will be wrote to STATE_LOG. 1603 1604 Rules: We throw away the Node 'a' in the STATE_LOG[STR_IDX] if... 1605 1. When STR_IDX == MATCH_LAST(the last index in the state_log): 1606 If 'a' isn't the LAST_NODE and 'a' can't epsilon transit to 1607 the LAST_NODE, we throw away the node 'a'. 1608 2. When 0 <= STR_IDX < MATCH_LAST and 'a' accepts 1609 string 's' and transit to 'b': 1610 i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw 1611 away the node 'a'. 1612 ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is 1613 thrown away, we throw away the node 'a'. 1614 3. When 0 <= STR_IDX < MATCH_LAST and 'a' epsilon transit to 'b': 1615 i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the 1616 node 'a'. 1617 ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is thrown away, 1618 we throw away the node 'a'. */ 1619 1620#define STATE_NODE_CONTAINS(state,node) \ 1621 ((state) != NULL && re_node_set_contains (&(state)->nodes, node)) 1622 1623static reg_errcode_t 1624internal_function 1625sift_states_backward (const re_match_context_t *mctx, re_sift_context_t *sctx) 1626{ 1627 reg_errcode_t err; 1628 int null_cnt = 0; 1629 Idx str_idx = sctx->last_str_idx; 1630 re_node_set cur_dest; 1631 1632#ifdef DEBUG 1633 assert (mctx->state_log != NULL && mctx->state_log[str_idx] != NULL); 1634#endif 1635 1636 /* Build sifted state_log[str_idx]. It has the nodes which can epsilon 1637 transit to the last_node and the last_node itself. */ 1638 err = re_node_set_init_1 (&cur_dest, sctx->last_node); 1639 if (BE (err != REG_NOERROR, 0)) 1640 return err; 1641 err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest); 1642 if (BE (err != REG_NOERROR, 0)) 1643 goto free_return; 1644 1645 /* Then check each states in the state_log. */ 1646 while (str_idx > 0) 1647 { 1648 /* Update counters. */ 1649 null_cnt = (sctx->sifted_states[str_idx] == NULL) ? null_cnt + 1 : 0; 1650 if (null_cnt > mctx->max_mb_elem_len) 1651 { 1652 memset (sctx->sifted_states, '\0', 1653 sizeof (re_dfastate_t *) * str_idx); 1654 re_node_set_free (&cur_dest); 1655 return REG_NOERROR; 1656 } 1657 re_node_set_empty (&cur_dest); 1658 --str_idx; 1659 1660 if (mctx->state_log[str_idx]) 1661 { 1662 err = build_sifted_states (mctx, sctx, str_idx, &cur_dest); 1663 if (BE (err != REG_NOERROR, 0)) 1664 goto free_return; 1665 } 1666 1667 /* Add all the nodes which satisfy the following conditions: 1668 - It can epsilon transit to a node in CUR_DEST. 1669 - It is in CUR_SRC. 1670 And update state_log. */ 1671 err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest); 1672 if (BE (err != REG_NOERROR, 0)) 1673 goto free_return; 1674 } 1675 err = REG_NOERROR; 1676 free_return: 1677 re_node_set_free (&cur_dest); 1678 return err; 1679} 1680 1681static reg_errcode_t 1682internal_function __attribute_warn_unused_result__ 1683build_sifted_states (const re_match_context_t *mctx, re_sift_context_t *sctx, 1684 Idx str_idx, re_node_set *cur_dest) 1685{ 1686 const re_dfa_t *const dfa = mctx->dfa; 1687 const re_node_set *cur_src = &mctx->state_log[str_idx]->non_eps_nodes; 1688 Idx i; 1689 1690 /* Then build the next sifted state. 1691 We build the next sifted state on 'cur_dest', and update 1692 'sifted_states[str_idx]' with 'cur_dest'. 1693 Note: 1694 'cur_dest' is the sifted state from 'state_log[str_idx + 1]'. 1695 'cur_src' points the node_set of the old 'state_log[str_idx]' 1696 (with the epsilon nodes pre-filtered out). */ 1697 for (i = 0; i < cur_src->nelem; i++) 1698 { 1699 Idx prev_node = cur_src->elems[i]; 1700 int naccepted = 0; 1701 bool ok; 1702 1703#ifdef DEBUG 1704 re_token_type_t type = dfa->nodes[prev_node].type; 1705 assert (!IS_EPSILON_NODE (type)); 1706#endif 1707#ifdef RE_ENABLE_I18N 1708 /* If the node may accept "multi byte". */ 1709 if (dfa->nodes[prev_node].accept_mb) 1710 naccepted = sift_states_iter_mb (mctx, sctx, prev_node, 1711 str_idx, sctx->last_str_idx); 1712#endif /* RE_ENABLE_I18N */ 1713 1714 /* We don't check backreferences here. 1715 See update_cur_sifted_state(). */ 1716 if (!naccepted 1717 && check_node_accept (mctx, dfa->nodes + prev_node, str_idx) 1718 && STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + 1], 1719 dfa->nexts[prev_node])) 1720 naccepted = 1; 1721 1722 if (naccepted == 0) 1723 continue; 1724 1725 if (sctx->limits.nelem) 1726 { 1727 Idx to_idx = str_idx + naccepted; 1728 if (check_dst_limits (mctx, &sctx->limits, 1729 dfa->nexts[prev_node], to_idx, 1730 prev_node, str_idx)) 1731 continue; 1732 } 1733 ok = re_node_set_insert (cur_dest, prev_node); 1734 if (BE (! ok, 0)) 1735 return REG_ESPACE; 1736 } 1737 1738 return REG_NOERROR; 1739} 1740 1741/* Helper functions. */ 1742 1743static reg_errcode_t 1744internal_function 1745clean_state_log_if_needed (re_match_context_t *mctx, Idx next_state_log_idx) 1746{ 1747 Idx top = mctx->state_log_top; 1748 1749 if ((next_state_log_idx >= mctx->input.bufs_len 1750 && mctx->input.bufs_len < mctx->input.len) 1751 || (next_state_log_idx >= mctx->input.valid_len 1752 && mctx->input.valid_len < mctx->input.len)) 1753 { 1754 reg_errcode_t err; 1755 err = extend_buffers (mctx, next_state_log_idx + 1); 1756 if (BE (err != REG_NOERROR, 0)) 1757 return err; 1758 } 1759 1760 if (top < next_state_log_idx) 1761 { 1762 memset (mctx->state_log + top + 1, '\0', 1763 sizeof (re_dfastate_t *) * (next_state_log_idx - top)); 1764 mctx->state_log_top = next_state_log_idx; 1765 } 1766 return REG_NOERROR; 1767} 1768 1769static reg_errcode_t 1770internal_function 1771merge_state_array (const re_dfa_t *dfa, re_dfastate_t **dst, 1772 re_dfastate_t **src, Idx num) 1773{ 1774 Idx st_idx; 1775 reg_errcode_t err; 1776 for (st_idx = 0; st_idx < num; ++st_idx) 1777 { 1778 if (dst[st_idx] == NULL) 1779 dst[st_idx] = src[st_idx]; 1780 else if (src[st_idx] != NULL) 1781 { 1782 re_node_set merged_set; 1783 err = re_node_set_init_union (&merged_set, &dst[st_idx]->nodes, 1784 &src[st_idx]->nodes); 1785 if (BE (err != REG_NOERROR, 0)) 1786 return err; 1787 dst[st_idx] = re_acquire_state (&err, dfa, &merged_set); 1788 re_node_set_free (&merged_set); 1789 if (BE (err != REG_NOERROR, 0)) 1790 return err; 1791 } 1792 } 1793 return REG_NOERROR; 1794} 1795 1796static reg_errcode_t 1797internal_function 1798update_cur_sifted_state (const re_match_context_t *mctx, 1799 re_sift_context_t *sctx, Idx str_idx, 1800 re_node_set *dest_nodes) 1801{ 1802 const re_dfa_t *const dfa = mctx->dfa; 1803 reg_errcode_t err = REG_NOERROR; 1804 const re_node_set *candidates; 1805 candidates = ((mctx->state_log[str_idx] == NULL) ? NULL 1806 : &mctx->state_log[str_idx]->nodes); 1807 1808 if (dest_nodes->nelem == 0) 1809 sctx->sifted_states[str_idx] = NULL; 1810 else 1811 { 1812 if (candidates) 1813 { 1814 /* At first, add the nodes which can epsilon transit to a node in 1815 DEST_NODE. */ 1816 err = add_epsilon_src_nodes (dfa, dest_nodes, candidates); 1817 if (BE (err != REG_NOERROR, 0)) 1818 return err; 1819 1820 /* Then, check the limitations in the current sift_context. */ 1821 if (sctx->limits.nelem) 1822 { 1823 err = check_subexp_limits (dfa, dest_nodes, candidates, &sctx->limits, 1824 mctx->bkref_ents, str_idx); 1825 if (BE (err != REG_NOERROR, 0)) 1826 return err; 1827 } 1828 } 1829 1830 sctx->sifted_states[str_idx] = re_acquire_state (&err, dfa, dest_nodes); 1831 if (BE (err != REG_NOERROR, 0)) 1832 return err; 1833 } 1834 1835 if (candidates && mctx->state_log[str_idx]->has_backref) 1836 { 1837 err = sift_states_bkref (mctx, sctx, str_idx, candidates); 1838 if (BE (err != REG_NOERROR, 0)) 1839 return err; 1840 } 1841 return REG_NOERROR; 1842} 1843 1844static reg_errcode_t 1845internal_function __attribute_warn_unused_result__ 1846add_epsilon_src_nodes (const re_dfa_t *dfa, re_node_set *dest_nodes, 1847 const re_node_set *candidates) 1848{ 1849 reg_errcode_t err = REG_NOERROR; 1850 Idx i; 1851 1852 re_dfastate_t *state = re_acquire_state (&err, dfa, dest_nodes); 1853 if (BE (err != REG_NOERROR, 0)) 1854 return err; 1855 1856 if (!state->inveclosure.alloc) 1857 { 1858 err = re_node_set_alloc (&state->inveclosure, dest_nodes->nelem); 1859 if (BE (err != REG_NOERROR, 0)) 1860 return REG_ESPACE; 1861 for (i = 0; i < dest_nodes->nelem; i++) 1862 { 1863 err = re_node_set_merge (&state->inveclosure, 1864 dfa->inveclosures + dest_nodes->elems[i]); 1865 if (BE (err != REG_NOERROR, 0)) 1866 return REG_ESPACE; 1867 } 1868 } 1869 return re_node_set_add_intersect (dest_nodes, candidates, 1870 &state->inveclosure); 1871} 1872 1873static reg_errcode_t 1874internal_function 1875sub_epsilon_src_nodes (const re_dfa_t *dfa, Idx node, re_node_set *dest_nodes, 1876 const re_node_set *candidates) 1877{ 1878 Idx ecl_idx; 1879 reg_errcode_t err; 1880 re_node_set *inv_eclosure = dfa->inveclosures + node; 1881 re_node_set except_nodes; 1882 re_node_set_init_empty (&except_nodes); 1883 for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx) 1884 { 1885 Idx cur_node = inv_eclosure->elems[ecl_idx]; 1886 if (cur_node == node) 1887 continue; 1888 if (IS_EPSILON_NODE (dfa->nodes[cur_node].type)) 1889 { 1890 Idx edst1 = dfa->edests[cur_node].elems[0]; 1891 Idx edst2 = ((dfa->edests[cur_node].nelem > 1) 1892 ? dfa->edests[cur_node].elems[1] : REG_MISSING); 1893 if ((!re_node_set_contains (inv_eclosure, edst1) 1894 && re_node_set_contains (dest_nodes, edst1)) 1895 || (REG_VALID_NONZERO_INDEX (edst2) 1896 && !re_node_set_contains (inv_eclosure, edst2) 1897 && re_node_set_contains (dest_nodes, edst2))) 1898 { 1899 err = re_node_set_add_intersect (&except_nodes, candidates, 1900 dfa->inveclosures + cur_node); 1901 if (BE (err != REG_NOERROR, 0)) 1902 { 1903 re_node_set_free (&except_nodes); 1904 return err; 1905 } 1906 } 1907 } 1908 } 1909 for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx) 1910 { 1911 Idx cur_node = inv_eclosure->elems[ecl_idx]; 1912 if (!re_node_set_contains (&except_nodes, cur_node)) 1913 { 1914 Idx idx = re_node_set_contains (dest_nodes, cur_node) - 1; 1915 re_node_set_remove_at (dest_nodes, idx); 1916 } 1917 } 1918 re_node_set_free (&except_nodes); 1919 return REG_NOERROR; 1920} 1921 1922static bool 1923internal_function 1924check_dst_limits (const re_match_context_t *mctx, const re_node_set *limits, 1925 Idx dst_node, Idx dst_idx, Idx src_node, Idx src_idx) 1926{ 1927 const re_dfa_t *const dfa = mctx->dfa; 1928 Idx lim_idx, src_pos, dst_pos; 1929 1930 Idx dst_bkref_idx = search_cur_bkref_entry (mctx, dst_idx); 1931 Idx src_bkref_idx = search_cur_bkref_entry (mctx, src_idx); 1932 for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx) 1933 { 1934 Idx subexp_idx; 1935 struct re_backref_cache_entry *ent; 1936 ent = mctx->bkref_ents + limits->elems[lim_idx]; 1937 subexp_idx = dfa->nodes[ent->node].opr.idx; 1938 1939 dst_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx], 1940 subexp_idx, dst_node, dst_idx, 1941 dst_bkref_idx); 1942 src_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx], 1943 subexp_idx, src_node, src_idx, 1944 src_bkref_idx); 1945 1946 /* In case of: 1947 <src> <dst> ( <subexp> ) 1948 ( <subexp> ) <src> <dst> 1949 ( <subexp1> <src> <subexp2> <dst> <subexp3> ) */ 1950 if (src_pos == dst_pos) 1951 continue; /* This is unrelated limitation. */ 1952 else 1953 return true; 1954 } 1955 return false; 1956} 1957 1958static int 1959internal_function 1960check_dst_limits_calc_pos_1 (const re_match_context_t *mctx, int boundaries, 1961 Idx subexp_idx, Idx from_node, Idx bkref_idx) 1962{ 1963 const re_dfa_t *const dfa = mctx->dfa; 1964 const re_node_set *eclosures = dfa->eclosures + from_node; 1965 Idx node_idx; 1966 1967 /* Else, we are on the boundary: examine the nodes on the epsilon 1968 closure. */ 1969 for (node_idx = 0; node_idx < eclosures->nelem; ++node_idx) 1970 { 1971 Idx node = eclosures->elems[node_idx]; 1972 switch (dfa->nodes[node].type) 1973 { 1974 case OP_BACK_REF: 1975 if (bkref_idx != REG_MISSING) 1976 { 1977 struct re_backref_cache_entry *ent = mctx->bkref_ents + bkref_idx; 1978 do 1979 { 1980 Idx dst; 1981 int cpos; 1982 1983 if (ent->node != node) 1984 continue; 1985 1986 if (subexp_idx < BITSET_WORD_BITS 1987 && !(ent->eps_reachable_subexps_map 1988 & ((bitset_word_t) 1 << subexp_idx))) 1989 continue; 1990 1991 /* Recurse trying to reach the OP_OPEN_SUBEXP and 1992 OP_CLOSE_SUBEXP cases below. But, if the 1993 destination node is the same node as the source 1994 node, don't recurse because it would cause an 1995 infinite loop: a regex that exhibits this behavior 1996 is ()\1*\1* */ 1997 dst = dfa->edests[node].elems[0]; 1998 if (dst == from_node) 1999 { 2000 if (boundaries & 1) 2001 return -1; 2002 else /* if (boundaries & 2) */ 2003 return 0; 2004 } 2005 2006 cpos = 2007 check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx, 2008 dst, bkref_idx); 2009 if (cpos == -1 /* && (boundaries & 1) */) 2010 return -1; 2011 if (cpos == 0 && (boundaries & 2)) 2012 return 0; 2013 2014 if (subexp_idx < BITSET_WORD_BITS) 2015 ent->eps_reachable_subexps_map 2016 &= ~((bitset_word_t) 1 << subexp_idx); 2017 } 2018 while (ent++->more); 2019 } 2020 break; 2021 2022 case OP_OPEN_SUBEXP: 2023 if ((boundaries & 1) && subexp_idx == dfa->nodes[node].opr.idx) 2024 return -1; 2025 break; 2026 2027 case OP_CLOSE_SUBEXP: 2028 if ((boundaries & 2) && subexp_idx == dfa->nodes[node].opr.idx) 2029 return 0; 2030 break; 2031 2032 default: 2033 break; 2034 } 2035 } 2036 2037 return (boundaries & 2) ? 1 : 0; 2038} 2039 2040static int 2041internal_function 2042check_dst_limits_calc_pos (const re_match_context_t *mctx, Idx limit, 2043 Idx subexp_idx, Idx from_node, Idx str_idx, 2044 Idx bkref_idx) 2045{ 2046 struct re_backref_cache_entry *lim = mctx->bkref_ents + limit; 2047 int boundaries; 2048 2049 /* If we are outside the range of the subexpression, return -1 or 1. */ 2050 if (str_idx < lim->subexp_from) 2051 return -1; 2052 2053 if (lim->subexp_to < str_idx) 2054 return 1; 2055 2056 /* If we are within the subexpression, return 0. */ 2057 boundaries = (str_idx == lim->subexp_from); 2058 boundaries |= (str_idx == lim->subexp_to) << 1; 2059 if (boundaries == 0) 2060 return 0; 2061 2062 /* Else, examine epsilon closure. */ 2063 return check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx, 2064 from_node, bkref_idx); 2065} 2066 2067/* Check the limitations of sub expressions LIMITS, and remove the nodes 2068 which are against limitations from DEST_NODES. */ 2069 2070static reg_errcode_t 2071internal_function 2072check_subexp_limits (const re_dfa_t *dfa, re_node_set *dest_nodes, 2073 const re_node_set *candidates, re_node_set *limits, 2074 struct re_backref_cache_entry *bkref_ents, Idx str_idx) 2075{ 2076 reg_errcode_t err; 2077 Idx node_idx, lim_idx; 2078 2079 for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx) 2080 { 2081 Idx subexp_idx; 2082 struct re_backref_cache_entry *ent; 2083 ent = bkref_ents + limits->elems[lim_idx]; 2084 2085 if (str_idx <= ent->subexp_from || ent->str_idx < str_idx) 2086 continue; /* This is unrelated limitation. */ 2087 2088 subexp_idx = dfa->nodes[ent->node].opr.idx; 2089 if (ent->subexp_to == str_idx) 2090 { 2091 Idx ops_node = REG_MISSING; 2092 Idx cls_node = REG_MISSING; 2093 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx) 2094 { 2095 Idx node = dest_nodes->elems[node_idx]; 2096 re_token_type_t type = dfa->nodes[node].type; 2097 if (type == OP_OPEN_SUBEXP 2098 && subexp_idx == dfa->nodes[node].opr.idx) 2099 ops_node = node; 2100 else if (type == OP_CLOSE_SUBEXP 2101 && subexp_idx == dfa->nodes[node].opr.idx) 2102 cls_node = node; 2103 } 2104 2105 /* Check the limitation of the open subexpression. */ 2106 /* Note that (ent->subexp_to = str_idx != ent->subexp_from). */ 2107 if (REG_VALID_INDEX (ops_node)) 2108 { 2109 err = sub_epsilon_src_nodes (dfa, ops_node, dest_nodes, 2110 candidates); 2111 if (BE (err != REG_NOERROR, 0)) 2112 return err; 2113 } 2114 2115 /* Check the limitation of the close subexpression. */ 2116 if (REG_VALID_INDEX (cls_node)) 2117 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx) 2118 { 2119 Idx node = dest_nodes->elems[node_idx]; 2120 if (!re_node_set_contains (dfa->inveclosures + node, 2121 cls_node) 2122 && !re_node_set_contains (dfa->eclosures + node, 2123 cls_node)) 2124 { 2125 /* It is against this limitation. 2126 Remove it form the current sifted state. */ 2127 err = sub_epsilon_src_nodes (dfa, node, dest_nodes, 2128 candidates); 2129 if (BE (err != REG_NOERROR, 0)) 2130 return err; 2131 --node_idx; 2132 } 2133 } 2134 } 2135 else /* (ent->subexp_to != str_idx) */ 2136 { 2137 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx) 2138 { 2139 Idx node = dest_nodes->elems[node_idx]; 2140 re_token_type_t type = dfa->nodes[node].type; 2141 if (type == OP_CLOSE_SUBEXP || type == OP_OPEN_SUBEXP) 2142 { 2143 if (subexp_idx != dfa->nodes[node].opr.idx) 2144 continue; 2145 /* It is against this limitation. 2146 Remove it form the current sifted state. */ 2147 err = sub_epsilon_src_nodes (dfa, node, dest_nodes, 2148 candidates); 2149 if (BE (err != REG_NOERROR, 0)) 2150 return err; 2151 } 2152 } 2153 } 2154 } 2155 return REG_NOERROR; 2156} 2157 2158static reg_errcode_t 2159internal_function __attribute_warn_unused_result__ 2160sift_states_bkref (const re_match_context_t *mctx, re_sift_context_t *sctx, 2161 Idx str_idx, const re_node_set *candidates) 2162{ 2163 const re_dfa_t *const dfa = mctx->dfa; 2164 reg_errcode_t err; 2165 Idx node_idx, node; 2166 re_sift_context_t local_sctx; 2167 Idx first_idx = search_cur_bkref_entry (mctx, str_idx); 2168 2169 if (first_idx == REG_MISSING) 2170 return REG_NOERROR; 2171 2172 local_sctx.sifted_states = NULL; /* Mark that it hasn't been initialized. */ 2173 2174 for (node_idx = 0; node_idx < candidates->nelem; ++node_idx) 2175 { 2176 Idx enabled_idx; 2177 re_token_type_t type; 2178 struct re_backref_cache_entry *entry; 2179 node = candidates->elems[node_idx]; 2180 type = dfa->nodes[node].type; 2181 /* Avoid infinite loop for the REs like "()\1+". */ 2182 if (node == sctx->last_node && str_idx == sctx->last_str_idx) 2183 continue; 2184 if (type != OP_BACK_REF) 2185 continue; 2186 2187 entry = mctx->bkref_ents + first_idx; 2188 enabled_idx = first_idx; 2189 do 2190 { 2191 Idx subexp_len; 2192 Idx to_idx; 2193 Idx dst_node; 2194 bool ok; 2195 re_dfastate_t *cur_state; 2196 2197 if (entry->node != node) 2198 continue; 2199 subexp_len = entry->subexp_to - entry->subexp_from; 2200 to_idx = str_idx + subexp_len; 2201 dst_node = (subexp_len ? dfa->nexts[node] 2202 : dfa->edests[node].elems[0]); 2203 2204 if (to_idx > sctx->last_str_idx 2205 || sctx->sifted_states[to_idx] == NULL 2206 || !STATE_NODE_CONTAINS (sctx->sifted_states[to_idx], dst_node) 2207 || check_dst_limits (mctx, &sctx->limits, node, 2208 str_idx, dst_node, to_idx)) 2209 continue; 2210 2211 if (local_sctx.sifted_states == NULL) 2212 { 2213 local_sctx = *sctx; 2214 err = re_node_set_init_copy (&local_sctx.limits, &sctx->limits); 2215 if (BE (err != REG_NOERROR, 0)) 2216 goto free_return; 2217 } 2218 local_sctx.last_node = node; 2219 local_sctx.last_str_idx = str_idx; 2220 ok = re_node_set_insert (&local_sctx.limits, enabled_idx); 2221 if (BE (! ok, 0)) 2222 { 2223 err = REG_ESPACE; 2224 goto free_return; 2225 } 2226 cur_state = local_sctx.sifted_states[str_idx]; 2227 err = sift_states_backward (mctx, &local_sctx); 2228 if (BE (err != REG_NOERROR, 0)) 2229 goto free_return; 2230 if (sctx->limited_states != NULL) 2231 { 2232 err = merge_state_array (dfa, sctx->limited_states, 2233 local_sctx.sifted_states, 2234 str_idx + 1); 2235 if (BE (err != REG_NOERROR, 0)) 2236 goto free_return; 2237 } 2238 local_sctx.sifted_states[str_idx] = cur_state; 2239 re_node_set_remove (&local_sctx.limits, enabled_idx); 2240 2241 /* mctx->bkref_ents may have changed, reload the pointer. */ 2242 entry = mctx->bkref_ents + enabled_idx; 2243 } 2244 while (enabled_idx++, entry++->more); 2245 } 2246 err = REG_NOERROR; 2247 free_return: 2248 if (local_sctx.sifted_states != NULL) 2249 { 2250 re_node_set_free (&local_sctx.limits); 2251 } 2252 2253 return err; 2254} 2255 2256 2257#ifdef RE_ENABLE_I18N 2258static int 2259internal_function 2260sift_states_iter_mb (const re_match_context_t *mctx, re_sift_context_t *sctx, 2261 Idx node_idx, Idx str_idx, Idx max_str_idx) 2262{ 2263 const re_dfa_t *const dfa = mctx->dfa; 2264 int naccepted; 2265 /* Check the node can accept "multi byte". */ 2266 naccepted = check_node_accept_bytes (dfa, node_idx, &mctx->input, str_idx); 2267 if (naccepted > 0 && str_idx + naccepted <= max_str_idx && 2268 !STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + naccepted], 2269 dfa->nexts[node_idx])) 2270 /* The node can't accept the "multi byte", or the 2271 destination was already thrown away, then the node 2272 could't accept the current input "multi byte". */ 2273 naccepted = 0; 2274 /* Otherwise, it is sure that the node could accept 2275 'naccepted' bytes input. */ 2276 return naccepted; 2277} 2278#endif /* RE_ENABLE_I18N */ 2279 2280 2281/* Functions for state transition. */ 2282 2283/* Return the next state to which the current state STATE will transit by 2284 accepting the current input byte, and update STATE_LOG if necessary. 2285 If STATE can accept a multibyte char/collating element/back reference 2286 update the destination of STATE_LOG. */ 2287 2288static re_dfastate_t * 2289internal_function __attribute_warn_unused_result__ 2290transit_state (reg_errcode_t *err, re_match_context_t *mctx, 2291 re_dfastate_t *state) 2292{ 2293 re_dfastate_t **trtable; 2294 unsigned char ch; 2295 2296#ifdef RE_ENABLE_I18N 2297 /* If the current state can accept multibyte. */ 2298 if (BE (state->accept_mb, 0)) 2299 { 2300 *err = transit_state_mb (mctx, state); 2301 if (BE (*err != REG_NOERROR, 0)) 2302 return NULL; 2303 } 2304#endif /* RE_ENABLE_I18N */ 2305 2306 /* Then decide the next state with the single byte. */ 2307#if 0 2308 if (0) 2309 /* don't use transition table */ 2310 return transit_state_sb (err, mctx, state); 2311#endif 2312 2313 /* Use transition table */ 2314 ch = re_string_fetch_byte (&mctx->input); 2315 for (;;) 2316 { 2317 trtable = state->trtable; 2318 if (BE (trtable != NULL, 1)) 2319 return trtable[ch]; 2320 2321 trtable = state->word_trtable; 2322 if (BE (trtable != NULL, 1)) 2323 { 2324 unsigned int context; 2325 context 2326 = re_string_context_at (&mctx->input, 2327 re_string_cur_idx (&mctx->input) - 1, 2328 mctx->eflags); 2329 if (IS_WORD_CONTEXT (context)) 2330 return trtable[ch + SBC_MAX]; 2331 else 2332 return trtable[ch]; 2333 } 2334 2335 if (!build_trtable (mctx->dfa, state)) 2336 { 2337 *err = REG_ESPACE; 2338 return NULL; 2339 } 2340 2341 /* Retry, we now have a transition table. */ 2342 } 2343} 2344 2345/* Update the state_log if we need */ 2346static re_dfastate_t * 2347internal_function 2348merge_state_with_log (reg_errcode_t *err, re_match_context_t *mctx, 2349 re_dfastate_t *next_state) 2350{ 2351 const re_dfa_t *const dfa = mctx->dfa; 2352 Idx cur_idx = re_string_cur_idx (&mctx->input); 2353 2354 if (cur_idx > mctx->state_log_top) 2355 { 2356 mctx->state_log[cur_idx] = next_state; 2357 mctx->state_log_top = cur_idx; 2358 } 2359 else if (mctx->state_log[cur_idx] == 0) 2360 { 2361 mctx->state_log[cur_idx] = next_state; 2362 } 2363 else 2364 { 2365 re_dfastate_t *pstate; 2366 unsigned int context; 2367 re_node_set next_nodes, *log_nodes, *table_nodes = NULL; 2368 /* If (state_log[cur_idx] != 0), it implies that cur_idx is 2369 the destination of a multibyte char/collating element/ 2370 back reference. Then the next state is the union set of 2371 these destinations and the results of the transition table. */ 2372 pstate = mctx->state_log[cur_idx]; 2373 log_nodes = pstate->entrance_nodes; 2374 if (next_state != NULL) 2375 { 2376 table_nodes = next_state->entrance_nodes; 2377 *err = re_node_set_init_union (&next_nodes, table_nodes, 2378 log_nodes); 2379 if (BE (*err != REG_NOERROR, 0)) 2380 return NULL; 2381 } 2382 else 2383 next_nodes = *log_nodes; 2384 /* Note: We already add the nodes of the initial state, 2385 then we don't need to add them here. */ 2386 2387 context = re_string_context_at (&mctx->input, 2388 re_string_cur_idx (&mctx->input) - 1, 2389 mctx->eflags); 2390 next_state = mctx->state_log[cur_idx] 2391 = re_acquire_state_context (err, dfa, &next_nodes, context); 2392 /* We don't need to check errors here, since the return value of 2393 this function is next_state and ERR is already set. */ 2394 2395 if (table_nodes != NULL) 2396 re_node_set_free (&next_nodes); 2397 } 2398 2399 if (BE (dfa->nbackref, 0) && next_state != NULL) 2400 { 2401 /* Check OP_OPEN_SUBEXP in the current state in case that we use them 2402 later. We must check them here, since the back references in the 2403 next state might use them. */ 2404 *err = check_subexp_matching_top (mctx, &next_state->nodes, 2405 cur_idx); 2406 if (BE (*err != REG_NOERROR, 0)) 2407 return NULL; 2408 2409 /* If the next state has back references. */ 2410 if (next_state->has_backref) 2411 { 2412 *err = transit_state_bkref (mctx, &next_state->nodes); 2413 if (BE (*err != REG_NOERROR, 0)) 2414 return NULL; 2415 next_state = mctx->state_log[cur_idx]; 2416 } 2417 } 2418 2419 return next_state; 2420} 2421 2422/* Skip bytes in the input that correspond to part of a 2423 multi-byte match, then look in the log for a state 2424 from which to restart matching. */ 2425static re_dfastate_t * 2426internal_function 2427find_recover_state (reg_errcode_t *err, re_match_context_t *mctx) 2428{ 2429 re_dfastate_t *cur_state; 2430 do 2431 { 2432 Idx max = mctx->state_log_top; 2433 Idx cur_str_idx = re_string_cur_idx (&mctx->input); 2434 2435 do 2436 { 2437 if (++cur_str_idx > max) 2438 return NULL; 2439 re_string_skip_bytes (&mctx->input, 1); 2440 } 2441 while (mctx->state_log[cur_str_idx] == NULL); 2442 2443 cur_state = merge_state_with_log (err, mctx, NULL); 2444 } 2445 while (*err == REG_NOERROR && cur_state == NULL); 2446 return cur_state; 2447} 2448 2449/* Helper functions for transit_state. */ 2450 2451/* From the node set CUR_NODES, pick up the nodes whose types are 2452 OP_OPEN_SUBEXP and which have corresponding back references in the regular 2453 expression. And register them to use them later for evaluating the 2454 corresponding back references. */ 2455 2456static reg_errcode_t 2457internal_function 2458check_subexp_matching_top (re_match_context_t *mctx, re_node_set *cur_nodes, 2459 Idx str_idx) 2460{ 2461 const re_dfa_t *const dfa = mctx->dfa; 2462 Idx node_idx; 2463 reg_errcode_t err; 2464 2465 /* TODO: This isn't efficient. 2466 Because there might be more than one nodes whose types are 2467 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all 2468 nodes. 2469 E.g. RE: (a){2} */ 2470 for (node_idx = 0; node_idx < cur_nodes->nelem; ++node_idx) 2471 { 2472 Idx node = cur_nodes->elems[node_idx]; 2473 if (dfa->nodes[node].type == OP_OPEN_SUBEXP 2474 && dfa->nodes[node].opr.idx < BITSET_WORD_BITS 2475 && (dfa->used_bkref_map 2476 & ((bitset_word_t) 1 << dfa->nodes[node].opr.idx))) 2477 { 2478 err = match_ctx_add_subtop (mctx, node, str_idx); 2479 if (BE (err != REG_NOERROR, 0)) 2480 return err; 2481 } 2482 } 2483 return REG_NOERROR; 2484} 2485 2486#if 0 2487/* Return the next state to which the current state STATE will transit by 2488 accepting the current input byte. */ 2489 2490static re_dfastate_t * 2491transit_state_sb (reg_errcode_t *err, re_match_context_t *mctx, 2492 re_dfastate_t *state) 2493{ 2494 const re_dfa_t *const dfa = mctx->dfa; 2495 re_node_set next_nodes; 2496 re_dfastate_t *next_state; 2497 Idx node_cnt, cur_str_idx = re_string_cur_idx (&mctx->input); 2498 unsigned int context; 2499 2500 *err = re_node_set_alloc (&next_nodes, state->nodes.nelem + 1); 2501 if (BE (*err != REG_NOERROR, 0)) 2502 return NULL; 2503 for (node_cnt = 0; node_cnt < state->nodes.nelem; ++node_cnt) 2504 { 2505 Idx cur_node = state->nodes.elems[node_cnt]; 2506 if (check_node_accept (mctx, dfa->nodes + cur_node, cur_str_idx)) 2507 { 2508 *err = re_node_set_merge (&next_nodes, 2509 dfa->eclosures + dfa->nexts[cur_node]); 2510 if (BE (*err != REG_NOERROR, 0)) 2511 { 2512 re_node_set_free (&next_nodes); 2513 return NULL; 2514 } 2515 } 2516 } 2517 context = re_string_context_at (&mctx->input, cur_str_idx, mctx->eflags); 2518 next_state = re_acquire_state_context (err, dfa, &next_nodes, context); 2519 /* We don't need to check errors here, since the return value of 2520 this function is next_state and ERR is already set. */ 2521 2522 re_node_set_free (&next_nodes); 2523 re_string_skip_bytes (&mctx->input, 1); 2524 return next_state; 2525} 2526#endif 2527 2528#ifdef RE_ENABLE_I18N 2529static reg_errcode_t 2530internal_function 2531transit_state_mb (re_match_context_t *mctx, re_dfastate_t *pstate) 2532{ 2533 const re_dfa_t *const dfa = mctx->dfa; 2534 reg_errcode_t err; 2535 Idx i; 2536 2537 for (i = 0; i < pstate->nodes.nelem; ++i) 2538 { 2539 re_node_set dest_nodes, *new_nodes; 2540 Idx cur_node_idx = pstate->nodes.elems[i]; 2541 int naccepted; 2542 Idx dest_idx; 2543 unsigned int context; 2544 re_dfastate_t *dest_state; 2545 2546 if (!dfa->nodes[cur_node_idx].accept_mb) 2547 continue; 2548 2549 if (dfa->nodes[cur_node_idx].constraint) 2550 { 2551 context = re_string_context_at (&mctx->input, 2552 re_string_cur_idx (&mctx->input), 2553 mctx->eflags); 2554 if (NOT_SATISFY_NEXT_CONSTRAINT (dfa->nodes[cur_node_idx].constraint, 2555 context)) 2556 continue; 2557 } 2558 2559 /* How many bytes the node can accept? */ 2560 naccepted = check_node_accept_bytes (dfa, cur_node_idx, &mctx->input, 2561 re_string_cur_idx (&mctx->input)); 2562 if (naccepted == 0) 2563 continue; 2564 2565 /* The node can accepts 'naccepted' bytes. */ 2566 dest_idx = re_string_cur_idx (&mctx->input) + naccepted; 2567 mctx->max_mb_elem_len = ((mctx->max_mb_elem_len < naccepted) ? naccepted 2568 : mctx->max_mb_elem_len); 2569 err = clean_state_log_if_needed (mctx, dest_idx); 2570 if (BE (err != REG_NOERROR, 0)) 2571 return err; 2572#ifdef DEBUG 2573 assert (dfa->nexts[cur_node_idx] != REG_MISSING); 2574#endif 2575 new_nodes = dfa->eclosures + dfa->nexts[cur_node_idx]; 2576 2577 dest_state = mctx->state_log[dest_idx]; 2578 if (dest_state == NULL) 2579 dest_nodes = *new_nodes; 2580 else 2581 { 2582 err = re_node_set_init_union (&dest_nodes, 2583 dest_state->entrance_nodes, new_nodes); 2584 if (BE (err != REG_NOERROR, 0)) 2585 return err; 2586 } 2587 context = re_string_context_at (&mctx->input, dest_idx - 1, 2588 mctx->eflags); 2589 mctx->state_log[dest_idx] 2590 = re_acquire_state_context (&err, dfa, &dest_nodes, context); 2591 if (dest_state != NULL) 2592 re_node_set_free (&dest_nodes); 2593 if (BE (mctx->state_log[dest_idx] == NULL && err != REG_NOERROR, 0)) 2594 return err; 2595 } 2596 return REG_NOERROR; 2597} 2598#endif /* RE_ENABLE_I18N */ 2599 2600static reg_errcode_t 2601internal_function 2602transit_state_bkref (re_match_context_t *mctx, const re_node_set *nodes) 2603{ 2604 const re_dfa_t *const dfa = mctx->dfa; 2605 reg_errcode_t err; 2606 Idx i; 2607 Idx cur_str_idx = re_string_cur_idx (&mctx->input); 2608 2609 for (i = 0; i < nodes->nelem; ++i) 2610 { 2611 Idx dest_str_idx, prev_nelem, bkc_idx; 2612 Idx node_idx = nodes->elems[i]; 2613 unsigned int context; 2614 const re_token_t *node = dfa->nodes + node_idx; 2615 re_node_set *new_dest_nodes; 2616 2617 /* Check whether 'node' is a backreference or not. */ 2618 if (node->type != OP_BACK_REF) 2619 continue; 2620 2621 if (node->constraint) 2622 { 2623 context = re_string_context_at (&mctx->input, cur_str_idx, 2624 mctx->eflags); 2625 if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context)) 2626 continue; 2627 } 2628 2629 /* 'node' is a backreference. 2630 Check the substring which the substring matched. */ 2631 bkc_idx = mctx->nbkref_ents; 2632 err = get_subexp (mctx, node_idx, cur_str_idx); 2633 if (BE (err != REG_NOERROR, 0)) 2634 goto free_return; 2635 2636 /* And add the epsilon closures (which is 'new_dest_nodes') of 2637 the backreference to appropriate state_log. */ 2638#ifdef DEBUG 2639 assert (dfa->nexts[node_idx] != REG_MISSING); 2640#endif 2641 for (; bkc_idx < mctx->nbkref_ents; ++bkc_idx) 2642 { 2643 Idx subexp_len; 2644 re_dfastate_t *dest_state; 2645 struct re_backref_cache_entry *bkref_ent; 2646 bkref_ent = mctx->bkref_ents + bkc_idx; 2647 if (bkref_ent->node != node_idx || bkref_ent->str_idx != cur_str_idx) 2648 continue; 2649 subexp_len = bkref_ent->subexp_to - bkref_ent->subexp_from; 2650 new_dest_nodes = (subexp_len == 0 2651 ? dfa->eclosures + dfa->edests[node_idx].elems[0] 2652 : dfa->eclosures + dfa->nexts[node_idx]); 2653 dest_str_idx = (cur_str_idx + bkref_ent->subexp_to 2654 - bkref_ent->subexp_from); 2655 context = re_string_context_at (&mctx->input, dest_str_idx - 1, 2656 mctx->eflags); 2657 dest_state = mctx->state_log[dest_str_idx]; 2658 prev_nelem = ((mctx->state_log[cur_str_idx] == NULL) ? 0 2659 : mctx->state_log[cur_str_idx]->nodes.nelem); 2660 /* Add 'new_dest_node' to state_log. */ 2661 if (dest_state == NULL) 2662 { 2663 mctx->state_log[dest_str_idx] 2664 = re_acquire_state_context (&err, dfa, new_dest_nodes, 2665 context); 2666 if (BE (mctx->state_log[dest_str_idx] == NULL 2667 && err != REG_NOERROR, 0)) 2668 goto free_return; 2669 } 2670 else 2671 { 2672 re_node_set dest_nodes; 2673 err = re_node_set_init_union (&dest_nodes, 2674 dest_state->entrance_nodes, 2675 new_dest_nodes); 2676 if (BE (err != REG_NOERROR, 0)) 2677 { 2678 re_node_set_free (&dest_nodes); 2679 goto free_return; 2680 } 2681 mctx->state_log[dest_str_idx] 2682 = re_acquire_state_context (&err, dfa, &dest_nodes, context); 2683 re_node_set_free (&dest_nodes); 2684 if (BE (mctx->state_log[dest_str_idx] == NULL 2685 && err != REG_NOERROR, 0)) 2686 goto free_return; 2687 } 2688 /* We need to check recursively if the backreference can epsilon 2689 transit. */ 2690 if (subexp_len == 0 2691 && mctx->state_log[cur_str_idx]->nodes.nelem > prev_nelem) 2692 { 2693 err = check_subexp_matching_top (mctx, new_dest_nodes, 2694 cur_str_idx); 2695 if (BE (err != REG_NOERROR, 0)) 2696 goto free_return; 2697 err = transit_state_bkref (mctx, new_dest_nodes); 2698 if (BE (err != REG_NOERROR, 0)) 2699 goto free_return; 2700 } 2701 } 2702 } 2703 err = REG_NOERROR; 2704 free_return: 2705 return err; 2706} 2707 2708/* Enumerate all the candidates which the backreference BKREF_NODE can match 2709 at BKREF_STR_IDX, and register them by match_ctx_add_entry(). 2710 Note that we might collect inappropriate candidates here. 2711 However, the cost of checking them strictly here is too high, then we 2712 delay these checking for prune_impossible_nodes(). */ 2713 2714static reg_errcode_t 2715internal_function __attribute_warn_unused_result__ 2716get_subexp (re_match_context_t *mctx, Idx bkref_node, Idx bkref_str_idx) 2717{ 2718 const re_dfa_t *const dfa = mctx->dfa; 2719 Idx subexp_num, sub_top_idx; 2720 const char *buf = (const char *) re_string_get_buffer (&mctx->input); 2721 /* Return if we have already checked BKREF_NODE at BKREF_STR_IDX. */ 2722 Idx cache_idx = search_cur_bkref_entry (mctx, bkref_str_idx); 2723 if (cache_idx != REG_MISSING) 2724 { 2725 const struct re_backref_cache_entry *entry 2726 = mctx->bkref_ents + cache_idx; 2727 do 2728 if (entry->node == bkref_node) 2729 return REG_NOERROR; /* We already checked it. */ 2730 while (entry++->more); 2731 } 2732 2733 subexp_num = dfa->nodes[bkref_node].opr.idx; 2734 2735 /* For each sub expression */ 2736 for (sub_top_idx = 0; sub_top_idx < mctx->nsub_tops; ++sub_top_idx) 2737 { 2738 reg_errcode_t err; 2739 re_sub_match_top_t *sub_top = mctx->sub_tops[sub_top_idx]; 2740 re_sub_match_last_t *sub_last; 2741 Idx sub_last_idx, sl_str, bkref_str_off; 2742 2743 if (dfa->nodes[sub_top->node].opr.idx != subexp_num) 2744 continue; /* It isn't related. */ 2745 2746 sl_str = sub_top->str_idx; 2747 bkref_str_off = bkref_str_idx; 2748 /* At first, check the last node of sub expressions we already 2749 evaluated. */ 2750 for (sub_last_idx = 0; sub_last_idx < sub_top->nlasts; ++sub_last_idx) 2751 { 2752 regoff_t sl_str_diff; 2753 sub_last = sub_top->lasts[sub_last_idx]; 2754 sl_str_diff = sub_last->str_idx - sl_str; 2755 /* The matched string by the sub expression match with the substring 2756 at the back reference? */ 2757 if (sl_str_diff > 0) 2758 { 2759 if (BE (bkref_str_off + sl_str_diff > mctx->input.valid_len, 0)) 2760 { 2761 /* Not enough chars for a successful match. */ 2762 if (bkref_str_off + sl_str_diff > mctx->input.len) 2763 break; 2764 2765 err = clean_state_log_if_needed (mctx, 2766 bkref_str_off 2767 + sl_str_diff); 2768 if (BE (err != REG_NOERROR, 0)) 2769 return err; 2770 buf = (const char *) re_string_get_buffer (&mctx->input); 2771 } 2772 if (memcmp (buf + bkref_str_off, buf + sl_str, sl_str_diff) != 0) 2773 /* We don't need to search this sub expression any more. */ 2774 break; 2775 } 2776 bkref_str_off += sl_str_diff; 2777 sl_str += sl_str_diff; 2778 err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node, 2779 bkref_str_idx); 2780 2781 /* Reload buf, since the preceding call might have reallocated 2782 the buffer. */ 2783 buf = (const char *) re_string_get_buffer (&mctx->input); 2784 2785 if (err == REG_NOMATCH) 2786 continue; 2787 if (BE (err != REG_NOERROR, 0)) 2788 return err; 2789 } 2790 2791 if (sub_last_idx < sub_top->nlasts) 2792 continue; 2793 if (sub_last_idx > 0) 2794 ++sl_str; 2795 /* Then, search for the other last nodes of the sub expression. */ 2796 for (; sl_str <= bkref_str_idx; ++sl_str) 2797 { 2798 Idx cls_node; 2799 regoff_t sl_str_off; 2800 const re_node_set *nodes; 2801 sl_str_off = sl_str - sub_top->str_idx; 2802 /* The matched string by the sub expression match with the substring 2803 at the back reference? */ 2804 if (sl_str_off > 0) 2805 { 2806 if (BE (bkref_str_off >= mctx->input.valid_len, 0)) 2807 { 2808 /* If we are at the end of the input, we cannot match. */ 2809 if (bkref_str_off >= mctx->input.len) 2810 break; 2811 2812 err = extend_buffers (mctx, bkref_str_off + 1); 2813 if (BE (err != REG_NOERROR, 0)) 2814 return err; 2815 2816 buf = (const char *) re_string_get_buffer (&mctx->input); 2817 } 2818 if (buf [bkref_str_off++] != buf[sl_str - 1]) 2819 break; /* We don't need to search this sub expression 2820 any more. */ 2821 } 2822 if (mctx->state_log[sl_str] == NULL) 2823 continue; 2824 /* Does this state have a ')' of the sub expression? */ 2825 nodes = &mctx->state_log[sl_str]->nodes; 2826 cls_node = find_subexp_node (dfa, nodes, subexp_num, 2827 OP_CLOSE_SUBEXP); 2828 if (cls_node == REG_MISSING) 2829 continue; /* No. */ 2830 if (sub_top->path == NULL) 2831 { 2832 sub_top->path = calloc (sizeof (state_array_t), 2833 sl_str - sub_top->str_idx + 1); 2834 if (sub_top->path == NULL) 2835 return REG_ESPACE; 2836 } 2837 /* Can the OP_OPEN_SUBEXP node arrive the OP_CLOSE_SUBEXP node 2838 in the current context? */ 2839 err = check_arrival (mctx, sub_top->path, sub_top->node, 2840 sub_top->str_idx, cls_node, sl_str, 2841 OP_CLOSE_SUBEXP); 2842 if (err == REG_NOMATCH) 2843 continue; 2844 if (BE (err != REG_NOERROR, 0)) 2845 return err; 2846 sub_last = match_ctx_add_sublast (sub_top, cls_node, sl_str); 2847 if (BE (sub_last == NULL, 0)) 2848 return REG_ESPACE; 2849 err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node, 2850 bkref_str_idx); 2851 if (err == REG_NOMATCH) 2852 continue; 2853 } 2854 } 2855 return REG_NOERROR; 2856} 2857 2858/* Helper functions for get_subexp(). */ 2859 2860/* Check SUB_LAST can arrive to the back reference BKREF_NODE at BKREF_STR. 2861 If it can arrive, register the sub expression expressed with SUB_TOP 2862 and SUB_LAST. */ 2863 2864static reg_errcode_t 2865internal_function 2866get_subexp_sub (re_match_context_t *mctx, const re_sub_match_top_t *sub_top, 2867 re_sub_match_last_t *sub_last, Idx bkref_node, Idx bkref_str) 2868{ 2869 reg_errcode_t err; 2870 Idx to_idx; 2871 /* Can the subexpression arrive the back reference? */ 2872 err = check_arrival (mctx, &sub_last->path, sub_last->node, 2873 sub_last->str_idx, bkref_node, bkref_str, 2874 OP_OPEN_SUBEXP); 2875 if (err != REG_NOERROR) 2876 return err; 2877 err = match_ctx_add_entry (mctx, bkref_node, bkref_str, sub_top->str_idx, 2878 sub_last->str_idx); 2879 if (BE (err != REG_NOERROR, 0)) 2880 return err; 2881 to_idx = bkref_str + sub_last->str_idx - sub_top->str_idx; 2882 return clean_state_log_if_needed (mctx, to_idx); 2883} 2884 2885/* Find the first node which is '(' or ')' and whose index is SUBEXP_IDX. 2886 Search '(' if FL_OPEN, or search ')' otherwise. 2887 TODO: This function isn't efficient... 2888 Because there might be more than one nodes whose types are 2889 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all 2890 nodes. 2891 E.g. RE: (a){2} */ 2892 2893static Idx 2894internal_function 2895find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes, 2896 Idx subexp_idx, int type) 2897{ 2898 Idx cls_idx; 2899 for (cls_idx = 0; cls_idx < nodes->nelem; ++cls_idx) 2900 { 2901 Idx cls_node = nodes->elems[cls_idx]; 2902 const re_token_t *node = dfa->nodes + cls_node; 2903 if (node->type == type 2904 && node->opr.idx == subexp_idx) 2905 return cls_node; 2906 } 2907 return REG_MISSING; 2908} 2909 2910/* Check whether the node TOP_NODE at TOP_STR can arrive to the node 2911 LAST_NODE at LAST_STR. We record the path onto PATH since it will be 2912 heavily reused. 2913 Return REG_NOERROR if it can arrive, or REG_NOMATCH otherwise. */ 2914 2915static reg_errcode_t 2916internal_function __attribute_warn_unused_result__ 2917check_arrival (re_match_context_t *mctx, state_array_t *path, Idx top_node, 2918 Idx top_str, Idx last_node, Idx last_str, int type) 2919{ 2920 const re_dfa_t *const dfa = mctx->dfa; 2921 reg_errcode_t err = REG_NOERROR; 2922 Idx subexp_num, backup_cur_idx, str_idx, null_cnt; 2923 re_dfastate_t *cur_state = NULL; 2924 re_node_set *cur_nodes, next_nodes; 2925 re_dfastate_t **backup_state_log; 2926 unsigned int context; 2927 2928 subexp_num = dfa->nodes[top_node].opr.idx; 2929 /* Extend the buffer if we need. */ 2930 if (BE (path->alloc < last_str + mctx->max_mb_elem_len + 1, 0)) 2931 { 2932 re_dfastate_t **new_array; 2933 Idx old_alloc = path->alloc; 2934 Idx incr_alloc = last_str + mctx->max_mb_elem_len + 1; 2935 Idx new_alloc; 2936 if (BE (IDX_MAX - old_alloc < incr_alloc, 0)) 2937 return REG_ESPACE; 2938 new_alloc = old_alloc + incr_alloc; 2939 if (BE (SIZE_MAX / sizeof (re_dfastate_t *) < new_alloc, 0)) 2940 return REG_ESPACE; 2941 new_array = re_realloc (path->array, re_dfastate_t *, new_alloc); 2942 if (BE (new_array == NULL, 0)) 2943 return REG_ESPACE; 2944 path->array = new_array; 2945 path->alloc = new_alloc; 2946 memset (new_array + old_alloc, '\0', 2947 sizeof (re_dfastate_t *) * (path->alloc - old_alloc)); 2948 } 2949 2950 str_idx = path->next_idx ? path->next_idx : top_str; 2951 2952 /* Temporary modify MCTX. */ 2953 backup_state_log = mctx->state_log; 2954 backup_cur_idx = mctx->input.cur_idx; 2955 mctx->state_log = path->array; 2956 mctx->input.cur_idx = str_idx; 2957 2958 /* Setup initial node set. */ 2959 context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags); 2960 if (str_idx == top_str) 2961 { 2962 err = re_node_set_init_1 (&next_nodes, top_node); 2963 if (BE (err != REG_NOERROR, 0)) 2964 return err; 2965 err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type); 2966 if (BE (err != REG_NOERROR, 0)) 2967 { 2968 re_node_set_free (&next_nodes); 2969 return err; 2970 } 2971 } 2972 else 2973 { 2974 cur_state = mctx->state_log[str_idx]; 2975 if (cur_state && cur_state->has_backref) 2976 { 2977 err = re_node_set_init_copy (&next_nodes, &cur_state->nodes); 2978 if (BE (err != REG_NOERROR, 0)) 2979 return err; 2980 } 2981 else 2982 re_node_set_init_empty (&next_nodes); 2983 } 2984 if (str_idx == top_str || (cur_state && cur_state->has_backref)) 2985 { 2986 if (next_nodes.nelem) 2987 { 2988 err = expand_bkref_cache (mctx, &next_nodes, str_idx, 2989 subexp_num, type); 2990 if (BE (err != REG_NOERROR, 0)) 2991 { 2992 re_node_set_free (&next_nodes); 2993 return err; 2994 } 2995 } 2996 cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context); 2997 if (BE (cur_state == NULL && err != REG_NOERROR, 0)) 2998 { 2999 re_node_set_free (&next_nodes); 3000 return err; 3001 } 3002 mctx->state_log[str_idx] = cur_state; 3003 } 3004 3005 for (null_cnt = 0; str_idx < last_str && null_cnt <= mctx->max_mb_elem_len;) 3006 { 3007 re_node_set_empty (&next_nodes); 3008 if (mctx->state_log[str_idx + 1]) 3009 { 3010 err = re_node_set_merge (&next_nodes, 3011 &mctx->state_log[str_idx + 1]->nodes); 3012 if (BE (err != REG_NOERROR, 0)) 3013 { 3014 re_node_set_free (&next_nodes); 3015 return err; 3016 } 3017 } 3018 if (cur_state) 3019 { 3020 err = check_arrival_add_next_nodes (mctx, str_idx, 3021 &cur_state->non_eps_nodes, 3022 &next_nodes); 3023 if (BE (err != REG_NOERROR, 0)) 3024 { 3025 re_node_set_free (&next_nodes); 3026 return err; 3027 } 3028 } 3029 ++str_idx; 3030 if (next_nodes.nelem) 3031 { 3032 err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type); 3033 if (BE (err != REG_NOERROR, 0)) 3034 { 3035 re_node_set_free (&next_nodes); 3036 return err; 3037 } 3038 err = expand_bkref_cache (mctx, &next_nodes, str_idx, 3039 subexp_num, type); 3040 if (BE (err != REG_NOERROR, 0)) 3041 { 3042 re_node_set_free (&next_nodes); 3043 return err; 3044 } 3045 } 3046 context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags); 3047 cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context); 3048 if (BE (cur_state == NULL && err != REG_NOERROR, 0)) 3049 { 3050 re_node_set_free (&next_nodes); 3051 return err; 3052 } 3053 mctx->state_log[str_idx] = cur_state; 3054 null_cnt = cur_state == NULL ? null_cnt + 1 : 0; 3055 } 3056 re_node_set_free (&next_nodes); 3057 cur_nodes = (mctx->state_log[last_str] == NULL ? NULL 3058 : &mctx->state_log[last_str]->nodes); 3059 path->next_idx = str_idx; 3060 3061 /* Fix MCTX. */ 3062 mctx->state_log = backup_state_log; 3063 mctx->input.cur_idx = backup_cur_idx; 3064 3065 /* Then check the current node set has the node LAST_NODE. */ 3066 if (cur_nodes != NULL && re_node_set_contains (cur_nodes, last_node)) 3067 return REG_NOERROR; 3068 3069 return REG_NOMATCH; 3070} 3071 3072/* Helper functions for check_arrival. */ 3073 3074/* Calculate the destination nodes of CUR_NODES at STR_IDX, and append them 3075 to NEXT_NODES. 3076 TODO: This function is similar to the functions transit_state*(), 3077 however this function has many additional works. 3078 Can't we unify them? */ 3079 3080static reg_errcode_t 3081internal_function __attribute_warn_unused_result__ 3082check_arrival_add_next_nodes (re_match_context_t *mctx, Idx str_idx, 3083 re_node_set *cur_nodes, re_node_set *next_nodes) 3084{ 3085 const re_dfa_t *const dfa = mctx->dfa; 3086 bool ok; 3087 Idx cur_idx; 3088#ifdef RE_ENABLE_I18N 3089 reg_errcode_t err = REG_NOERROR; 3090#endif 3091 re_node_set union_set; 3092 re_node_set_init_empty (&union_set); 3093 for (cur_idx = 0; cur_idx < cur_nodes->nelem; ++cur_idx) 3094 { 3095 int naccepted = 0; 3096 Idx cur_node = cur_nodes->elems[cur_idx]; 3097#ifdef DEBUG 3098 re_token_type_t type = dfa->nodes[cur_node].type; 3099 assert (!IS_EPSILON_NODE (type)); 3100#endif 3101#ifdef RE_ENABLE_I18N 3102 /* If the node may accept "multi byte". */ 3103 if (dfa->nodes[cur_node].accept_mb) 3104 { 3105 naccepted = check_node_accept_bytes (dfa, cur_node, &mctx->input, 3106 str_idx); 3107 if (naccepted > 1) 3108 { 3109 re_dfastate_t *dest_state; 3110 Idx next_node = dfa->nexts[cur_node]; 3111 Idx next_idx = str_idx + naccepted; 3112 dest_state = mctx->state_log[next_idx]; 3113 re_node_set_empty (&union_set); 3114 if (dest_state) 3115 { 3116 err = re_node_set_merge (&union_set, &dest_state->nodes); 3117 if (BE (err != REG_NOERROR, 0)) 3118 { 3119 re_node_set_free (&union_set); 3120 return err; 3121 } 3122 } 3123 ok = re_node_set_insert (&union_set, next_node); 3124 if (BE (! ok, 0)) 3125 { 3126 re_node_set_free (&union_set); 3127 return REG_ESPACE; 3128 } 3129 mctx->state_log[next_idx] = re_acquire_state (&err, dfa, 3130 &union_set); 3131 if (BE (mctx->state_log[next_idx] == NULL 3132 && err != REG_NOERROR, 0)) 3133 { 3134 re_node_set_free (&union_set); 3135 return err; 3136 } 3137 } 3138 } 3139#endif /* RE_ENABLE_I18N */ 3140 if (naccepted 3141 || check_node_accept (mctx, dfa->nodes + cur_node, str_idx)) 3142 { 3143 ok = re_node_set_insert (next_nodes, dfa->nexts[cur_node]); 3144 if (BE (! ok, 0)) 3145 { 3146 re_node_set_free (&union_set); 3147 return REG_ESPACE; 3148 } 3149 } 3150 } 3151 re_node_set_free (&union_set); 3152 return REG_NOERROR; 3153} 3154 3155/* For all the nodes in CUR_NODES, add the epsilon closures of them to 3156 CUR_NODES, however exclude the nodes which are: 3157 - inside the sub expression whose number is EX_SUBEXP, if FL_OPEN. 3158 - out of the sub expression whose number is EX_SUBEXP, if !FL_OPEN. 3159*/ 3160 3161static reg_errcode_t 3162internal_function 3163check_arrival_expand_ecl (const re_dfa_t *dfa, re_node_set *cur_nodes, 3164 Idx ex_subexp, int type) 3165{ 3166 reg_errcode_t err; 3167 Idx idx, outside_node; 3168 re_node_set new_nodes; 3169#ifdef DEBUG 3170 assert (cur_nodes->nelem); 3171#endif 3172 err = re_node_set_alloc (&new_nodes, cur_nodes->nelem); 3173 if (BE (err != REG_NOERROR, 0)) 3174 return err; 3175 /* Create a new node set NEW_NODES with the nodes which are epsilon 3176 closures of the node in CUR_NODES. */ 3177 3178 for (idx = 0; idx < cur_nodes->nelem; ++idx) 3179 { 3180 Idx cur_node = cur_nodes->elems[idx]; 3181 const re_node_set *eclosure = dfa->eclosures + cur_node; 3182 outside_node = find_subexp_node (dfa, eclosure, ex_subexp, type); 3183 if (outside_node == REG_MISSING) 3184 { 3185 /* There are no problematic nodes, just merge them. */ 3186 err = re_node_set_merge (&new_nodes, eclosure); 3187 if (BE (err != REG_NOERROR, 0)) 3188 { 3189 re_node_set_free (&new_nodes); 3190 return err; 3191 } 3192 } 3193 else 3194 { 3195 /* There are problematic nodes, re-calculate incrementally. */ 3196 err = check_arrival_expand_ecl_sub (dfa, &new_nodes, cur_node, 3197 ex_subexp, type); 3198 if (BE (err != REG_NOERROR, 0)) 3199 { 3200 re_node_set_free (&new_nodes); 3201 return err; 3202 } 3203 } 3204 } 3205 re_node_set_free (cur_nodes); 3206 *cur_nodes = new_nodes; 3207 return REG_NOERROR; 3208} 3209 3210/* Helper function for check_arrival_expand_ecl. 3211 Check incrementally the epsilon closure of TARGET, and if it isn't 3212 problematic append it to DST_NODES. */ 3213 3214static reg_errcode_t 3215internal_function __attribute_warn_unused_result__ 3216check_arrival_expand_ecl_sub (const re_dfa_t *dfa, re_node_set *dst_nodes, 3217 Idx target, Idx ex_subexp, int type) 3218{ 3219 Idx cur_node; 3220 for (cur_node = target; !re_node_set_contains (dst_nodes, cur_node);) 3221 { 3222 bool ok; 3223 3224 if (dfa->nodes[cur_node].type == type 3225 && dfa->nodes[cur_node].opr.idx == ex_subexp) 3226 { 3227 if (type == OP_CLOSE_SUBEXP) 3228 { 3229 ok = re_node_set_insert (dst_nodes, cur_node); 3230 if (BE (! ok, 0)) 3231 return REG_ESPACE; 3232 } 3233 break; 3234 } 3235 ok = re_node_set_insert (dst_nodes, cur_node); 3236 if (BE (! ok, 0)) 3237 return REG_ESPACE; 3238 if (dfa->edests[cur_node].nelem == 0) 3239 break; 3240 if (dfa->edests[cur_node].nelem == 2) 3241 { 3242 reg_errcode_t err; 3243 err = check_arrival_expand_ecl_sub (dfa, dst_nodes, 3244 dfa->edests[cur_node].elems[1], 3245 ex_subexp, type); 3246 if (BE (err != REG_NOERROR, 0)) 3247 return err; 3248 } 3249 cur_node = dfa->edests[cur_node].elems[0]; 3250 } 3251 return REG_NOERROR; 3252} 3253 3254 3255/* For all the back references in the current state, calculate the 3256 destination of the back references by the appropriate entry 3257 in MCTX->BKREF_ENTS. */ 3258 3259static reg_errcode_t 3260internal_function __attribute_warn_unused_result__ 3261expand_bkref_cache (re_match_context_t *mctx, re_node_set *cur_nodes, 3262 Idx cur_str, Idx subexp_num, int type) 3263{ 3264 const re_dfa_t *const dfa = mctx->dfa; 3265 reg_errcode_t err; 3266 Idx cache_idx_start = search_cur_bkref_entry (mctx, cur_str); 3267 struct re_backref_cache_entry *ent; 3268 3269 if (cache_idx_start == REG_MISSING) 3270 return REG_NOERROR; 3271 3272 restart: 3273 ent = mctx->bkref_ents + cache_idx_start; 3274 do 3275 { 3276 Idx to_idx, next_node; 3277 3278 /* Is this entry ENT is appropriate? */ 3279 if (!re_node_set_contains (cur_nodes, ent->node)) 3280 continue; /* No. */ 3281 3282 to_idx = cur_str + ent->subexp_to - ent->subexp_from; 3283 /* Calculate the destination of the back reference, and append it 3284 to MCTX->STATE_LOG. */ 3285 if (to_idx == cur_str) 3286 { 3287 /* The backreference did epsilon transit, we must re-check all the 3288 node in the current state. */ 3289 re_node_set new_dests; 3290 reg_errcode_t err2, err3; 3291 next_node = dfa->edests[ent->node].elems[0]; 3292 if (re_node_set_contains (cur_nodes, next_node)) 3293 continue; 3294 err = re_node_set_init_1 (&new_dests, next_node); 3295 err2 = check_arrival_expand_ecl (dfa, &new_dests, subexp_num, type); 3296 err3 = re_node_set_merge (cur_nodes, &new_dests); 3297 re_node_set_free (&new_dests); 3298 if (BE (err != REG_NOERROR || err2 != REG_NOERROR 3299 || err3 != REG_NOERROR, 0)) 3300 { 3301 err = (err != REG_NOERROR ? err 3302 : (err2 != REG_NOERROR ? err2 : err3)); 3303 return err; 3304 } 3305 /* TODO: It is still inefficient... */ 3306 goto restart; 3307 } 3308 else 3309 { 3310 re_node_set union_set; 3311 next_node = dfa->nexts[ent->node]; 3312 if (mctx->state_log[to_idx]) 3313 { 3314 bool ok; 3315 if (re_node_set_contains (&mctx->state_log[to_idx]->nodes, 3316 next_node)) 3317 continue; 3318 err = re_node_set_init_copy (&union_set, 3319 &mctx->state_log[to_idx]->nodes); 3320 ok = re_node_set_insert (&union_set, next_node); 3321 if (BE (err != REG_NOERROR || ! ok, 0)) 3322 { 3323 re_node_set_free (&union_set); 3324 err = err != REG_NOERROR ? err : REG_ESPACE; 3325 return err; 3326 } 3327 } 3328 else 3329 { 3330 err = re_node_set_init_1 (&union_set, next_node); 3331 if (BE (err != REG_NOERROR, 0)) 3332 return err; 3333 } 3334 mctx->state_log[to_idx] = re_acquire_state (&err, dfa, &union_set); 3335 re_node_set_free (&union_set); 3336 if (BE (mctx->state_log[to_idx] == NULL 3337 && err != REG_NOERROR, 0)) 3338 return err; 3339 } 3340 } 3341 while (ent++->more); 3342 return REG_NOERROR; 3343} 3344 3345/* Build transition table for the state. 3346 Return true if successful. */ 3347 3348static bool 3349internal_function 3350build_trtable (const re_dfa_t *dfa, re_dfastate_t *state) 3351{ 3352 reg_errcode_t err; 3353 Idx i, j; 3354 int ch; 3355 bool need_word_trtable = false; 3356 bitset_word_t elem, mask; 3357 bool dests_node_malloced = false; 3358 bool dest_states_malloced = false; 3359 Idx ndests; /* Number of the destination states from 'state'. */ 3360 re_dfastate_t **trtable; 3361 re_dfastate_t **dest_states = NULL, **dest_states_word, **dest_states_nl; 3362 re_node_set follows, *dests_node; 3363 bitset_t *dests_ch; 3364 bitset_t acceptable; 3365 3366 struct dests_alloc 3367 { 3368 re_node_set dests_node[SBC_MAX]; 3369 bitset_t dests_ch[SBC_MAX]; 3370 } *dests_alloc; 3371 3372 /* We build DFA states which corresponds to the destination nodes 3373 from 'state'. 'dests_node[i]' represents the nodes which i-th 3374 destination state contains, and 'dests_ch[i]' represents the 3375 characters which i-th destination state accepts. */ 3376 if (__libc_use_alloca (sizeof (struct dests_alloc))) 3377 dests_alloc = (struct dests_alloc *) alloca (sizeof (struct dests_alloc)); 3378 else 3379 { 3380 dests_alloc = re_malloc (struct dests_alloc, 1); 3381 if (BE (dests_alloc == NULL, 0)) 3382 return false; 3383 dests_node_malloced = true; 3384 } 3385 dests_node = dests_alloc->dests_node; 3386 dests_ch = dests_alloc->dests_ch; 3387 3388 /* Initialize transition table. */ 3389 state->word_trtable = state->trtable = NULL; 3390 3391 /* At first, group all nodes belonging to 'state' into several 3392 destinations. */ 3393 ndests = group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch); 3394 if (BE (! REG_VALID_NONZERO_INDEX (ndests), 0)) 3395 { 3396 if (dests_node_malloced) 3397 free (dests_alloc); 3398 /* Return false in case of an error, true otherwise. */ 3399 if (ndests == 0) 3400 { 3401 state->trtable = (re_dfastate_t **) 3402 calloc (sizeof (re_dfastate_t *), SBC_MAX); 3403 if (BE (state->trtable == NULL, 0)) 3404 return false; 3405 return true; 3406 } 3407 return false; 3408 } 3409 3410 err = re_node_set_alloc (&follows, ndests + 1); 3411 if (BE (err != REG_NOERROR, 0)) 3412 goto out_free; 3413 3414 /* Avoid arithmetic overflow in size calculation. */ 3415 if (BE ((((SIZE_MAX - (sizeof (re_node_set) + sizeof (bitset_t)) * SBC_MAX) 3416 / (3 * sizeof (re_dfastate_t *))) 3417 < ndests), 3418 0)) 3419 goto out_free; 3420 3421 if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset_t)) * SBC_MAX 3422 + ndests * 3 * sizeof (re_dfastate_t *))) 3423 dest_states = (re_dfastate_t **) 3424 alloca (ndests * 3 * sizeof (re_dfastate_t *)); 3425 else 3426 { 3427 dest_states = (re_dfastate_t **) 3428 malloc (ndests * 3 * sizeof (re_dfastate_t *)); 3429 if (BE (dest_states == NULL, 0)) 3430 { 3431out_free: 3432 if (dest_states_malloced) 3433 free (dest_states); 3434 re_node_set_free (&follows); 3435 for (i = 0; i < ndests; ++i) 3436 re_node_set_free (dests_node + i); 3437 if (dests_node_malloced) 3438 free (dests_alloc); 3439 return false; 3440 } 3441 dest_states_malloced = true; 3442 } 3443 dest_states_word = dest_states + ndests; 3444 dest_states_nl = dest_states_word + ndests; 3445 bitset_empty (acceptable); 3446 3447 /* Then build the states for all destinations. */ 3448 for (i = 0; i < ndests; ++i) 3449 { 3450 Idx next_node; 3451 re_node_set_empty (&follows); 3452 /* Merge the follows of this destination states. */ 3453 for (j = 0; j < dests_node[i].nelem; ++j) 3454 { 3455 next_node = dfa->nexts[dests_node[i].elems[j]]; 3456 if (next_node != REG_MISSING) 3457 { 3458 err = re_node_set_merge (&follows, dfa->eclosures + next_node); 3459 if (BE (err != REG_NOERROR, 0)) 3460 goto out_free; 3461 } 3462 } 3463 dest_states[i] = re_acquire_state_context (&err, dfa, &follows, 0); 3464 if (BE (dest_states[i] == NULL && err != REG_NOERROR, 0)) 3465 goto out_free; 3466 /* If the new state has context constraint, 3467 build appropriate states for these contexts. */ 3468 if (dest_states[i]->has_constraint) 3469 { 3470 dest_states_word[i] = re_acquire_state_context (&err, dfa, &follows, 3471 CONTEXT_WORD); 3472 if (BE (dest_states_word[i] == NULL && err != REG_NOERROR, 0)) 3473 goto out_free; 3474 3475 if (dest_states[i] != dest_states_word[i] && dfa->mb_cur_max > 1) 3476 need_word_trtable = true; 3477 3478 dest_states_nl[i] = re_acquire_state_context (&err, dfa, &follows, 3479 CONTEXT_NEWLINE); 3480 if (BE (dest_states_nl[i] == NULL && err != REG_NOERROR, 0)) 3481 goto out_free; 3482 } 3483 else 3484 { 3485 dest_states_word[i] = dest_states[i]; 3486 dest_states_nl[i] = dest_states[i]; 3487 } 3488 bitset_merge (acceptable, dests_ch[i]); 3489 } 3490 3491 if (!BE (need_word_trtable, 0)) 3492 { 3493 /* We don't care about whether the following character is a word 3494 character, or we are in a single-byte character set so we can 3495 discern by looking at the character code: allocate a 3496 256-entry transition table. */ 3497 trtable = state->trtable = 3498 (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), SBC_MAX); 3499 if (BE (trtable == NULL, 0)) 3500 goto out_free; 3501 3502 /* For all characters ch...: */ 3503 for (i = 0; i < BITSET_WORDS; ++i) 3504 for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1; 3505 elem; 3506 mask <<= 1, elem >>= 1, ++ch) 3507 if (BE (elem & 1, 0)) 3508 { 3509 /* There must be exactly one destination which accepts 3510 character ch. See group_nodes_into_DFAstates. */ 3511 for (j = 0; (dests_ch[j][i] & mask) == 0; ++j) 3512 ; 3513 3514 /* j-th destination accepts the word character ch. */ 3515 if (dfa->word_char[i] & mask) 3516 trtable[ch] = dest_states_word[j]; 3517 else 3518 trtable[ch] = dest_states[j]; 3519 } 3520 } 3521 else 3522 { 3523 /* We care about whether the following character is a word 3524 character, and we are in a multi-byte character set: discern 3525 by looking at the character code: build two 256-entry 3526 transition tables, one starting at trtable[0] and one 3527 starting at trtable[SBC_MAX]. */ 3528 trtable = state->word_trtable = 3529 (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), 2 * SBC_MAX); 3530 if (BE (trtable == NULL, 0)) 3531 goto out_free; 3532 3533 /* For all characters ch...: */ 3534 for (i = 0; i < BITSET_WORDS; ++i) 3535 for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1; 3536 elem; 3537 mask <<= 1, elem >>= 1, ++ch) 3538 if (BE (elem & 1, 0)) 3539 { 3540 /* There must be exactly one destination which accepts 3541 character ch. See group_nodes_into_DFAstates. */ 3542 for (j = 0; (dests_ch[j][i] & mask) == 0; ++j) 3543 ; 3544 3545 /* j-th destination accepts the word character ch. */ 3546 trtable[ch] = dest_states[j]; 3547 trtable[ch + SBC_MAX] = dest_states_word[j]; 3548 } 3549 } 3550 3551 /* new line */ 3552 if (bitset_contain (acceptable, NEWLINE_CHAR)) 3553 { 3554 /* The current state accepts newline character. */ 3555 for (j = 0; j < ndests; ++j) 3556 if (bitset_contain (dests_ch[j], NEWLINE_CHAR)) 3557 { 3558 /* k-th destination accepts newline character. */ 3559 trtable[NEWLINE_CHAR] = dest_states_nl[j]; 3560 if (need_word_trtable) 3561 trtable[NEWLINE_CHAR + SBC_MAX] = dest_states_nl[j]; 3562 /* There must be only one destination which accepts 3563 newline. See group_nodes_into_DFAstates. */ 3564 break; 3565 } 3566 } 3567 3568 if (dest_states_malloced) 3569 free (dest_states); 3570 3571 re_node_set_free (&follows); 3572 for (i = 0; i < ndests; ++i) 3573 re_node_set_free (dests_node + i); 3574 3575 if (dests_node_malloced) 3576 free (dests_alloc); 3577 3578 return true; 3579} 3580 3581/* Group all nodes belonging to STATE into several destinations. 3582 Then for all destinations, set the nodes belonging to the destination 3583 to DESTS_NODE[i] and set the characters accepted by the destination 3584 to DEST_CH[i]. This function return the number of destinations. */ 3585 3586static Idx 3587internal_function 3588group_nodes_into_DFAstates (const re_dfa_t *dfa, const re_dfastate_t *state, 3589 re_node_set *dests_node, bitset_t *dests_ch) 3590{ 3591 reg_errcode_t err; 3592 bool ok; 3593 Idx i, j, k; 3594 Idx ndests; /* Number of the destinations from 'state'. */ 3595 bitset_t accepts; /* Characters a node can accept. */ 3596 const re_node_set *cur_nodes = &state->nodes; 3597 bitset_empty (accepts); 3598 ndests = 0; 3599 3600 /* For all the nodes belonging to 'state', */ 3601 for (i = 0; i < cur_nodes->nelem; ++i) 3602 { 3603 re_token_t *node = &dfa->nodes[cur_nodes->elems[i]]; 3604 re_token_type_t type = node->type; 3605 unsigned int constraint = node->constraint; 3606 3607 /* Enumerate all single byte character this node can accept. */ 3608 if (type == CHARACTER) 3609 bitset_set (accepts, node->opr.c); 3610 else if (type == SIMPLE_BRACKET) 3611 { 3612 bitset_merge (accepts, node->opr.sbcset); 3613 } 3614 else if (type == OP_PERIOD) 3615 { 3616#ifdef RE_ENABLE_I18N 3617 if (dfa->mb_cur_max > 1) 3618 bitset_merge (accepts, dfa->sb_char); 3619 else 3620#endif 3621 bitset_set_all (accepts); 3622 if (!(dfa->syntax & RE_DOT_NEWLINE)) 3623 bitset_clear (accepts, '\n'); 3624 if (dfa->syntax & RE_DOT_NOT_NULL) 3625 bitset_clear (accepts, '\0'); 3626 } 3627#ifdef RE_ENABLE_I18N 3628 else if (type == OP_UTF8_PERIOD) 3629 { 3630 if (ASCII_CHARS % BITSET_WORD_BITS == 0) 3631 memset (accepts, -1, ASCII_CHARS / CHAR_BIT); 3632 else 3633 bitset_merge (accepts, utf8_sb_map); 3634 if (!(dfa->syntax & RE_DOT_NEWLINE)) 3635 bitset_clear (accepts, '\n'); 3636 if (dfa->syntax & RE_DOT_NOT_NULL) 3637 bitset_clear (accepts, '\0'); 3638 } 3639#endif 3640 else 3641 continue; 3642 3643 /* Check the 'accepts' and sift the characters which are not 3644 match it the context. */ 3645 if (constraint) 3646 { 3647 if (constraint & NEXT_NEWLINE_CONSTRAINT) 3648 { 3649 bool accepts_newline = bitset_contain (accepts, NEWLINE_CHAR); 3650 bitset_empty (accepts); 3651 if (accepts_newline) 3652 bitset_set (accepts, NEWLINE_CHAR); 3653 else 3654 continue; 3655 } 3656 if (constraint & NEXT_ENDBUF_CONSTRAINT) 3657 { 3658 bitset_empty (accepts); 3659 continue; 3660 } 3661 3662 if (constraint & NEXT_WORD_CONSTRAINT) 3663 { 3664 bitset_word_t any_set = 0; 3665 if (type == CHARACTER && !node->word_char) 3666 { 3667 bitset_empty (accepts); 3668 continue; 3669 } 3670#ifdef RE_ENABLE_I18N 3671 if (dfa->mb_cur_max > 1) 3672 for (j = 0; j < BITSET_WORDS; ++j) 3673 any_set |= (accepts[j] &= (dfa->word_char[j] | ~dfa->sb_char[j])); 3674 else 3675#endif 3676 for (j = 0; j < BITSET_WORDS; ++j) 3677 any_set |= (accepts[j] &= dfa->word_char[j]); 3678 if (!any_set) 3679 continue; 3680 } 3681 if (constraint & NEXT_NOTWORD_CONSTRAINT) 3682 { 3683 bitset_word_t any_set = 0; 3684 if (type == CHARACTER && node->word_char) 3685 { 3686 bitset_empty (accepts); 3687 continue; 3688 } 3689#ifdef RE_ENABLE_I18N 3690 if (dfa->mb_cur_max > 1) 3691 for (j = 0; j < BITSET_WORDS; ++j) 3692 any_set |= (accepts[j] &= ~(dfa->word_char[j] & dfa->sb_char[j])); 3693 else 3694#endif 3695 for (j = 0; j < BITSET_WORDS; ++j) 3696 any_set |= (accepts[j] &= ~dfa->word_char[j]); 3697 if (!any_set) 3698 continue; 3699 } 3700 } 3701 3702 /* Then divide 'accepts' into DFA states, or create a new 3703 state. Above, we make sure that accepts is not empty. */ 3704 for (j = 0; j < ndests; ++j) 3705 { 3706 bitset_t intersec; /* Intersection sets, see below. */ 3707 bitset_t remains; 3708 /* Flags, see below. */ 3709 bitset_word_t has_intersec, not_subset, not_consumed; 3710 3711 /* Optimization, skip if this state doesn't accept the character. */ 3712 if (type == CHARACTER && !bitset_contain (dests_ch[j], node->opr.c)) 3713 continue; 3714 3715 /* Enumerate the intersection set of this state and 'accepts'. */ 3716 has_intersec = 0; 3717 for (k = 0; k < BITSET_WORDS; ++k) 3718 has_intersec |= intersec[k] = accepts[k] & dests_ch[j][k]; 3719 /* And skip if the intersection set is empty. */ 3720 if (!has_intersec) 3721 continue; 3722 3723 /* Then check if this state is a subset of 'accepts'. */ 3724 not_subset = not_consumed = 0; 3725 for (k = 0; k < BITSET_WORDS; ++k) 3726 { 3727 not_subset |= remains[k] = ~accepts[k] & dests_ch[j][k]; 3728 not_consumed |= accepts[k] = accepts[k] & ~dests_ch[j][k]; 3729 } 3730 3731 /* If this state isn't a subset of 'accepts', create a 3732 new group state, which has the 'remains'. */ 3733 if (not_subset) 3734 { 3735 bitset_copy (dests_ch[ndests], remains); 3736 bitset_copy (dests_ch[j], intersec); 3737 err = re_node_set_init_copy (dests_node + ndests, &dests_node[j]); 3738 if (BE (err != REG_NOERROR, 0)) 3739 goto error_return; 3740 ++ndests; 3741 } 3742 3743 /* Put the position in the current group. */ 3744 ok = re_node_set_insert (&dests_node[j], cur_nodes->elems[i]); 3745 if (BE (! ok, 0)) 3746 goto error_return; 3747 3748 /* If all characters are consumed, go to next node. */ 3749 if (!not_consumed) 3750 break; 3751 } 3752 /* Some characters remain, create a new group. */ 3753 if (j == ndests) 3754 { 3755 bitset_copy (dests_ch[ndests], accepts); 3756 err = re_node_set_init_1 (dests_node + ndests, cur_nodes->elems[i]); 3757 if (BE (err != REG_NOERROR, 0)) 3758 goto error_return; 3759 ++ndests; 3760 bitset_empty (accepts); 3761 } 3762 } 3763 return ndests; 3764 error_return: 3765 for (j = 0; j < ndests; ++j) 3766 re_node_set_free (dests_node + j); 3767 return REG_MISSING; 3768} 3769 3770#ifdef RE_ENABLE_I18N 3771/* Check how many bytes the node 'dfa->nodes[node_idx]' accepts. 3772 Return the number of the bytes the node accepts. 3773 STR_IDX is the current index of the input string. 3774 3775 This function handles the nodes which can accept one character, or 3776 one collating element like '.', '[a-z]', opposite to the other nodes 3777 can only accept one byte. */ 3778 3779static int 3780internal_function 3781check_node_accept_bytes (const re_dfa_t *dfa, Idx node_idx, 3782 const re_string_t *input, Idx str_idx) 3783{ 3784 const re_token_t *node = dfa->nodes + node_idx; 3785 int char_len, elem_len; 3786 Idx i; 3787 3788 if (BE (node->type == OP_UTF8_PERIOD, 0)) 3789 { 3790 unsigned char c = re_string_byte_at (input, str_idx), d; 3791 if (BE (c < 0xc2, 1)) 3792 return 0; 3793 3794 if (str_idx + 2 > input->len) 3795 return 0; 3796 3797 d = re_string_byte_at (input, str_idx + 1); 3798 if (c < 0xe0) 3799 return (d < 0x80 || d > 0xbf) ? 0 : 2; 3800 else if (c < 0xf0) 3801 { 3802 char_len = 3; 3803 if (c == 0xe0 && d < 0xa0) 3804 return 0; 3805 } 3806 else if (c < 0xf8) 3807 { 3808 char_len = 4; 3809 if (c == 0xf0 && d < 0x90) 3810 return 0; 3811 } 3812 else if (c < 0xfc) 3813 { 3814 char_len = 5; 3815 if (c == 0xf8 && d < 0x88) 3816 return 0; 3817 } 3818 else if (c < 0xfe) 3819 { 3820 char_len = 6; 3821 if (c == 0xfc && d < 0x84) 3822 return 0; 3823 } 3824 else 3825 return 0; 3826 3827 if (str_idx + char_len > input->len) 3828 return 0; 3829 3830 for (i = 1; i < char_len; ++i) 3831 { 3832 d = re_string_byte_at (input, str_idx + i); 3833 if (d < 0x80 || d > 0xbf) 3834 return 0; 3835 } 3836 return char_len; 3837 } 3838 3839 char_len = re_string_char_size_at (input, str_idx); 3840 if (node->type == OP_PERIOD) 3841 { 3842 if (char_len <= 1) 3843 return 0; 3844 /* FIXME: I don't think this if is needed, as both '\n' 3845 and '\0' are char_len == 1. */ 3846 /* '.' accepts any one character except the following two cases. */ 3847 if ((!(dfa->syntax & RE_DOT_NEWLINE) && 3848 re_string_byte_at (input, str_idx) == '\n') || 3849 ((dfa->syntax & RE_DOT_NOT_NULL) && 3850 re_string_byte_at (input, str_idx) == '\0')) 3851 return 0; 3852 return char_len; 3853 } 3854 3855 elem_len = re_string_elem_size_at (input, str_idx); 3856 if ((elem_len <= 1 && char_len <= 1) || char_len == 0) 3857 return 0; 3858 3859 if (node->type == COMPLEX_BRACKET) 3860 { 3861 const re_charset_t *cset = node->opr.mbcset; 3862# ifdef _LIBC 3863 const unsigned char *pin 3864 = ((const unsigned char *) re_string_get_buffer (input) + str_idx); 3865 Idx j; 3866 uint32_t nrules; 3867# endif /* _LIBC */ 3868 int match_len = 0; 3869 wchar_t wc = ((cset->nranges || cset->nchar_classes || cset->nmbchars) 3870 ? re_string_wchar_at (input, str_idx) : 0); 3871 3872 /* match with multibyte character? */ 3873 for (i = 0; i < cset->nmbchars; ++i) 3874 if (wc == cset->mbchars[i]) 3875 { 3876 match_len = char_len; 3877 goto check_node_accept_bytes_match; 3878 } 3879 /* match with character_class? */ 3880 for (i = 0; i < cset->nchar_classes; ++i) 3881 { 3882 wctype_t wt = cset->char_classes[i]; 3883 if (__iswctype (wc, wt)) 3884 { 3885 match_len = char_len; 3886 goto check_node_accept_bytes_match; 3887 } 3888 } 3889 3890# ifdef _LIBC 3891 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES); 3892 if (nrules != 0) 3893 { 3894 unsigned int in_collseq = 0; 3895 const int32_t *table, *indirect; 3896 const unsigned char *weights, *extra; 3897 const char *collseqwc; 3898 /* This #include defines a local function! */ 3899# include <locale/weight.h> 3900 3901 /* match with collating_symbol? */ 3902 if (cset->ncoll_syms) 3903 extra = (const unsigned char *) 3904 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB); 3905 for (i = 0; i < cset->ncoll_syms; ++i) 3906 { 3907 const unsigned char *coll_sym = extra + cset->coll_syms[i]; 3908 /* Compare the length of input collating element and 3909 the length of current collating element. */ 3910 if (*coll_sym != elem_len) 3911 continue; 3912 /* Compare each bytes. */ 3913 for (j = 0; j < *coll_sym; j++) 3914 if (pin[j] != coll_sym[1 + j]) 3915 break; 3916 if (j == *coll_sym) 3917 { 3918 /* Match if every bytes is equal. */ 3919 match_len = j; 3920 goto check_node_accept_bytes_match; 3921 } 3922 } 3923 3924 if (cset->nranges) 3925 { 3926 if (elem_len <= char_len) 3927 { 3928 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC); 3929 in_collseq = __collseq_table_lookup (collseqwc, wc); 3930 } 3931 else 3932 in_collseq = find_collation_sequence_value (pin, elem_len); 3933 } 3934 /* match with range expression? */ 3935 /* FIXME: Implement rational ranges here, too. */ 3936 for (i = 0; i < cset->nranges; ++i) 3937 if (cset->range_starts[i] <= in_collseq 3938 && in_collseq <= cset->range_ends[i]) 3939 { 3940 match_len = elem_len; 3941 goto check_node_accept_bytes_match; 3942 } 3943 3944 /* match with equivalence_class? */ 3945 if (cset->nequiv_classes) 3946 { 3947 const unsigned char *cp = pin; 3948 table = (const int32_t *) 3949 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB); 3950 weights = (const unsigned char *) 3951 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB); 3952 extra = (const unsigned char *) 3953 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB); 3954 indirect = (const int32_t *) 3955 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB); 3956 int32_t idx = findidx (&cp, elem_len); 3957 if (idx > 0) 3958 for (i = 0; i < cset->nequiv_classes; ++i) 3959 { 3960 int32_t equiv_class_idx = cset->equiv_classes[i]; 3961 size_t weight_len = weights[idx & 0xffffff]; 3962 if (weight_len == weights[equiv_class_idx & 0xffffff] 3963 && (idx >> 24) == (equiv_class_idx >> 24)) 3964 { 3965 Idx cnt = 0; 3966 3967 idx &= 0xffffff; 3968 equiv_class_idx &= 0xffffff; 3969 3970 while (cnt <= weight_len 3971 && (weights[equiv_class_idx + 1 + cnt] 3972 == weights[idx + 1 + cnt])) 3973 ++cnt; 3974 if (cnt > weight_len) 3975 { 3976 match_len = elem_len; 3977 goto check_node_accept_bytes_match; 3978 } 3979 } 3980 } 3981 } 3982 } 3983 else 3984# endif /* _LIBC */ 3985 { 3986 /* match with range expression? */ 3987 for (i = 0; i < cset->nranges; ++i) 3988 { 3989 if (cset->range_starts[i] <= wc && wc <= cset->range_ends[i]) 3990 { 3991 match_len = char_len; 3992 goto check_node_accept_bytes_match; 3993 } 3994 } 3995 } 3996 check_node_accept_bytes_match: 3997 if (!cset->non_match) 3998 return match_len; 3999 else 4000 { 4001 if (match_len > 0) 4002 return 0; 4003 else 4004 return (elem_len > char_len) ? elem_len : char_len; 4005 } 4006 } 4007 return 0; 4008} 4009 4010# ifdef _LIBC 4011static unsigned int 4012internal_function 4013find_collation_sequence_value (const unsigned char *mbs, size_t mbs_len) 4014{ 4015 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES); 4016 if (nrules == 0) 4017 { 4018 if (mbs_len == 1) 4019 { 4020 /* No valid character. Match it as a single byte character. */ 4021 const unsigned char *collseq = (const unsigned char *) 4022 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB); 4023 return collseq[mbs[0]]; 4024 } 4025 return UINT_MAX; 4026 } 4027 else 4028 { 4029 int32_t idx; 4030 const unsigned char *extra = (const unsigned char *) 4031 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB); 4032 int32_t extrasize = (const unsigned char *) 4033 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB + 1) - extra; 4034 4035 for (idx = 0; idx < extrasize;) 4036 { 4037 int mbs_cnt; 4038 bool found = false; 4039 int32_t elem_mbs_len; 4040 /* Skip the name of collating element name. */ 4041 idx = idx + extra[idx] + 1; 4042 elem_mbs_len = extra[idx++]; 4043 if (mbs_len == elem_mbs_len) 4044 { 4045 for (mbs_cnt = 0; mbs_cnt < elem_mbs_len; ++mbs_cnt) 4046 if (extra[idx + mbs_cnt] != mbs[mbs_cnt]) 4047 break; 4048 if (mbs_cnt == elem_mbs_len) 4049 /* Found the entry. */ 4050 found = true; 4051 } 4052 /* Skip the byte sequence of the collating element. */ 4053 idx += elem_mbs_len; 4054 /* Adjust for the alignment. */ 4055 idx = (idx + 3) & ~3; 4056 /* Skip the collation sequence value. */ 4057 idx += sizeof (uint32_t); 4058 /* Skip the wide char sequence of the collating element. */ 4059 idx = idx + sizeof (uint32_t) * (*(int32_t *) (extra + idx) + 1); 4060 /* If we found the entry, return the sequence value. */ 4061 if (found) 4062 return *(uint32_t *) (extra + idx); 4063 /* Skip the collation sequence value. */ 4064 idx += sizeof (uint32_t); 4065 } 4066 return UINT_MAX; 4067 } 4068} 4069# endif /* _LIBC */ 4070#endif /* RE_ENABLE_I18N */ 4071 4072/* Check whether the node accepts the byte which is IDX-th 4073 byte of the INPUT. */ 4074 4075static bool 4076internal_function 4077check_node_accept (const re_match_context_t *mctx, const re_token_t *node, 4078 Idx idx) 4079{ 4080 unsigned char ch; 4081 ch = re_string_byte_at (&mctx->input, idx); 4082 switch (node->type) 4083 { 4084 case CHARACTER: 4085 if (node->opr.c != ch) 4086 return false; 4087 break; 4088 4089 case SIMPLE_BRACKET: 4090 if (!bitset_contain (node->opr.sbcset, ch)) 4091 return false; 4092 break; 4093 4094#ifdef RE_ENABLE_I18N 4095 case OP_UTF8_PERIOD: 4096 if (ch >= ASCII_CHARS) 4097 return false; 4098 /* FALLTHROUGH */ 4099#endif 4100 case OP_PERIOD: 4101 if ((ch == '\n' && !(mctx->dfa->syntax & RE_DOT_NEWLINE)) 4102 || (ch == '\0' && (mctx->dfa->syntax & RE_DOT_NOT_NULL))) 4103 return false; 4104 break; 4105 4106 default: 4107 return false; 4108 } 4109 4110 if (node->constraint) 4111 { 4112 /* The node has constraints. Check whether the current context 4113 satisfies the constraints. */ 4114 unsigned int context = re_string_context_at (&mctx->input, idx, 4115 mctx->eflags); 4116 if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context)) 4117 return false; 4118 } 4119 4120 return true; 4121} 4122 4123/* Extend the buffers, if the buffers have run out. */ 4124 4125static reg_errcode_t 4126internal_function __attribute_warn_unused_result__ 4127extend_buffers (re_match_context_t *mctx, int min_len) 4128{ 4129 reg_errcode_t ret; 4130 re_string_t *pstr = &mctx->input; 4131 4132 /* Avoid overflow. */ 4133 if (BE (MIN (IDX_MAX, SIZE_MAX / sizeof (re_dfastate_t *)) / 2 4134 <= pstr->bufs_len, 0)) 4135 return REG_ESPACE; 4136 4137 /* Double the lengths of the buffers, but allocate at least MIN_LEN. */ 4138 ret = re_string_realloc_buffers (pstr, 4139 MAX (min_len, 4140 MIN (pstr->len, pstr->bufs_len * 2))); 4141 if (BE (ret != REG_NOERROR, 0)) 4142 return ret; 4143 4144 if (mctx->state_log != NULL) 4145 { 4146 /* And double the length of state_log. */ 4147 /* XXX We have no indication of the size of this buffer. If this 4148 allocation fail we have no indication that the state_log array 4149 does not have the right size. */ 4150 re_dfastate_t **new_array = re_realloc (mctx->state_log, re_dfastate_t *, 4151 pstr->bufs_len + 1); 4152 if (BE (new_array == NULL, 0)) 4153 return REG_ESPACE; 4154 mctx->state_log = new_array; 4155 } 4156 4157 /* Then reconstruct the buffers. */ 4158 if (pstr->icase) 4159 { 4160#ifdef RE_ENABLE_I18N 4161 if (pstr->mb_cur_max > 1) 4162 { 4163 ret = build_wcs_upper_buffer (pstr); 4164 if (BE (ret != REG_NOERROR, 0)) 4165 return ret; 4166 } 4167 else 4168#endif /* RE_ENABLE_I18N */ 4169 build_upper_buffer (pstr); 4170 } 4171 else 4172 { 4173#ifdef RE_ENABLE_I18N 4174 if (pstr->mb_cur_max > 1) 4175 build_wcs_buffer (pstr); 4176 else 4177#endif /* RE_ENABLE_I18N */ 4178 { 4179 if (pstr->trans != NULL) 4180 re_string_translate_buffer (pstr); 4181 } 4182 } 4183 return REG_NOERROR; 4184} 4185 4186 4187/* Functions for matching context. */ 4188 4189/* Initialize MCTX. */ 4190 4191static reg_errcode_t 4192internal_function __attribute_warn_unused_result__ 4193match_ctx_init (re_match_context_t *mctx, int eflags, Idx n) 4194{ 4195 mctx->eflags = eflags; 4196 mctx->match_last = REG_MISSING; 4197 if (n > 0) 4198 { 4199 /* Avoid overflow. */ 4200 size_t max_object_size = 4201 MAX (sizeof (struct re_backref_cache_entry), 4202 sizeof (re_sub_match_top_t *)); 4203 if (BE (MIN (IDX_MAX, SIZE_MAX / max_object_size) < n, 0)) 4204 return REG_ESPACE; 4205 4206 mctx->bkref_ents = re_malloc (struct re_backref_cache_entry, n); 4207 mctx->sub_tops = re_malloc (re_sub_match_top_t *, n); 4208 if (BE (mctx->bkref_ents == NULL || mctx->sub_tops == NULL, 0)) 4209 return REG_ESPACE; 4210 } 4211 /* Already zero-ed by the caller. 4212 else 4213 mctx->bkref_ents = NULL; 4214 mctx->nbkref_ents = 0; 4215 mctx->nsub_tops = 0; */ 4216 mctx->abkref_ents = n; 4217 mctx->max_mb_elem_len = 1; 4218 mctx->asub_tops = n; 4219 return REG_NOERROR; 4220} 4221 4222/* Clean the entries which depend on the current input in MCTX. 4223 This function must be invoked when the matcher changes the start index 4224 of the input, or changes the input string. */ 4225 4226static void 4227internal_function 4228match_ctx_clean (re_match_context_t *mctx) 4229{ 4230 Idx st_idx; 4231 for (st_idx = 0; st_idx < mctx->nsub_tops; ++st_idx) 4232 { 4233 Idx sl_idx; 4234 re_sub_match_top_t *top = mctx->sub_tops[st_idx]; 4235 for (sl_idx = 0; sl_idx < top->nlasts; ++sl_idx) 4236 { 4237 re_sub_match_last_t *last = top->lasts[sl_idx]; 4238 re_free (last->path.array); 4239 re_free (last); 4240 } 4241 re_free (top->lasts); 4242 if (top->path) 4243 { 4244 re_free (top->path->array); 4245 re_free (top->path); 4246 } 4247 free (top); 4248 } 4249 4250 mctx->nsub_tops = 0; 4251 mctx->nbkref_ents = 0; 4252} 4253 4254/* Free all the memory associated with MCTX. */ 4255 4256static void 4257internal_function 4258match_ctx_free (re_match_context_t *mctx) 4259{ 4260 /* First, free all the memory associated with MCTX->SUB_TOPS. */ 4261 match_ctx_clean (mctx); 4262 re_free (mctx->sub_tops); 4263 re_free (mctx->bkref_ents); 4264} 4265 4266/* Add a new backreference entry to MCTX. 4267 Note that we assume that caller never call this function with duplicate 4268 entry, and call with STR_IDX which isn't smaller than any existing entry. 4269*/ 4270 4271static reg_errcode_t 4272internal_function __attribute_warn_unused_result__ 4273match_ctx_add_entry (re_match_context_t *mctx, Idx node, Idx str_idx, Idx from, 4274 Idx to) 4275{ 4276 if (mctx->nbkref_ents >= mctx->abkref_ents) 4277 { 4278 struct re_backref_cache_entry* new_entry; 4279 new_entry = re_realloc (mctx->bkref_ents, struct re_backref_cache_entry, 4280 mctx->abkref_ents * 2); 4281 if (BE (new_entry == NULL, 0)) 4282 { 4283 re_free (mctx->bkref_ents); 4284 return REG_ESPACE; 4285 } 4286 mctx->bkref_ents = new_entry; 4287 memset (mctx->bkref_ents + mctx->nbkref_ents, '\0', 4288 sizeof (struct re_backref_cache_entry) * mctx->abkref_ents); 4289 mctx->abkref_ents *= 2; 4290 } 4291 if (mctx->nbkref_ents > 0 4292 && mctx->bkref_ents[mctx->nbkref_ents - 1].str_idx == str_idx) 4293 mctx->bkref_ents[mctx->nbkref_ents - 1].more = 1; 4294 4295 mctx->bkref_ents[mctx->nbkref_ents].node = node; 4296 mctx->bkref_ents[mctx->nbkref_ents].str_idx = str_idx; 4297 mctx->bkref_ents[mctx->nbkref_ents].subexp_from = from; 4298 mctx->bkref_ents[mctx->nbkref_ents].subexp_to = to; 4299 4300 /* This is a cache that saves negative results of check_dst_limits_calc_pos. 4301 If bit N is clear, means that this entry won't epsilon-transition to 4302 an OP_OPEN_SUBEXP or OP_CLOSE_SUBEXP for the N+1-th subexpression. If 4303 it is set, check_dst_limits_calc_pos_1 will recurse and try to find one 4304 such node. 4305 4306 A backreference does not epsilon-transition unless it is empty, so set 4307 to all zeros if FROM != TO. */ 4308 mctx->bkref_ents[mctx->nbkref_ents].eps_reachable_subexps_map 4309 = (from == to ? -1 : 0); 4310 4311 mctx->bkref_ents[mctx->nbkref_ents++].more = 0; 4312 if (mctx->max_mb_elem_len < to - from) 4313 mctx->max_mb_elem_len = to - from; 4314 return REG_NOERROR; 4315} 4316 4317/* Return the first entry with the same str_idx, or REG_MISSING if none is 4318 found. Note that MCTX->BKREF_ENTS is already sorted by MCTX->STR_IDX. */ 4319 4320static Idx 4321internal_function 4322search_cur_bkref_entry (const re_match_context_t *mctx, Idx str_idx) 4323{ 4324 Idx left, right, mid, last; 4325 last = right = mctx->nbkref_ents; 4326 for (left = 0; left < right;) 4327 { 4328 mid = (left + right) / 2; 4329 if (mctx->bkref_ents[mid].str_idx < str_idx) 4330 left = mid + 1; 4331 else 4332 right = mid; 4333 } 4334 if (left < last && mctx->bkref_ents[left].str_idx == str_idx) 4335 return left; 4336 else 4337 return REG_MISSING; 4338} 4339 4340/* Register the node NODE, whose type is OP_OPEN_SUBEXP, and which matches 4341 at STR_IDX. */ 4342 4343static reg_errcode_t 4344internal_function __attribute_warn_unused_result__ 4345match_ctx_add_subtop (re_match_context_t *mctx, Idx node, Idx str_idx) 4346{ 4347#ifdef DEBUG 4348 assert (mctx->sub_tops != NULL); 4349 assert (mctx->asub_tops > 0); 4350#endif 4351 if (BE (mctx->nsub_tops == mctx->asub_tops, 0)) 4352 { 4353 Idx new_asub_tops = mctx->asub_tops * 2; 4354 re_sub_match_top_t **new_array = re_realloc (mctx->sub_tops, 4355 re_sub_match_top_t *, 4356 new_asub_tops); 4357 if (BE (new_array == NULL, 0)) 4358 return REG_ESPACE; 4359 mctx->sub_tops = new_array; 4360 mctx->asub_tops = new_asub_tops; 4361 } 4362 mctx->sub_tops[mctx->nsub_tops] = calloc (1, sizeof (re_sub_match_top_t)); 4363 if (BE (mctx->sub_tops[mctx->nsub_tops] == NULL, 0)) 4364 return REG_ESPACE; 4365 mctx->sub_tops[mctx->nsub_tops]->node = node; 4366 mctx->sub_tops[mctx->nsub_tops++]->str_idx = str_idx; 4367 return REG_NOERROR; 4368} 4369 4370/* Register the node NODE, whose type is OP_CLOSE_SUBEXP, and which matches 4371 at STR_IDX, whose corresponding OP_OPEN_SUBEXP is SUB_TOP. */ 4372 4373static re_sub_match_last_t * 4374internal_function 4375match_ctx_add_sublast (re_sub_match_top_t *subtop, Idx node, Idx str_idx) 4376{ 4377 re_sub_match_last_t *new_entry; 4378 if (BE (subtop->nlasts == subtop->alasts, 0)) 4379 { 4380 Idx new_alasts = 2 * subtop->alasts + 1; 4381 re_sub_match_last_t **new_array = re_realloc (subtop->lasts, 4382 re_sub_match_last_t *, 4383 new_alasts); 4384 if (BE (new_array == NULL, 0)) 4385 return NULL; 4386 subtop->lasts = new_array; 4387 subtop->alasts = new_alasts; 4388 } 4389 new_entry = calloc (1, sizeof (re_sub_match_last_t)); 4390 if (BE (new_entry != NULL, 1)) 4391 { 4392 subtop->lasts[subtop->nlasts] = new_entry; 4393 new_entry->node = node; 4394 new_entry->str_idx = str_idx; 4395 ++subtop->nlasts; 4396 } 4397 return new_entry; 4398} 4399 4400static void 4401internal_function 4402sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts, 4403 re_dfastate_t **limited_sts, Idx last_node, Idx last_str_idx) 4404{ 4405 sctx->sifted_states = sifted_sts; 4406 sctx->limited_states = limited_sts; 4407 sctx->last_node = last_node; 4408 sctx->last_str_idx = last_str_idx; 4409 re_node_set_init_empty (&sctx->limits); 4410} 4411