regexec.c revision 225736
1/* Extended regular expression matching and search library. 2 Copyright (C) 2002, 2003, 2004, 2005 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>. 5 6 The GNU C Library is free software; you can redistribute it and/or 7 modify it under the terms of the GNU Lesser General Public 8 License as published by the Free Software Foundation; either 9 version 2.1 of the License, or (at your option) any later version. 10 11 The GNU C Library is distributed in the hope that it will be useful, 12 but WITHOUT ANY WARRANTY; without even the implied warranty of 13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 Lesser General Public License for more details. 15 16 You should have received a copy of the GNU Lesser General Public 17 License along with the GNU C Library; if not, write to the Free 18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 19 02111-1307 USA. */ 20 21static reg_errcode_t match_ctx_init (re_match_context_t *cache, int eflags, 22 int n) internal_function; 23static void match_ctx_clean (re_match_context_t *mctx) internal_function; 24static void match_ctx_free (re_match_context_t *cache) internal_function; 25static reg_errcode_t match_ctx_add_entry (re_match_context_t *cache, int node, 26 int str_idx, int from, int to) 27 internal_function; 28static int search_cur_bkref_entry (re_match_context_t *mctx, int str_idx) 29 internal_function; 30static reg_errcode_t match_ctx_add_subtop (re_match_context_t *mctx, int node, 31 int str_idx) internal_function; 32static re_sub_match_last_t * match_ctx_add_sublast (re_sub_match_top_t *subtop, 33 int node, int str_idx) 34 internal_function; 35static void sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts, 36 re_dfastate_t **limited_sts, int last_node, 37 int last_str_idx) 38 internal_function; 39static reg_errcode_t re_search_internal (const regex_t *preg, 40 const char *string, int length, 41 int start, int range, int stop, 42 size_t nmatch, regmatch_t pmatch[], 43 int eflags) internal_function; 44static int re_search_2_stub (struct re_pattern_buffer *bufp, 45 const char *string1, int length1, 46 const char *string2, int length2, 47 int start, int range, struct re_registers *regs, 48 int stop, int ret_len) internal_function; 49static int re_search_stub (struct re_pattern_buffer *bufp, 50 const char *string, int length, int start, 51 int range, int stop, struct re_registers *regs, 52 int ret_len) internal_function; 53static unsigned re_copy_regs (struct re_registers *regs, regmatch_t *pmatch, 54 int nregs, int regs_allocated) internal_function; 55static inline re_dfastate_t *acquire_init_state_context 56 (reg_errcode_t *err, const re_match_context_t *mctx, int idx) 57 __attribute ((always_inline)) internal_function; 58static reg_errcode_t prune_impossible_nodes (re_match_context_t *mctx) 59 internal_function; 60static int check_matching (re_match_context_t *mctx, int fl_longest_match, 61 int *p_match_first) 62 internal_function; 63static int check_halt_node_context (const re_dfa_t *dfa, int node, 64 unsigned int context) internal_function; 65static int check_halt_state_context (const re_match_context_t *mctx, 66 const re_dfastate_t *state, int idx) 67 internal_function; 68static void update_regs (re_dfa_t *dfa, regmatch_t *pmatch, 69 regmatch_t *prev_idx_match, int cur_node, 70 int cur_idx, int nmatch) internal_function; 71static int proceed_next_node (const re_match_context_t *mctx, 72 int nregs, regmatch_t *regs, 73 int *pidx, int node, re_node_set *eps_via_nodes, 74 struct re_fail_stack_t *fs) internal_function; 75static reg_errcode_t push_fail_stack (struct re_fail_stack_t *fs, 76 int str_idx, int dest_node, int nregs, 77 regmatch_t *regs, 78 re_node_set *eps_via_nodes) internal_function; 79static int pop_fail_stack (struct re_fail_stack_t *fs, int *pidx, int nregs, 80 regmatch_t *regs, re_node_set *eps_via_nodes) internal_function; 81static reg_errcode_t set_regs (const regex_t *preg, 82 const re_match_context_t *mctx, 83 size_t nmatch, regmatch_t *pmatch, 84 int fl_backtrack) internal_function; 85static reg_errcode_t free_fail_stack_return (struct re_fail_stack_t *fs) internal_function; 86 87#ifdef RE_ENABLE_I18N 88static int sift_states_iter_mb (const re_match_context_t *mctx, 89 re_sift_context_t *sctx, 90 int node_idx, int str_idx, int max_str_idx) internal_function; 91#endif /* RE_ENABLE_I18N */ 92static reg_errcode_t sift_states_backward (re_match_context_t *mctx, 93 re_sift_context_t *sctx) internal_function; 94static reg_errcode_t build_sifted_states (re_match_context_t *mctx, 95 re_sift_context_t *sctx, int str_idx, 96 re_node_set *cur_dest) internal_function; 97static reg_errcode_t update_cur_sifted_state (re_match_context_t *mctx, 98 re_sift_context_t *sctx, 99 int str_idx, 100 re_node_set *dest_nodes) internal_function; 101static reg_errcode_t add_epsilon_src_nodes (re_dfa_t *dfa, 102 re_node_set *dest_nodes, 103 const re_node_set *candidates) internal_function; 104static reg_errcode_t sub_epsilon_src_nodes (re_dfa_t *dfa, int node, 105 re_node_set *dest_nodes, 106 const re_node_set *and_nodes) internal_function; 107static int check_dst_limits (re_match_context_t *mctx, re_node_set *limits, 108 int dst_node, int dst_idx, int src_node, 109 int src_idx) internal_function; 110static int check_dst_limits_calc_pos_1 (re_match_context_t *mctx, 111 int boundaries, int subexp_idx, 112 int from_node, int bkref_idx) internal_function; 113static int check_dst_limits_calc_pos (re_match_context_t *mctx, 114 int limit, int subexp_idx, 115 int node, int str_idx, 116 int bkref_idx) internal_function; 117static reg_errcode_t check_subexp_limits (re_dfa_t *dfa, 118 re_node_set *dest_nodes, 119 const re_node_set *candidates, 120 re_node_set *limits, 121 struct re_backref_cache_entry *bkref_ents, 122 int str_idx) internal_function; 123static reg_errcode_t sift_states_bkref (re_match_context_t *mctx, 124 re_sift_context_t *sctx, 125 int str_idx, const re_node_set *candidates) internal_function; 126static reg_errcode_t clean_state_log_if_needed (re_match_context_t *mctx, 127 int next_state_log_idx) internal_function; 128static reg_errcode_t merge_state_array (re_dfa_t *dfa, re_dfastate_t **dst, 129 re_dfastate_t **src, int num) internal_function; 130static re_dfastate_t *find_recover_state (reg_errcode_t *err, 131 re_match_context_t *mctx) internal_function; 132static re_dfastate_t *transit_state (reg_errcode_t *err, 133 re_match_context_t *mctx, 134 re_dfastate_t *state) internal_function; 135static re_dfastate_t *merge_state_with_log (reg_errcode_t *err, 136 re_match_context_t *mctx, 137 re_dfastate_t *next_state) internal_function; 138static reg_errcode_t check_subexp_matching_top (re_match_context_t *mctx, 139 re_node_set *cur_nodes, 140 int str_idx) internal_function; 141#if 0 142static re_dfastate_t *transit_state_sb (reg_errcode_t *err, 143 re_match_context_t *mctx, 144 re_dfastate_t *pstate) internal_function; 145#endif 146#ifdef RE_ENABLE_I18N 147static reg_errcode_t transit_state_mb (re_match_context_t *mctx, 148 re_dfastate_t *pstate) internal_function; 149#endif /* RE_ENABLE_I18N */ 150static reg_errcode_t transit_state_bkref (re_match_context_t *mctx, 151 const re_node_set *nodes) internal_function; 152static reg_errcode_t get_subexp (re_match_context_t *mctx, 153 int bkref_node, int bkref_str_idx) internal_function; 154static reg_errcode_t get_subexp_sub (re_match_context_t *mctx, 155 const re_sub_match_top_t *sub_top, 156 re_sub_match_last_t *sub_last, 157 int bkref_node, int bkref_str) internal_function; 158static int find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes, 159 int subexp_idx, int type) internal_function; 160static reg_errcode_t check_arrival (re_match_context_t *mctx, 161 state_array_t *path, int top_node, 162 int top_str, int last_node, int last_str, 163 int type) internal_function; 164static reg_errcode_t check_arrival_add_next_nodes (re_match_context_t *mctx, 165 int str_idx, 166 re_node_set *cur_nodes, 167 re_node_set *next_nodes) internal_function; 168static reg_errcode_t check_arrival_expand_ecl (re_dfa_t *dfa, 169 re_node_set *cur_nodes, 170 int ex_subexp, int type) internal_function; 171static reg_errcode_t check_arrival_expand_ecl_sub (re_dfa_t *dfa, 172 re_node_set *dst_nodes, 173 int target, int ex_subexp, 174 int type) internal_function; 175static reg_errcode_t expand_bkref_cache (re_match_context_t *mctx, 176 re_node_set *cur_nodes, int cur_str, 177 int subexp_num, int type) internal_function; 178static int build_trtable (re_dfa_t *dfa, 179 re_dfastate_t *state) internal_function; 180#ifdef RE_ENABLE_I18N 181static int check_node_accept_bytes (re_dfa_t *dfa, int node_idx, 182 const re_string_t *input, int idx) internal_function; 183# ifdef _LIBC 184static unsigned int find_collation_sequence_value (const unsigned char *mbs, 185 size_t name_len) internal_function; 186# endif /* _LIBC */ 187#endif /* RE_ENABLE_I18N */ 188static int group_nodes_into_DFAstates (re_dfa_t *dfa, 189 const re_dfastate_t *state, 190 re_node_set *states_node, 191 bitset *states_ch) internal_function; 192static int check_node_accept (const re_match_context_t *mctx, 193 const re_token_t *node, int idx) internal_function; 194static reg_errcode_t extend_buffers (re_match_context_t *mctx) internal_function; 195 196/* Entry point for POSIX code. */ 197 198/* regexec searches for a given pattern, specified by PREG, in the 199 string STRING. 200 201 If NMATCH is zero or REG_NOSUB was set in the cflags argument to 202 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at 203 least NMATCH elements, and we set them to the offsets of the 204 corresponding matched substrings. 205 206 EFLAGS specifies `execution flags' which affect matching: if 207 REG_NOTBOL is set, then ^ does not match at the beginning of the 208 string; if REG_NOTEOL is set, then $ does not match at the end. 209 210 We return 0 if we find a match and REG_NOMATCH if not. */ 211 212int 213regexec (preg, string, nmatch, pmatch, eflags) 214 const regex_t *__restrict preg; 215 const char *__restrict string; 216 size_t nmatch; 217 regmatch_t pmatch[]; 218 int eflags; 219{ 220 reg_errcode_t err; 221 int start, length; 222 223 if (eflags & ~(REG_NOTBOL | REG_NOTEOL | REG_STARTEND)) 224 return REG_BADPAT; 225 226 if (eflags & REG_STARTEND) 227 { 228 start = pmatch[0].rm_so; 229 length = pmatch[0].rm_eo; 230 } 231 else 232 { 233 start = 0; 234 length = strlen (string); 235 } 236 if (preg->no_sub) 237 err = re_search_internal (preg, string, length, start, length - start, 238 length, 0, NULL, eflags); 239 else 240 err = re_search_internal (preg, string, length, start, length - start, 241 length, nmatch, pmatch, eflags); 242 return err != REG_NOERROR; 243} 244 245#ifdef _LIBC 246# include <shlib-compat.h> 247versioned_symbol (libc, __regexec, regexec, GLIBC_2_3_4); 248 249# if SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3_4) 250__typeof__ (__regexec) __compat_regexec; 251 252int 253attribute_compat_text_section 254__compat_regexec (const regex_t *__restrict preg, 255 const char *__restrict string, size_t nmatch, 256 regmatch_t pmatch[], int eflags) 257{ 258 return regexec (preg, string, nmatch, pmatch, 259 eflags & (REG_NOTBOL | REG_NOTEOL)); 260} 261compat_symbol (libc, __compat_regexec, regexec, GLIBC_2_0); 262# endif 263#endif 264 265/* Entry points for GNU code. */ 266 267/* re_match, re_search, re_match_2, re_search_2 268 269 The former two functions operate on STRING with length LENGTH, 270 while the later two operate on concatenation of STRING1 and STRING2 271 with lengths LENGTH1 and LENGTH2, respectively. 272 273 re_match() matches the compiled pattern in BUFP against the string, 274 starting at index START. 275 276 re_search() first tries matching at index START, then it tries to match 277 starting from index START + 1, and so on. The last start position tried 278 is START + RANGE. (Thus RANGE = 0 forces re_search to operate the same 279 way as re_match().) 280 281 The parameter STOP of re_{match,search}_2 specifies that no match exceeding 282 the first STOP characters of the concatenation of the strings should be 283 concerned. 284 285 If REGS is not NULL, and BUFP->no_sub is not set, the offsets of the match 286 and all groups is stroed in REGS. (For the "_2" variants, the offsets are 287 computed relative to the concatenation, not relative to the individual 288 strings.) 289 290 On success, re_match* functions return the length of the match, re_search* 291 return the position of the start of the match. Return value -1 means no 292 match was found and -2 indicates an internal error. */ 293 294int 295re_match (bufp, string, length, start, regs) 296 struct re_pattern_buffer *bufp; 297 const char *string; 298 int length, start; 299 struct re_registers *regs; 300{ 301 return re_search_stub (bufp, string, length, start, 0, length, regs, 1); 302} 303#ifdef _LIBC 304weak_alias (__re_match, re_match) 305#endif 306 307int 308re_search (bufp, string, length, start, range, regs) 309 struct re_pattern_buffer *bufp; 310 const char *string; 311 int length, start, range; 312 struct re_registers *regs; 313{ 314 return re_search_stub (bufp, string, length, start, range, length, regs, 0); 315} 316#ifdef _LIBC 317weak_alias (__re_search, re_search) 318#endif 319 320int 321re_match_2 (bufp, string1, length1, string2, length2, start, regs, stop) 322 struct re_pattern_buffer *bufp; 323 const char *string1, *string2; 324 int length1, length2, start, stop; 325 struct re_registers *regs; 326{ 327 return re_search_2_stub (bufp, string1, length1, string2, length2, 328 start, 0, regs, stop, 1); 329} 330#ifdef _LIBC 331weak_alias (__re_match_2, re_match_2) 332#endif 333 334int 335re_search_2 (bufp, string1, length1, string2, length2, start, range, regs, stop) 336 struct re_pattern_buffer *bufp; 337 const char *string1, *string2; 338 int length1, length2, start, range, stop; 339 struct re_registers *regs; 340{ 341 return re_search_2_stub (bufp, string1, length1, string2, length2, 342 start, range, regs, stop, 0); 343} 344#ifdef _LIBC 345weak_alias (__re_search_2, re_search_2) 346#endif 347 348static int 349re_search_2_stub (bufp, string1, length1, string2, length2, start, range, regs, 350 stop, ret_len) 351 struct re_pattern_buffer *bufp; 352 const char *string1, *string2; 353 int length1, length2, start, range, stop, ret_len; 354 struct re_registers *regs; 355{ 356 const char *str; 357 int rval; 358 int len = length1 + length2; 359 int free_str = 0; 360 361 if (BE (length1 < 0 || length2 < 0 || stop < 0, 0)) 362 return -2; 363 364 /* Concatenate the strings. */ 365 if (length2 > 0) 366 if (length1 > 0) 367 { 368 char *s = re_malloc (char, len); 369 370 if (BE (s == NULL, 0)) 371 return -2; 372 memcpy (s, string1, length1); 373 memcpy (s + length1, string2, length2); 374 str = s; 375 free_str = 1; 376 } 377 else 378 str = string2; 379 else 380 str = string1; 381 382 rval = re_search_stub (bufp, str, len, start, range, stop, regs, 383 ret_len); 384 if (free_str) 385 re_free ((char *) str); 386 return rval; 387} 388 389/* The parameters have the same meaning as those of re_search. 390 Additional parameters: 391 If RET_LEN is nonzero the length of the match is returned (re_match style); 392 otherwise the position of the match is returned. */ 393 394static int 395re_search_stub (bufp, string, length, start, range, stop, regs, ret_len) 396 struct re_pattern_buffer *bufp; 397 const char *string; 398 int length, start, range, stop, ret_len; 399 struct re_registers *regs; 400{ 401 reg_errcode_t result; 402 regmatch_t *pmatch; 403 int nregs, rval; 404 int eflags = 0; 405 406 /* Check for out-of-range. */ 407 if (BE (start < 0 || start > length, 0)) 408 return -1; 409 if (BE (start + range > length, 0)) 410 range = length - start; 411 else if (BE (start + range < 0, 0)) 412 range = -start; 413 414 eflags |= (bufp->not_bol) ? REG_NOTBOL : 0; 415 eflags |= (bufp->not_eol) ? REG_NOTEOL : 0; 416 417 /* Compile fastmap if we haven't yet. */ 418 if (range > 0 && bufp->fastmap != NULL && !bufp->fastmap_accurate) 419 re_compile_fastmap (bufp); 420 421 if (BE (bufp->no_sub, 0)) 422 regs = NULL; 423 424 /* We need at least 1 register. */ 425 if (regs == NULL) 426 nregs = 1; 427 else if (BE (bufp->regs_allocated == REGS_FIXED && 428 regs->num_regs < bufp->re_nsub + 1, 0)) 429 { 430 nregs = regs->num_regs; 431 if (BE (nregs < 1, 0)) 432 { 433 /* Nothing can be copied to regs. */ 434 regs = NULL; 435 nregs = 1; 436 } 437 } 438 else 439 nregs = bufp->re_nsub + 1; 440 pmatch = re_malloc (regmatch_t, nregs); 441 if (BE (pmatch == NULL, 0)) 442 return -2; 443 444 result = re_search_internal (bufp, string, length, start, range, stop, 445 nregs, pmatch, eflags); 446 447 rval = 0; 448 449 /* I hope we needn't fill ther regs with -1's when no match was found. */ 450 if (result != REG_NOERROR) 451 rval = -1; 452 else if (regs != NULL) 453 { 454 /* If caller wants register contents data back, copy them. */ 455 bufp->regs_allocated = re_copy_regs (regs, pmatch, nregs, 456 bufp->regs_allocated); 457 if (BE (bufp->regs_allocated == REGS_UNALLOCATED, 0)) 458 rval = -2; 459 } 460 461 if (BE (rval == 0, 1)) 462 { 463 if (ret_len) 464 { 465 assert (pmatch[0].rm_so == start); 466 rval = pmatch[0].rm_eo - start; 467 } 468 else 469 rval = pmatch[0].rm_so; 470 } 471 re_free (pmatch); 472 return rval; 473} 474 475static unsigned 476re_copy_regs (regs, pmatch, nregs, regs_allocated) 477 struct re_registers *regs; 478 regmatch_t *pmatch; 479 int nregs, regs_allocated; 480{ 481 int rval = REGS_REALLOCATE; 482 int i; 483 int need_regs = nregs + 1; 484 /* We need one extra element beyond `num_regs' for the `-1' marker GNU code 485 uses. */ 486 487 /* Have the register data arrays been allocated? */ 488 if (regs_allocated == REGS_UNALLOCATED) 489 { /* No. So allocate them with malloc. */ 490 regs->start = re_malloc (regoff_t, need_regs); 491 regs->end = re_malloc (regoff_t, need_regs); 492 if (BE (regs->start == NULL, 0) || BE (regs->end == NULL, 0)) 493 return REGS_UNALLOCATED; 494 regs->num_regs = need_regs; 495 } 496 else if (regs_allocated == REGS_REALLOCATE) 497 { /* Yes. If we need more elements than were already 498 allocated, reallocate them. If we need fewer, just 499 leave it alone. */ 500 if (BE (need_regs > regs->num_regs, 0)) 501 { 502 regoff_t *new_start = re_realloc (regs->start, regoff_t, need_regs); 503 regoff_t *new_end = re_realloc (regs->end, regoff_t, need_regs); 504 if (BE (new_start == NULL, 0) || BE (new_end == NULL, 0)) 505 return REGS_UNALLOCATED; 506 regs->start = new_start; 507 regs->end = new_end; 508 regs->num_regs = need_regs; 509 } 510 } 511 else 512 { 513 assert (regs_allocated == REGS_FIXED); 514 /* This function may not be called with REGS_FIXED and nregs too big. */ 515 assert (regs->num_regs >= nregs); 516 rval = REGS_FIXED; 517 } 518 519 /* Copy the regs. */ 520 for (i = 0; i < nregs; ++i) 521 { 522 regs->start[i] = pmatch[i].rm_so; 523 regs->end[i] = pmatch[i].rm_eo; 524 } 525 for ( ; i < regs->num_regs; ++i) 526 regs->start[i] = regs->end[i] = -1; 527 528 return rval; 529} 530 531/* Set REGS to hold NUM_REGS registers, storing them in STARTS and 532 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use 533 this memory for recording register information. STARTS and ENDS 534 must be allocated using the malloc library routine, and must each 535 be at least NUM_REGS * sizeof (regoff_t) bytes long. 536 537 If NUM_REGS == 0, then subsequent matches should allocate their own 538 register data. 539 540 Unless this function is called, the first search or match using 541 PATTERN_BUFFER will allocate its own register data, without 542 freeing the old data. */ 543 544void 545re_set_registers (bufp, regs, num_regs, starts, ends) 546 struct re_pattern_buffer *bufp; 547 struct re_registers *regs; 548 unsigned num_regs; 549 regoff_t *starts, *ends; 550{ 551 if (num_regs) 552 { 553 bufp->regs_allocated = REGS_REALLOCATE; 554 regs->num_regs = num_regs; 555 regs->start = starts; 556 regs->end = ends; 557 } 558 else 559 { 560 bufp->regs_allocated = REGS_UNALLOCATED; 561 regs->num_regs = 0; 562 regs->start = regs->end = (regoff_t *) 0; 563 } 564} 565#ifdef _LIBC 566weak_alias (__re_set_registers, re_set_registers) 567#endif 568 569/* Entry points compatible with 4.2 BSD regex library. We don't define 570 them unless specifically requested. */ 571 572#if defined _REGEX_RE_COMP || defined _LIBC 573int 574# ifdef _LIBC 575weak_function 576# endif 577re_exec (s) 578 const char *s; 579{ 580 return 0 == regexec (&re_comp_buf, s, 0, NULL, 0); 581} 582#endif /* _REGEX_RE_COMP */ 583 584/* Internal entry point. */ 585 586/* Searches for a compiled pattern PREG in the string STRING, whose 587 length is LENGTH. NMATCH, PMATCH, and EFLAGS have the same 588 mingings with regexec. START, and RANGE have the same meanings 589 with re_search. 590 Return REG_NOERROR if we find a match, and REG_NOMATCH if not, 591 otherwise return the error code. 592 Note: We assume front end functions already check ranges. 593 (START + RANGE >= 0 && START + RANGE <= LENGTH) */ 594 595static reg_errcode_t 596re_search_internal (preg, string, length, start, range, stop, nmatch, pmatch, 597 eflags) 598 const regex_t *preg; 599 const char *string; 600 int length, start, range, stop, eflags; 601 size_t nmatch; 602 regmatch_t pmatch[]; 603{ 604 reg_errcode_t err; 605 re_dfa_t *dfa = (re_dfa_t *)preg->buffer; 606 int left_lim, right_lim, incr; 607 int fl_longest_match, match_first, match_kind, match_last = -1; 608 int extra_nmatch; 609 int sb, ch; 610#if defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L) 611 re_match_context_t mctx = { .dfa = dfa }; 612#else 613 re_match_context_t mctx; 614#endif 615 char *fastmap = (preg->fastmap != NULL && preg->fastmap_accurate 616 && range && !preg->can_be_null) ? preg->fastmap : NULL; 617 unsigned RE_TRANSLATE_TYPE t = (unsigned RE_TRANSLATE_TYPE) preg->translate; 618 619#if !(defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L)) 620 memset (&mctx, '\0', sizeof (re_match_context_t)); 621 mctx.dfa = dfa; 622#endif 623 624 extra_nmatch = (nmatch > preg->re_nsub) ? nmatch - (preg->re_nsub + 1) : 0; 625 nmatch -= extra_nmatch; 626 627 /* Check if the DFA haven't been compiled. */ 628 if (BE (preg->used == 0 || dfa->init_state == NULL 629 || dfa->init_state_word == NULL || dfa->init_state_nl == NULL 630 || dfa->init_state_begbuf == NULL, 0)) 631 return REG_NOMATCH; 632 633#ifdef DEBUG 634 /* We assume front-end functions already check them. */ 635 assert (start + range >= 0 && start + range <= length); 636#endif 637 638 /* If initial states with non-begbuf contexts have no elements, 639 the regex must be anchored. If preg->newline_anchor is set, 640 we'll never use init_state_nl, so do not check it. */ 641 if (dfa->init_state->nodes.nelem == 0 642 && dfa->init_state_word->nodes.nelem == 0 643 && (dfa->init_state_nl->nodes.nelem == 0 644 || !preg->newline_anchor)) 645 { 646 if (start != 0 && start + range != 0) 647 return REG_NOMATCH; 648 start = range = 0; 649 } 650 651 /* We must check the longest matching, if nmatch > 0. */ 652 fl_longest_match = (nmatch != 0 || dfa->nbackref); 653 654 err = re_string_allocate (&mctx.input, string, length, dfa->nodes_len + 1, 655 preg->translate, preg->syntax & RE_ICASE, dfa); 656 if (BE (err != REG_NOERROR, 0)) 657 goto free_return; 658 mctx.input.stop = stop; 659 mctx.input.raw_stop = stop; 660 mctx.input.newline_anchor = preg->newline_anchor; 661 662 err = match_ctx_init (&mctx, eflags, dfa->nbackref * 2); 663 if (BE (err != REG_NOERROR, 0)) 664 goto free_return; 665 666 /* We will log all the DFA states through which the dfa pass, 667 if nmatch > 1, or this dfa has "multibyte node", which is a 668 back-reference or a node which can accept multibyte character or 669 multi character collating element. */ 670 if (nmatch > 1 || dfa->has_mb_node) 671 { 672 mctx.state_log = re_malloc (re_dfastate_t *, mctx.input.bufs_len + 1); 673 if (BE (mctx.state_log == NULL, 0)) 674 { 675 err = REG_ESPACE; 676 goto free_return; 677 } 678 } 679 else 680 mctx.state_log = NULL; 681 682 match_first = start; 683 mctx.input.tip_context = (eflags & REG_NOTBOL) ? CONTEXT_BEGBUF 684 : CONTEXT_NEWLINE | CONTEXT_BEGBUF; 685 686 /* Check incrementally whether of not the input string match. */ 687 incr = (range < 0) ? -1 : 1; 688 left_lim = (range < 0) ? start + range : start; 689 right_lim = (range < 0) ? start : start + range; 690 sb = dfa->mb_cur_max == 1; 691 match_kind = 692 (fastmap 693 ? ((sb || !(preg->syntax & RE_ICASE || t) ? 4 : 0) 694 | (range >= 0 ? 2 : 0) 695 | (t != NULL ? 1 : 0)) 696 : 8); 697 698 for (;; match_first += incr) 699 { 700 err = REG_NOMATCH; 701 if (match_first < left_lim || right_lim < match_first) 702 goto free_return; 703 704 /* Advance as rapidly as possible through the string, until we 705 find a plausible place to start matching. This may be done 706 with varying efficiency, so there are various possibilities: 707 only the most common of them are specialized, in order to 708 save on code size. We use a switch statement for speed. */ 709 switch (match_kind) 710 { 711 case 8: 712 /* No fastmap. */ 713 break; 714 715 case 7: 716 /* Fastmap with single-byte translation, match forward. */ 717 while (BE (match_first < right_lim, 1) 718 && !fastmap[t[(unsigned char) string[match_first]]]) 719 ++match_first; 720 goto forward_match_found_start_or_reached_end; 721 722 case 6: 723 /* Fastmap without translation, match forward. */ 724 while (BE (match_first < right_lim, 1) 725 && !fastmap[(unsigned char) string[match_first]]) 726 ++match_first; 727 728 forward_match_found_start_or_reached_end: 729 if (BE (match_first == right_lim, 0)) 730 { 731 ch = match_first >= length 732 ? 0 : (unsigned char) string[match_first]; 733 if (!fastmap[t ? t[ch] : ch]) 734 goto free_return; 735 } 736 break; 737 738 case 4: 739 case 5: 740 /* Fastmap without multi-byte translation, match backwards. */ 741 while (match_first >= left_lim) 742 { 743 ch = match_first >= length 744 ? 0 : (unsigned char) string[match_first]; 745 if (fastmap[t ? t[ch] : ch]) 746 break; 747 --match_first; 748 } 749 if (match_first < left_lim) 750 goto free_return; 751 break; 752 753 default: 754 /* In this case, we can't determine easily the current byte, 755 since it might be a component byte of a multibyte 756 character. Then we use the constructed buffer instead. */ 757 for (;;) 758 { 759 /* If MATCH_FIRST is out of the valid range, reconstruct the 760 buffers. */ 761 unsigned int offset = match_first - mctx.input.raw_mbs_idx; 762 if (BE (offset >= (unsigned int) mctx.input.valid_raw_len, 0)) 763 { 764 err = re_string_reconstruct (&mctx.input, match_first, 765 eflags); 766 if (BE (err != REG_NOERROR, 0)) 767 goto free_return; 768 769 offset = match_first - mctx.input.raw_mbs_idx; 770 } 771 /* If MATCH_FIRST is out of the buffer, leave it as '\0'. 772 Note that MATCH_FIRST must not be smaller than 0. */ 773 ch = (match_first >= length 774 ? 0 : re_string_byte_at (&mctx.input, offset)); 775 if (fastmap[ch]) 776 break; 777 match_first += incr; 778 if (match_first < left_lim || match_first > right_lim) 779 { 780 err = REG_NOMATCH; 781 goto free_return; 782 } 783 } 784 break; 785 } 786 787 /* Reconstruct the buffers so that the matcher can assume that 788 the matching starts from the beginning of the buffer. */ 789 err = re_string_reconstruct (&mctx.input, match_first, eflags); 790 if (BE (err != REG_NOERROR, 0)) 791 goto free_return; 792 793#ifdef RE_ENABLE_I18N 794 /* Don't consider this char as a possible match start if it part, 795 yet isn't the head, of a multibyte character. */ 796 if (!sb && !re_string_first_byte (&mctx.input, 0)) 797 continue; 798#endif 799 800 /* It seems to be appropriate one, then use the matcher. */ 801 /* We assume that the matching starts from 0. */ 802 mctx.state_log_top = mctx.nbkref_ents = mctx.max_mb_elem_len = 0; 803 match_last = check_matching (&mctx, fl_longest_match, 804 range >= 0 ? &match_first : NULL); 805 if (match_last != -1) 806 { 807 if (BE (match_last == -2, 0)) 808 { 809 err = REG_ESPACE; 810 goto free_return; 811 } 812 else 813 { 814 mctx.match_last = match_last; 815 if ((!preg->no_sub && nmatch > 1) || dfa->nbackref) 816 { 817 re_dfastate_t *pstate = mctx.state_log[match_last]; 818 mctx.last_node = check_halt_state_context (&mctx, pstate, 819 match_last); 820 } 821 if ((!preg->no_sub && nmatch > 1 && dfa->has_plural_match) 822 || dfa->nbackref) 823 { 824 err = prune_impossible_nodes (&mctx); 825 if (err == REG_NOERROR) 826 break; 827 if (BE (err != REG_NOMATCH, 0)) 828 goto free_return; 829 match_last = -1; 830 } 831 else 832 break; /* We found a match. */ 833 } 834 } 835 836 match_ctx_clean (&mctx); 837 } 838 839#ifdef DEBUG 840 assert (match_last != -1); 841 assert (err == REG_NOERROR); 842#endif 843 844 /* Set pmatch[] if we need. */ 845 if (nmatch > 0) 846 { 847 int reg_idx; 848 849 /* Initialize registers. */ 850 for (reg_idx = 1; reg_idx < nmatch; ++reg_idx) 851 pmatch[reg_idx].rm_so = pmatch[reg_idx].rm_eo = -1; 852 853 /* Set the points where matching start/end. */ 854 pmatch[0].rm_so = 0; 855 pmatch[0].rm_eo = mctx.match_last; 856 857 if (!preg->no_sub && nmatch > 1) 858 { 859 err = set_regs (preg, &mctx, nmatch, pmatch, 860 dfa->has_plural_match && dfa->nbackref > 0); 861 if (BE (err != REG_NOERROR, 0)) 862 goto free_return; 863 } 864 865 /* At last, add the offset to the each registers, since we slided 866 the buffers so that we could assume that the matching starts 867 from 0. */ 868 for (reg_idx = 0; reg_idx < nmatch; ++reg_idx) 869 if (pmatch[reg_idx].rm_so != -1) 870 { 871#ifdef RE_ENABLE_I18N 872 if (BE (mctx.input.offsets_needed != 0, 0)) 873 { 874 if (pmatch[reg_idx].rm_so == mctx.input.valid_len) 875 pmatch[reg_idx].rm_so += mctx.input.valid_raw_len - mctx.input.valid_len; 876 else 877 pmatch[reg_idx].rm_so = mctx.input.offsets[pmatch[reg_idx].rm_so]; 878 if (pmatch[reg_idx].rm_eo == mctx.input.valid_len) 879 pmatch[reg_idx].rm_eo += mctx.input.valid_raw_len - mctx.input.valid_len; 880 else 881 pmatch[reg_idx].rm_eo = mctx.input.offsets[pmatch[reg_idx].rm_eo]; 882 } 883#else 884 assert (mctx.input.offsets_needed == 0); 885#endif 886 pmatch[reg_idx].rm_so += match_first; 887 pmatch[reg_idx].rm_eo += match_first; 888 } 889 for (reg_idx = 0; reg_idx < extra_nmatch; ++reg_idx) 890 { 891 pmatch[nmatch + reg_idx].rm_so = -1; 892 pmatch[nmatch + reg_idx].rm_eo = -1; 893 } 894 895 if (dfa->subexp_map) 896 for (reg_idx = 0; reg_idx + 1 < nmatch; reg_idx++) 897 if (dfa->subexp_map[reg_idx] != reg_idx) 898 { 899 pmatch[reg_idx + 1].rm_so 900 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_so; 901 pmatch[reg_idx + 1].rm_eo 902 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_eo; 903 } 904 } 905 906 free_return: 907 re_free (mctx.state_log); 908 if (dfa->nbackref) 909 match_ctx_free (&mctx); 910 re_string_destruct (&mctx.input); 911 return err; 912} 913 914static reg_errcode_t 915prune_impossible_nodes (mctx) 916 re_match_context_t *mctx; 917{ 918 re_dfa_t *const dfa = mctx->dfa; 919 int halt_node, match_last; 920 reg_errcode_t ret; 921 re_dfastate_t **sifted_states; 922 re_dfastate_t **lim_states = NULL; 923 re_sift_context_t sctx; 924#ifdef DEBUG 925 assert (mctx->state_log != NULL); 926#endif 927 match_last = mctx->match_last; 928 halt_node = mctx->last_node; 929 sifted_states = re_malloc (re_dfastate_t *, match_last + 1); 930 if (BE (sifted_states == NULL, 0)) 931 { 932 ret = REG_ESPACE; 933 goto free_return; 934 } 935 if (dfa->nbackref) 936 { 937 lim_states = re_malloc (re_dfastate_t *, match_last + 1); 938 if (BE (lim_states == NULL, 0)) 939 { 940 ret = REG_ESPACE; 941 goto free_return; 942 } 943 while (1) 944 { 945 memset (lim_states, '\0', 946 sizeof (re_dfastate_t *) * (match_last + 1)); 947 sift_ctx_init (&sctx, sifted_states, lim_states, halt_node, 948 match_last); 949 ret = sift_states_backward (mctx, &sctx); 950 re_node_set_free (&sctx.limits); 951 if (BE (ret != REG_NOERROR, 0)) 952 goto free_return; 953 if (sifted_states[0] != NULL || lim_states[0] != NULL) 954 break; 955 do 956 { 957 --match_last; 958 if (match_last < 0) 959 { 960 ret = REG_NOMATCH; 961 goto free_return; 962 } 963 } while (mctx->state_log[match_last] == NULL 964 || !mctx->state_log[match_last]->halt); 965 halt_node = check_halt_state_context (mctx, 966 mctx->state_log[match_last], 967 match_last); 968 } 969 ret = merge_state_array (dfa, sifted_states, lim_states, 970 match_last + 1); 971 re_free (lim_states); 972 lim_states = NULL; 973 if (BE (ret != REG_NOERROR, 0)) 974 goto free_return; 975 } 976 else 977 { 978 sift_ctx_init (&sctx, sifted_states, lim_states, halt_node, match_last); 979 ret = sift_states_backward (mctx, &sctx); 980 re_node_set_free (&sctx.limits); 981 if (BE (ret != REG_NOERROR, 0)) 982 goto free_return; 983 } 984 re_free (mctx->state_log); 985 mctx->state_log = sifted_states; 986 sifted_states = NULL; 987 mctx->last_node = halt_node; 988 mctx->match_last = match_last; 989 ret = REG_NOERROR; 990 free_return: 991 re_free (sifted_states); 992 re_free (lim_states); 993 return ret; 994} 995 996/* Acquire an initial state and return it. 997 We must select appropriate initial state depending on the context, 998 since initial states may have constraints like "\<", "^", etc.. */ 999 1000static inline re_dfastate_t * 1001acquire_init_state_context (err, mctx, idx) 1002 reg_errcode_t *err; 1003 const re_match_context_t *mctx; 1004 int idx; 1005{ 1006 re_dfa_t *const dfa = mctx->dfa; 1007 if (dfa->init_state->has_constraint) 1008 { 1009 unsigned int context; 1010 context = re_string_context_at (&mctx->input, idx - 1, mctx->eflags); 1011 if (IS_WORD_CONTEXT (context)) 1012 return dfa->init_state_word; 1013 else if (IS_ORDINARY_CONTEXT (context)) 1014 return dfa->init_state; 1015 else if (IS_BEGBUF_CONTEXT (context) && IS_NEWLINE_CONTEXT (context)) 1016 return dfa->init_state_begbuf; 1017 else if (IS_NEWLINE_CONTEXT (context)) 1018 return dfa->init_state_nl; 1019 else if (IS_BEGBUF_CONTEXT (context)) 1020 { 1021 /* It is relatively rare case, then calculate on demand. */ 1022 return re_acquire_state_context (err, dfa, 1023 dfa->init_state->entrance_nodes, 1024 context); 1025 } 1026 else 1027 /* Must not happen? */ 1028 return dfa->init_state; 1029 } 1030 else 1031 return dfa->init_state; 1032} 1033 1034/* Check whether the regular expression match input string INPUT or not, 1035 and return the index where the matching end, return -1 if not match, 1036 or return -2 in case of an error. 1037 FL_LONGEST_MATCH means we want the POSIX longest matching. 1038 If P_MATCH_FIRST is not NULL, and the match fails, it is set to the 1039 next place where we may want to try matching. 1040 Note that the matcher assume that the maching starts from the current 1041 index of the buffer. */ 1042 1043static int 1044check_matching (mctx, fl_longest_match, p_match_first) 1045 re_match_context_t *mctx; 1046 int fl_longest_match; 1047 int *p_match_first; 1048{ 1049 re_dfa_t *const dfa = mctx->dfa; 1050 reg_errcode_t err; 1051 int match = 0; 1052 int match_last = -1; 1053 int cur_str_idx = re_string_cur_idx (&mctx->input); 1054 re_dfastate_t *cur_state; 1055 int at_init_state = p_match_first != NULL; 1056 int next_start_idx = cur_str_idx; 1057 1058 err = REG_NOERROR; 1059 cur_state = acquire_init_state_context (&err, mctx, cur_str_idx); 1060 /* An initial state must not be NULL (invalid). */ 1061 if (BE (cur_state == NULL, 0)) 1062 { 1063 assert (err == REG_ESPACE); 1064 return -2; 1065 } 1066 1067 if (mctx->state_log != NULL) 1068 { 1069 mctx->state_log[cur_str_idx] = cur_state; 1070 1071 /* Check OP_OPEN_SUBEXP in the initial state in case that we use them 1072 later. E.g. Processing back references. */ 1073 if (BE (dfa->nbackref, 0)) 1074 { 1075 at_init_state = 0; 1076 err = check_subexp_matching_top (mctx, &cur_state->nodes, 0); 1077 if (BE (err != REG_NOERROR, 0)) 1078 return err; 1079 1080 if (cur_state->has_backref) 1081 { 1082 err = transit_state_bkref (mctx, &cur_state->nodes); 1083 if (BE (err != REG_NOERROR, 0)) 1084 return err; 1085 } 1086 } 1087 } 1088 1089 /* If the RE accepts NULL string. */ 1090 if (BE (cur_state->halt, 0)) 1091 { 1092 if (!cur_state->has_constraint 1093 || check_halt_state_context (mctx, cur_state, cur_str_idx)) 1094 { 1095 if (!fl_longest_match) 1096 return cur_str_idx; 1097 else 1098 { 1099 match_last = cur_str_idx; 1100 match = 1; 1101 } 1102 } 1103 } 1104 1105 while (!re_string_eoi (&mctx->input)) 1106 { 1107 re_dfastate_t *old_state = cur_state; 1108 int next_char_idx = re_string_cur_idx (&mctx->input) + 1; 1109 1110 if (BE (next_char_idx >= mctx->input.bufs_len, 0) 1111 || (BE (next_char_idx >= mctx->input.valid_len, 0) 1112 && mctx->input.valid_len < mctx->input.len)) 1113 { 1114 err = extend_buffers (mctx); 1115 if (BE (err != REG_NOERROR, 0)) 1116 { 1117 assert (err == REG_ESPACE); 1118 return -2; 1119 } 1120 } 1121 1122 cur_state = transit_state (&err, mctx, cur_state); 1123 if (mctx->state_log != NULL) 1124 cur_state = merge_state_with_log (&err, mctx, cur_state); 1125 1126 if (cur_state == NULL) 1127 { 1128 /* Reached the invalid state or an error. Try to recover a valid 1129 state using the state log, if available and if we have not 1130 already found a valid (even if not the longest) match. */ 1131 if (BE (err != REG_NOERROR, 0)) 1132 return -2; 1133 1134 if (mctx->state_log == NULL 1135 || (match && !fl_longest_match) 1136 || (cur_state = find_recover_state (&err, mctx)) == NULL) 1137 break; 1138 } 1139 1140 if (BE (at_init_state, 0)) 1141 { 1142 if (old_state == cur_state) 1143 next_start_idx = next_char_idx; 1144 else 1145 at_init_state = 0; 1146 } 1147 1148 if (cur_state->halt) 1149 { 1150 /* Reached a halt state. 1151 Check the halt state can satisfy the current context. */ 1152 if (!cur_state->has_constraint 1153 || check_halt_state_context (mctx, cur_state, 1154 re_string_cur_idx (&mctx->input))) 1155 { 1156 /* We found an appropriate halt state. */ 1157 match_last = re_string_cur_idx (&mctx->input); 1158 match = 1; 1159 1160 /* We found a match, do not modify match_first below. */ 1161 p_match_first = NULL; 1162 if (!fl_longest_match) 1163 break; 1164 } 1165 } 1166 } 1167 1168 if (p_match_first) 1169 *p_match_first += next_start_idx; 1170 1171 return match_last; 1172} 1173 1174/* Check NODE match the current context. */ 1175 1176static int check_halt_node_context (dfa, node, context) 1177 const re_dfa_t *dfa; 1178 int node; 1179 unsigned int context; 1180{ 1181 re_token_type_t type = dfa->nodes[node].type; 1182 unsigned int constraint = dfa->nodes[node].constraint; 1183 if (type != END_OF_RE) 1184 return 0; 1185 if (!constraint) 1186 return 1; 1187 if (NOT_SATISFY_NEXT_CONSTRAINT (constraint, context)) 1188 return 0; 1189 return 1; 1190} 1191 1192/* Check the halt state STATE match the current context. 1193 Return 0 if not match, if the node, STATE has, is a halt node and 1194 match the context, return the node. */ 1195 1196static int 1197check_halt_state_context (mctx, state, idx) 1198 const re_match_context_t *mctx; 1199 const re_dfastate_t *state; 1200 int idx; 1201{ 1202 int i; 1203 unsigned int context; 1204#ifdef DEBUG 1205 assert (state->halt); 1206#endif 1207 context = re_string_context_at (&mctx->input, idx, mctx->eflags); 1208 for (i = 0; i < state->nodes.nelem; ++i) 1209 if (check_halt_node_context (mctx->dfa, state->nodes.elems[i], context)) 1210 return state->nodes.elems[i]; 1211 return 0; 1212} 1213 1214/* Compute the next node to which "NFA" transit from NODE("NFA" is a NFA 1215 corresponding to the DFA). 1216 Return the destination node, and update EPS_VIA_NODES, return -1 in case 1217 of errors. */ 1218 1219static int 1220proceed_next_node (mctx, nregs, regs, pidx, node, eps_via_nodes, fs) 1221 const re_match_context_t *mctx; 1222 regmatch_t *regs; 1223 int nregs, *pidx, node; 1224 re_node_set *eps_via_nodes; 1225 struct re_fail_stack_t *fs; 1226{ 1227 re_dfa_t *const dfa = mctx->dfa; 1228 int i, err, dest_node; 1229 dest_node = -1; 1230 if (IS_EPSILON_NODE (dfa->nodes[node].type)) 1231 { 1232 re_node_set *cur_nodes = &mctx->state_log[*pidx]->nodes; 1233 re_node_set *edests = &dfa->edests[node]; 1234 int dest_node; 1235 err = re_node_set_insert (eps_via_nodes, node); 1236 if (BE (err < 0, 0)) 1237 return -2; 1238 /* Pick up a valid destination, or return -1 if none is found. */ 1239 for (dest_node = -1, i = 0; i < edests->nelem; ++i) 1240 { 1241 int candidate = edests->elems[i]; 1242 if (!re_node_set_contains (cur_nodes, candidate)) 1243 continue; 1244 if (dest_node == -1) 1245 dest_node = candidate; 1246 1247 else 1248 { 1249 /* In order to avoid infinite loop like "(a*)*", return the second 1250 epsilon-transition if the first was already considered. */ 1251 if (re_node_set_contains (eps_via_nodes, dest_node)) 1252 return candidate; 1253 1254 /* Otherwise, push the second epsilon-transition on the fail stack. */ 1255 else if (fs != NULL 1256 && push_fail_stack (fs, *pidx, candidate, nregs, regs, 1257 eps_via_nodes)) 1258 return -2; 1259 1260 /* We know we are going to exit. */ 1261 break; 1262 } 1263 } 1264 return dest_node; 1265 } 1266 else 1267 { 1268 int naccepted = 0; 1269 re_token_type_t type = dfa->nodes[node].type; 1270 1271#ifdef RE_ENABLE_I18N 1272 if (dfa->nodes[node].accept_mb) 1273 naccepted = check_node_accept_bytes (dfa, node, &mctx->input, *pidx); 1274 else 1275#endif /* RE_ENABLE_I18N */ 1276 if (type == OP_BACK_REF) 1277 { 1278 int subexp_idx = dfa->nodes[node].opr.idx + 1; 1279 naccepted = regs[subexp_idx].rm_eo - regs[subexp_idx].rm_so; 1280 if (fs != NULL) 1281 { 1282 if (regs[subexp_idx].rm_so == -1 || regs[subexp_idx].rm_eo == -1) 1283 return -1; 1284 else if (naccepted) 1285 { 1286 char *buf = (char *) re_string_get_buffer (&mctx->input); 1287 if (memcmp (buf + regs[subexp_idx].rm_so, buf + *pidx, 1288 naccepted) != 0) 1289 return -1; 1290 } 1291 } 1292 1293 if (naccepted == 0) 1294 { 1295 err = re_node_set_insert (eps_via_nodes, node); 1296 if (BE (err < 0, 0)) 1297 return -2; 1298 dest_node = dfa->edests[node].elems[0]; 1299 if (re_node_set_contains (&mctx->state_log[*pidx]->nodes, 1300 dest_node)) 1301 return dest_node; 1302 } 1303 } 1304 1305 if (naccepted != 0 1306 || check_node_accept (mctx, dfa->nodes + node, *pidx)) 1307 { 1308 dest_node = dfa->nexts[node]; 1309 *pidx = (naccepted == 0) ? *pidx + 1 : *pidx + naccepted; 1310 if (fs && (*pidx > mctx->match_last || mctx->state_log[*pidx] == NULL 1311 || !re_node_set_contains (&mctx->state_log[*pidx]->nodes, 1312 dest_node))) 1313 return -1; 1314 re_node_set_empty (eps_via_nodes); 1315 return dest_node; 1316 } 1317 } 1318 return -1; 1319} 1320 1321static reg_errcode_t 1322push_fail_stack (fs, str_idx, dest_node, nregs, regs, eps_via_nodes) 1323 struct re_fail_stack_t *fs; 1324 int str_idx, dest_node, nregs; 1325 regmatch_t *regs; 1326 re_node_set *eps_via_nodes; 1327{ 1328 reg_errcode_t err; 1329 int num = fs->num++; 1330 if (fs->num == fs->alloc) 1331 { 1332 struct re_fail_stack_ent_t *new_array; 1333 new_array = realloc (fs->stack, (sizeof (struct re_fail_stack_ent_t) 1334 * fs->alloc * 2)); 1335 if (new_array == NULL) 1336 return REG_ESPACE; 1337 fs->alloc *= 2; 1338 fs->stack = new_array; 1339 } 1340 fs->stack[num].idx = str_idx; 1341 fs->stack[num].node = dest_node; 1342 fs->stack[num].regs = re_malloc (regmatch_t, nregs); 1343 if (fs->stack[num].regs == NULL) 1344 return REG_ESPACE; 1345 memcpy (fs->stack[num].regs, regs, sizeof (regmatch_t) * nregs); 1346 err = re_node_set_init_copy (&fs->stack[num].eps_via_nodes, eps_via_nodes); 1347 return err; 1348} 1349 1350static int 1351pop_fail_stack (fs, pidx, nregs, regs, eps_via_nodes) 1352 struct re_fail_stack_t *fs; 1353 int *pidx, nregs; 1354 regmatch_t *regs; 1355 re_node_set *eps_via_nodes; 1356{ 1357 int num = --fs->num; 1358 assert (num >= 0); 1359 *pidx = fs->stack[num].idx; 1360 memcpy (regs, fs->stack[num].regs, sizeof (regmatch_t) * nregs); 1361 re_node_set_free (eps_via_nodes); 1362 re_free (fs->stack[num].regs); 1363 *eps_via_nodes = fs->stack[num].eps_via_nodes; 1364 return fs->stack[num].node; 1365} 1366 1367/* Set the positions where the subexpressions are starts/ends to registers 1368 PMATCH. 1369 Note: We assume that pmatch[0] is already set, and 1370 pmatch[i].rm_so == pmatch[i].rm_eo == -1 for 0 < i < nmatch. */ 1371 1372static reg_errcode_t 1373set_regs (preg, mctx, nmatch, pmatch, fl_backtrack) 1374 const regex_t *preg; 1375 const re_match_context_t *mctx; 1376 size_t nmatch; 1377 regmatch_t *pmatch; 1378 int fl_backtrack; 1379{ 1380 re_dfa_t *dfa = (re_dfa_t *) preg->buffer; 1381 int idx, cur_node; 1382 re_node_set eps_via_nodes; 1383 struct re_fail_stack_t *fs; 1384 struct re_fail_stack_t fs_body = { 0, 2, NULL }; 1385 regmatch_t *prev_idx_match; 1386 1387#ifdef DEBUG 1388 assert (nmatch > 1); 1389 assert (mctx->state_log != NULL); 1390#endif 1391 if (fl_backtrack) 1392 { 1393 fs = &fs_body; 1394 fs->stack = re_malloc (struct re_fail_stack_ent_t, fs->alloc); 1395 if (fs->stack == NULL) 1396 return REG_ESPACE; 1397 } 1398 else 1399 fs = NULL; 1400 1401 cur_node = dfa->init_node; 1402 re_node_set_init_empty (&eps_via_nodes); 1403 1404 prev_idx_match = (regmatch_t *) alloca (sizeof (regmatch_t) * nmatch); 1405 memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch); 1406 1407 for (idx = pmatch[0].rm_so; idx <= pmatch[0].rm_eo ;) 1408 { 1409 update_regs (dfa, pmatch, prev_idx_match, cur_node, idx, nmatch); 1410 1411 if (idx == pmatch[0].rm_eo && cur_node == mctx->last_node) 1412 { 1413 int reg_idx; 1414 if (fs) 1415 { 1416 for (reg_idx = 0; reg_idx < nmatch; ++reg_idx) 1417 if (pmatch[reg_idx].rm_so > -1 && pmatch[reg_idx].rm_eo == -1) 1418 break; 1419 if (reg_idx == nmatch) 1420 { 1421 re_node_set_free (&eps_via_nodes); 1422 return free_fail_stack_return (fs); 1423 } 1424 cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch, 1425 &eps_via_nodes); 1426 } 1427 else 1428 { 1429 re_node_set_free (&eps_via_nodes); 1430 return REG_NOERROR; 1431 } 1432 } 1433 1434 /* Proceed to next node. */ 1435 cur_node = proceed_next_node (mctx, nmatch, pmatch, &idx, cur_node, 1436 &eps_via_nodes, fs); 1437 1438 if (BE (cur_node < 0, 0)) 1439 { 1440 if (BE (cur_node == -2, 0)) 1441 { 1442 re_node_set_free (&eps_via_nodes); 1443 free_fail_stack_return (fs); 1444 return REG_ESPACE; 1445 } 1446 if (fs) 1447 cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch, 1448 &eps_via_nodes); 1449 else 1450 { 1451 re_node_set_free (&eps_via_nodes); 1452 return REG_NOMATCH; 1453 } 1454 } 1455 } 1456 re_node_set_free (&eps_via_nodes); 1457 return free_fail_stack_return (fs); 1458} 1459 1460static reg_errcode_t 1461free_fail_stack_return (fs) 1462 struct re_fail_stack_t *fs; 1463{ 1464 if (fs) 1465 { 1466 int fs_idx; 1467 for (fs_idx = 0; fs_idx < fs->num; ++fs_idx) 1468 { 1469 re_node_set_free (&fs->stack[fs_idx].eps_via_nodes); 1470 re_free (fs->stack[fs_idx].regs); 1471 } 1472 re_free (fs->stack); 1473 } 1474 return REG_NOERROR; 1475} 1476 1477static void 1478update_regs (dfa, pmatch, prev_idx_match, cur_node, cur_idx, nmatch) 1479 re_dfa_t *dfa; 1480 regmatch_t *pmatch, *prev_idx_match; 1481 int cur_node, cur_idx, nmatch; 1482{ 1483 int type = dfa->nodes[cur_node].type; 1484 if (type == OP_OPEN_SUBEXP) 1485 { 1486 int reg_num = dfa->nodes[cur_node].opr.idx + 1; 1487 1488 /* We are at the first node of this sub expression. */ 1489 if (reg_num < nmatch) 1490 { 1491 pmatch[reg_num].rm_so = cur_idx; 1492 pmatch[reg_num].rm_eo = -1; 1493 } 1494 } 1495 else if (type == OP_CLOSE_SUBEXP) 1496 { 1497 int reg_num = dfa->nodes[cur_node].opr.idx + 1; 1498 if (reg_num < nmatch) 1499 { 1500 /* We are at the last node of this sub expression. */ 1501 if (pmatch[reg_num].rm_so < cur_idx) 1502 { 1503 pmatch[reg_num].rm_eo = cur_idx; 1504 /* This is a non-empty match or we are not inside an optional 1505 subexpression. Accept this right away. */ 1506 memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch); 1507 } 1508 else 1509 { 1510 if (dfa->nodes[cur_node].opt_subexp 1511 && prev_idx_match[reg_num].rm_so != -1) 1512 /* We transited through an empty match for an optional 1513 subexpression, like (a?)*, and this is not the subexp's 1514 first match. Copy back the old content of the registers 1515 so that matches of an inner subexpression are undone as 1516 well, like in ((a?))*. */ 1517 memcpy (pmatch, prev_idx_match, sizeof (regmatch_t) * nmatch); 1518 else 1519 /* We completed a subexpression, but it may be part of 1520 an optional one, so do not update PREV_IDX_MATCH. */ 1521 pmatch[reg_num].rm_eo = cur_idx; 1522 } 1523 } 1524 } 1525} 1526 1527/* This function checks the STATE_LOG from the SCTX->last_str_idx to 0 1528 and sift the nodes in each states according to the following rules. 1529 Updated state_log will be wrote to STATE_LOG. 1530 1531 Rules: We throw away the Node `a' in the STATE_LOG[STR_IDX] if... 1532 1. When STR_IDX == MATCH_LAST(the last index in the state_log): 1533 If `a' isn't the LAST_NODE and `a' can't epsilon transit to 1534 the LAST_NODE, we throw away the node `a'. 1535 2. When 0 <= STR_IDX < MATCH_LAST and `a' accepts 1536 string `s' and transit to `b': 1537 i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw 1538 away the node `a'. 1539 ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is 1540 thrown away, we throw away the node `a'. 1541 3. When 0 <= STR_IDX < MATCH_LAST and 'a' epsilon transit to 'b': 1542 i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the 1543 node `a'. 1544 ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is thrown away, 1545 we throw away the node `a'. */ 1546 1547#define STATE_NODE_CONTAINS(state,node) \ 1548 ((state) != NULL && re_node_set_contains (&(state)->nodes, node)) 1549 1550static reg_errcode_t 1551sift_states_backward (mctx, sctx) 1552 re_match_context_t *mctx; 1553 re_sift_context_t *sctx; 1554{ 1555 reg_errcode_t err; 1556 int null_cnt = 0; 1557 int str_idx = sctx->last_str_idx; 1558 re_node_set cur_dest; 1559 1560#ifdef DEBUG 1561 assert (mctx->state_log != NULL && mctx->state_log[str_idx] != NULL); 1562#endif 1563 1564 /* Build sifted state_log[str_idx]. It has the nodes which can epsilon 1565 transit to the last_node and the last_node itself. */ 1566 err = re_node_set_init_1 (&cur_dest, sctx->last_node); 1567 if (BE (err != REG_NOERROR, 0)) 1568 return err; 1569 err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest); 1570 if (BE (err != REG_NOERROR, 0)) 1571 goto free_return; 1572 1573 /* Then check each states in the state_log. */ 1574 while (str_idx > 0) 1575 { 1576 /* Update counters. */ 1577 null_cnt = (sctx->sifted_states[str_idx] == NULL) ? null_cnt + 1 : 0; 1578 if (null_cnt > mctx->max_mb_elem_len) 1579 { 1580 memset (sctx->sifted_states, '\0', 1581 sizeof (re_dfastate_t *) * str_idx); 1582 re_node_set_free (&cur_dest); 1583 return REG_NOERROR; 1584 } 1585 re_node_set_empty (&cur_dest); 1586 --str_idx; 1587 1588 if (mctx->state_log[str_idx]) 1589 { 1590 err = build_sifted_states (mctx, sctx, str_idx, &cur_dest); 1591 if (BE (err != REG_NOERROR, 0)) 1592 goto free_return; 1593 } 1594 1595 /* Add all the nodes which satisfy the following conditions: 1596 - It can epsilon transit to a node in CUR_DEST. 1597 - It is in CUR_SRC. 1598 And update state_log. */ 1599 err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest); 1600 if (BE (err != REG_NOERROR, 0)) 1601 goto free_return; 1602 } 1603 err = REG_NOERROR; 1604 free_return: 1605 re_node_set_free (&cur_dest); 1606 return err; 1607} 1608 1609static reg_errcode_t 1610build_sifted_states (mctx, sctx, str_idx, cur_dest) 1611 re_match_context_t *mctx; 1612 re_sift_context_t *sctx; 1613 int str_idx; 1614 re_node_set *cur_dest; 1615{ 1616 re_dfa_t *const dfa = mctx->dfa; 1617 re_node_set *cur_src = &mctx->state_log[str_idx]->non_eps_nodes; 1618 int i; 1619 1620 /* Then build the next sifted state. 1621 We build the next sifted state on `cur_dest', and update 1622 `sifted_states[str_idx]' with `cur_dest'. 1623 Note: 1624 `cur_dest' is the sifted state from `state_log[str_idx + 1]'. 1625 `cur_src' points the node_set of the old `state_log[str_idx]' 1626 (with the epsilon nodes pre-filtered out). */ 1627 for (i = 0; i < cur_src->nelem; i++) 1628 { 1629 int prev_node = cur_src->elems[i]; 1630 int naccepted = 0; 1631 int ret; 1632 1633#ifdef DEBUG 1634 re_token_type_t type = dfa->nodes[prev_node].type; 1635 assert (!IS_EPSILON_NODE (type)); 1636#endif 1637#ifdef RE_ENABLE_I18N 1638 /* If the node may accept `multi byte'. */ 1639 if (dfa->nodes[prev_node].accept_mb) 1640 naccepted = sift_states_iter_mb (mctx, sctx, prev_node, 1641 str_idx, sctx->last_str_idx); 1642#endif /* RE_ENABLE_I18N */ 1643 1644 /* We don't check backreferences here. 1645 See update_cur_sifted_state(). */ 1646 if (!naccepted 1647 && check_node_accept (mctx, dfa->nodes + prev_node, str_idx) 1648 && STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + 1], 1649 dfa->nexts[prev_node])) 1650 naccepted = 1; 1651 1652 if (naccepted == 0) 1653 continue; 1654 1655 if (sctx->limits.nelem) 1656 { 1657 int to_idx = str_idx + naccepted; 1658 if (check_dst_limits (mctx, &sctx->limits, 1659 dfa->nexts[prev_node], to_idx, 1660 prev_node, str_idx)) 1661 continue; 1662 } 1663 ret = re_node_set_insert (cur_dest, prev_node); 1664 if (BE (ret == -1, 0)) 1665 return REG_ESPACE; 1666 } 1667 1668 return REG_NOERROR; 1669} 1670 1671/* Helper functions. */ 1672 1673static reg_errcode_t 1674clean_state_log_if_needed (mctx, next_state_log_idx) 1675 re_match_context_t *mctx; 1676 int next_state_log_idx; 1677{ 1678 int top = mctx->state_log_top; 1679 1680 if (next_state_log_idx >= mctx->input.bufs_len 1681 || (next_state_log_idx >= mctx->input.valid_len 1682 && mctx->input.valid_len < mctx->input.len)) 1683 { 1684 reg_errcode_t err; 1685 err = extend_buffers (mctx); 1686 if (BE (err != REG_NOERROR, 0)) 1687 return err; 1688 } 1689 1690 if (top < next_state_log_idx) 1691 { 1692 memset (mctx->state_log + top + 1, '\0', 1693 sizeof (re_dfastate_t *) * (next_state_log_idx - top)); 1694 mctx->state_log_top = next_state_log_idx; 1695 } 1696 return REG_NOERROR; 1697} 1698 1699static reg_errcode_t 1700merge_state_array (dfa, dst, src, num) 1701 re_dfa_t *dfa; 1702 re_dfastate_t **dst; 1703 re_dfastate_t **src; 1704 int num; 1705{ 1706 int st_idx; 1707 reg_errcode_t err; 1708 for (st_idx = 0; st_idx < num; ++st_idx) 1709 { 1710 if (dst[st_idx] == NULL) 1711 dst[st_idx] = src[st_idx]; 1712 else if (src[st_idx] != NULL) 1713 { 1714 re_node_set merged_set; 1715 err = re_node_set_init_union (&merged_set, &dst[st_idx]->nodes, 1716 &src[st_idx]->nodes); 1717 if (BE (err != REG_NOERROR, 0)) 1718 return err; 1719 dst[st_idx] = re_acquire_state (&err, dfa, &merged_set); 1720 re_node_set_free (&merged_set); 1721 if (BE (err != REG_NOERROR, 0)) 1722 return err; 1723 } 1724 } 1725 return REG_NOERROR; 1726} 1727 1728static reg_errcode_t 1729update_cur_sifted_state (mctx, sctx, str_idx, dest_nodes) 1730 re_match_context_t *mctx; 1731 re_sift_context_t *sctx; 1732 int str_idx; 1733 re_node_set *dest_nodes; 1734{ 1735 re_dfa_t *const dfa = mctx->dfa; 1736 reg_errcode_t err; 1737 const re_node_set *candidates; 1738 candidates = ((mctx->state_log[str_idx] == NULL) ? NULL 1739 : &mctx->state_log[str_idx]->nodes); 1740 1741 if (dest_nodes->nelem == 0) 1742 sctx->sifted_states[str_idx] = NULL; 1743 else 1744 { 1745 if (candidates) 1746 { 1747 /* At first, add the nodes which can epsilon transit to a node in 1748 DEST_NODE. */ 1749 err = add_epsilon_src_nodes (dfa, dest_nodes, candidates); 1750 if (BE (err != REG_NOERROR, 0)) 1751 return err; 1752 1753 /* Then, check the limitations in the current sift_context. */ 1754 if (sctx->limits.nelem) 1755 { 1756 err = check_subexp_limits (dfa, dest_nodes, candidates, &sctx->limits, 1757 mctx->bkref_ents, str_idx); 1758 if (BE (err != REG_NOERROR, 0)) 1759 return err; 1760 } 1761 } 1762 1763 sctx->sifted_states[str_idx] = re_acquire_state (&err, dfa, dest_nodes); 1764 if (BE (err != REG_NOERROR, 0)) 1765 return err; 1766 } 1767 1768 if (candidates && mctx->state_log[str_idx]->has_backref) 1769 { 1770 err = sift_states_bkref (mctx, sctx, str_idx, candidates); 1771 if (BE (err != REG_NOERROR, 0)) 1772 return err; 1773 } 1774 return REG_NOERROR; 1775} 1776 1777static reg_errcode_t 1778add_epsilon_src_nodes (dfa, dest_nodes, candidates) 1779 re_dfa_t *dfa; 1780 re_node_set *dest_nodes; 1781 const re_node_set *candidates; 1782{ 1783 reg_errcode_t err = REG_NOERROR; 1784 int i; 1785 1786 re_dfastate_t *state = re_acquire_state (&err, dfa, dest_nodes); 1787 if (BE (err != REG_NOERROR, 0)) 1788 return err; 1789 1790 if (!state->inveclosure.alloc) 1791 { 1792 err = re_node_set_alloc (&state->inveclosure, dest_nodes->nelem); 1793 if (BE (err != REG_NOERROR, 0)) 1794 return REG_ESPACE; 1795 for (i = 0; i < dest_nodes->nelem; i++) 1796 re_node_set_merge (&state->inveclosure, 1797 dfa->inveclosures + dest_nodes->elems[i]); 1798 } 1799 return re_node_set_add_intersect (dest_nodes, candidates, 1800 &state->inveclosure); 1801} 1802 1803static reg_errcode_t 1804sub_epsilon_src_nodes (dfa, node, dest_nodes, candidates) 1805 re_dfa_t *dfa; 1806 int node; 1807 re_node_set *dest_nodes; 1808 const re_node_set *candidates; 1809{ 1810 int ecl_idx; 1811 reg_errcode_t err; 1812 re_node_set *inv_eclosure = dfa->inveclosures + node; 1813 re_node_set except_nodes; 1814 re_node_set_init_empty (&except_nodes); 1815 for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx) 1816 { 1817 int cur_node = inv_eclosure->elems[ecl_idx]; 1818 if (cur_node == node) 1819 continue; 1820 if (IS_EPSILON_NODE (dfa->nodes[cur_node].type)) 1821 { 1822 int edst1 = dfa->edests[cur_node].elems[0]; 1823 int edst2 = ((dfa->edests[cur_node].nelem > 1) 1824 ? dfa->edests[cur_node].elems[1] : -1); 1825 if ((!re_node_set_contains (inv_eclosure, edst1) 1826 && re_node_set_contains (dest_nodes, edst1)) 1827 || (edst2 > 0 1828 && !re_node_set_contains (inv_eclosure, edst2) 1829 && re_node_set_contains (dest_nodes, edst2))) 1830 { 1831 err = re_node_set_add_intersect (&except_nodes, candidates, 1832 dfa->inveclosures + cur_node); 1833 if (BE (err != REG_NOERROR, 0)) 1834 { 1835 re_node_set_free (&except_nodes); 1836 return err; 1837 } 1838 } 1839 } 1840 } 1841 for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx) 1842 { 1843 int cur_node = inv_eclosure->elems[ecl_idx]; 1844 if (!re_node_set_contains (&except_nodes, cur_node)) 1845 { 1846 int idx = re_node_set_contains (dest_nodes, cur_node) - 1; 1847 re_node_set_remove_at (dest_nodes, idx); 1848 } 1849 } 1850 re_node_set_free (&except_nodes); 1851 return REG_NOERROR; 1852} 1853 1854static int 1855check_dst_limits (mctx, limits, dst_node, dst_idx, src_node, src_idx) 1856 re_match_context_t *mctx; 1857 re_node_set *limits; 1858 int dst_node, dst_idx, src_node, src_idx; 1859{ 1860 re_dfa_t *const dfa = mctx->dfa; 1861 int lim_idx, src_pos, dst_pos; 1862 1863 int dst_bkref_idx = search_cur_bkref_entry (mctx, dst_idx); 1864 int src_bkref_idx = search_cur_bkref_entry (mctx, src_idx); 1865 for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx) 1866 { 1867 int subexp_idx; 1868 struct re_backref_cache_entry *ent; 1869 ent = mctx->bkref_ents + limits->elems[lim_idx]; 1870 subexp_idx = dfa->nodes[ent->node].opr.idx; 1871 1872 dst_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx], 1873 subexp_idx, dst_node, dst_idx, 1874 dst_bkref_idx); 1875 src_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx], 1876 subexp_idx, src_node, src_idx, 1877 src_bkref_idx); 1878 1879 /* In case of: 1880 <src> <dst> ( <subexp> ) 1881 ( <subexp> ) <src> <dst> 1882 ( <subexp1> <src> <subexp2> <dst> <subexp3> ) */ 1883 if (src_pos == dst_pos) 1884 continue; /* This is unrelated limitation. */ 1885 else 1886 return 1; 1887 } 1888 return 0; 1889} 1890 1891static int 1892check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx, from_node, bkref_idx) 1893 re_match_context_t *mctx; 1894 int boundaries, subexp_idx, from_node, bkref_idx; 1895{ 1896 re_dfa_t *const dfa = mctx->dfa; 1897 re_node_set *eclosures = dfa->eclosures + from_node; 1898 int node_idx; 1899 1900 /* Else, we are on the boundary: examine the nodes on the epsilon 1901 closure. */ 1902 for (node_idx = 0; node_idx < eclosures->nelem; ++node_idx) 1903 { 1904 int node = eclosures->elems[node_idx]; 1905 switch (dfa->nodes[node].type) 1906 { 1907 case OP_BACK_REF: 1908 if (bkref_idx != -1) 1909 { 1910 struct re_backref_cache_entry *ent = mctx->bkref_ents + bkref_idx; 1911 do 1912 { 1913 int dst, cpos; 1914 1915 if (ent->node != node) 1916 continue; 1917 1918 if (subexp_idx <= 8 * sizeof (ent->eps_reachable_subexps_map) 1919 && !(ent->eps_reachable_subexps_map & (1 << subexp_idx))) 1920 continue; 1921 1922 /* Recurse trying to reach the OP_OPEN_SUBEXP and 1923 OP_CLOSE_SUBEXP cases below. But, if the 1924 destination node is the same node as the source 1925 node, don't recurse because it would cause an 1926 infinite loop: a regex that exhibits this behavior 1927 is ()\1*\1* */ 1928 dst = dfa->edests[node].elems[0]; 1929 if (dst == from_node) 1930 { 1931 if (boundaries & 1) 1932 return -1; 1933 else /* if (boundaries & 2) */ 1934 return 0; 1935 } 1936 1937 cpos = 1938 check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx, 1939 dst, bkref_idx); 1940 if (cpos == -1 /* && (boundaries & 1) */) 1941 return -1; 1942 if (cpos == 0 && (boundaries & 2)) 1943 return 0; 1944 1945 ent->eps_reachable_subexps_map &= ~(1 << subexp_idx); 1946 } 1947 while (ent++->more); 1948 } 1949 break; 1950 1951 case OP_OPEN_SUBEXP: 1952 if ((boundaries & 1) && subexp_idx == dfa->nodes[node].opr.idx) 1953 return -1; 1954 break; 1955 1956 case OP_CLOSE_SUBEXP: 1957 if ((boundaries & 2) && subexp_idx == dfa->nodes[node].opr.idx) 1958 return 0; 1959 break; 1960 1961 default: 1962 break; 1963 } 1964 } 1965 1966 return (boundaries & 2) ? 1 : 0; 1967} 1968 1969static int 1970check_dst_limits_calc_pos (mctx, limit, subexp_idx, from_node, str_idx, bkref_idx) 1971 re_match_context_t *mctx; 1972 int limit, subexp_idx, from_node, str_idx, bkref_idx; 1973{ 1974 struct re_backref_cache_entry *lim = mctx->bkref_ents + limit; 1975 int boundaries; 1976 1977 /* If we are outside the range of the subexpression, return -1 or 1. */ 1978 if (str_idx < lim->subexp_from) 1979 return -1; 1980 1981 if (lim->subexp_to < str_idx) 1982 return 1; 1983 1984 /* If we are within the subexpression, return 0. */ 1985 boundaries = (str_idx == lim->subexp_from); 1986 boundaries |= (str_idx == lim->subexp_to) << 1; 1987 if (boundaries == 0) 1988 return 0; 1989 1990 /* Else, examine epsilon closure. */ 1991 return check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx, 1992 from_node, bkref_idx); 1993} 1994 1995/* Check the limitations of sub expressions LIMITS, and remove the nodes 1996 which are against limitations from DEST_NODES. */ 1997 1998static reg_errcode_t 1999check_subexp_limits (dfa, dest_nodes, candidates, limits, bkref_ents, str_idx) 2000 re_dfa_t *dfa; 2001 re_node_set *dest_nodes; 2002 const re_node_set *candidates; 2003 re_node_set *limits; 2004 struct re_backref_cache_entry *bkref_ents; 2005 int str_idx; 2006{ 2007 reg_errcode_t err; 2008 int node_idx, lim_idx; 2009 2010 for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx) 2011 { 2012 int subexp_idx; 2013 struct re_backref_cache_entry *ent; 2014 ent = bkref_ents + limits->elems[lim_idx]; 2015 2016 if (str_idx <= ent->subexp_from || ent->str_idx < str_idx) 2017 continue; /* This is unrelated limitation. */ 2018 2019 subexp_idx = dfa->nodes[ent->node].opr.idx; 2020 if (ent->subexp_to == str_idx) 2021 { 2022 int ops_node = -1; 2023 int cls_node = -1; 2024 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx) 2025 { 2026 int node = dest_nodes->elems[node_idx]; 2027 re_token_type_t type = dfa->nodes[node].type; 2028 if (type == OP_OPEN_SUBEXP 2029 && subexp_idx == dfa->nodes[node].opr.idx) 2030 ops_node = node; 2031 else if (type == OP_CLOSE_SUBEXP 2032 && subexp_idx == dfa->nodes[node].opr.idx) 2033 cls_node = node; 2034 } 2035 2036 /* Check the limitation of the open subexpression. */ 2037 /* Note that (ent->subexp_to = str_idx != ent->subexp_from). */ 2038 if (ops_node >= 0) 2039 { 2040 err = sub_epsilon_src_nodes (dfa, ops_node, dest_nodes, 2041 candidates); 2042 if (BE (err != REG_NOERROR, 0)) 2043 return err; 2044 } 2045 2046 /* Check the limitation of the close subexpression. */ 2047 if (cls_node >= 0) 2048 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx) 2049 { 2050 int node = dest_nodes->elems[node_idx]; 2051 if (!re_node_set_contains (dfa->inveclosures + node, 2052 cls_node) 2053 && !re_node_set_contains (dfa->eclosures + node, 2054 cls_node)) 2055 { 2056 /* It is against this limitation. 2057 Remove it form the current sifted state. */ 2058 err = sub_epsilon_src_nodes (dfa, node, dest_nodes, 2059 candidates); 2060 if (BE (err != REG_NOERROR, 0)) 2061 return err; 2062 --node_idx; 2063 } 2064 } 2065 } 2066 else /* (ent->subexp_to != str_idx) */ 2067 { 2068 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx) 2069 { 2070 int node = dest_nodes->elems[node_idx]; 2071 re_token_type_t type = dfa->nodes[node].type; 2072 if (type == OP_CLOSE_SUBEXP || type == OP_OPEN_SUBEXP) 2073 { 2074 if (subexp_idx != dfa->nodes[node].opr.idx) 2075 continue; 2076 /* It is against this limitation. 2077 Remove it form the current sifted state. */ 2078 err = sub_epsilon_src_nodes (dfa, node, dest_nodes, 2079 candidates); 2080 if (BE (err != REG_NOERROR, 0)) 2081 return err; 2082 } 2083 } 2084 } 2085 } 2086 return REG_NOERROR; 2087} 2088 2089static reg_errcode_t 2090sift_states_bkref (mctx, sctx, str_idx, candidates) 2091 re_match_context_t *mctx; 2092 re_sift_context_t *sctx; 2093 int str_idx; 2094 const re_node_set *candidates; 2095{ 2096 re_dfa_t *const dfa = mctx->dfa; 2097 reg_errcode_t err; 2098 int node_idx, node; 2099 re_sift_context_t local_sctx; 2100 int first_idx = search_cur_bkref_entry (mctx, str_idx); 2101 2102 if (first_idx == -1) 2103 return REG_NOERROR; 2104 2105 local_sctx.sifted_states = NULL; /* Mark that it hasn't been initialized. */ 2106 2107 for (node_idx = 0; node_idx < candidates->nelem; ++node_idx) 2108 { 2109 int enabled_idx; 2110 re_token_type_t type; 2111 struct re_backref_cache_entry *entry; 2112 node = candidates->elems[node_idx]; 2113 type = dfa->nodes[node].type; 2114 /* Avoid infinite loop for the REs like "()\1+". */ 2115 if (node == sctx->last_node && str_idx == sctx->last_str_idx) 2116 continue; 2117 if (type != OP_BACK_REF) 2118 continue; 2119 2120 entry = mctx->bkref_ents + first_idx; 2121 enabled_idx = first_idx; 2122 do 2123 { 2124 int subexp_len, to_idx, dst_node; 2125 re_dfastate_t *cur_state; 2126 2127 if (entry->node != node) 2128 continue; 2129 subexp_len = entry->subexp_to - entry->subexp_from; 2130 to_idx = str_idx + subexp_len; 2131 dst_node = (subexp_len ? dfa->nexts[node] 2132 : dfa->edests[node].elems[0]); 2133 2134 if (to_idx > sctx->last_str_idx 2135 || sctx->sifted_states[to_idx] == NULL 2136 || !STATE_NODE_CONTAINS (sctx->sifted_states[to_idx], dst_node) 2137 || check_dst_limits (mctx, &sctx->limits, node, 2138 str_idx, dst_node, to_idx)) 2139 continue; 2140 2141 if (local_sctx.sifted_states == NULL) 2142 { 2143 local_sctx = *sctx; 2144 err = re_node_set_init_copy (&local_sctx.limits, &sctx->limits); 2145 if (BE (err != REG_NOERROR, 0)) 2146 goto free_return; 2147 } 2148 local_sctx.last_node = node; 2149 local_sctx.last_str_idx = str_idx; 2150 err = re_node_set_insert (&local_sctx.limits, enabled_idx); 2151 if (BE (err < 0, 0)) 2152 { 2153 err = REG_ESPACE; 2154 goto free_return; 2155 } 2156 cur_state = local_sctx.sifted_states[str_idx]; 2157 err = sift_states_backward (mctx, &local_sctx); 2158 if (BE (err != REG_NOERROR, 0)) 2159 goto free_return; 2160 if (sctx->limited_states != NULL) 2161 { 2162 err = merge_state_array (dfa, sctx->limited_states, 2163 local_sctx.sifted_states, 2164 str_idx + 1); 2165 if (BE (err != REG_NOERROR, 0)) 2166 goto free_return; 2167 } 2168 local_sctx.sifted_states[str_idx] = cur_state; 2169 re_node_set_remove (&local_sctx.limits, enabled_idx); 2170 2171 /* mctx->bkref_ents may have changed, reload the pointer. */ 2172 entry = mctx->bkref_ents + enabled_idx; 2173 } 2174 while (enabled_idx++, entry++->more); 2175 } 2176 err = REG_NOERROR; 2177 free_return: 2178 if (local_sctx.sifted_states != NULL) 2179 { 2180 re_node_set_free (&local_sctx.limits); 2181 } 2182 2183 return err; 2184} 2185 2186 2187#ifdef RE_ENABLE_I18N 2188static int 2189sift_states_iter_mb (mctx, sctx, node_idx, str_idx, max_str_idx) 2190 const re_match_context_t *mctx; 2191 re_sift_context_t *sctx; 2192 int node_idx, str_idx, max_str_idx; 2193{ 2194 re_dfa_t *const dfa = mctx->dfa; 2195 int naccepted; 2196 /* Check the node can accept `multi byte'. */ 2197 naccepted = check_node_accept_bytes (dfa, node_idx, &mctx->input, str_idx); 2198 if (naccepted > 0 && str_idx + naccepted <= max_str_idx && 2199 !STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + naccepted], 2200 dfa->nexts[node_idx])) 2201 /* The node can't accept the `multi byte', or the 2202 destination was already thrown away, then the node 2203 could't accept the current input `multi byte'. */ 2204 naccepted = 0; 2205 /* Otherwise, it is sure that the node could accept 2206 `naccepted' bytes input. */ 2207 return naccepted; 2208} 2209#endif /* RE_ENABLE_I18N */ 2210 2211 2212/* Functions for state transition. */ 2213 2214/* Return the next state to which the current state STATE will transit by 2215 accepting the current input byte, and update STATE_LOG if necessary. 2216 If STATE can accept a multibyte char/collating element/back reference 2217 update the destination of STATE_LOG. */ 2218 2219static re_dfastate_t * 2220transit_state (err, mctx, state) 2221 reg_errcode_t *err; 2222 re_match_context_t *mctx; 2223 re_dfastate_t *state; 2224{ 2225 re_dfastate_t **trtable; 2226 unsigned char ch; 2227 2228#ifdef RE_ENABLE_I18N 2229 /* If the current state can accept multibyte. */ 2230 if (BE (state->accept_mb, 0)) 2231 { 2232 *err = transit_state_mb (mctx, state); 2233 if (BE (*err != REG_NOERROR, 0)) 2234 return NULL; 2235 } 2236#endif /* RE_ENABLE_I18N */ 2237 2238 /* Then decide the next state with the single byte. */ 2239#if 0 2240 if (0) 2241 /* don't use transition table */ 2242 return transit_state_sb (err, mctx, state); 2243#endif 2244 2245 /* Use transition table */ 2246 ch = re_string_fetch_byte (&mctx->input); 2247 for (;;) 2248 { 2249 trtable = state->trtable; 2250 if (BE (trtable != NULL, 1)) 2251 return trtable[ch]; 2252 2253 trtable = state->word_trtable; 2254 if (BE (trtable != NULL, 1)) 2255 { 2256 unsigned int context; 2257 context 2258 = re_string_context_at (&mctx->input, 2259 re_string_cur_idx (&mctx->input) - 1, 2260 mctx->eflags); 2261 if (IS_WORD_CONTEXT (context)) 2262 return trtable[ch + SBC_MAX]; 2263 else 2264 return trtable[ch]; 2265 } 2266 2267 if (!build_trtable (mctx->dfa, state)) 2268 { 2269 *err = REG_ESPACE; 2270 return NULL; 2271 } 2272 2273 /* Retry, we now have a transition table. */ 2274 } 2275} 2276 2277/* Update the state_log if we need */ 2278re_dfastate_t * 2279merge_state_with_log (err, mctx, next_state) 2280 reg_errcode_t *err; 2281 re_match_context_t *mctx; 2282 re_dfastate_t *next_state; 2283{ 2284 re_dfa_t *const dfa = mctx->dfa; 2285 int cur_idx = re_string_cur_idx (&mctx->input); 2286 2287 if (cur_idx > mctx->state_log_top) 2288 { 2289 mctx->state_log[cur_idx] = next_state; 2290 mctx->state_log_top = cur_idx; 2291 } 2292 else if (mctx->state_log[cur_idx] == 0) 2293 { 2294 mctx->state_log[cur_idx] = next_state; 2295 } 2296 else 2297 { 2298 re_dfastate_t *pstate; 2299 unsigned int context; 2300 re_node_set next_nodes, *log_nodes, *table_nodes = NULL; 2301 /* If (state_log[cur_idx] != 0), it implies that cur_idx is 2302 the destination of a multibyte char/collating element/ 2303 back reference. Then the next state is the union set of 2304 these destinations and the results of the transition table. */ 2305 pstate = mctx->state_log[cur_idx]; 2306 log_nodes = pstate->entrance_nodes; 2307 if (next_state != NULL) 2308 { 2309 table_nodes = next_state->entrance_nodes; 2310 *err = re_node_set_init_union (&next_nodes, table_nodes, 2311 log_nodes); 2312 if (BE (*err != REG_NOERROR, 0)) 2313 return NULL; 2314 } 2315 else 2316 next_nodes = *log_nodes; 2317 /* Note: We already add the nodes of the initial state, 2318 then we don't need to add them here. */ 2319 2320 context = re_string_context_at (&mctx->input, 2321 re_string_cur_idx (&mctx->input) - 1, 2322 mctx->eflags); 2323 next_state = mctx->state_log[cur_idx] 2324 = re_acquire_state_context (err, dfa, &next_nodes, context); 2325 /* We don't need to check errors here, since the return value of 2326 this function is next_state and ERR is already set. */ 2327 2328 if (table_nodes != NULL) 2329 re_node_set_free (&next_nodes); 2330 } 2331 2332 if (BE (dfa->nbackref, 0) && next_state != NULL) 2333 { 2334 /* Check OP_OPEN_SUBEXP in the current state in case that we use them 2335 later. We must check them here, since the back references in the 2336 next state might use them. */ 2337 *err = check_subexp_matching_top (mctx, &next_state->nodes, 2338 cur_idx); 2339 if (BE (*err != REG_NOERROR, 0)) 2340 return NULL; 2341 2342 /* If the next state has back references. */ 2343 if (next_state->has_backref) 2344 { 2345 *err = transit_state_bkref (mctx, &next_state->nodes); 2346 if (BE (*err != REG_NOERROR, 0)) 2347 return NULL; 2348 next_state = mctx->state_log[cur_idx]; 2349 } 2350 } 2351 2352 return next_state; 2353} 2354 2355/* Skip bytes in the input that correspond to part of a 2356 multi-byte match, then look in the log for a state 2357 from which to restart matching. */ 2358re_dfastate_t * 2359find_recover_state (err, mctx) 2360 reg_errcode_t *err; 2361 re_match_context_t *mctx; 2362{ 2363 re_dfastate_t *cur_state = NULL; 2364 do 2365 { 2366 int max = mctx->state_log_top; 2367 int cur_str_idx = re_string_cur_idx (&mctx->input); 2368 2369 do 2370 { 2371 if (++cur_str_idx > max) 2372 return NULL; 2373 re_string_skip_bytes (&mctx->input, 1); 2374 } 2375 while (mctx->state_log[cur_str_idx] == NULL); 2376 2377 cur_state = merge_state_with_log (err, mctx, NULL); 2378 } 2379 while (err == REG_NOERROR && cur_state == NULL); 2380 return cur_state; 2381} 2382 2383/* Helper functions for transit_state. */ 2384 2385/* From the node set CUR_NODES, pick up the nodes whose types are 2386 OP_OPEN_SUBEXP and which have corresponding back references in the regular 2387 expression. And register them to use them later for evaluating the 2388 correspoding back references. */ 2389 2390static reg_errcode_t 2391check_subexp_matching_top (mctx, cur_nodes, str_idx) 2392 re_match_context_t *mctx; 2393 re_node_set *cur_nodes; 2394 int str_idx; 2395{ 2396 re_dfa_t *const dfa = mctx->dfa; 2397 int node_idx; 2398 reg_errcode_t err; 2399 2400 /* TODO: This isn't efficient. 2401 Because there might be more than one nodes whose types are 2402 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all 2403 nodes. 2404 E.g. RE: (a){2} */ 2405 for (node_idx = 0; node_idx < cur_nodes->nelem; ++node_idx) 2406 { 2407 int node = cur_nodes->elems[node_idx]; 2408 if (dfa->nodes[node].type == OP_OPEN_SUBEXP 2409 && dfa->nodes[node].opr.idx < (8 * sizeof (dfa->used_bkref_map)) 2410 && dfa->used_bkref_map & (1 << dfa->nodes[node].opr.idx)) 2411 { 2412 err = match_ctx_add_subtop (mctx, node, str_idx); 2413 if (BE (err != REG_NOERROR, 0)) 2414 return err; 2415 } 2416 } 2417 return REG_NOERROR; 2418} 2419 2420#if 0 2421/* Return the next state to which the current state STATE will transit by 2422 accepting the current input byte. */ 2423 2424static re_dfastate_t * 2425transit_state_sb (err, mctx, state) 2426 reg_errcode_t *err; 2427 re_match_context_t *mctx; 2428 re_dfastate_t *state; 2429{ 2430 re_dfa_t *const dfa = mctx->dfa; 2431 re_node_set next_nodes; 2432 re_dfastate_t *next_state; 2433 int node_cnt, cur_str_idx = re_string_cur_idx (&mctx->input); 2434 unsigned int context; 2435 2436 *err = re_node_set_alloc (&next_nodes, state->nodes.nelem + 1); 2437 if (BE (*err != REG_NOERROR, 0)) 2438 return NULL; 2439 for (node_cnt = 0; node_cnt < state->nodes.nelem; ++node_cnt) 2440 { 2441 int cur_node = state->nodes.elems[node_cnt]; 2442 if (check_node_accept (mctx, dfa->nodes + cur_node, cur_str_idx)) 2443 { 2444 *err = re_node_set_merge (&next_nodes, 2445 dfa->eclosures + dfa->nexts[cur_node]); 2446 if (BE (*err != REG_NOERROR, 0)) 2447 { 2448 re_node_set_free (&next_nodes); 2449 return NULL; 2450 } 2451 } 2452 } 2453 context = re_string_context_at (&mctx->input, cur_str_idx, mctx->eflags); 2454 next_state = re_acquire_state_context (err, dfa, &next_nodes, context); 2455 /* We don't need to check errors here, since the return value of 2456 this function is next_state and ERR is already set. */ 2457 2458 re_node_set_free (&next_nodes); 2459 re_string_skip_bytes (&mctx->input, 1); 2460 return next_state; 2461} 2462#endif 2463 2464#ifdef RE_ENABLE_I18N 2465static reg_errcode_t 2466transit_state_mb (mctx, pstate) 2467 re_match_context_t *mctx; 2468 re_dfastate_t *pstate; 2469{ 2470 re_dfa_t *const dfa = mctx->dfa; 2471 reg_errcode_t err; 2472 int i; 2473 2474 for (i = 0; i < pstate->nodes.nelem; ++i) 2475 { 2476 re_node_set dest_nodes, *new_nodes; 2477 int cur_node_idx = pstate->nodes.elems[i]; 2478 int naccepted, dest_idx; 2479 unsigned int context; 2480 re_dfastate_t *dest_state; 2481 2482 if (!dfa->nodes[cur_node_idx].accept_mb) 2483 continue; 2484 2485 if (dfa->nodes[cur_node_idx].constraint) 2486 { 2487 context = re_string_context_at (&mctx->input, 2488 re_string_cur_idx (&mctx->input), 2489 mctx->eflags); 2490 if (NOT_SATISFY_NEXT_CONSTRAINT (dfa->nodes[cur_node_idx].constraint, 2491 context)) 2492 continue; 2493 } 2494 2495 /* How many bytes the node can accept? */ 2496 naccepted = check_node_accept_bytes (dfa, cur_node_idx, &mctx->input, 2497 re_string_cur_idx (&mctx->input)); 2498 if (naccepted == 0) 2499 continue; 2500 2501 /* The node can accepts `naccepted' bytes. */ 2502 dest_idx = re_string_cur_idx (&mctx->input) + naccepted; 2503 mctx->max_mb_elem_len = ((mctx->max_mb_elem_len < naccepted) ? naccepted 2504 : mctx->max_mb_elem_len); 2505 err = clean_state_log_if_needed (mctx, dest_idx); 2506 if (BE (err != REG_NOERROR, 0)) 2507 return err; 2508#ifdef DEBUG 2509 assert (dfa->nexts[cur_node_idx] != -1); 2510#endif 2511 new_nodes = dfa->eclosures + dfa->nexts[cur_node_idx]; 2512 2513 dest_state = mctx->state_log[dest_idx]; 2514 if (dest_state == NULL) 2515 dest_nodes = *new_nodes; 2516 else 2517 { 2518 err = re_node_set_init_union (&dest_nodes, 2519 dest_state->entrance_nodes, new_nodes); 2520 if (BE (err != REG_NOERROR, 0)) 2521 return err; 2522 } 2523 context = re_string_context_at (&mctx->input, dest_idx - 1, mctx->eflags); 2524 mctx->state_log[dest_idx] 2525 = re_acquire_state_context (&err, dfa, &dest_nodes, context); 2526 if (dest_state != NULL) 2527 re_node_set_free (&dest_nodes); 2528 if (BE (mctx->state_log[dest_idx] == NULL && err != REG_NOERROR, 0)) 2529 return err; 2530 } 2531 return REG_NOERROR; 2532} 2533#endif /* RE_ENABLE_I18N */ 2534 2535static reg_errcode_t 2536transit_state_bkref (mctx, nodes) 2537 re_match_context_t *mctx; 2538 const re_node_set *nodes; 2539{ 2540 re_dfa_t *const dfa = mctx->dfa; 2541 reg_errcode_t err; 2542 int i; 2543 int cur_str_idx = re_string_cur_idx (&mctx->input); 2544 2545 for (i = 0; i < nodes->nelem; ++i) 2546 { 2547 int dest_str_idx, prev_nelem, bkc_idx; 2548 int node_idx = nodes->elems[i]; 2549 unsigned int context; 2550 const re_token_t *node = dfa->nodes + node_idx; 2551 re_node_set *new_dest_nodes; 2552 2553 /* Check whether `node' is a backreference or not. */ 2554 if (node->type != OP_BACK_REF) 2555 continue; 2556 2557 if (node->constraint) 2558 { 2559 context = re_string_context_at (&mctx->input, cur_str_idx, 2560 mctx->eflags); 2561 if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context)) 2562 continue; 2563 } 2564 2565 /* `node' is a backreference. 2566 Check the substring which the substring matched. */ 2567 bkc_idx = mctx->nbkref_ents; 2568 err = get_subexp (mctx, node_idx, cur_str_idx); 2569 if (BE (err != REG_NOERROR, 0)) 2570 goto free_return; 2571 2572 /* And add the epsilon closures (which is `new_dest_nodes') of 2573 the backreference to appropriate state_log. */ 2574#ifdef DEBUG 2575 assert (dfa->nexts[node_idx] != -1); 2576#endif 2577 for (; bkc_idx < mctx->nbkref_ents; ++bkc_idx) 2578 { 2579 int subexp_len; 2580 re_dfastate_t *dest_state; 2581 struct re_backref_cache_entry *bkref_ent; 2582 bkref_ent = mctx->bkref_ents + bkc_idx; 2583 if (bkref_ent->node != node_idx || bkref_ent->str_idx != cur_str_idx) 2584 continue; 2585 subexp_len = bkref_ent->subexp_to - bkref_ent->subexp_from; 2586 new_dest_nodes = (subexp_len == 0 2587 ? dfa->eclosures + dfa->edests[node_idx].elems[0] 2588 : dfa->eclosures + dfa->nexts[node_idx]); 2589 dest_str_idx = (cur_str_idx + bkref_ent->subexp_to 2590 - bkref_ent->subexp_from); 2591 context = re_string_context_at (&mctx->input, dest_str_idx - 1, 2592 mctx->eflags); 2593 dest_state = mctx->state_log[dest_str_idx]; 2594 prev_nelem = ((mctx->state_log[cur_str_idx] == NULL) ? 0 2595 : mctx->state_log[cur_str_idx]->nodes.nelem); 2596 /* Add `new_dest_node' to state_log. */ 2597 if (dest_state == NULL) 2598 { 2599 mctx->state_log[dest_str_idx] 2600 = re_acquire_state_context (&err, dfa, new_dest_nodes, 2601 context); 2602 if (BE (mctx->state_log[dest_str_idx] == NULL 2603 && err != REG_NOERROR, 0)) 2604 goto free_return; 2605 } 2606 else 2607 { 2608 re_node_set dest_nodes; 2609 err = re_node_set_init_union (&dest_nodes, 2610 dest_state->entrance_nodes, 2611 new_dest_nodes); 2612 if (BE (err != REG_NOERROR, 0)) 2613 { 2614 re_node_set_free (&dest_nodes); 2615 goto free_return; 2616 } 2617 mctx->state_log[dest_str_idx] 2618 = re_acquire_state_context (&err, dfa, &dest_nodes, context); 2619 re_node_set_free (&dest_nodes); 2620 if (BE (mctx->state_log[dest_str_idx] == NULL 2621 && err != REG_NOERROR, 0)) 2622 goto free_return; 2623 } 2624 /* We need to check recursively if the backreference can epsilon 2625 transit. */ 2626 if (subexp_len == 0 2627 && mctx->state_log[cur_str_idx]->nodes.nelem > prev_nelem) 2628 { 2629 err = check_subexp_matching_top (mctx, new_dest_nodes, 2630 cur_str_idx); 2631 if (BE (err != REG_NOERROR, 0)) 2632 goto free_return; 2633 err = transit_state_bkref (mctx, new_dest_nodes); 2634 if (BE (err != REG_NOERROR, 0)) 2635 goto free_return; 2636 } 2637 } 2638 } 2639 err = REG_NOERROR; 2640 free_return: 2641 return err; 2642} 2643 2644/* Enumerate all the candidates which the backreference BKREF_NODE can match 2645 at BKREF_STR_IDX, and register them by match_ctx_add_entry(). 2646 Note that we might collect inappropriate candidates here. 2647 However, the cost of checking them strictly here is too high, then we 2648 delay these checking for prune_impossible_nodes(). */ 2649 2650static reg_errcode_t 2651get_subexp (mctx, bkref_node, bkref_str_idx) 2652 re_match_context_t *mctx; 2653 int bkref_node, bkref_str_idx; 2654{ 2655 re_dfa_t *const dfa = mctx->dfa; 2656 int subexp_num, sub_top_idx; 2657 const char *buf = (const char *) re_string_get_buffer (&mctx->input); 2658 /* Return if we have already checked BKREF_NODE at BKREF_STR_IDX. */ 2659 int cache_idx = search_cur_bkref_entry (mctx, bkref_str_idx); 2660 if (cache_idx != -1) 2661 { 2662 const struct re_backref_cache_entry *entry = mctx->bkref_ents + cache_idx; 2663 do 2664 if (entry->node == bkref_node) 2665 return REG_NOERROR; /* We already checked it. */ 2666 while (entry++->more); 2667 } 2668 2669 subexp_num = dfa->nodes[bkref_node].opr.idx; 2670 2671 /* For each sub expression */ 2672 for (sub_top_idx = 0; sub_top_idx < mctx->nsub_tops; ++sub_top_idx) 2673 { 2674 reg_errcode_t err; 2675 re_sub_match_top_t *sub_top = mctx->sub_tops[sub_top_idx]; 2676 re_sub_match_last_t *sub_last; 2677 int sub_last_idx, sl_str, bkref_str_off; 2678 2679 if (dfa->nodes[sub_top->node].opr.idx != subexp_num) 2680 continue; /* It isn't related. */ 2681 2682 sl_str = sub_top->str_idx; 2683 bkref_str_off = bkref_str_idx; 2684 /* At first, check the last node of sub expressions we already 2685 evaluated. */ 2686 for (sub_last_idx = 0; sub_last_idx < sub_top->nlasts; ++sub_last_idx) 2687 { 2688 int sl_str_diff; 2689 sub_last = sub_top->lasts[sub_last_idx]; 2690 sl_str_diff = sub_last->str_idx - sl_str; 2691 /* The matched string by the sub expression match with the substring 2692 at the back reference? */ 2693 if (sl_str_diff > 0) 2694 { 2695 if (BE (bkref_str_off + sl_str_diff > mctx->input.valid_len, 0)) 2696 { 2697 /* Not enough chars for a successful match. */ 2698 if (bkref_str_off + sl_str_diff > mctx->input.len) 2699 break; 2700 2701 err = clean_state_log_if_needed (mctx, 2702 bkref_str_off 2703 + sl_str_diff); 2704 if (BE (err != REG_NOERROR, 0)) 2705 return err; 2706 buf = (const char *) re_string_get_buffer (&mctx->input); 2707 } 2708 if (memcmp (buf + bkref_str_off, buf + sl_str, sl_str_diff) != 0) 2709 break; /* We don't need to search this sub expression any more. */ 2710 } 2711 bkref_str_off += sl_str_diff; 2712 sl_str += sl_str_diff; 2713 err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node, 2714 bkref_str_idx); 2715 2716 /* Reload buf, since the preceding call might have reallocated 2717 the buffer. */ 2718 buf = (const char *) re_string_get_buffer (&mctx->input); 2719 2720 if (err == REG_NOMATCH) 2721 continue; 2722 if (BE (err != REG_NOERROR, 0)) 2723 return err; 2724 } 2725 2726 if (sub_last_idx < sub_top->nlasts) 2727 continue; 2728 if (sub_last_idx > 0) 2729 ++sl_str; 2730 /* Then, search for the other last nodes of the sub expression. */ 2731 for (; sl_str <= bkref_str_idx; ++sl_str) 2732 { 2733 int cls_node, sl_str_off; 2734 const re_node_set *nodes; 2735 sl_str_off = sl_str - sub_top->str_idx; 2736 /* The matched string by the sub expression match with the substring 2737 at the back reference? */ 2738 if (sl_str_off > 0) 2739 { 2740 if (BE (bkref_str_off >= mctx->input.valid_len, 0)) 2741 { 2742 /* If we are at the end of the input, we cannot match. */ 2743 if (bkref_str_off >= mctx->input.len) 2744 break; 2745 2746 err = extend_buffers (mctx); 2747 if (BE (err != REG_NOERROR, 0)) 2748 return err; 2749 2750 buf = (const char *) re_string_get_buffer (&mctx->input); 2751 } 2752 if (buf [bkref_str_off++] != buf[sl_str - 1]) 2753 break; /* We don't need to search this sub expression 2754 any more. */ 2755 } 2756 if (mctx->state_log[sl_str] == NULL) 2757 continue; 2758 /* Does this state have a ')' of the sub expression? */ 2759 nodes = &mctx->state_log[sl_str]->nodes; 2760 cls_node = find_subexp_node (dfa, nodes, subexp_num, OP_CLOSE_SUBEXP); 2761 if (cls_node == -1) 2762 continue; /* No. */ 2763 if (sub_top->path == NULL) 2764 { 2765 sub_top->path = calloc (sizeof (state_array_t), 2766 sl_str - sub_top->str_idx + 1); 2767 if (sub_top->path == NULL) 2768 return REG_ESPACE; 2769 } 2770 /* Can the OP_OPEN_SUBEXP node arrive the OP_CLOSE_SUBEXP node 2771 in the current context? */ 2772 err = check_arrival (mctx, sub_top->path, sub_top->node, 2773 sub_top->str_idx, cls_node, sl_str, OP_CLOSE_SUBEXP); 2774 if (err == REG_NOMATCH) 2775 continue; 2776 if (BE (err != REG_NOERROR, 0)) 2777 return err; 2778 sub_last = match_ctx_add_sublast (sub_top, cls_node, sl_str); 2779 if (BE (sub_last == NULL, 0)) 2780 return REG_ESPACE; 2781 err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node, 2782 bkref_str_idx); 2783 if (err == REG_NOMATCH) 2784 continue; 2785 } 2786 } 2787 return REG_NOERROR; 2788} 2789 2790/* Helper functions for get_subexp(). */ 2791 2792/* Check SUB_LAST can arrive to the back reference BKREF_NODE at BKREF_STR. 2793 If it can arrive, register the sub expression expressed with SUB_TOP 2794 and SUB_LAST. */ 2795 2796static reg_errcode_t 2797get_subexp_sub (mctx, sub_top, sub_last, bkref_node, bkref_str) 2798 re_match_context_t *mctx; 2799 const re_sub_match_top_t *sub_top; 2800 re_sub_match_last_t *sub_last; 2801 int bkref_node, bkref_str; 2802{ 2803 reg_errcode_t err; 2804 int to_idx; 2805 /* Can the subexpression arrive the back reference? */ 2806 err = check_arrival (mctx, &sub_last->path, sub_last->node, 2807 sub_last->str_idx, bkref_node, bkref_str, OP_OPEN_SUBEXP); 2808 if (err != REG_NOERROR) 2809 return err; 2810 err = match_ctx_add_entry (mctx, bkref_node, bkref_str, sub_top->str_idx, 2811 sub_last->str_idx); 2812 if (BE (err != REG_NOERROR, 0)) 2813 return err; 2814 to_idx = bkref_str + sub_last->str_idx - sub_top->str_idx; 2815 return clean_state_log_if_needed (mctx, to_idx); 2816} 2817 2818/* Find the first node which is '(' or ')' and whose index is SUBEXP_IDX. 2819 Search '(' if FL_OPEN, or search ')' otherwise. 2820 TODO: This function isn't efficient... 2821 Because there might be more than one nodes whose types are 2822 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all 2823 nodes. 2824 E.g. RE: (a){2} */ 2825 2826static int 2827find_subexp_node (dfa, nodes, subexp_idx, type) 2828 const re_dfa_t *dfa; 2829 const re_node_set *nodes; 2830 int subexp_idx, type; 2831{ 2832 int cls_idx; 2833 for (cls_idx = 0; cls_idx < nodes->nelem; ++cls_idx) 2834 { 2835 int cls_node = nodes->elems[cls_idx]; 2836 const re_token_t *node = dfa->nodes + cls_node; 2837 if (node->type == type 2838 && node->opr.idx == subexp_idx) 2839 return cls_node; 2840 } 2841 return -1; 2842} 2843 2844/* Check whether the node TOP_NODE at TOP_STR can arrive to the node 2845 LAST_NODE at LAST_STR. We record the path onto PATH since it will be 2846 heavily reused. 2847 Return REG_NOERROR if it can arrive, or REG_NOMATCH otherwise. */ 2848 2849static reg_errcode_t 2850check_arrival (mctx, path, top_node, top_str, last_node, last_str, 2851 type) 2852 re_match_context_t *mctx; 2853 state_array_t *path; 2854 int top_node, top_str, last_node, last_str, type; 2855{ 2856 re_dfa_t *const dfa = mctx->dfa; 2857 reg_errcode_t err; 2858 int subexp_num, backup_cur_idx, str_idx, null_cnt; 2859 re_dfastate_t *cur_state = NULL; 2860 re_node_set *cur_nodes, next_nodes; 2861 re_dfastate_t **backup_state_log; 2862 unsigned int context; 2863 2864 subexp_num = dfa->nodes[top_node].opr.idx; 2865 /* Extend the buffer if we need. */ 2866 if (BE (path->alloc < last_str + mctx->max_mb_elem_len + 1, 0)) 2867 { 2868 re_dfastate_t **new_array; 2869 int old_alloc = path->alloc; 2870 path->alloc += last_str + mctx->max_mb_elem_len + 1; 2871 new_array = re_realloc (path->array, re_dfastate_t *, path->alloc); 2872 if (new_array == NULL) 2873 { 2874 path->alloc = old_alloc; 2875 return REG_ESPACE; 2876 } 2877 path->array = new_array; 2878 memset (new_array + old_alloc, '\0', 2879 sizeof (re_dfastate_t *) * (path->alloc - old_alloc)); 2880 } 2881 2882 str_idx = path->next_idx == 0 ? top_str : path->next_idx; 2883 2884 /* Temporary modify MCTX. */ 2885 backup_state_log = mctx->state_log; 2886 backup_cur_idx = mctx->input.cur_idx; 2887 mctx->state_log = path->array; 2888 mctx->input.cur_idx = str_idx; 2889 2890 /* Setup initial node set. */ 2891 context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags); 2892 if (str_idx == top_str) 2893 { 2894 err = re_node_set_init_1 (&next_nodes, top_node); 2895 if (BE (err != REG_NOERROR, 0)) 2896 return err; 2897 err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type); 2898 if (BE (err != REG_NOERROR, 0)) 2899 { 2900 re_node_set_free (&next_nodes); 2901 return err; 2902 } 2903 } 2904 else 2905 { 2906 cur_state = mctx->state_log[str_idx]; 2907 if (cur_state && cur_state->has_backref) 2908 { 2909 err = re_node_set_init_copy (&next_nodes, &cur_state->nodes); 2910 if (BE ( err != REG_NOERROR, 0)) 2911 return err; 2912 } 2913 else 2914 re_node_set_init_empty (&next_nodes); 2915 } 2916 if (str_idx == top_str || (cur_state && cur_state->has_backref)) 2917 { 2918 if (next_nodes.nelem) 2919 { 2920 err = expand_bkref_cache (mctx, &next_nodes, str_idx, 2921 subexp_num, type); 2922 if (BE ( err != REG_NOERROR, 0)) 2923 { 2924 re_node_set_free (&next_nodes); 2925 return err; 2926 } 2927 } 2928 cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context); 2929 if (BE (cur_state == NULL && err != REG_NOERROR, 0)) 2930 { 2931 re_node_set_free (&next_nodes); 2932 return err; 2933 } 2934 mctx->state_log[str_idx] = cur_state; 2935 } 2936 2937 for (null_cnt = 0; str_idx < last_str && null_cnt <= mctx->max_mb_elem_len;) 2938 { 2939 re_node_set_empty (&next_nodes); 2940 if (mctx->state_log[str_idx + 1]) 2941 { 2942 err = re_node_set_merge (&next_nodes, 2943 &mctx->state_log[str_idx + 1]->nodes); 2944 if (BE (err != REG_NOERROR, 0)) 2945 { 2946 re_node_set_free (&next_nodes); 2947 return err; 2948 } 2949 } 2950 if (cur_state) 2951 { 2952 err = check_arrival_add_next_nodes (mctx, str_idx, 2953 &cur_state->non_eps_nodes, &next_nodes); 2954 if (BE (err != REG_NOERROR, 0)) 2955 { 2956 re_node_set_free (&next_nodes); 2957 return err; 2958 } 2959 } 2960 ++str_idx; 2961 if (next_nodes.nelem) 2962 { 2963 err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type); 2964 if (BE (err != REG_NOERROR, 0)) 2965 { 2966 re_node_set_free (&next_nodes); 2967 return err; 2968 } 2969 err = expand_bkref_cache (mctx, &next_nodes, str_idx, 2970 subexp_num, type); 2971 if (BE ( err != REG_NOERROR, 0)) 2972 { 2973 re_node_set_free (&next_nodes); 2974 return err; 2975 } 2976 } 2977 context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags); 2978 cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context); 2979 if (BE (cur_state == NULL && err != REG_NOERROR, 0)) 2980 { 2981 re_node_set_free (&next_nodes); 2982 return err; 2983 } 2984 mctx->state_log[str_idx] = cur_state; 2985 null_cnt = cur_state == NULL ? null_cnt + 1 : 0; 2986 } 2987 re_node_set_free (&next_nodes); 2988 cur_nodes = (mctx->state_log[last_str] == NULL ? NULL 2989 : &mctx->state_log[last_str]->nodes); 2990 path->next_idx = str_idx; 2991 2992 /* Fix MCTX. */ 2993 mctx->state_log = backup_state_log; 2994 mctx->input.cur_idx = backup_cur_idx; 2995 2996 /* Then check the current node set has the node LAST_NODE. */ 2997 if (cur_nodes != NULL && re_node_set_contains (cur_nodes, last_node)) 2998 return REG_NOERROR; 2999 3000 return REG_NOMATCH; 3001} 3002 3003/* Helper functions for check_arrival. */ 3004 3005/* Calculate the destination nodes of CUR_NODES at STR_IDX, and append them 3006 to NEXT_NODES. 3007 TODO: This function is similar to the functions transit_state*(), 3008 however this function has many additional works. 3009 Can't we unify them? */ 3010 3011static reg_errcode_t 3012check_arrival_add_next_nodes (mctx, str_idx, cur_nodes, next_nodes) 3013 re_match_context_t *mctx; 3014 int str_idx; 3015 re_node_set *cur_nodes, *next_nodes; 3016{ 3017 re_dfa_t *const dfa = mctx->dfa; 3018 int result; 3019 int cur_idx; 3020 reg_errcode_t err; 3021 re_node_set union_set; 3022 re_node_set_init_empty (&union_set); 3023 for (cur_idx = 0; cur_idx < cur_nodes->nelem; ++cur_idx) 3024 { 3025 int naccepted = 0; 3026 int cur_node = cur_nodes->elems[cur_idx]; 3027#ifdef DEBUG 3028 re_token_type_t type = dfa->nodes[cur_node].type; 3029 assert (!IS_EPSILON_NODE (type)); 3030#endif 3031#ifdef RE_ENABLE_I18N 3032 /* If the node may accept `multi byte'. */ 3033 if (dfa->nodes[cur_node].accept_mb) 3034 { 3035 naccepted = check_node_accept_bytes (dfa, cur_node, &mctx->input, 3036 str_idx); 3037 if (naccepted > 1) 3038 { 3039 re_dfastate_t *dest_state; 3040 int next_node = dfa->nexts[cur_node]; 3041 int next_idx = str_idx + naccepted; 3042 dest_state = mctx->state_log[next_idx]; 3043 re_node_set_empty (&union_set); 3044 if (dest_state) 3045 { 3046 err = re_node_set_merge (&union_set, &dest_state->nodes); 3047 if (BE (err != REG_NOERROR, 0)) 3048 { 3049 re_node_set_free (&union_set); 3050 return err; 3051 } 3052 } 3053 result = re_node_set_insert (&union_set, next_node); 3054 if (BE (result < 0, 0)) 3055 { 3056 re_node_set_free (&union_set); 3057 return REG_ESPACE; 3058 } 3059 mctx->state_log[next_idx] = re_acquire_state (&err, dfa, 3060 &union_set); 3061 if (BE (mctx->state_log[next_idx] == NULL 3062 && err != REG_NOERROR, 0)) 3063 { 3064 re_node_set_free (&union_set); 3065 return err; 3066 } 3067 } 3068 } 3069#endif /* RE_ENABLE_I18N */ 3070 if (naccepted 3071 || check_node_accept (mctx, dfa->nodes + cur_node, str_idx)) 3072 { 3073 result = re_node_set_insert (next_nodes, dfa->nexts[cur_node]); 3074 if (BE (result < 0, 0)) 3075 { 3076 re_node_set_free (&union_set); 3077 return REG_ESPACE; 3078 } 3079 } 3080 } 3081 re_node_set_free (&union_set); 3082 return REG_NOERROR; 3083} 3084 3085/* For all the nodes in CUR_NODES, add the epsilon closures of them to 3086 CUR_NODES, however exclude the nodes which are: 3087 - inside the sub expression whose number is EX_SUBEXP, if FL_OPEN. 3088 - out of the sub expression whose number is EX_SUBEXP, if !FL_OPEN. 3089*/ 3090 3091static reg_errcode_t 3092check_arrival_expand_ecl (dfa, cur_nodes, ex_subexp, type) 3093 re_dfa_t *dfa; 3094 re_node_set *cur_nodes; 3095 int ex_subexp, type; 3096{ 3097 reg_errcode_t err; 3098 int idx, outside_node; 3099 re_node_set new_nodes; 3100#ifdef DEBUG 3101 assert (cur_nodes->nelem); 3102#endif 3103 err = re_node_set_alloc (&new_nodes, cur_nodes->nelem); 3104 if (BE (err != REG_NOERROR, 0)) 3105 return err; 3106 /* Create a new node set NEW_NODES with the nodes which are epsilon 3107 closures of the node in CUR_NODES. */ 3108 3109 for (idx = 0; idx < cur_nodes->nelem; ++idx) 3110 { 3111 int cur_node = cur_nodes->elems[idx]; 3112 re_node_set *eclosure = dfa->eclosures + cur_node; 3113 outside_node = find_subexp_node (dfa, eclosure, ex_subexp, type); 3114 if (outside_node == -1) 3115 { 3116 /* There are no problematic nodes, just merge them. */ 3117 err = re_node_set_merge (&new_nodes, eclosure); 3118 if (BE (err != REG_NOERROR, 0)) 3119 { 3120 re_node_set_free (&new_nodes); 3121 return err; 3122 } 3123 } 3124 else 3125 { 3126 /* There are problematic nodes, re-calculate incrementally. */ 3127 err = check_arrival_expand_ecl_sub (dfa, &new_nodes, cur_node, 3128 ex_subexp, type); 3129 if (BE (err != REG_NOERROR, 0)) 3130 { 3131 re_node_set_free (&new_nodes); 3132 return err; 3133 } 3134 } 3135 } 3136 re_node_set_free (cur_nodes); 3137 *cur_nodes = new_nodes; 3138 return REG_NOERROR; 3139} 3140 3141/* Helper function for check_arrival_expand_ecl. 3142 Check incrementally the epsilon closure of TARGET, and if it isn't 3143 problematic append it to DST_NODES. */ 3144 3145static reg_errcode_t 3146check_arrival_expand_ecl_sub (dfa, dst_nodes, target, ex_subexp, type) 3147 re_dfa_t *dfa; 3148 int target, ex_subexp, type; 3149 re_node_set *dst_nodes; 3150{ 3151 int cur_node; 3152 for (cur_node = target; !re_node_set_contains (dst_nodes, cur_node);) 3153 { 3154 int err; 3155 3156 if (dfa->nodes[cur_node].type == type 3157 && dfa->nodes[cur_node].opr.idx == ex_subexp) 3158 { 3159 if (type == OP_CLOSE_SUBEXP) 3160 { 3161 err = re_node_set_insert (dst_nodes, cur_node); 3162 if (BE (err == -1, 0)) 3163 return REG_ESPACE; 3164 } 3165 break; 3166 } 3167 err = re_node_set_insert (dst_nodes, cur_node); 3168 if (BE (err == -1, 0)) 3169 return REG_ESPACE; 3170 if (dfa->edests[cur_node].nelem == 0) 3171 break; 3172 if (dfa->edests[cur_node].nelem == 2) 3173 { 3174 err = check_arrival_expand_ecl_sub (dfa, dst_nodes, 3175 dfa->edests[cur_node].elems[1], 3176 ex_subexp, type); 3177 if (BE (err != REG_NOERROR, 0)) 3178 return err; 3179 } 3180 cur_node = dfa->edests[cur_node].elems[0]; 3181 } 3182 return REG_NOERROR; 3183} 3184 3185 3186/* For all the back references in the current state, calculate the 3187 destination of the back references by the appropriate entry 3188 in MCTX->BKREF_ENTS. */ 3189 3190static reg_errcode_t 3191expand_bkref_cache (mctx, cur_nodes, cur_str, subexp_num, 3192 type) 3193 re_match_context_t *mctx; 3194 int cur_str, subexp_num, type; 3195 re_node_set *cur_nodes; 3196{ 3197 re_dfa_t *const dfa = mctx->dfa; 3198 reg_errcode_t err; 3199 int cache_idx_start = search_cur_bkref_entry (mctx, cur_str); 3200 struct re_backref_cache_entry *ent; 3201 3202 if (cache_idx_start == -1) 3203 return REG_NOERROR; 3204 3205 restart: 3206 ent = mctx->bkref_ents + cache_idx_start; 3207 do 3208 { 3209 int to_idx, next_node; 3210 3211 /* Is this entry ENT is appropriate? */ 3212 if (!re_node_set_contains (cur_nodes, ent->node)) 3213 continue; /* No. */ 3214 3215 to_idx = cur_str + ent->subexp_to - ent->subexp_from; 3216 /* Calculate the destination of the back reference, and append it 3217 to MCTX->STATE_LOG. */ 3218 if (to_idx == cur_str) 3219 { 3220 /* The backreference did epsilon transit, we must re-check all the 3221 node in the current state. */ 3222 re_node_set new_dests; 3223 reg_errcode_t err2, err3; 3224 next_node = dfa->edests[ent->node].elems[0]; 3225 if (re_node_set_contains (cur_nodes, next_node)) 3226 continue; 3227 err = re_node_set_init_1 (&new_dests, next_node); 3228 err2 = check_arrival_expand_ecl (dfa, &new_dests, subexp_num, type); 3229 err3 = re_node_set_merge (cur_nodes, &new_dests); 3230 re_node_set_free (&new_dests); 3231 if (BE (err != REG_NOERROR || err2 != REG_NOERROR 3232 || err3 != REG_NOERROR, 0)) 3233 { 3234 err = (err != REG_NOERROR ? err 3235 : (err2 != REG_NOERROR ? err2 : err3)); 3236 return err; 3237 } 3238 /* TODO: It is still inefficient... */ 3239 goto restart; 3240 } 3241 else 3242 { 3243 re_node_set union_set; 3244 next_node = dfa->nexts[ent->node]; 3245 if (mctx->state_log[to_idx]) 3246 { 3247 int ret; 3248 if (re_node_set_contains (&mctx->state_log[to_idx]->nodes, 3249 next_node)) 3250 continue; 3251 err = re_node_set_init_copy (&union_set, 3252 &mctx->state_log[to_idx]->nodes); 3253 ret = re_node_set_insert (&union_set, next_node); 3254 if (BE (err != REG_NOERROR || ret < 0, 0)) 3255 { 3256 re_node_set_free (&union_set); 3257 err = err != REG_NOERROR ? err : REG_ESPACE; 3258 return err; 3259 } 3260 } 3261 else 3262 { 3263 err = re_node_set_init_1 (&union_set, next_node); 3264 if (BE (err != REG_NOERROR, 0)) 3265 return err; 3266 } 3267 mctx->state_log[to_idx] = re_acquire_state (&err, dfa, &union_set); 3268 re_node_set_free (&union_set); 3269 if (BE (mctx->state_log[to_idx] == NULL 3270 && err != REG_NOERROR, 0)) 3271 return err; 3272 } 3273 } 3274 while (ent++->more); 3275 return REG_NOERROR; 3276} 3277 3278/* Build transition table for the state. 3279 Return 1 if succeeded, otherwise return NULL. */ 3280 3281static int 3282build_trtable (dfa, state) 3283 re_dfa_t *dfa; 3284 re_dfastate_t *state; 3285{ 3286 reg_errcode_t err; 3287 int i, j, ch, need_word_trtable = 0; 3288 unsigned int elem, mask; 3289 int dests_node_malloced = 0, dest_states_malloced = 0; 3290 int ndests; /* Number of the destination states from `state'. */ 3291 re_dfastate_t **trtable; 3292 re_dfastate_t **dest_states = NULL, **dest_states_word, **dest_states_nl; 3293 re_node_set follows, *dests_node; 3294 bitset *dests_ch; 3295 bitset acceptable; 3296 3297 /* We build DFA states which corresponds to the destination nodes 3298 from `state'. `dests_node[i]' represents the nodes which i-th 3299 destination state contains, and `dests_ch[i]' represents the 3300 characters which i-th destination state accepts. */ 3301#ifdef _LIBC 3302 if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX)) 3303 dests_node = (re_node_set *) 3304 alloca ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX); 3305 else 3306#endif 3307 { 3308 dests_node = (re_node_set *) 3309 malloc ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX); 3310 if (BE (dests_node == NULL, 0)) 3311 return 0; 3312 dests_node_malloced = 1; 3313 } 3314 dests_ch = (bitset *) (dests_node + SBC_MAX); 3315 3316 /* Initialize transiton table. */ 3317 state->word_trtable = state->trtable = NULL; 3318 3319 /* At first, group all nodes belonging to `state' into several 3320 destinations. */ 3321 ndests = group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch); 3322 if (BE (ndests <= 0, 0)) 3323 { 3324 if (dests_node_malloced) 3325 free (dests_node); 3326 /* Return 0 in case of an error, 1 otherwise. */ 3327 if (ndests == 0) 3328 { 3329 state->trtable = (re_dfastate_t **) 3330 calloc (sizeof (re_dfastate_t *), SBC_MAX); 3331 return 1; 3332 } 3333 return 0; 3334 } 3335 3336 err = re_node_set_alloc (&follows, ndests + 1); 3337 if (BE (err != REG_NOERROR, 0)) 3338 goto out_free; 3339 3340#ifdef _LIBC 3341 if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX 3342 + ndests * 3 * sizeof (re_dfastate_t *))) 3343 dest_states = (re_dfastate_t **) 3344 alloca (ndests * 3 * sizeof (re_dfastate_t *)); 3345 else 3346#endif 3347 { 3348 dest_states = (re_dfastate_t **) 3349 malloc (ndests * 3 * sizeof (re_dfastate_t *)); 3350 if (BE (dest_states == NULL, 0)) 3351 { 3352out_free: 3353 if (dest_states_malloced) 3354 free (dest_states); 3355 re_node_set_free (&follows); 3356 for (i = 0; i < ndests; ++i) 3357 re_node_set_free (dests_node + i); 3358 if (dests_node_malloced) 3359 free (dests_node); 3360 return 0; 3361 } 3362 dest_states_malloced = 1; 3363 } 3364 dest_states_word = dest_states + ndests; 3365 dest_states_nl = dest_states_word + ndests; 3366 bitset_empty (acceptable); 3367 3368 /* Then build the states for all destinations. */ 3369 for (i = 0; i < ndests; ++i) 3370 { 3371 int next_node; 3372 re_node_set_empty (&follows); 3373 /* Merge the follows of this destination states. */ 3374 for (j = 0; j < dests_node[i].nelem; ++j) 3375 { 3376 next_node = dfa->nexts[dests_node[i].elems[j]]; 3377 if (next_node != -1) 3378 { 3379 err = re_node_set_merge (&follows, dfa->eclosures + next_node); 3380 if (BE (err != REG_NOERROR, 0)) 3381 goto out_free; 3382 } 3383 } 3384 dest_states[i] = re_acquire_state_context (&err, dfa, &follows, 0); 3385 if (BE (dest_states[i] == NULL && err != REG_NOERROR, 0)) 3386 goto out_free; 3387 /* If the new state has context constraint, 3388 build appropriate states for these contexts. */ 3389 if (dest_states[i]->has_constraint) 3390 { 3391 dest_states_word[i] = re_acquire_state_context (&err, dfa, &follows, 3392 CONTEXT_WORD); 3393 if (BE (dest_states_word[i] == NULL && err != REG_NOERROR, 0)) 3394 goto out_free; 3395 3396 if (dest_states[i] != dest_states_word[i] && dfa->mb_cur_max > 1) 3397 need_word_trtable = 1; 3398 3399 dest_states_nl[i] = re_acquire_state_context (&err, dfa, &follows, 3400 CONTEXT_NEWLINE); 3401 if (BE (dest_states_nl[i] == NULL && err != REG_NOERROR, 0)) 3402 goto out_free; 3403 } 3404 else 3405 { 3406 dest_states_word[i] = dest_states[i]; 3407 dest_states_nl[i] = dest_states[i]; 3408 } 3409 bitset_merge (acceptable, dests_ch[i]); 3410 } 3411 3412 if (!BE (need_word_trtable, 0)) 3413 { 3414 /* We don't care about whether the following character is a word 3415 character, or we are in a single-byte character set so we can 3416 discern by looking at the character code: allocate a 3417 256-entry transition table. */ 3418 trtable = state->trtable = 3419 (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), SBC_MAX); 3420 if (BE (trtable == NULL, 0)) 3421 goto out_free; 3422 3423 /* For all characters ch...: */ 3424 for (i = 0; i < BITSET_UINTS; ++i) 3425 for (ch = i * UINT_BITS, elem = acceptable[i], mask = 1; 3426 elem; 3427 mask <<= 1, elem >>= 1, ++ch) 3428 if (BE (elem & 1, 0)) 3429 { 3430 /* There must be exactly one destination which accepts 3431 character ch. See group_nodes_into_DFAstates. */ 3432 for (j = 0; (dests_ch[j][i] & mask) == 0; ++j) 3433 ; 3434 3435 /* j-th destination accepts the word character ch. */ 3436 if (dfa->word_char[i] & mask) 3437 trtable[ch] = dest_states_word[j]; 3438 else 3439 trtable[ch] = dest_states[j]; 3440 } 3441 } 3442 else 3443 { 3444 /* We care about whether the following character is a word 3445 character, and we are in a multi-byte character set: discern 3446 by looking at the character code: build two 256-entry 3447 transition tables, one starting at trtable[0] and one 3448 starting at trtable[SBC_MAX]. */ 3449 trtable = state->word_trtable = 3450 (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), 2 * SBC_MAX); 3451 if (BE (trtable == NULL, 0)) 3452 goto out_free; 3453 3454 /* For all characters ch...: */ 3455 for (i = 0; i < BITSET_UINTS; ++i) 3456 for (ch = i * UINT_BITS, elem = acceptable[i], mask = 1; 3457 elem; 3458 mask <<= 1, elem >>= 1, ++ch) 3459 if (BE (elem & 1, 0)) 3460 { 3461 /* There must be exactly one destination which accepts 3462 character ch. See group_nodes_into_DFAstates. */ 3463 for (j = 0; (dests_ch[j][i] & mask) == 0; ++j) 3464 ; 3465 3466 /* j-th destination accepts the word character ch. */ 3467 trtable[ch] = dest_states[j]; 3468 trtable[ch + SBC_MAX] = dest_states_word[j]; 3469 } 3470 } 3471 3472 /* new line */ 3473 if (bitset_contain (acceptable, NEWLINE_CHAR)) 3474 { 3475 /* The current state accepts newline character. */ 3476 for (j = 0; j < ndests; ++j) 3477 if (bitset_contain (dests_ch[j], NEWLINE_CHAR)) 3478 { 3479 /* k-th destination accepts newline character. */ 3480 trtable[NEWLINE_CHAR] = dest_states_nl[j]; 3481 if (need_word_trtable) 3482 trtable[NEWLINE_CHAR + SBC_MAX] = dest_states_nl[j]; 3483 /* There must be only one destination which accepts 3484 newline. See group_nodes_into_DFAstates. */ 3485 break; 3486 } 3487 } 3488 3489 if (dest_states_malloced) 3490 free (dest_states); 3491 3492 re_node_set_free (&follows); 3493 for (i = 0; i < ndests; ++i) 3494 re_node_set_free (dests_node + i); 3495 3496 if (dests_node_malloced) 3497 free (dests_node); 3498 3499 return 1; 3500} 3501 3502/* Group all nodes belonging to STATE into several destinations. 3503 Then for all destinations, set the nodes belonging to the destination 3504 to DESTS_NODE[i] and set the characters accepted by the destination 3505 to DEST_CH[i]. This function return the number of destinations. */ 3506 3507static int 3508group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch) 3509 re_dfa_t *dfa; 3510 const re_dfastate_t *state; 3511 re_node_set *dests_node; 3512 bitset *dests_ch; 3513{ 3514 reg_errcode_t err; 3515 int result; 3516 int i, j, k; 3517 int ndests; /* Number of the destinations from `state'. */ 3518 bitset accepts; /* Characters a node can accept. */ 3519 const re_node_set *cur_nodes = &state->nodes; 3520 bitset_empty (accepts); 3521 ndests = 0; 3522 3523 /* For all the nodes belonging to `state', */ 3524 for (i = 0; i < cur_nodes->nelem; ++i) 3525 { 3526 re_token_t *node = &dfa->nodes[cur_nodes->elems[i]]; 3527 re_token_type_t type = node->type; 3528 unsigned int constraint = node->constraint; 3529 3530 /* Enumerate all single byte character this node can accept. */ 3531 if (type == CHARACTER) 3532 bitset_set (accepts, node->opr.c); 3533 else if (type == SIMPLE_BRACKET) 3534 { 3535 bitset_merge (accepts, node->opr.sbcset); 3536 } 3537 else if (type == OP_PERIOD) 3538 { 3539#ifdef RE_ENABLE_I18N 3540 if (dfa->mb_cur_max > 1) 3541 bitset_merge (accepts, dfa->sb_char); 3542 else 3543#endif 3544 bitset_set_all (accepts); 3545 if (!(dfa->syntax & RE_DOT_NEWLINE)) 3546 bitset_clear (accepts, '\n'); 3547 if (dfa->syntax & RE_DOT_NOT_NULL) 3548 bitset_clear (accepts, '\0'); 3549 } 3550#ifdef RE_ENABLE_I18N 3551 else if (type == OP_UTF8_PERIOD) 3552 { 3553 memset (accepts, 255, sizeof (unsigned int) * BITSET_UINTS / 2); 3554 if (!(dfa->syntax & RE_DOT_NEWLINE)) 3555 bitset_clear (accepts, '\n'); 3556 if (dfa->syntax & RE_DOT_NOT_NULL) 3557 bitset_clear (accepts, '\0'); 3558 } 3559#endif 3560 else 3561 continue; 3562 3563 /* Check the `accepts' and sift the characters which are not 3564 match it the context. */ 3565 if (constraint) 3566 { 3567 if (constraint & NEXT_NEWLINE_CONSTRAINT) 3568 { 3569 int accepts_newline = bitset_contain (accepts, NEWLINE_CHAR); 3570 bitset_empty (accepts); 3571 if (accepts_newline) 3572 bitset_set (accepts, NEWLINE_CHAR); 3573 else 3574 continue; 3575 } 3576 if (constraint & NEXT_ENDBUF_CONSTRAINT) 3577 { 3578 bitset_empty (accepts); 3579 continue; 3580 } 3581 3582 if (constraint & NEXT_WORD_CONSTRAINT) 3583 { 3584 unsigned int any_set = 0; 3585 if (type == CHARACTER && !node->word_char) 3586 { 3587 bitset_empty (accepts); 3588 continue; 3589 } 3590#ifdef RE_ENABLE_I18N 3591 if (dfa->mb_cur_max > 1) 3592 for (j = 0; j < BITSET_UINTS; ++j) 3593 any_set |= (accepts[j] &= (dfa->word_char[j] | ~dfa->sb_char[j])); 3594 else 3595#endif 3596 for (j = 0; j < BITSET_UINTS; ++j) 3597 any_set |= (accepts[j] &= dfa->word_char[j]); 3598 if (!any_set) 3599 continue; 3600 } 3601 if (constraint & NEXT_NOTWORD_CONSTRAINT) 3602 { 3603 unsigned int any_set = 0; 3604 if (type == CHARACTER && node->word_char) 3605 { 3606 bitset_empty (accepts); 3607 continue; 3608 } 3609#ifdef RE_ENABLE_I18N 3610 if (dfa->mb_cur_max > 1) 3611 for (j = 0; j < BITSET_UINTS; ++j) 3612 any_set |= (accepts[j] &= ~(dfa->word_char[j] & dfa->sb_char[j])); 3613 else 3614#endif 3615 for (j = 0; j < BITSET_UINTS; ++j) 3616 any_set |= (accepts[j] &= ~dfa->word_char[j]); 3617 if (!any_set) 3618 continue; 3619 } 3620 } 3621 3622 /* Then divide `accepts' into DFA states, or create a new 3623 state. Above, we make sure that accepts is not empty. */ 3624 for (j = 0; j < ndests; ++j) 3625 { 3626 bitset intersec; /* Intersection sets, see below. */ 3627 bitset remains; 3628 /* Flags, see below. */ 3629 int has_intersec, not_subset, not_consumed; 3630 3631 /* Optimization, skip if this state doesn't accept the character. */ 3632 if (type == CHARACTER && !bitset_contain (dests_ch[j], node->opr.c)) 3633 continue; 3634 3635 /* Enumerate the intersection set of this state and `accepts'. */ 3636 has_intersec = 0; 3637 for (k = 0; k < BITSET_UINTS; ++k) 3638 has_intersec |= intersec[k] = accepts[k] & dests_ch[j][k]; 3639 /* And skip if the intersection set is empty. */ 3640 if (!has_intersec) 3641 continue; 3642 3643 /* Then check if this state is a subset of `accepts'. */ 3644 not_subset = not_consumed = 0; 3645 for (k = 0; k < BITSET_UINTS; ++k) 3646 { 3647 not_subset |= remains[k] = ~accepts[k] & dests_ch[j][k]; 3648 not_consumed |= accepts[k] = accepts[k] & ~dests_ch[j][k]; 3649 } 3650 3651 /* If this state isn't a subset of `accepts', create a 3652 new group state, which has the `remains'. */ 3653 if (not_subset) 3654 { 3655 bitset_copy (dests_ch[ndests], remains); 3656 bitset_copy (dests_ch[j], intersec); 3657 err = re_node_set_init_copy (dests_node + ndests, &dests_node[j]); 3658 if (BE (err != REG_NOERROR, 0)) 3659 goto error_return; 3660 ++ndests; 3661 } 3662 3663 /* Put the position in the current group. */ 3664 result = re_node_set_insert (&dests_node[j], cur_nodes->elems[i]); 3665 if (BE (result < 0, 0)) 3666 goto error_return; 3667 3668 /* If all characters are consumed, go to next node. */ 3669 if (!not_consumed) 3670 break; 3671 } 3672 /* Some characters remain, create a new group. */ 3673 if (j == ndests) 3674 { 3675 bitset_copy (dests_ch[ndests], accepts); 3676 err = re_node_set_init_1 (dests_node + ndests, cur_nodes->elems[i]); 3677 if (BE (err != REG_NOERROR, 0)) 3678 goto error_return; 3679 ++ndests; 3680 bitset_empty (accepts); 3681 } 3682 } 3683 return ndests; 3684 error_return: 3685 for (j = 0; j < ndests; ++j) 3686 re_node_set_free (dests_node + j); 3687 return -1; 3688} 3689 3690#ifdef RE_ENABLE_I18N 3691/* Check how many bytes the node `dfa->nodes[node_idx]' accepts. 3692 Return the number of the bytes the node accepts. 3693 STR_IDX is the current index of the input string. 3694 3695 This function handles the nodes which can accept one character, or 3696 one collating element like '.', '[a-z]', opposite to the other nodes 3697 can only accept one byte. */ 3698 3699static int 3700check_node_accept_bytes (dfa, node_idx, input, str_idx) 3701 re_dfa_t *dfa; 3702 int node_idx, str_idx; 3703 const re_string_t *input; 3704{ 3705 const re_token_t *node = dfa->nodes + node_idx; 3706 int char_len, elem_len; 3707 int i; 3708 3709 if (BE (node->type == OP_UTF8_PERIOD, 0)) 3710 { 3711 unsigned char c = re_string_byte_at (input, str_idx), d; 3712 if (BE (c < 0xc2, 1)) 3713 return 0; 3714 3715 if (str_idx + 2 > input->len) 3716 return 0; 3717 3718 d = re_string_byte_at (input, str_idx + 1); 3719 if (c < 0xe0) 3720 return (d < 0x80 || d > 0xbf) ? 0 : 2; 3721 else if (c < 0xf0) 3722 { 3723 char_len = 3; 3724 if (c == 0xe0 && d < 0xa0) 3725 return 0; 3726 } 3727 else if (c < 0xf8) 3728 { 3729 char_len = 4; 3730 if (c == 0xf0 && d < 0x90) 3731 return 0; 3732 } 3733 else if (c < 0xfc) 3734 { 3735 char_len = 5; 3736 if (c == 0xf8 && d < 0x88) 3737 return 0; 3738 } 3739 else if (c < 0xfe) 3740 { 3741 char_len = 6; 3742 if (c == 0xfc && d < 0x84) 3743 return 0; 3744 } 3745 else 3746 return 0; 3747 3748 if (str_idx + char_len > input->len) 3749 return 0; 3750 3751 for (i = 1; i < char_len; ++i) 3752 { 3753 d = re_string_byte_at (input, str_idx + i); 3754 if (d < 0x80 || d > 0xbf) 3755 return 0; 3756 } 3757 return char_len; 3758 } 3759 3760 char_len = re_string_char_size_at (input, str_idx); 3761 if (node->type == OP_PERIOD) 3762 { 3763 if (char_len <= 1) 3764 return 0; 3765 /* FIXME: I don't think this if is needed, as both '\n' 3766 and '\0' are char_len == 1. */ 3767 /* '.' accepts any one character except the following two cases. */ 3768 if ((!(dfa->syntax & RE_DOT_NEWLINE) && 3769 re_string_byte_at (input, str_idx) == '\n') || 3770 ((dfa->syntax & RE_DOT_NOT_NULL) && 3771 re_string_byte_at (input, str_idx) == '\0')) 3772 return 0; 3773 return char_len; 3774 } 3775 3776 elem_len = re_string_elem_size_at (input, str_idx); 3777 if ((elem_len <= 1 && char_len <= 1) || char_len == 0) 3778 return 0; 3779 3780 if (node->type == COMPLEX_BRACKET) 3781 { 3782 const re_charset_t *cset = node->opr.mbcset; 3783# ifdef _LIBC 3784 const unsigned char *pin 3785 = ((const unsigned char *) re_string_get_buffer (input) + str_idx); 3786 int j; 3787 uint32_t nrules; 3788# endif /* _LIBC */ 3789 int match_len = 0; 3790 wchar_t wc = ((cset->nranges || cset->nchar_classes || cset->nmbchars) 3791 ? re_string_wchar_at (input, str_idx) : 0); 3792 3793 /* match with multibyte character? */ 3794 for (i = 0; i < cset->nmbchars; ++i) 3795 if (wc == cset->mbchars[i]) 3796 { 3797 match_len = char_len; 3798 goto check_node_accept_bytes_match; 3799 } 3800 /* match with character_class? */ 3801 for (i = 0; i < cset->nchar_classes; ++i) 3802 { 3803 wctype_t wt = cset->char_classes[i]; 3804 if (__iswctype (wc, wt)) 3805 { 3806 match_len = char_len; 3807 goto check_node_accept_bytes_match; 3808 } 3809 } 3810 3811# ifdef _LIBC 3812 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES); 3813 if (nrules != 0) 3814 { 3815 unsigned int in_collseq = 0; 3816 const int32_t *table, *indirect; 3817 const unsigned char *weights, *extra; 3818 const char *collseqwc; 3819 int32_t idx; 3820 /* This #include defines a local function! */ 3821# include <locale/weight.h> 3822 3823 /* match with collating_symbol? */ 3824 if (cset->ncoll_syms) 3825 extra = (const unsigned char *) 3826 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB); 3827 for (i = 0; i < cset->ncoll_syms; ++i) 3828 { 3829 const unsigned char *coll_sym = extra + cset->coll_syms[i]; 3830 /* Compare the length of input collating element and 3831 the length of current collating element. */ 3832 if (*coll_sym != elem_len) 3833 continue; 3834 /* Compare each bytes. */ 3835 for (j = 0; j < *coll_sym; j++) 3836 if (pin[j] != coll_sym[1 + j]) 3837 break; 3838 if (j == *coll_sym) 3839 { 3840 /* Match if every bytes is equal. */ 3841 match_len = j; 3842 goto check_node_accept_bytes_match; 3843 } 3844 } 3845 3846 if (cset->nranges) 3847 { 3848 if (elem_len <= char_len) 3849 { 3850 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC); 3851 in_collseq = __collseq_table_lookup (collseqwc, wc); 3852 } 3853 else 3854 in_collseq = find_collation_sequence_value (pin, elem_len); 3855 } 3856 /* match with range expression? */ 3857 for (i = 0; i < cset->nranges; ++i) 3858 if (cset->range_starts[i] <= in_collseq 3859 && in_collseq <= cset->range_ends[i]) 3860 { 3861 match_len = elem_len; 3862 goto check_node_accept_bytes_match; 3863 } 3864 3865 /* match with equivalence_class? */ 3866 if (cset->nequiv_classes) 3867 { 3868 const unsigned char *cp = pin; 3869 table = (const int32_t *) 3870 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB); 3871 weights = (const unsigned char *) 3872 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB); 3873 extra = (const unsigned char *) 3874 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB); 3875 indirect = (const int32_t *) 3876 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB); 3877 idx = findidx (&cp); 3878 if (idx > 0) 3879 for (i = 0; i < cset->nequiv_classes; ++i) 3880 { 3881 int32_t equiv_class_idx = cset->equiv_classes[i]; 3882 size_t weight_len = weights[idx]; 3883 if (weight_len == weights[equiv_class_idx]) 3884 { 3885 int cnt = 0; 3886 while (cnt <= weight_len 3887 && (weights[equiv_class_idx + 1 + cnt] 3888 == weights[idx + 1 + cnt])) 3889 ++cnt; 3890 if (cnt > weight_len) 3891 { 3892 match_len = elem_len; 3893 goto check_node_accept_bytes_match; 3894 } 3895 } 3896 } 3897 } 3898 } 3899 else 3900# endif /* _LIBC */ 3901 { 3902 /* match with range expression? */ 3903#if __GNUC__ >= 2 3904 wchar_t cmp_buf[] = {L'\0', L'\0', wc, L'\0', L'\0', L'\0'}; 3905#else 3906 wchar_t cmp_buf[] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'}; 3907 cmp_buf[2] = wc; 3908#endif 3909 for (i = 0; i < cset->nranges; ++i) 3910 { 3911 cmp_buf[0] = cset->range_starts[i]; 3912 cmp_buf[4] = cset->range_ends[i]; 3913 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0 3914 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0) 3915 { 3916 match_len = char_len; 3917 goto check_node_accept_bytes_match; 3918 } 3919 } 3920 } 3921 check_node_accept_bytes_match: 3922 if (!cset->non_match) 3923 return match_len; 3924 else 3925 { 3926 if (match_len > 0) 3927 return 0; 3928 else 3929 return (elem_len > char_len) ? elem_len : char_len; 3930 } 3931 } 3932 return 0; 3933} 3934 3935# ifdef _LIBC 3936static unsigned int 3937find_collation_sequence_value (mbs, mbs_len) 3938 const unsigned char *mbs; 3939 size_t mbs_len; 3940{ 3941 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES); 3942 if (nrules == 0) 3943 { 3944 if (mbs_len == 1) 3945 { 3946 /* No valid character. Match it as a single byte character. */ 3947 const unsigned char *collseq = (const unsigned char *) 3948 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB); 3949 return collseq[mbs[0]]; 3950 } 3951 return UINT_MAX; 3952 } 3953 else 3954 { 3955 int32_t idx; 3956 const unsigned char *extra = (const unsigned char *) 3957 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB); 3958 int32_t extrasize = (const unsigned char *) 3959 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB + 1) - extra; 3960 3961 for (idx = 0; idx < extrasize;) 3962 { 3963 int mbs_cnt, found = 0; 3964 int32_t elem_mbs_len; 3965 /* Skip the name of collating element name. */ 3966 idx = idx + extra[idx] + 1; 3967 elem_mbs_len = extra[idx++]; 3968 if (mbs_len == elem_mbs_len) 3969 { 3970 for (mbs_cnt = 0; mbs_cnt < elem_mbs_len; ++mbs_cnt) 3971 if (extra[idx + mbs_cnt] != mbs[mbs_cnt]) 3972 break; 3973 if (mbs_cnt == elem_mbs_len) 3974 /* Found the entry. */ 3975 found = 1; 3976 } 3977 /* Skip the byte sequence of the collating element. */ 3978 idx += elem_mbs_len; 3979 /* Adjust for the alignment. */ 3980 idx = (idx + 3) & ~3; 3981 /* Skip the collation sequence value. */ 3982 idx += sizeof (uint32_t); 3983 /* Skip the wide char sequence of the collating element. */ 3984 idx = idx + sizeof (uint32_t) * (extra[idx] + 1); 3985 /* If we found the entry, return the sequence value. */ 3986 if (found) 3987 return *(uint32_t *) (extra + idx); 3988 /* Skip the collation sequence value. */ 3989 idx += sizeof (uint32_t); 3990 } 3991 return UINT_MAX; 3992 } 3993} 3994# endif /* _LIBC */ 3995#endif /* RE_ENABLE_I18N */ 3996 3997/* Check whether the node accepts the byte which is IDX-th 3998 byte of the INPUT. */ 3999 4000static int 4001check_node_accept (mctx, node, idx) 4002 const re_match_context_t *mctx; 4003 const re_token_t *node; 4004 int idx; 4005{ 4006 unsigned char ch; 4007 ch = re_string_byte_at (&mctx->input, idx); 4008 switch (node->type) 4009 { 4010 case CHARACTER: 4011 if (node->opr.c != ch) 4012 return 0; 4013 break; 4014 4015 case SIMPLE_BRACKET: 4016 if (!bitset_contain (node->opr.sbcset, ch)) 4017 return 0; 4018 break; 4019 4020#ifdef RE_ENABLE_I18N 4021 case OP_UTF8_PERIOD: 4022 if (ch >= 0x80) 4023 return 0; 4024 /* FALLTHROUGH */ 4025#endif 4026 case OP_PERIOD: 4027 if ((ch == '\n' && !(mctx->dfa->syntax & RE_DOT_NEWLINE)) 4028 || (ch == '\0' && (mctx->dfa->syntax & RE_DOT_NOT_NULL))) 4029 return 0; 4030 break; 4031 4032 default: 4033 return 0; 4034 } 4035 4036 if (node->constraint) 4037 { 4038 /* The node has constraints. Check whether the current context 4039 satisfies the constraints. */ 4040 unsigned int context = re_string_context_at (&mctx->input, idx, 4041 mctx->eflags); 4042 if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context)) 4043 return 0; 4044 } 4045 4046 return 1; 4047} 4048 4049/* Extend the buffers, if the buffers have run out. */ 4050 4051static reg_errcode_t 4052extend_buffers (mctx) 4053 re_match_context_t *mctx; 4054{ 4055 reg_errcode_t ret; 4056 re_string_t *pstr = &mctx->input; 4057 4058 /* Double the lengthes of the buffers. */ 4059 ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2); 4060 if (BE (ret != REG_NOERROR, 0)) 4061 return ret; 4062 4063 if (mctx->state_log != NULL) 4064 { 4065 /* And double the length of state_log. */ 4066 /* XXX We have no indication of the size of this buffer. If this 4067 allocation fail we have no indication that the state_log array 4068 does not have the right size. */ 4069 re_dfastate_t **new_array = re_realloc (mctx->state_log, re_dfastate_t *, 4070 pstr->bufs_len + 1); 4071 if (BE (new_array == NULL, 0)) 4072 return REG_ESPACE; 4073 mctx->state_log = new_array; 4074 } 4075 4076 /* Then reconstruct the buffers. */ 4077 if (pstr->icase) 4078 { 4079#ifdef RE_ENABLE_I18N 4080 if (pstr->mb_cur_max > 1) 4081 { 4082 ret = build_wcs_upper_buffer (pstr); 4083 if (BE (ret != REG_NOERROR, 0)) 4084 return ret; 4085 } 4086 else 4087#endif /* RE_ENABLE_I18N */ 4088 build_upper_buffer (pstr); 4089 } 4090 else 4091 { 4092#ifdef RE_ENABLE_I18N 4093 if (pstr->mb_cur_max > 1) 4094 build_wcs_buffer (pstr); 4095 else 4096#endif /* RE_ENABLE_I18N */ 4097 { 4098 if (pstr->trans != NULL) 4099 re_string_translate_buffer (pstr); 4100 } 4101 } 4102 return REG_NOERROR; 4103} 4104 4105 4106/* Functions for matching context. */ 4107 4108/* Initialize MCTX. */ 4109 4110static reg_errcode_t 4111match_ctx_init (mctx, eflags, n) 4112 re_match_context_t *mctx; 4113 int eflags, n; 4114{ 4115 mctx->eflags = eflags; 4116 mctx->match_last = -1; 4117 if (n > 0) 4118 { 4119 mctx->bkref_ents = re_malloc (struct re_backref_cache_entry, n); 4120 mctx->sub_tops = re_malloc (re_sub_match_top_t *, n); 4121 if (BE (mctx->bkref_ents == NULL || mctx->sub_tops == NULL, 0)) 4122 return REG_ESPACE; 4123 } 4124 /* Already zero-ed by the caller. 4125 else 4126 mctx->bkref_ents = NULL; 4127 mctx->nbkref_ents = 0; 4128 mctx->nsub_tops = 0; */ 4129 mctx->abkref_ents = n; 4130 mctx->max_mb_elem_len = 1; 4131 mctx->asub_tops = n; 4132 return REG_NOERROR; 4133} 4134 4135/* Clean the entries which depend on the current input in MCTX. 4136 This function must be invoked when the matcher changes the start index 4137 of the input, or changes the input string. */ 4138 4139static void 4140match_ctx_clean (mctx) 4141 re_match_context_t *mctx; 4142{ 4143 int st_idx; 4144 for (st_idx = 0; st_idx < mctx->nsub_tops; ++st_idx) 4145 { 4146 int sl_idx; 4147 re_sub_match_top_t *top = mctx->sub_tops[st_idx]; 4148 for (sl_idx = 0; sl_idx < top->nlasts; ++sl_idx) 4149 { 4150 re_sub_match_last_t *last = top->lasts[sl_idx]; 4151 re_free (last->path.array); 4152 re_free (last); 4153 } 4154 re_free (top->lasts); 4155 if (top->path) 4156 { 4157 re_free (top->path->array); 4158 re_free (top->path); 4159 } 4160 free (top); 4161 } 4162 4163 mctx->nsub_tops = 0; 4164 mctx->nbkref_ents = 0; 4165} 4166 4167/* Free all the memory associated with MCTX. */ 4168 4169static void 4170match_ctx_free (mctx) 4171 re_match_context_t *mctx; 4172{ 4173 /* First, free all the memory associated with MCTX->SUB_TOPS. */ 4174 match_ctx_clean (mctx); 4175 re_free (mctx->sub_tops); 4176 re_free (mctx->bkref_ents); 4177} 4178 4179/* Add a new backreference entry to MCTX. 4180 Note that we assume that caller never call this function with duplicate 4181 entry, and call with STR_IDX which isn't smaller than any existing entry. 4182*/ 4183 4184static reg_errcode_t 4185match_ctx_add_entry (mctx, node, str_idx, from, to) 4186 re_match_context_t *mctx; 4187 int node, str_idx, from, to; 4188{ 4189 if (mctx->nbkref_ents >= mctx->abkref_ents) 4190 { 4191 struct re_backref_cache_entry* new_entry; 4192 new_entry = re_realloc (mctx->bkref_ents, struct re_backref_cache_entry, 4193 mctx->abkref_ents * 2); 4194 if (BE (new_entry == NULL, 0)) 4195 { 4196 re_free (mctx->bkref_ents); 4197 return REG_ESPACE; 4198 } 4199 mctx->bkref_ents = new_entry; 4200 memset (mctx->bkref_ents + mctx->nbkref_ents, '\0', 4201 sizeof (struct re_backref_cache_entry) * mctx->abkref_ents); 4202 mctx->abkref_ents *= 2; 4203 } 4204 if (mctx->nbkref_ents > 0 4205 && mctx->bkref_ents[mctx->nbkref_ents - 1].str_idx == str_idx) 4206 mctx->bkref_ents[mctx->nbkref_ents - 1].more = 1; 4207 4208 mctx->bkref_ents[mctx->nbkref_ents].node = node; 4209 mctx->bkref_ents[mctx->nbkref_ents].str_idx = str_idx; 4210 mctx->bkref_ents[mctx->nbkref_ents].subexp_from = from; 4211 mctx->bkref_ents[mctx->nbkref_ents].subexp_to = to; 4212 4213 /* This is a cache that saves negative results of check_dst_limits_calc_pos. 4214 If bit N is clear, means that this entry won't epsilon-transition to 4215 an OP_OPEN_SUBEXP or OP_CLOSE_SUBEXP for the N+1-th subexpression. If 4216 it is set, check_dst_limits_calc_pos_1 will recurse and try to find one 4217 such node. 4218 4219 A backreference does not epsilon-transition unless it is empty, so set 4220 to all zeros if FROM != TO. */ 4221 mctx->bkref_ents[mctx->nbkref_ents].eps_reachable_subexps_map 4222 = (from == to ? ~0 : 0); 4223 4224 mctx->bkref_ents[mctx->nbkref_ents++].more = 0; 4225 if (mctx->max_mb_elem_len < to - from) 4226 mctx->max_mb_elem_len = to - from; 4227 return REG_NOERROR; 4228} 4229 4230/* Search for the first entry which has the same str_idx, or -1 if none is 4231 found. Note that MCTX->BKREF_ENTS is already sorted by MCTX->STR_IDX. */ 4232 4233static int 4234search_cur_bkref_entry (mctx, str_idx) 4235 re_match_context_t *mctx; 4236 int str_idx; 4237{ 4238 int left, right, mid, last; 4239 last = right = mctx->nbkref_ents; 4240 for (left = 0; left < right;) 4241 { 4242 mid = (left + right) / 2; 4243 if (mctx->bkref_ents[mid].str_idx < str_idx) 4244 left = mid + 1; 4245 else 4246 right = mid; 4247 } 4248 if (left < last && mctx->bkref_ents[left].str_idx == str_idx) 4249 return left; 4250 else 4251 return -1; 4252} 4253 4254/* Register the node NODE, whose type is OP_OPEN_SUBEXP, and which matches 4255 at STR_IDX. */ 4256 4257static reg_errcode_t 4258match_ctx_add_subtop (mctx, node, str_idx) 4259 re_match_context_t *mctx; 4260 int node, str_idx; 4261{ 4262#ifdef DEBUG 4263 assert (mctx->sub_tops != NULL); 4264 assert (mctx->asub_tops > 0); 4265#endif 4266 if (BE (mctx->nsub_tops == mctx->asub_tops, 0)) 4267 { 4268 int new_asub_tops = mctx->asub_tops * 2; 4269 re_sub_match_top_t **new_array = re_realloc (mctx->sub_tops, 4270 re_sub_match_top_t *, 4271 new_asub_tops); 4272 if (BE (new_array == NULL, 0)) 4273 return REG_ESPACE; 4274 mctx->sub_tops = new_array; 4275 mctx->asub_tops = new_asub_tops; 4276 } 4277 mctx->sub_tops[mctx->nsub_tops] = calloc (1, sizeof (re_sub_match_top_t)); 4278 if (BE (mctx->sub_tops[mctx->nsub_tops] == NULL, 0)) 4279 return REG_ESPACE; 4280 mctx->sub_tops[mctx->nsub_tops]->node = node; 4281 mctx->sub_tops[mctx->nsub_tops++]->str_idx = str_idx; 4282 return REG_NOERROR; 4283} 4284 4285/* Register the node NODE, whose type is OP_CLOSE_SUBEXP, and which matches 4286 at STR_IDX, whose corresponding OP_OPEN_SUBEXP is SUB_TOP. */ 4287 4288static re_sub_match_last_t * 4289match_ctx_add_sublast (subtop, node, str_idx) 4290 re_sub_match_top_t *subtop; 4291 int node, str_idx; 4292{ 4293 re_sub_match_last_t *new_entry; 4294 if (BE (subtop->nlasts == subtop->alasts, 0)) 4295 { 4296 int new_alasts = 2 * subtop->alasts + 1; 4297 re_sub_match_last_t **new_array = re_realloc (subtop->lasts, 4298 re_sub_match_last_t *, 4299 new_alasts); 4300 if (BE (new_array == NULL, 0)) 4301 return NULL; 4302 subtop->lasts = new_array; 4303 subtop->alasts = new_alasts; 4304 } 4305 new_entry = calloc (1, sizeof (re_sub_match_last_t)); 4306 if (BE (new_entry != NULL, 1)) 4307 { 4308 subtop->lasts[subtop->nlasts] = new_entry; 4309 new_entry->node = node; 4310 new_entry->str_idx = str_idx; 4311 ++subtop->nlasts; 4312 } 4313 return new_entry; 4314} 4315 4316static void 4317sift_ctx_init (sctx, sifted_sts, limited_sts, last_node, last_str_idx) 4318 re_sift_context_t *sctx; 4319 re_dfastate_t **sifted_sts, **limited_sts; 4320 int last_node, last_str_idx; 4321{ 4322 sctx->sifted_states = sifted_sts; 4323 sctx->limited_states = limited_sts; 4324 sctx->last_node = last_node; 4325 sctx->last_str_idx = last_str_idx; 4326 re_node_set_init_empty (&sctx->limits); 4327} 4328