1/* Extended regular expression matching and search library. 2 Copyright (C) 2002, 2003 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>. 5 6 The GNU C Library is free software; you can redistribute it and/or 7 modify it under the terms of the GNU Lesser General Public 8 License as published by the Free Software Foundation; either 9 version 2.1 of the License, or (at your option) any later version. 10 11 The GNU C Library is distributed in the hope that it will be useful, 12 but WITHOUT ANY WARRANTY; without even the implied warranty of 13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 Lesser General Public License for more details. 15 16 You should have received a copy of the GNU Lesser General Public 17 License along with the GNU C Library; if not, write to the Free 18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 19 02111-1307 USA. */ 20 21static reg_errcode_t match_ctx_init _RE_ARGS((re_match_context_t *cache, int eflags, 22 re_string_t *input, int n)); 23static void match_ctx_clean _RE_ARGS((re_match_context_t *mctx)); 24static void match_ctx_free _RE_ARGS((re_match_context_t *cache)); 25static void match_ctx_free_subtops _RE_ARGS((re_match_context_t *mctx)); 26static reg_errcode_t match_ctx_add_entry _RE_ARGS((re_match_context_t *cache, int node, 27 int str_idx, int from, int to)); 28static int search_cur_bkref_entry _RE_ARGS((re_match_context_t *mctx, int str_idx)); 29static void match_ctx_clear_flag _RE_ARGS((re_match_context_t *mctx)); 30static reg_errcode_t match_ctx_add_subtop _RE_ARGS((re_match_context_t *mctx, int node, 31 int str_idx)); 32static re_sub_match_last_t * match_ctx_add_sublast _RE_ARGS((re_sub_match_top_t *subtop, 33 int node, int str_idx)); 34static void sift_ctx_init _RE_ARGS((re_sift_context_t *sctx, re_dfastate_t **sifted_sts, 35 re_dfastate_t **limited_sts, int last_node, 36 int last_str_idx, int check_subexp)); 37static reg_errcode_t re_search_internal _RE_ARGS((const regex_t *preg, 38 const char *string, int length, 39 int start, int range, int stop, 40 size_t nmatch, regmatch_t pmatch[], 41 int eflags)); 42static int re_search_2_stub _RE_ARGS((struct re_pattern_buffer *bufp, 43 const char *string1, int length1, 44 const char *string2, int length2, 45 int start, int range, struct re_registers *regs, 46 int stop, int ret_len)); 47static int re_search_stub _RE_ARGS((struct re_pattern_buffer *bufp, 48 const char *string, int length, int start, 49 int range, int stop, struct re_registers *regs, 50 int ret_len)); 51static unsigned re_copy_regs _RE_ARGS((struct re_registers *regs, regmatch_t *pmatch, 52 int nregs, int regs_allocated)); 53static inline re_dfastate_t *acquire_init_state_context _RE_ARGS((reg_errcode_t *err, 54 const regex_t *preg, 55 const re_match_context_t *mctx, 56 int idx)); 57static reg_errcode_t prune_impossible_nodes _RE_ARGS((const regex_t *preg, 58 re_match_context_t *mctx)); 59static int check_matching _RE_ARGS((const regex_t *preg, re_match_context_t *mctx, 60 int fl_search, int fl_longest_match)); 61static int check_halt_node_context _RE_ARGS((const re_dfa_t *dfa, int node, 62 unsigned int context)); 63static int check_halt_state_context _RE_ARGS((const regex_t *preg, 64 const re_dfastate_t *state, 65 const re_match_context_t *mctx, int idx)); 66static void update_regs _RE_ARGS((re_dfa_t *dfa, regmatch_t *pmatch, int cur_node, 67 int cur_idx, int nmatch)); 68static int proceed_next_node _RE_ARGS((const regex_t *preg, int nregs, regmatch_t *regs, 69 const re_match_context_t *mctx, 70 int *pidx, int node, re_node_set *eps_via_nodes, 71 struct re_fail_stack_t *fs)); 72static reg_errcode_t push_fail_stack _RE_ARGS((struct re_fail_stack_t *fs, 73 int str_idx, int *dests, int nregs, 74 regmatch_t *regs, 75 re_node_set *eps_via_nodes)); 76static int pop_fail_stack _RE_ARGS((struct re_fail_stack_t *fs, int *pidx, int nregs, 77 regmatch_t *regs, re_node_set *eps_via_nodes)); 78static reg_errcode_t set_regs _RE_ARGS((const regex_t *preg, 79 const re_match_context_t *mctx, 80 size_t nmatch, regmatch_t *pmatch, 81 int fl_backtrack)); 82static reg_errcode_t free_fail_stack_return _RE_ARGS((struct re_fail_stack_t *fs)); 83 84#ifdef RE_ENABLE_I18N 85static int sift_states_iter_mb _RE_ARGS((const regex_t *preg, 86 const re_match_context_t *mctx, 87 re_sift_context_t *sctx, 88 int node_idx, int str_idx, int max_str_idx)); 89#endif /* RE_ENABLE_I18N */ 90static reg_errcode_t sift_states_backward _RE_ARGS((const regex_t *preg, 91 re_match_context_t *mctx, 92 re_sift_context_t *sctx)); 93static reg_errcode_t update_cur_sifted_state _RE_ARGS((const regex_t *preg, 94 re_match_context_t *mctx, 95 re_sift_context_t *sctx, 96 int str_idx, 97 re_node_set *dest_nodes)); 98static reg_errcode_t add_epsilon_src_nodes _RE_ARGS((re_dfa_t *dfa, 99 re_node_set *dest_nodes, 100 const re_node_set *candidates)); 101static reg_errcode_t sub_epsilon_src_nodes _RE_ARGS((re_dfa_t *dfa, int node, 102 re_node_set *dest_nodes, 103 const re_node_set *and_nodes)); 104static int check_dst_limits _RE_ARGS((re_dfa_t *dfa, re_node_set *limits, 105 re_match_context_t *mctx, int dst_node, 106 int dst_idx, int src_node, int src_idx)); 107static int check_dst_limits_calc_pos _RE_ARGS((re_dfa_t *dfa, re_match_context_t *mctx, 108 int limit, re_node_set *eclosures, 109 int subexp_idx, int node, int str_idx)); 110static reg_errcode_t check_subexp_limits _RE_ARGS((re_dfa_t *dfa, 111 re_node_set *dest_nodes, 112 const re_node_set *candidates, 113 re_node_set *limits, 114 struct re_backref_cache_entry *bkref_ents, 115 int str_idx)); 116static reg_errcode_t sift_states_bkref _RE_ARGS((const regex_t *preg, 117 re_match_context_t *mctx, 118 re_sift_context_t *sctx, 119 int str_idx, re_node_set *dest_nodes)); 120static reg_errcode_t clean_state_log_if_need _RE_ARGS((re_match_context_t *mctx, 121 int next_state_log_idx)); 122static reg_errcode_t merge_state_array _RE_ARGS((re_dfa_t *dfa, re_dfastate_t **dst, 123 re_dfastate_t **src, int num)); 124static re_dfastate_t *transit_state _RE_ARGS((reg_errcode_t *err, const regex_t *preg, 125 re_match_context_t *mctx, 126 re_dfastate_t *state, int fl_search)); 127static reg_errcode_t check_subexp_matching_top _RE_ARGS((re_dfa_t *dfa, 128 re_match_context_t *mctx, 129 re_node_set *cur_nodes, 130 int str_idx)); 131static re_dfastate_t *transit_state_sb _RE_ARGS((reg_errcode_t *err, const regex_t *preg, 132 re_dfastate_t *pstate, 133 int fl_search, 134 re_match_context_t *mctx)); 135#ifdef RE_ENABLE_I18N 136static reg_errcode_t transit_state_mb _RE_ARGS((const regex_t *preg, 137 re_dfastate_t *pstate, 138 re_match_context_t *mctx)); 139#endif /* RE_ENABLE_I18N */ 140static reg_errcode_t transit_state_bkref _RE_ARGS((const regex_t *preg, 141 re_node_set *nodes, 142 re_match_context_t *mctx)); 143static reg_errcode_t get_subexp _RE_ARGS((const regex_t *preg, re_match_context_t *mctx, 144 int bkref_node, int bkref_str_idx)); 145static reg_errcode_t get_subexp_sub _RE_ARGS((const regex_t *preg, 146 re_match_context_t *mctx, 147 re_sub_match_top_t *sub_top, 148 re_sub_match_last_t *sub_last, 149 int bkref_node, int bkref_str)); 150static int find_subexp_node _RE_ARGS((re_dfa_t *dfa, re_node_set *nodes, 151 int subexp_idx, int fl_open)); 152static reg_errcode_t check_arrival _RE_ARGS((const regex_t *preg, 153 re_match_context_t *mctx, 154 state_array_t *path, int top_node, 155 int top_str, int last_node, int last_str, 156 int fl_open)); 157static reg_errcode_t check_arrival_add_next_nodes _RE_ARGS((const regex_t *preg, 158 re_dfa_t *dfa, 159 re_match_context_t *mctx, 160 int str_idx, 161 re_node_set *cur_nodes, 162 re_node_set *next_nodes)); 163static reg_errcode_t check_arrival_expand_ecl _RE_ARGS((re_dfa_t *dfa, 164 re_node_set *cur_nodes, 165 int ex_subexp, int fl_open)); 166static reg_errcode_t check_arrival_expand_ecl_sub _RE_ARGS((re_dfa_t *dfa, 167 re_node_set *dst_nodes, 168 int target, int ex_subexp, 169 int fl_open)); 170static reg_errcode_t expand_bkref_cache _RE_ARGS((const regex_t *preg, 171 re_match_context_t *mctx, 172 re_node_set *cur_nodes, int cur_str, 173 int last_str, int subexp_num, 174 int fl_open)); 175static re_dfastate_t **build_trtable _RE_ARGS((const regex_t *dfa, 176 const re_dfastate_t *state, 177 int fl_search)); 178#ifdef RE_ENABLE_I18N 179static int check_node_accept_bytes _RE_ARGS((const regex_t *preg, int node_idx, 180 const re_string_t *input, int idx)); 181# ifdef _LIBC 182static unsigned int find_collation_sequence_value _RE_ARGS((const unsigned char *mbs, 183 size_t name_len)); 184# endif /* _LIBC */ 185#endif /* RE_ENABLE_I18N */ 186static int group_nodes_into_DFAstates _RE_ARGS((const regex_t *dfa, 187 const re_dfastate_t *state, 188 re_node_set *states_node, 189 bitset *states_ch)); 190static int check_node_accept _RE_ARGS((const regex_t *preg, const re_token_t *node, 191 const re_match_context_t *mctx, int idx)); 192static reg_errcode_t extend_buffers _RE_ARGS((re_match_context_t *mctx)); 193 194/* Entry point for POSIX code. */ 195 196/* regexec searches for a given pattern, specified by PREG, in the 197 string STRING. 198 199 If NMATCH is zero or REG_NOSUB was set in the cflags argument to 200 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at 201 least NMATCH elements, and we set them to the offsets of the 202 corresponding matched substrings. 203 204 EFLAGS specifies `execution flags' which affect matching: if 205 REG_NOTBOL is set, then ^ does not match at the beginning of the 206 string; if REG_NOTEOL is set, then $ does not match at the end. 207 208 We return 0 if we find a match and REG_NOMATCH if not. */ 209 210int 211regexec (preg, string, nmatch, pmatch, eflags) 212 const regex_t *__restrict preg; 213 const char *__restrict string; 214 size_t nmatch; 215 regmatch_t pmatch[]; 216 int eflags; 217{ 218 reg_errcode_t err; 219 int length = strlen (string); 220 if (preg->no_sub) 221 err = re_search_internal (preg, string, length, 0, length, length, 0, 222 NULL, eflags); 223 else 224 err = re_search_internal (preg, string, length, 0, length, length, nmatch, 225 pmatch, eflags); 226 return err != REG_NOERROR; 227} 228#ifdef _LIBC 229weak_alias (__regexec, regexec) 230#endif 231 232/* Entry points for GNU code. */ 233 234/* re_match, re_search, re_match_2, re_search_2 235 236 The former two functions operate on STRING with length LENGTH, 237 while the later two operate on concatenation of STRING1 and STRING2 238 with lengths LENGTH1 and LENGTH2, respectively. 239 240 re_match() matches the compiled pattern in BUFP against the string, 241 starting at index START. 242 243 re_search() first tries matching at index START, then it tries to match 244 starting from index START + 1, and so on. The last start position tried 245 is START + RANGE. (Thus RANGE = 0 forces re_search to operate the same 246 way as re_match().) 247 248 The parameter STOP of re_{match,search}_2 specifies that no match exceeding 249 the first STOP characters of the concatenation of the strings should be 250 concerned. 251 252 If REGS is not NULL, and BUFP->no_sub is not set, the offsets of the match 253 and all groups is stroed in REGS. (For the "_2" variants, the offsets are 254 computed relative to the concatenation, not relative to the individual 255 strings.) 256 257 On success, re_match* functions return the length of the match, re_search* 258 return the position of the start of the match. Return value -1 means no 259 match was found and -2 indicates an internal error. */ 260 261int 262re_match (bufp, string, length, start, regs) 263 struct re_pattern_buffer *bufp; 264 const char *string; 265 int length, start; 266 struct re_registers *regs; 267{ 268 return re_search_stub (bufp, string, length, start, 0, length, regs, 1); 269} 270#ifdef _LIBC 271weak_alias (__re_match, re_match) 272#endif 273 274int 275re_search (bufp, string, length, start, range, regs) 276 struct re_pattern_buffer *bufp; 277 const char *string; 278 int length, start, range; 279 struct re_registers *regs; 280{ 281 return re_search_stub (bufp, string, length, start, range, length, regs, 0); 282} 283#ifdef _LIBC 284weak_alias (__re_search, re_search) 285#endif 286 287int 288re_match_2 (bufp, string1, length1, string2, length2, start, regs, stop) 289 struct re_pattern_buffer *bufp; 290 const char *string1, *string2; 291 int length1, length2, start, stop; 292 struct re_registers *regs; 293{ 294 return re_search_2_stub (bufp, string1, length1, string2, length2, 295 start, 0, regs, stop, 1); 296} 297#ifdef _LIBC 298weak_alias (__re_match_2, re_match_2) 299#endif 300 301int 302re_search_2 (bufp, string1, length1, string2, length2, start, range, regs, stop) 303 struct re_pattern_buffer *bufp; 304 const char *string1, *string2; 305 int length1, length2, start, range, stop; 306 struct re_registers *regs; 307{ 308 return re_search_2_stub (bufp, string1, length1, string2, length2, 309 start, range, regs, stop, 0); 310} 311#ifdef _LIBC 312weak_alias (__re_search_2, re_search_2) 313#endif 314 315static int 316re_search_2_stub (bufp, string1, length1, string2, length2, start, range, regs, 317 stop, ret_len) 318 struct re_pattern_buffer *bufp; 319 const char *string1, *string2; 320 int length1, length2, start, range, stop, ret_len; 321 struct re_registers *regs; 322{ 323 const char *str; 324 int rval; 325 int len = length1 + length2; 326 int free_str = 0; 327 328 if (BE (length1 < 0 || length2 < 0 || stop < 0, 0)) 329 return -2; 330 331 /* Concatenate the strings. */ 332 if (length2 > 0) 333 if (length1 > 0) 334 { 335 char *s = re_malloc (char, len); 336 337 if (BE (s == NULL, 0)) 338 return -2; 339 memcpy (s, string1, length1); 340 memcpy (s + length1, string2, length2); 341 str = s; 342 free_str = 1; 343 } 344 else 345 str = string2; 346 else 347 str = string1; 348 349 rval = re_search_stub (bufp, str, len, start, range, stop, regs, 350 ret_len); 351 if (free_str) 352 re_free ((char *) str); 353 return rval; 354} 355 356/* The parameters have the same meaning as those of re_search. 357 Additional parameters: 358 If RET_LEN is nonzero the length of the match is returned (re_match style); 359 otherwise the position of the match is returned. */ 360 361static int 362re_search_stub (bufp, string, length, start, range, stop, regs, ret_len) 363 struct re_pattern_buffer *bufp; 364 const char *string; 365 int length, start, range, stop, ret_len; 366 struct re_registers *regs; 367{ 368 reg_errcode_t result; 369 regmatch_t *pmatch; 370 int nregs, rval; 371 int eflags = 0; 372 373 /* Check for out-of-range. */ 374 if (BE (start < 0 || start > length, 0)) 375 return -1; 376 if (BE (start + range > length, 0)) 377 range = length - start; 378 else if (BE (start + range < 0, 0)) 379 range = -start; 380 381 eflags |= (bufp->not_bol) ? REG_NOTBOL : 0; 382 eflags |= (bufp->not_eol) ? REG_NOTEOL : 0; 383 384 /* Compile fastmap if we haven't yet. */ 385 if (range > 0 && bufp->fastmap != NULL && !bufp->fastmap_accurate) 386 re_compile_fastmap (bufp); 387 388 if (BE (bufp->no_sub, 0)) 389 regs = NULL; 390 391 /* We need at least 1 register. */ 392 if (regs == NULL) 393 nregs = 1; 394 else if (BE (bufp->regs_allocated == REGS_FIXED && 395 regs->num_regs < bufp->re_nsub + 1, 0)) 396 { 397 nregs = regs->num_regs; 398 if (BE (nregs < 1, 0)) 399 { 400 /* Nothing can be copied to regs. */ 401 regs = NULL; 402 nregs = 1; 403 } 404 } 405 else 406 nregs = bufp->re_nsub + 1; 407 pmatch = re_malloc (regmatch_t, nregs); 408 if (BE (pmatch == NULL, 0)) 409 return -2; 410 411 result = re_search_internal (bufp, string, length, start, range, stop, 412 nregs, pmatch, eflags); 413 414 rval = 0; 415 416 /* I hope we needn't fill ther regs with -1's when no match was found. */ 417 if (result != REG_NOERROR) 418 rval = -1; 419 else if (regs != NULL) 420 { 421 /* If caller wants register contents data back, copy them. */ 422 bufp->regs_allocated = re_copy_regs (regs, pmatch, nregs, 423 bufp->regs_allocated); 424 if (BE (bufp->regs_allocated == REGS_UNALLOCATED, 0)) 425 rval = -2; 426 } 427 428 if (BE (rval == 0, 1)) 429 { 430 if (ret_len) 431 { 432 assert (pmatch[0].rm_so == start); 433 rval = pmatch[0].rm_eo - start; 434 } 435 else 436 rval = pmatch[0].rm_so; 437 } 438 re_free (pmatch); 439 return rval; 440} 441 442static unsigned 443re_copy_regs (regs, pmatch, nregs, regs_allocated) 444 struct re_registers *regs; 445 regmatch_t *pmatch; 446 int nregs, regs_allocated; 447{ 448 int rval = REGS_REALLOCATE; 449 int i; 450 int need_regs = nregs + 1; 451 /* We need one extra element beyond `num_regs' for the `-1' marker GNU code 452 uses. */ 453 454 /* Have the register data arrays been allocated? */ 455 if (regs_allocated == REGS_UNALLOCATED) 456 { /* No. So allocate them with malloc. */ 457 regs->start = re_malloc (regoff_t, need_regs); 458 if (BE (regs->start == NULL, 0)) 459 return REGS_UNALLOCATED; 460 regs->end = re_malloc (regoff_t, need_regs); 461 if (BE (regs->end == NULL, 0)) 462 { 463 re_free (regs->start); 464 return REGS_UNALLOCATED; 465 } 466 regs->num_regs = need_regs; 467 } 468 else if (regs_allocated == REGS_REALLOCATE) 469 { /* Yes. If we need more elements than were already 470 allocated, reallocate them. If we need fewer, just 471 leave it alone. */ 472 if (need_regs > regs->num_regs) 473 { 474 regs->start = re_realloc (regs->start, regoff_t, need_regs); 475 if (BE (regs->start == NULL, 0)) 476 { 477 if (regs->end != NULL) 478 re_free (regs->end); 479 return REGS_UNALLOCATED; 480 } 481 regs->end = re_realloc (regs->end, regoff_t, need_regs); 482 if (BE (regs->end == NULL, 0)) 483 { 484 re_free (regs->start); 485 return REGS_UNALLOCATED; 486 } 487 regs->num_regs = need_regs; 488 } 489 } 490 else 491 { 492 assert (regs_allocated == REGS_FIXED); 493 /* This function may not be called with REGS_FIXED and nregs too big. */ 494 assert (regs->num_regs >= nregs); 495 rval = REGS_FIXED; 496 } 497 498 /* Copy the regs. */ 499 for (i = 0; i < nregs; ++i) 500 { 501 regs->start[i] = pmatch[i].rm_so; 502 regs->end[i] = pmatch[i].rm_eo; 503 } 504 for ( ; i < regs->num_regs; ++i) 505 regs->start[i] = regs->end[i] = -1; 506 507 return rval; 508} 509 510/* Set REGS to hold NUM_REGS registers, storing them in STARTS and 511 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use 512 this memory for recording register information. STARTS and ENDS 513 must be allocated using the malloc library routine, and must each 514 be at least NUM_REGS * sizeof (regoff_t) bytes long. 515 516 If NUM_REGS == 0, then subsequent matches should allocate their own 517 register data. 518 519 Unless this function is called, the first search or match using 520 PATTERN_BUFFER will allocate its own register data, without 521 freeing the old data. */ 522 523void 524re_set_registers (bufp, regs, num_regs, starts, ends) 525 struct re_pattern_buffer *bufp; 526 struct re_registers *regs; 527 unsigned num_regs; 528 regoff_t *starts, *ends; 529{ 530 if (num_regs) 531 { 532 bufp->regs_allocated = REGS_REALLOCATE; 533 regs->num_regs = num_regs; 534 regs->start = starts; 535 regs->end = ends; 536 } 537 else 538 { 539 bufp->regs_allocated = REGS_UNALLOCATED; 540 regs->num_regs = 0; 541 regs->start = regs->end = (regoff_t *) 0; 542 } 543} 544#ifdef _LIBC 545weak_alias (__re_set_registers, re_set_registers) 546#endif 547 548/* Entry points compatible with 4.2 BSD regex library. We don't define 549 them unless specifically requested. */ 550 551#if defined _REGEX_RE_COMP || defined _LIBC 552int 553# ifdef _LIBC 554weak_function 555# endif 556re_exec (s) 557 const char *s; 558{ 559 return 0 == regexec (&re_comp_buf, s, 0, NULL, 0); 560} 561#endif /* _REGEX_RE_COMP */ 562 563static re_node_set empty_set; 564 565/* Internal entry point. */ 566 567/* Searches for a compiled pattern PREG in the string STRING, whose 568 length is LENGTH. NMATCH, PMATCH, and EFLAGS have the same 569 mingings with regexec. START, and RANGE have the same meanings 570 with re_search. 571 Return REG_NOERROR if we find a match, and REG_NOMATCH if not, 572 otherwise return the error code. 573 Note: We assume front end functions already check ranges. 574 (START + RANGE >= 0 && START + RANGE <= LENGTH) */ 575 576static reg_errcode_t 577re_search_internal (preg, string, length, start, range, stop, nmatch, pmatch, 578 eflags) 579 const regex_t *preg; 580 const char *string; 581 int length, start, range, stop, eflags; 582 size_t nmatch; 583 regmatch_t pmatch[]; 584{ 585 reg_errcode_t err; 586 re_dfa_t *dfa = (re_dfa_t *)preg->buffer; 587 re_string_t input; 588 int left_lim, right_lim, incr; 589 int fl_longest_match, match_first, match_last = -1; 590 int fast_translate, sb; 591 re_match_context_t mctx; 592 char *fastmap = ((preg->fastmap != NULL && preg->fastmap_accurate 593 && range && !preg->can_be_null) ? preg->fastmap : NULL); 594 595 /* Check if the DFA haven't been compiled. */ 596 if (BE (preg->used == 0 || dfa->init_state == NULL 597 || dfa->init_state_word == NULL || dfa->init_state_nl == NULL 598 || dfa->init_state_begbuf == NULL, 0)) 599 return REG_NOMATCH; 600 601 re_node_set_init_empty (&empty_set); 602 memset (&mctx, '\0', sizeof (re_match_context_t)); 603 604 /* We must check the longest matching, if nmatch > 0. */ 605 fl_longest_match = (nmatch != 0 || dfa->nbackref); 606 607 err = re_string_allocate (&input, string, length, dfa->nodes_len + 1, 608 preg->translate, preg->syntax & RE_ICASE); 609 if (BE (err != REG_NOERROR, 0)) 610 goto free_return; 611 input.stop = stop; 612 613 err = match_ctx_init (&mctx, eflags, &input, dfa->nbackref * 2); 614 if (BE (err != REG_NOERROR, 0)) 615 goto free_return; 616 617 /* We will log all the DFA states through which the dfa pass, 618 if nmatch > 1, or this dfa has "multibyte node", which is a 619 back-reference or a node which can accept multibyte character or 620 multi character collating element. */ 621 if (nmatch > 1 || dfa->has_mb_node) 622 { 623 mctx.state_log = re_malloc (re_dfastate_t *, dfa->nodes_len + 1); 624 if (BE (mctx.state_log == NULL, 0)) 625 { 626 err = REG_ESPACE; 627 goto free_return; 628 } 629 } 630 else 631 mctx.state_log = NULL; 632 633#ifdef DEBUG 634 /* We assume front-end functions already check them. */ 635 assert (start + range >= 0 && start + range <= length); 636#endif 637 638 match_first = start; 639 input.tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF 640 : CONTEXT_NEWLINE | CONTEXT_BEGBUF); 641 642 /* Check incrementally whether of not the input string match. */ 643 incr = (range < 0) ? -1 : 1; 644 left_lim = (range < 0) ? start + range : start; 645 right_lim = (range < 0) ? start : start + range; 646#ifdef RE_ENABLE_I18N 647 sb = re_mb_cur_max == 1; 648#else 649 sb = 1; 650#endif 651 fast_translate = sb || !(preg->syntax & RE_ICASE || preg->translate); 652 653 for (;;) 654 { 655 /* At first get the current byte from input string. */ 656 if (fastmap) 657 { 658 if (BE (fast_translate, 1)) 659 { 660 unsigned RE_TRANSLATE_TYPE t 661 = (unsigned RE_TRANSLATE_TYPE) preg->translate; 662 if (BE (range >= 0, 1)) 663 { 664 if (BE (t != NULL, 0)) 665 { 666 while (BE (match_first < right_lim, 1) 667 && !fastmap[t[(unsigned char) string[match_first]]]) 668 ++match_first; 669 } 670 else 671 { 672 while (BE (match_first < right_lim, 1) 673 && !fastmap[(unsigned char) string[match_first]]) 674 ++match_first; 675 } 676 if (BE (match_first == right_lim, 0)) 677 { 678 int ch = match_first >= length 679 ? 0 : (unsigned char) string[match_first]; 680 if (!fastmap[t ? t[ch] : ch]) 681 break; 682 } 683 } 684 else 685 { 686 while (match_first >= left_lim) 687 { 688 int ch = match_first >= length 689 ? 0 : (unsigned char) string[match_first]; 690 if (fastmap[t ? t[ch] : ch]) 691 break; 692 --match_first; 693 } 694 if (match_first < left_lim) 695 break; 696 } 697 } 698 else 699 { 700 int ch; 701 702 do 703 { 704 /* In this case, we can't determine easily the current byte, 705 since it might be a component byte of a multibyte 706 character. Then we use the constructed buffer 707 instead. */ 708 /* If MATCH_FIRST is out of the valid range, reconstruct the 709 buffers. */ 710 if (input.raw_mbs_idx + input.valid_len <= match_first 711 || match_first < input.raw_mbs_idx) 712 { 713 err = re_string_reconstruct (&input, match_first, eflags, 714 preg->newline_anchor); 715 if (BE (err != REG_NOERROR, 0)) 716 goto free_return; 717 } 718 /* If MATCH_FIRST is out of the buffer, leave it as '\0'. 719 Note that MATCH_FIRST must not be smaller than 0. */ 720 ch = ((match_first >= length) ? 0 721 : re_string_byte_at (&input, 722 match_first - input.raw_mbs_idx)); 723 if (fastmap[ch]) 724 break; 725 match_first += incr; 726 } 727 while (match_first >= left_lim && match_first <= right_lim); 728 if (! fastmap[ch]) 729 break; 730 } 731 } 732 733 /* Reconstruct the buffers so that the matcher can assume that 734 the matching starts from the begining of the buffer. */ 735 err = re_string_reconstruct (&input, match_first, eflags, 736 preg->newline_anchor); 737 if (BE (err != REG_NOERROR, 0)) 738 goto free_return; 739#ifdef RE_ENABLE_I18N 740 /* Eliminate it when it is a component of a multibyte character 741 and isn't the head of a multibyte character. */ 742 if (sb || re_string_first_byte (&input, 0)) 743#endif 744 { 745 /* It seems to be appropriate one, then use the matcher. */ 746 /* We assume that the matching starts from 0. */ 747 mctx.state_log_top = mctx.nbkref_ents = mctx.max_mb_elem_len = 0; 748 match_last = check_matching (preg, &mctx, 0, fl_longest_match); 749 if (match_last != -1) 750 { 751 if (BE (match_last == -2, 0)) 752 { 753 err = REG_ESPACE; 754 goto free_return; 755 } 756 else 757 { 758 mctx.match_last = match_last; 759 if ((!preg->no_sub && nmatch > 1) || dfa->nbackref) 760 { 761 re_dfastate_t *pstate = mctx.state_log[match_last]; 762 mctx.last_node = check_halt_state_context (preg, pstate, 763 &mctx, match_last); 764 } 765 if ((!preg->no_sub && nmatch > 1 && dfa->has_plural_match) 766 || dfa->nbackref) 767 { 768 err = prune_impossible_nodes (preg, &mctx); 769 if (err == REG_NOERROR) 770 break; 771 if (BE (err != REG_NOMATCH, 0)) 772 goto free_return; 773 } 774 else 775 break; /* We found a matching. */ 776 } 777 } 778 match_ctx_clean (&mctx); 779 } 780 /* Update counter. */ 781 match_first += incr; 782 if (match_first < left_lim || right_lim < match_first) 783 break; 784 } 785 786 /* Set pmatch[] if we need. */ 787 if (match_last != -1 && nmatch > 0) 788 { 789 int reg_idx; 790 791 /* Initialize registers. */ 792 for (reg_idx = 0; reg_idx < nmatch; ++reg_idx) 793 pmatch[reg_idx].rm_so = pmatch[reg_idx].rm_eo = -1; 794 795 /* Set the points where matching start/end. */ 796 pmatch[0].rm_so = 0; 797 pmatch[0].rm_eo = mctx.match_last; 798 799 if (!preg->no_sub && nmatch > 1) 800 { 801 err = set_regs (preg, &mctx, nmatch, pmatch, 802 dfa->has_plural_match && dfa->nbackref > 0); 803 if (BE (err != REG_NOERROR, 0)) 804 goto free_return; 805 } 806 807 /* At last, add the offset to the each registers, since we slided 808 the buffers so that We can assume that the matching starts from 0. */ 809 for (reg_idx = 0; reg_idx < nmatch; ++reg_idx) 810 if (pmatch[reg_idx].rm_so != -1) 811 { 812 pmatch[reg_idx].rm_so += match_first; 813 pmatch[reg_idx].rm_eo += match_first; 814 } 815 } 816 err = (match_last == -1) ? REG_NOMATCH : REG_NOERROR; 817 free_return: 818 re_free (mctx.state_log); 819 if (dfa->nbackref) 820 match_ctx_free (&mctx); 821 re_string_destruct (&input); 822 return err; 823} 824 825static reg_errcode_t 826prune_impossible_nodes (preg, mctx) 827 const regex_t *preg; 828 re_match_context_t *mctx; 829{ 830 int halt_node, match_last; 831 reg_errcode_t ret; 832 re_dfa_t *dfa = (re_dfa_t *)preg->buffer; 833 re_dfastate_t **sifted_states; 834 re_dfastate_t **lim_states = NULL; 835 re_sift_context_t sctx; 836#ifdef DEBUG 837 assert (mctx->state_log != NULL); 838#endif 839 match_last = mctx->match_last; 840 halt_node = mctx->last_node; 841 sifted_states = re_malloc (re_dfastate_t *, match_last + 1); 842 if (BE (sifted_states == NULL, 0)) 843 { 844 ret = REG_ESPACE; 845 goto free_return; 846 } 847 if (dfa->nbackref) 848 { 849 lim_states = re_malloc (re_dfastate_t *, match_last + 1); 850 if (BE (lim_states == NULL, 0)) 851 { 852 ret = REG_ESPACE; 853 goto free_return; 854 } 855 while (1) 856 { 857 memset (lim_states, '\0', 858 sizeof (re_dfastate_t *) * (match_last + 1)); 859 match_ctx_clear_flag (mctx); 860 sift_ctx_init (&sctx, sifted_states, lim_states, halt_node, 861 match_last, 0); 862 ret = sift_states_backward (preg, mctx, &sctx); 863 re_node_set_free (&sctx.limits); 864 if (BE (ret != REG_NOERROR, 0)) 865 goto free_return; 866 if (sifted_states[0] != NULL || lim_states[0] != NULL) 867 break; 868 do 869 { 870 --match_last; 871 if (match_last < 0) 872 { 873 ret = REG_NOMATCH; 874 goto free_return; 875 } 876 } while (!mctx->state_log[match_last]->halt); 877 halt_node = check_halt_state_context (preg, 878 mctx->state_log[match_last], 879 mctx, match_last); 880 } 881 ret = merge_state_array (dfa, sifted_states, lim_states, 882 match_last + 1); 883 re_free (lim_states); 884 lim_states = NULL; 885 if (BE (ret != REG_NOERROR, 0)) 886 goto free_return; 887 } 888 else 889 { 890 sift_ctx_init (&sctx, sifted_states, lim_states, halt_node, 891 match_last, 0); 892 ret = sift_states_backward (preg, mctx, &sctx); 893 re_node_set_free (&sctx.limits); 894 if (BE (ret != REG_NOERROR, 0)) 895 goto free_return; 896 } 897 re_free (mctx->state_log); 898 mctx->state_log = sifted_states; 899 sifted_states = NULL; 900 mctx->last_node = halt_node; 901 mctx->match_last = match_last; 902 ret = REG_NOERROR; 903 free_return: 904 re_free (sifted_states); 905 re_free (lim_states); 906 return ret; 907} 908 909/* Acquire an initial state and return it. 910 We must select appropriate initial state depending on the context, 911 since initial states may have constraints like "\<", "^", etc.. */ 912 913static inline re_dfastate_t * 914acquire_init_state_context (err, preg, mctx, idx) 915 reg_errcode_t *err; 916 const regex_t *preg; 917 const re_match_context_t *mctx; 918 int idx; 919{ 920 re_dfa_t *dfa = (re_dfa_t *) preg->buffer; 921 922 *err = REG_NOERROR; 923 if (dfa->init_state->has_constraint) 924 { 925 unsigned int context; 926 context = re_string_context_at (mctx->input, idx - 1, mctx->eflags, 927 preg->newline_anchor); 928 if (IS_WORD_CONTEXT (context)) 929 return dfa->init_state_word; 930 else if (IS_ORDINARY_CONTEXT (context)) 931 return dfa->init_state; 932 else if (IS_BEGBUF_CONTEXT (context) && IS_NEWLINE_CONTEXT (context)) 933 return dfa->init_state_begbuf; 934 else if (IS_NEWLINE_CONTEXT (context)) 935 return dfa->init_state_nl; 936 else if (IS_BEGBUF_CONTEXT (context)) 937 { 938 /* It is relatively rare case, then calculate on demand. */ 939 return re_acquire_state_context (err, dfa, 940 dfa->init_state->entrance_nodes, 941 context); 942 } 943 else 944 /* Must not happen? */ 945 return dfa->init_state; 946 } 947 else 948 return dfa->init_state; 949} 950 951/* Check whether the regular expression match input string INPUT or not, 952 and return the index where the matching end, return -1 if not match, 953 or return -2 in case of an error. 954 FL_SEARCH means we must search where the matching starts, 955 FL_LONGEST_MATCH means we want the POSIX longest matching. 956 Note that the matcher assume that the maching starts from the current 957 index of the buffer. */ 958 959static int 960check_matching (preg, mctx, fl_search, fl_longest_match) 961 const regex_t *preg; 962 re_match_context_t *mctx; 963 int fl_search, fl_longest_match; 964{ 965 re_dfa_t *dfa = (re_dfa_t *) preg->buffer; 966 reg_errcode_t err; 967 int match = 0; 968 int match_last = -1; 969 int cur_str_idx = re_string_cur_idx (mctx->input); 970 re_dfastate_t *cur_state; 971 972 cur_state = acquire_init_state_context (&err, preg, mctx, cur_str_idx); 973 /* An initial state must not be NULL(invalid state). */ 974 if (BE (cur_state == NULL, 0)) 975 return -2; 976 if (mctx->state_log != NULL) 977 mctx->state_log[cur_str_idx] = cur_state; 978 979 /* Check OP_OPEN_SUBEXP in the initial state in case that we use them 980 later. E.g. Processing back references. */ 981 if (dfa->nbackref) 982 { 983 err = check_subexp_matching_top (dfa, mctx, &cur_state->nodes, 0); 984 if (BE (err != REG_NOERROR, 0)) 985 return err; 986 } 987 988 if (cur_state->has_backref) 989 { 990 err = transit_state_bkref (preg, &cur_state->nodes, mctx); 991 if (BE (err != REG_NOERROR, 0)) 992 return err; 993 } 994 995 /* If the RE accepts NULL string. */ 996 if (cur_state->halt) 997 { 998 if (!cur_state->has_constraint 999 || check_halt_state_context (preg, cur_state, mctx, cur_str_idx)) 1000 { 1001 if (!fl_longest_match) 1002 return cur_str_idx; 1003 else 1004 { 1005 match_last = cur_str_idx; 1006 match = 1; 1007 } 1008 } 1009 } 1010 1011 while (!re_string_eoi (mctx->input)) 1012 { 1013 cur_state = transit_state (&err, preg, mctx, cur_state, 1014 fl_search && !match); 1015 if (cur_state == NULL) /* Reached at the invalid state or an error. */ 1016 { 1017 cur_str_idx = re_string_cur_idx (mctx->input); 1018 if (BE (err != REG_NOERROR, 0)) 1019 return -2; 1020 if (fl_search && !match) 1021 { 1022 /* Restart from initial state, since we are searching 1023 the point from where matching start. */ 1024#ifdef RE_ENABLE_I18N 1025 if (re_mb_cur_max == 1 1026 || re_string_first_byte (mctx->input, cur_str_idx)) 1027#endif /* RE_ENABLE_I18N */ 1028 cur_state = acquire_init_state_context (&err, preg, mctx, 1029 cur_str_idx); 1030 if (BE (cur_state == NULL && err != REG_NOERROR, 0)) 1031 return -2; 1032 if (mctx->state_log != NULL) 1033 mctx->state_log[cur_str_idx] = cur_state; 1034 } 1035 else if (!fl_longest_match && match) 1036 break; 1037 else /* (fl_longest_match && match) || (!fl_search && !match) */ 1038 { 1039 if (mctx->state_log == NULL) 1040 break; 1041 else 1042 { 1043 int max = mctx->state_log_top; 1044 for (; cur_str_idx <= max; ++cur_str_idx) 1045 if (mctx->state_log[cur_str_idx] != NULL) 1046 break; 1047 if (cur_str_idx > max) 1048 break; 1049 } 1050 } 1051 } 1052 1053 if (cur_state != NULL && cur_state->halt) 1054 { 1055 /* Reached at a halt state. 1056 Check the halt state can satisfy the current context. */ 1057 if (!cur_state->has_constraint 1058 || check_halt_state_context (preg, cur_state, mctx, 1059 re_string_cur_idx (mctx->input))) 1060 { 1061 /* We found an appropriate halt state. */ 1062 match_last = re_string_cur_idx (mctx->input); 1063 match = 1; 1064 if (!fl_longest_match) 1065 break; 1066 } 1067 } 1068 } 1069 return match_last; 1070} 1071 1072/* Check NODE match the current context. */ 1073 1074static int check_halt_node_context (dfa, node, context) 1075 const re_dfa_t *dfa; 1076 int node; 1077 unsigned int context; 1078{ 1079 re_token_type_t type = dfa->nodes[node].type; 1080 unsigned int constraint = dfa->nodes[node].constraint; 1081 if (type != END_OF_RE) 1082 return 0; 1083 if (!constraint) 1084 return 1; 1085 if (NOT_SATISFY_NEXT_CONSTRAINT (constraint, context)) 1086 return 0; 1087 return 1; 1088} 1089 1090/* Check the halt state STATE match the current context. 1091 Return 0 if not match, if the node, STATE has, is a halt node and 1092 match the context, return the node. */ 1093 1094static int 1095check_halt_state_context (preg, state, mctx, idx) 1096 const regex_t *preg; 1097 const re_dfastate_t *state; 1098 const re_match_context_t *mctx; 1099 int idx; 1100{ 1101 re_dfa_t *dfa = (re_dfa_t *) preg->buffer; 1102 int i; 1103 unsigned int context; 1104#ifdef DEBUG 1105 assert (state->halt); 1106#endif 1107 context = re_string_context_at (mctx->input, idx, mctx->eflags, 1108 preg->newline_anchor); 1109 for (i = 0; i < state->nodes.nelem; ++i) 1110 if (check_halt_node_context (dfa, state->nodes.elems[i], context)) 1111 return state->nodes.elems[i]; 1112 return 0; 1113} 1114 1115/* Compute the next node to which "NFA" transit from NODE("NFA" is a NFA 1116 corresponding to the DFA). 1117 Return the destination node, and update EPS_VIA_NODES, return -1 in case 1118 of errors. */ 1119 1120static int 1121proceed_next_node (preg, nregs, regs, mctx, pidx, node, eps_via_nodes, fs) 1122 const regex_t *preg; 1123 regmatch_t *regs; 1124 const re_match_context_t *mctx; 1125 int nregs, *pidx, node; 1126 re_node_set *eps_via_nodes; 1127 struct re_fail_stack_t *fs; 1128{ 1129 re_dfa_t *dfa = (re_dfa_t *)preg->buffer; 1130 int i, err, dest_node; 1131 dest_node = -1; 1132 if (IS_EPSILON_NODE (dfa->nodes[node].type)) 1133 { 1134 re_node_set *cur_nodes = &mctx->state_log[*pidx]->nodes; 1135 int ndest, dest_nodes[2]; 1136 err = re_node_set_insert (eps_via_nodes, node); 1137 if (BE (err < 0, 0)) 1138 return -1; 1139 /* Pick up valid destinations. */ 1140 for (ndest = 0, i = 0; i < dfa->edests[node].nelem; ++i) 1141 { 1142 int candidate = dfa->edests[node].elems[i]; 1143 if (!re_node_set_contains (cur_nodes, candidate)) 1144 continue; 1145 dest_nodes[0] = (ndest == 0) ? candidate : dest_nodes[0]; 1146 dest_nodes[1] = (ndest == 1) ? candidate : dest_nodes[1]; 1147 ++ndest; 1148 } 1149 if (ndest <= 1) 1150 return ndest == 0 ? -1 : (ndest == 1 ? dest_nodes[0] : 0); 1151 /* In order to avoid infinite loop like "(a*)*". */ 1152 if (re_node_set_contains (eps_via_nodes, dest_nodes[0])) 1153 return dest_nodes[1]; 1154 if (fs != NULL) 1155 push_fail_stack (fs, *pidx, dest_nodes, nregs, regs, eps_via_nodes); 1156 return dest_nodes[0]; 1157 } 1158 else 1159 { 1160 int naccepted = 0; 1161 re_token_type_t type = dfa->nodes[node].type; 1162 1163#ifdef RE_ENABLE_I18N 1164 if (ACCEPT_MB_NODE (type)) 1165 naccepted = check_node_accept_bytes (preg, node, mctx->input, *pidx); 1166 else 1167#endif /* RE_ENABLE_I18N */ 1168 if (type == OP_BACK_REF) 1169 { 1170 int subexp_idx = dfa->nodes[node].opr.idx; 1171 naccepted = regs[subexp_idx].rm_eo - regs[subexp_idx].rm_so; 1172 if (fs != NULL) 1173 { 1174 if (regs[subexp_idx].rm_so == -1 || regs[subexp_idx].rm_eo == -1) 1175 return -1; 1176 else if (naccepted) 1177 { 1178 char *buf = (char *) re_string_get_buffer (mctx->input); 1179 if (memcmp (buf + regs[subexp_idx].rm_so, buf + *pidx, 1180 naccepted) != 0) 1181 return -1; 1182 } 1183 } 1184 1185 if (naccepted == 0) 1186 { 1187 err = re_node_set_insert (eps_via_nodes, node); 1188 if (BE (err < 0, 0)) 1189 return -2; 1190 dest_node = dfa->edests[node].elems[0]; 1191 if (re_node_set_contains (&mctx->state_log[*pidx]->nodes, 1192 dest_node)) 1193 return dest_node; 1194 } 1195 } 1196 1197 if (naccepted != 0 1198 || check_node_accept (preg, dfa->nodes + node, mctx, *pidx)) 1199 { 1200 dest_node = dfa->nexts[node]; 1201 *pidx = (naccepted == 0) ? *pidx + 1 : *pidx + naccepted; 1202 if (fs && (*pidx > mctx->match_last || mctx->state_log[*pidx] == NULL 1203 || !re_node_set_contains (&mctx->state_log[*pidx]->nodes, 1204 dest_node))) 1205 return -1; 1206 re_node_set_empty (eps_via_nodes); 1207 return dest_node; 1208 } 1209 } 1210 return -1; 1211} 1212 1213static reg_errcode_t 1214push_fail_stack (fs, str_idx, dests, nregs, regs, eps_via_nodes) 1215 struct re_fail_stack_t *fs; 1216 int str_idx, *dests, nregs; 1217 regmatch_t *regs; 1218 re_node_set *eps_via_nodes; 1219{ 1220 reg_errcode_t err; 1221 int num = fs->num++; 1222 if (fs->num == fs->alloc) 1223 { 1224 struct re_fail_stack_ent_t *new_array; 1225 fs->alloc *= 2; 1226 new_array = realloc (fs->stack, (sizeof (struct re_fail_stack_ent_t) 1227 * fs->alloc)); 1228 if (new_array == NULL) 1229 return REG_ESPACE; 1230 fs->stack = new_array; 1231 } 1232 fs->stack[num].idx = str_idx; 1233 fs->stack[num].node = dests[1]; 1234 fs->stack[num].regs = re_malloc (regmatch_t, nregs); 1235 memcpy (fs->stack[num].regs, regs, sizeof (regmatch_t) * nregs); 1236 err = re_node_set_init_copy (&fs->stack[num].eps_via_nodes, eps_via_nodes); 1237 return err; 1238} 1239 1240static int 1241pop_fail_stack (fs, pidx, nregs, regs, eps_via_nodes) 1242 struct re_fail_stack_t *fs; 1243 int *pidx, nregs; 1244 regmatch_t *regs; 1245 re_node_set *eps_via_nodes; 1246{ 1247 int num = --fs->num; 1248 assert (num >= 0); 1249 *pidx = fs->stack[num].idx; 1250 memcpy (regs, fs->stack[num].regs, sizeof (regmatch_t) * nregs); 1251 re_node_set_free (eps_via_nodes); 1252 re_free (fs->stack[num].regs); 1253 *eps_via_nodes = fs->stack[num].eps_via_nodes; 1254 return fs->stack[num].node; 1255} 1256 1257/* Set the positions where the subexpressions are starts/ends to registers 1258 PMATCH. 1259 Note: We assume that pmatch[0] is already set, and 1260 pmatch[i].rm_so == pmatch[i].rm_eo == -1 (i > 1). */ 1261 1262static reg_errcode_t 1263set_regs (preg, mctx, nmatch, pmatch, fl_backtrack) 1264 const regex_t *preg; 1265 const re_match_context_t *mctx; 1266 size_t nmatch; 1267 regmatch_t *pmatch; 1268 int fl_backtrack; 1269{ 1270 re_dfa_t *dfa = (re_dfa_t *)preg->buffer; 1271 int idx, cur_node, real_nmatch; 1272 re_node_set eps_via_nodes; 1273 struct re_fail_stack_t *fs; 1274 struct re_fail_stack_t fs_body; 1275 fs_body.num = 0; 1276 fs_body.alloc = 2; 1277 fs_body.stack = NULL; 1278#ifdef DEBUG 1279 assert (nmatch > 1); 1280 assert (mctx->state_log != NULL); 1281#endif 1282 if (fl_backtrack) 1283 { 1284 fs = &fs_body; 1285 fs->stack = re_malloc (struct re_fail_stack_ent_t, fs->alloc); 1286 } 1287 else 1288 fs = NULL; 1289 cur_node = dfa->init_node; 1290 real_nmatch = (nmatch <= preg->re_nsub) ? nmatch : preg->re_nsub + 1; 1291 re_node_set_init_empty (&eps_via_nodes); 1292 for (idx = pmatch[0].rm_so; idx <= pmatch[0].rm_eo ;) 1293 { 1294 update_regs (dfa, pmatch, cur_node, idx, real_nmatch); 1295 if (idx == pmatch[0].rm_eo && cur_node == mctx->last_node) 1296 { 1297 int reg_idx; 1298 if (fs) 1299 { 1300 for (reg_idx = 0; reg_idx < nmatch; ++reg_idx) 1301 if (pmatch[reg_idx].rm_so > -1 && pmatch[reg_idx].rm_eo == -1) 1302 break; 1303 if (reg_idx == nmatch) 1304 { 1305 re_node_set_free (&eps_via_nodes); 1306 return free_fail_stack_return (fs); 1307 } 1308 cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch, 1309 &eps_via_nodes); 1310 } 1311 else 1312 { 1313 re_node_set_free (&eps_via_nodes); 1314 return REG_NOERROR; 1315 } 1316 } 1317 1318 /* Proceed to next node. */ 1319 cur_node = proceed_next_node (preg, nmatch, pmatch, mctx, &idx, cur_node, 1320 &eps_via_nodes, fs); 1321 1322 if (BE (cur_node < 0, 0)) 1323 { 1324 if (cur_node == -2) 1325 return REG_ESPACE; 1326 if (fs) 1327 cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch, 1328 &eps_via_nodes); 1329 else 1330 { 1331 re_node_set_free (&eps_via_nodes); 1332 return REG_NOMATCH; 1333 } 1334 } 1335 } 1336 re_node_set_free (&eps_via_nodes); 1337 return free_fail_stack_return (fs); 1338} 1339 1340static reg_errcode_t 1341free_fail_stack_return (fs) 1342 struct re_fail_stack_t *fs; 1343{ 1344 if (fs) 1345 { 1346 int fs_idx; 1347 for (fs_idx = 0; fs_idx < fs->num; ++fs_idx) 1348 { 1349 re_node_set_free (&fs->stack[fs_idx].eps_via_nodes); 1350 re_free (fs->stack[fs_idx].regs); 1351 } 1352 re_free (fs->stack); 1353 } 1354 return REG_NOERROR; 1355} 1356 1357static void 1358update_regs (dfa, pmatch, cur_node, cur_idx, nmatch) 1359 re_dfa_t *dfa; 1360 regmatch_t *pmatch; 1361 int cur_node, cur_idx, nmatch; 1362{ 1363 int type = dfa->nodes[cur_node].type; 1364 int reg_num; 1365 if (type != OP_OPEN_SUBEXP && type != OP_CLOSE_SUBEXP) 1366 return; 1367 reg_num = dfa->nodes[cur_node].opr.idx + 1; 1368 if (reg_num >= nmatch) 1369 return; 1370 if (type == OP_OPEN_SUBEXP) 1371 { 1372 /* We are at the first node of this sub expression. */ 1373 pmatch[reg_num].rm_so = cur_idx; 1374 pmatch[reg_num].rm_eo = -1; 1375 } 1376 else if (type == OP_CLOSE_SUBEXP) 1377 /* We are at the first node of this sub expression. */ 1378 pmatch[reg_num].rm_eo = cur_idx; 1379} 1380 1381#define NUMBER_OF_STATE 1 1382 1383/* This function checks the STATE_LOG from the SCTX->last_str_idx to 0 1384 and sift the nodes in each states according to the following rules. 1385 Updated state_log will be wrote to STATE_LOG. 1386 1387 Rules: We throw away the Node `a' in the STATE_LOG[STR_IDX] if... 1388 1. When STR_IDX == MATCH_LAST(the last index in the state_log): 1389 If `a' isn't the LAST_NODE and `a' can't epsilon transit to 1390 the LAST_NODE, we throw away the node `a'. 1391 2. When 0 <= STR_IDX < MATCH_LAST and `a' accepts 1392 string `s' and transit to `b': 1393 i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw 1394 away the node `a'. 1395 ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is 1396 throwed away, we throw away the node `a'. 1397 3. When 0 <= STR_IDX < n and 'a' epsilon transit to 'b': 1398 i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the 1399 node `a'. 1400 ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is throwed away, 1401 we throw away the node `a'. */ 1402 1403#define STATE_NODE_CONTAINS(state,node) \ 1404 ((state) != NULL && re_node_set_contains (&(state)->nodes, node)) 1405 1406static reg_errcode_t 1407sift_states_backward (preg, mctx, sctx) 1408 const regex_t *preg; 1409 re_match_context_t *mctx; 1410 re_sift_context_t *sctx; 1411{ 1412 reg_errcode_t err; 1413 re_dfa_t *dfa = (re_dfa_t *)preg->buffer; 1414 int null_cnt = 0; 1415 int str_idx = sctx->last_str_idx; 1416 re_node_set cur_dest; 1417 re_node_set *cur_src; /* Points the state_log[str_idx]->nodes */ 1418 1419#ifdef DEBUG 1420 assert (mctx->state_log != NULL && mctx->state_log[str_idx] != NULL); 1421#endif 1422 cur_src = &mctx->state_log[str_idx]->nodes; 1423 1424 /* Build sifted state_log[str_idx]. It has the nodes which can epsilon 1425 transit to the last_node and the last_node itself. */ 1426 err = re_node_set_init_1 (&cur_dest, sctx->last_node); 1427 if (BE (err != REG_NOERROR, 0)) 1428 return err; 1429 err = update_cur_sifted_state (preg, mctx, sctx, str_idx, &cur_dest); 1430 if (BE (err != REG_NOERROR, 0)) 1431 goto free_return; 1432 1433 /* Then check each states in the state_log. */ 1434 while (str_idx > 0) 1435 { 1436 int i, ret; 1437 /* Update counters. */ 1438 null_cnt = (sctx->sifted_states[str_idx] == NULL) ? null_cnt + 1 : 0; 1439 if (null_cnt > mctx->max_mb_elem_len) 1440 { 1441 memset (sctx->sifted_states, '\0', 1442 sizeof (re_dfastate_t *) * str_idx); 1443 re_node_set_free (&cur_dest); 1444 return REG_NOERROR; 1445 } 1446 re_node_set_empty (&cur_dest); 1447 --str_idx; 1448 cur_src = ((mctx->state_log[str_idx] == NULL) ? &empty_set 1449 : &mctx->state_log[str_idx]->nodes); 1450 1451 /* Then build the next sifted state. 1452 We build the next sifted state on `cur_dest', and update 1453 `sifted_states[str_idx]' with `cur_dest'. 1454 Note: 1455 `cur_dest' is the sifted state from `state_log[str_idx + 1]'. 1456 `cur_src' points the node_set of the old `state_log[str_idx]'. */ 1457 for (i = 0; i < cur_src->nelem; i++) 1458 { 1459 int prev_node = cur_src->elems[i]; 1460 int naccepted = 0; 1461 re_token_type_t type = dfa->nodes[prev_node].type; 1462 1463 if (IS_EPSILON_NODE(type)) 1464 continue; 1465#ifdef RE_ENABLE_I18N 1466 /* If the node may accept `multi byte'. */ 1467 if (ACCEPT_MB_NODE (type)) 1468 naccepted = sift_states_iter_mb (preg, mctx, sctx, prev_node, 1469 str_idx, sctx->last_str_idx); 1470 1471#endif /* RE_ENABLE_I18N */ 1472 /* We don't check backreferences here. 1473 See update_cur_sifted_state(). */ 1474 1475 if (!naccepted 1476 && check_node_accept (preg, dfa->nodes + prev_node, mctx, 1477 str_idx) 1478 && STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + 1], 1479 dfa->nexts[prev_node])) 1480 naccepted = 1; 1481 1482 if (naccepted == 0) 1483 continue; 1484 1485 if (sctx->limits.nelem) 1486 { 1487 int to_idx = str_idx + naccepted; 1488 if (check_dst_limits (dfa, &sctx->limits, mctx, 1489 dfa->nexts[prev_node], to_idx, 1490 prev_node, str_idx)) 1491 continue; 1492 } 1493 ret = re_node_set_insert (&cur_dest, prev_node); 1494 if (BE (ret == -1, 0)) 1495 { 1496 err = REG_ESPACE; 1497 goto free_return; 1498 } 1499 } 1500 1501 /* Add all the nodes which satisfy the following conditions: 1502 - It can epsilon transit to a node in CUR_DEST. 1503 - It is in CUR_SRC. 1504 And update state_log. */ 1505 err = update_cur_sifted_state (preg, mctx, sctx, str_idx, &cur_dest); 1506 if (BE (err != REG_NOERROR, 0)) 1507 goto free_return; 1508 } 1509 err = REG_NOERROR; 1510 free_return: 1511 re_node_set_free (&cur_dest); 1512 return err; 1513} 1514 1515/* Helper functions. */ 1516 1517static inline reg_errcode_t 1518clean_state_log_if_need (mctx, next_state_log_idx) 1519 re_match_context_t *mctx; 1520 int next_state_log_idx; 1521{ 1522 int top = mctx->state_log_top; 1523 1524 if (next_state_log_idx >= mctx->input->bufs_len 1525 || (next_state_log_idx >= mctx->input->valid_len 1526 && mctx->input->valid_len < mctx->input->len)) 1527 { 1528 reg_errcode_t err; 1529 err = extend_buffers (mctx); 1530 if (BE (err != REG_NOERROR, 0)) 1531 return err; 1532 } 1533 1534 if (top < next_state_log_idx) 1535 { 1536 memset (mctx->state_log + top + 1, '\0', 1537 sizeof (re_dfastate_t *) * (next_state_log_idx - top)); 1538 mctx->state_log_top = next_state_log_idx; 1539 } 1540 return REG_NOERROR; 1541} 1542 1543static reg_errcode_t 1544merge_state_array (dfa, dst, src, num) 1545 re_dfa_t *dfa; 1546 re_dfastate_t **dst; 1547 re_dfastate_t **src; 1548 int num; 1549{ 1550 int st_idx; 1551 reg_errcode_t err; 1552 for (st_idx = 0; st_idx < num; ++st_idx) 1553 { 1554 if (dst[st_idx] == NULL) 1555 dst[st_idx] = src[st_idx]; 1556 else if (src[st_idx] != NULL) 1557 { 1558 re_node_set merged_set; 1559 err = re_node_set_init_union (&merged_set, &dst[st_idx]->nodes, 1560 &src[st_idx]->nodes); 1561 if (BE (err != REG_NOERROR, 0)) 1562 return err; 1563 dst[st_idx] = re_acquire_state (&err, dfa, &merged_set); 1564 re_node_set_free (&merged_set); 1565 if (BE (err != REG_NOERROR, 0)) 1566 return err; 1567 } 1568 } 1569 return REG_NOERROR; 1570} 1571 1572static reg_errcode_t 1573update_cur_sifted_state (preg, mctx, sctx, str_idx, dest_nodes) 1574 const regex_t *preg; 1575 re_match_context_t *mctx; 1576 re_sift_context_t *sctx; 1577 int str_idx; 1578 re_node_set *dest_nodes; 1579{ 1580 reg_errcode_t err; 1581 re_dfa_t *dfa = (re_dfa_t *)preg->buffer; 1582 const re_node_set *candidates; 1583 candidates = ((mctx->state_log[str_idx] == NULL) ? &empty_set 1584 : &mctx->state_log[str_idx]->nodes); 1585 1586 /* At first, add the nodes which can epsilon transit to a node in 1587 DEST_NODE. */ 1588 if (dest_nodes->nelem) 1589 { 1590 err = add_epsilon_src_nodes (dfa, dest_nodes, candidates); 1591 if (BE (err != REG_NOERROR, 0)) 1592 return err; 1593 } 1594 1595 /* Then, check the limitations in the current sift_context. */ 1596 if (dest_nodes->nelem && sctx->limits.nelem) 1597 { 1598 err = check_subexp_limits (dfa, dest_nodes, candidates, &sctx->limits, 1599 mctx->bkref_ents, str_idx); 1600 if (BE (err != REG_NOERROR, 0)) 1601 return err; 1602 } 1603 1604 /* Update state_log. */ 1605 sctx->sifted_states[str_idx] = re_acquire_state (&err, dfa, dest_nodes); 1606 if (BE (sctx->sifted_states[str_idx] == NULL && err != REG_NOERROR, 0)) 1607 return err; 1608 1609 if ((mctx->state_log[str_idx] != NULL 1610 && mctx->state_log[str_idx]->has_backref)) 1611 { 1612 err = sift_states_bkref (preg, mctx, sctx, str_idx, dest_nodes); 1613 if (BE (err != REG_NOERROR, 0)) 1614 return err; 1615 } 1616 return REG_NOERROR; 1617} 1618 1619static reg_errcode_t 1620add_epsilon_src_nodes (dfa, dest_nodes, candidates) 1621 re_dfa_t *dfa; 1622 re_node_set *dest_nodes; 1623 const re_node_set *candidates; 1624{ 1625 reg_errcode_t err; 1626 int src_idx; 1627 re_node_set src_copy; 1628 1629 err = re_node_set_init_copy (&src_copy, dest_nodes); 1630 if (BE (err != REG_NOERROR, 0)) 1631 return err; 1632 for (src_idx = 0; src_idx < src_copy.nelem; ++src_idx) 1633 { 1634 err = re_node_set_add_intersect (dest_nodes, candidates, 1635 dfa->inveclosures 1636 + src_copy.elems[src_idx]); 1637 if (BE (err != REG_NOERROR, 0)) 1638 { 1639 re_node_set_free (&src_copy); 1640 return err; 1641 } 1642 } 1643 re_node_set_free (&src_copy); 1644 return REG_NOERROR; 1645} 1646 1647static reg_errcode_t 1648sub_epsilon_src_nodes (dfa, node, dest_nodes, candidates) 1649 re_dfa_t *dfa; 1650 int node; 1651 re_node_set *dest_nodes; 1652 const re_node_set *candidates; 1653{ 1654 int ecl_idx; 1655 reg_errcode_t err; 1656 re_node_set *inv_eclosure = dfa->inveclosures + node; 1657 re_node_set except_nodes; 1658 re_node_set_init_empty (&except_nodes); 1659 for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx) 1660 { 1661 int cur_node = inv_eclosure->elems[ecl_idx]; 1662 if (cur_node == node) 1663 continue; 1664 if (IS_EPSILON_NODE (dfa->nodes[cur_node].type)) 1665 { 1666 int edst1 = dfa->edests[cur_node].elems[0]; 1667 int edst2 = ((dfa->edests[cur_node].nelem > 1) 1668 ? dfa->edests[cur_node].elems[1] : -1); 1669 if ((!re_node_set_contains (inv_eclosure, edst1) 1670 && re_node_set_contains (dest_nodes, edst1)) 1671 || (edst2 > 0 1672 && !re_node_set_contains (inv_eclosure, edst2) 1673 && re_node_set_contains (dest_nodes, edst2))) 1674 { 1675 err = re_node_set_add_intersect (&except_nodes, candidates, 1676 dfa->inveclosures + cur_node); 1677 if (BE (err != REG_NOERROR, 0)) 1678 { 1679 re_node_set_free (&except_nodes); 1680 return err; 1681 } 1682 } 1683 } 1684 } 1685 for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx) 1686 { 1687 int cur_node = inv_eclosure->elems[ecl_idx]; 1688 if (!re_node_set_contains (&except_nodes, cur_node)) 1689 { 1690 int idx = re_node_set_contains (dest_nodes, cur_node) - 1; 1691 re_node_set_remove_at (dest_nodes, idx); 1692 } 1693 } 1694 re_node_set_free (&except_nodes); 1695 return REG_NOERROR; 1696} 1697 1698static int 1699check_dst_limits (dfa, limits, mctx, dst_node, dst_idx, src_node, src_idx) 1700 re_dfa_t *dfa; 1701 re_node_set *limits; 1702 re_match_context_t *mctx; 1703 int dst_node, dst_idx, src_node, src_idx; 1704{ 1705 int lim_idx, src_pos, dst_pos; 1706 1707 for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx) 1708 { 1709 int subexp_idx; 1710 struct re_backref_cache_entry *ent; 1711 ent = mctx->bkref_ents + limits->elems[lim_idx]; 1712 subexp_idx = dfa->nodes[ent->node].opr.idx - 1; 1713 1714 dst_pos = check_dst_limits_calc_pos (dfa, mctx, limits->elems[lim_idx], 1715 dfa->eclosures + dst_node, 1716 subexp_idx, dst_node, dst_idx); 1717 src_pos = check_dst_limits_calc_pos (dfa, mctx, limits->elems[lim_idx], 1718 dfa->eclosures + src_node, 1719 subexp_idx, src_node, src_idx); 1720 1721 /* In case of: 1722 <src> <dst> ( <subexp> ) 1723 ( <subexp> ) <src> <dst> 1724 ( <subexp1> <src> <subexp2> <dst> <subexp3> ) */ 1725 if (src_pos == dst_pos) 1726 continue; /* This is unrelated limitation. */ 1727 else 1728 return 1; 1729 } 1730 return 0; 1731} 1732 1733static int 1734check_dst_limits_calc_pos (dfa, mctx, limit, eclosures, subexp_idx, node, 1735 str_idx) 1736 re_dfa_t *dfa; 1737 re_match_context_t *mctx; 1738 re_node_set *eclosures; 1739 int limit, subexp_idx, node, str_idx; 1740{ 1741 struct re_backref_cache_entry *lim = mctx->bkref_ents + limit; 1742 int pos = (str_idx < lim->subexp_from ? -1 1743 : (lim->subexp_to < str_idx ? 1 : 0)); 1744 if (pos == 0 1745 && (str_idx == lim->subexp_from || str_idx == lim->subexp_to)) 1746 { 1747 int node_idx; 1748 for (node_idx = 0; node_idx < eclosures->nelem; ++node_idx) 1749 { 1750 int node = eclosures->elems[node_idx]; 1751 re_token_type_t type= dfa->nodes[node].type; 1752 if (type == OP_BACK_REF) 1753 { 1754 int bi = search_cur_bkref_entry (mctx, str_idx); 1755 for (; bi < mctx->nbkref_ents; ++bi) 1756 { 1757 struct re_backref_cache_entry *ent = mctx->bkref_ents + bi; 1758 if (ent->str_idx > str_idx) 1759 break; 1760 if (ent->node == node && ent->subexp_from == ent->subexp_to) 1761 { 1762 int cpos, dst; 1763 dst = dfa->edests[node].elems[0]; 1764 cpos = check_dst_limits_calc_pos (dfa, mctx, limit, 1765 dfa->eclosures + dst, 1766 subexp_idx, dst, 1767 str_idx); 1768 if ((str_idx == lim->subexp_from && cpos == -1) 1769 || (str_idx == lim->subexp_to && cpos == 0)) 1770 return cpos; 1771 } 1772 } 1773 } 1774 if (type == OP_OPEN_SUBEXP && subexp_idx == dfa->nodes[node].opr.idx 1775 && str_idx == lim->subexp_from) 1776 { 1777 pos = -1; 1778 break; 1779 } 1780 if (type == OP_CLOSE_SUBEXP && subexp_idx == dfa->nodes[node].opr.idx 1781 && str_idx == lim->subexp_to) 1782 break; 1783 } 1784 if (node_idx == eclosures->nelem && str_idx == lim->subexp_to) 1785 pos = 1; 1786 } 1787 return pos; 1788} 1789 1790/* Check the limitations of sub expressions LIMITS, and remove the nodes 1791 which are against limitations from DEST_NODES. */ 1792 1793static reg_errcode_t 1794check_subexp_limits (dfa, dest_nodes, candidates, limits, bkref_ents, str_idx) 1795 re_dfa_t *dfa; 1796 re_node_set *dest_nodes; 1797 const re_node_set *candidates; 1798 re_node_set *limits; 1799 struct re_backref_cache_entry *bkref_ents; 1800 int str_idx; 1801{ 1802 reg_errcode_t err; 1803 int node_idx, lim_idx; 1804 1805 for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx) 1806 { 1807 int subexp_idx; 1808 struct re_backref_cache_entry *ent; 1809 ent = bkref_ents + limits->elems[lim_idx]; 1810 1811 if (str_idx <= ent->subexp_from || ent->str_idx < str_idx) 1812 continue; /* This is unrelated limitation. */ 1813 1814 subexp_idx = dfa->nodes[ent->node].opr.idx - 1; 1815 if (ent->subexp_to == str_idx) 1816 { 1817 int ops_node = -1; 1818 int cls_node = -1; 1819 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx) 1820 { 1821 int node = dest_nodes->elems[node_idx]; 1822 re_token_type_t type= dfa->nodes[node].type; 1823 if (type == OP_OPEN_SUBEXP 1824 && subexp_idx == dfa->nodes[node].opr.idx) 1825 ops_node = node; 1826 else if (type == OP_CLOSE_SUBEXP 1827 && subexp_idx == dfa->nodes[node].opr.idx) 1828 cls_node = node; 1829 } 1830 1831 /* Check the limitation of the open subexpression. */ 1832 /* Note that (ent->subexp_to = str_idx != ent->subexp_from). */ 1833 if (ops_node >= 0) 1834 { 1835 err = sub_epsilon_src_nodes(dfa, ops_node, dest_nodes, 1836 candidates); 1837 if (BE (err != REG_NOERROR, 0)) 1838 return err; 1839 } 1840 /* Check the limitation of the close subexpression. */ 1841 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx) 1842 { 1843 int node = dest_nodes->elems[node_idx]; 1844 if (!re_node_set_contains (dfa->inveclosures + node, cls_node) 1845 && !re_node_set_contains (dfa->eclosures + node, cls_node)) 1846 { 1847 /* It is against this limitation. 1848 Remove it form the current sifted state. */ 1849 err = sub_epsilon_src_nodes(dfa, node, dest_nodes, 1850 candidates); 1851 if (BE (err != REG_NOERROR, 0)) 1852 return err; 1853 --node_idx; 1854 } 1855 } 1856 } 1857 else /* (ent->subexp_to != str_idx) */ 1858 { 1859 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx) 1860 { 1861 int node = dest_nodes->elems[node_idx]; 1862 re_token_type_t type= dfa->nodes[node].type; 1863 if (type == OP_CLOSE_SUBEXP || type == OP_OPEN_SUBEXP) 1864 { 1865 if (subexp_idx != dfa->nodes[node].opr.idx) 1866 continue; 1867 if ((type == OP_CLOSE_SUBEXP && ent->subexp_to != str_idx) 1868 || (type == OP_OPEN_SUBEXP)) 1869 { 1870 /* It is against this limitation. 1871 Remove it form the current sifted state. */ 1872 err = sub_epsilon_src_nodes(dfa, node, dest_nodes, 1873 candidates); 1874 if (BE (err != REG_NOERROR, 0)) 1875 return err; 1876 } 1877 } 1878 } 1879 } 1880 } 1881 return REG_NOERROR; 1882} 1883 1884static reg_errcode_t 1885sift_states_bkref (preg, mctx, sctx, str_idx, dest_nodes) 1886 const regex_t *preg; 1887 re_match_context_t *mctx; 1888 re_sift_context_t *sctx; 1889 int str_idx; 1890 re_node_set *dest_nodes; 1891{ 1892 reg_errcode_t err; 1893 re_dfa_t *dfa = (re_dfa_t *)preg->buffer; 1894 int node_idx, node; 1895 re_sift_context_t local_sctx; 1896 const re_node_set *candidates; 1897 candidates = ((mctx->state_log[str_idx] == NULL) ? &empty_set 1898 : &mctx->state_log[str_idx]->nodes); 1899 local_sctx.sifted_states = NULL; /* Mark that it hasn't been initialized. */ 1900 1901 for (node_idx = 0; node_idx < candidates->nelem; ++node_idx) 1902 { 1903 int cur_bkref_idx = re_string_cur_idx (mctx->input); 1904 re_token_type_t type; 1905 node = candidates->elems[node_idx]; 1906 type = dfa->nodes[node].type; 1907 if (node == sctx->cur_bkref && str_idx == cur_bkref_idx) 1908 continue; 1909 /* Avoid infinite loop for the REs like "()\1+". */ 1910 if (node == sctx->last_node && str_idx == sctx->last_str_idx) 1911 continue; 1912 if (type == OP_BACK_REF) 1913 { 1914 int enabled_idx = search_cur_bkref_entry (mctx, str_idx); 1915 for (; enabled_idx < mctx->nbkref_ents; ++enabled_idx) 1916 { 1917 int disabled_idx, subexp_len, to_idx, dst_node; 1918 struct re_backref_cache_entry *entry; 1919 entry = mctx->bkref_ents + enabled_idx; 1920 if (entry->str_idx > str_idx) 1921 break; 1922 if (entry->node != node) 1923 continue; 1924 subexp_len = entry->subexp_to - entry->subexp_from; 1925 to_idx = str_idx + subexp_len; 1926 dst_node = (subexp_len ? dfa->nexts[node] 1927 : dfa->edests[node].elems[0]); 1928 1929 if (to_idx > sctx->last_str_idx 1930 || sctx->sifted_states[to_idx] == NULL 1931 || !STATE_NODE_CONTAINS (sctx->sifted_states[to_idx], 1932 dst_node) 1933 || check_dst_limits (dfa, &sctx->limits, mctx, node, 1934 str_idx, dst_node, to_idx)) 1935 continue; 1936 { 1937 re_dfastate_t *cur_state; 1938 entry->flag = 0; 1939 for (disabled_idx = enabled_idx + 1; 1940 disabled_idx < mctx->nbkref_ents; ++disabled_idx) 1941 { 1942 struct re_backref_cache_entry *entry2; 1943 entry2 = mctx->bkref_ents + disabled_idx; 1944 if (entry2->str_idx > str_idx) 1945 break; 1946 entry2->flag = (entry2->node == node) ? 1 : entry2->flag; 1947 } 1948 1949 if (local_sctx.sifted_states == NULL) 1950 { 1951 local_sctx = *sctx; 1952 err = re_node_set_init_copy (&local_sctx.limits, 1953 &sctx->limits); 1954 if (BE (err != REG_NOERROR, 0)) 1955 goto free_return; 1956 } 1957 local_sctx.last_node = node; 1958 local_sctx.last_str_idx = str_idx; 1959 err = re_node_set_insert (&local_sctx.limits, enabled_idx); 1960 if (BE (err < 0, 0)) 1961 { 1962 err = REG_ESPACE; 1963 goto free_return; 1964 } 1965 cur_state = local_sctx.sifted_states[str_idx]; 1966 err = sift_states_backward (preg, mctx, &local_sctx); 1967 if (BE (err != REG_NOERROR, 0)) 1968 goto free_return; 1969 if (sctx->limited_states != NULL) 1970 { 1971 err = merge_state_array (dfa, sctx->limited_states, 1972 local_sctx.sifted_states, 1973 str_idx + 1); 1974 if (BE (err != REG_NOERROR, 0)) 1975 goto free_return; 1976 } 1977 local_sctx.sifted_states[str_idx] = cur_state; 1978 re_node_set_remove (&local_sctx.limits, enabled_idx); 1979 /* We must not use the variable entry here, since 1980 mctx->bkref_ents might be realloced. */ 1981 mctx->bkref_ents[enabled_idx].flag = 1; 1982 } 1983 } 1984 enabled_idx = search_cur_bkref_entry (mctx, str_idx); 1985 for (; enabled_idx < mctx->nbkref_ents; ++enabled_idx) 1986 { 1987 struct re_backref_cache_entry *entry; 1988 entry = mctx->bkref_ents + enabled_idx; 1989 if (entry->str_idx > str_idx) 1990 break; 1991 if (entry->node == node) 1992 entry->flag = 0; 1993 } 1994 } 1995 } 1996 err = REG_NOERROR; 1997 free_return: 1998 if (local_sctx.sifted_states != NULL) 1999 { 2000 re_node_set_free (&local_sctx.limits); 2001 } 2002 2003 return err; 2004} 2005 2006 2007#ifdef RE_ENABLE_I18N 2008static int 2009sift_states_iter_mb (preg, mctx, sctx, node_idx, str_idx, max_str_idx) 2010 const regex_t *preg; 2011 const re_match_context_t *mctx; 2012 re_sift_context_t *sctx; 2013 int node_idx, str_idx, max_str_idx; 2014{ 2015 re_dfa_t *dfa = (re_dfa_t *) preg->buffer; 2016 int naccepted; 2017 /* Check the node can accept `multi byte'. */ 2018 naccepted = check_node_accept_bytes (preg, node_idx, mctx->input, str_idx); 2019 if (naccepted > 0 && str_idx + naccepted <= max_str_idx && 2020 !STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + naccepted], 2021 dfa->nexts[node_idx])) 2022 /* The node can't accept the `multi byte', or the 2023 destination was already throwed away, then the node 2024 could't accept the current input `multi byte'. */ 2025 naccepted = 0; 2026 /* Otherwise, it is sure that the node could accept 2027 `naccepted' bytes input. */ 2028 return naccepted; 2029} 2030#endif /* RE_ENABLE_I18N */ 2031 2032 2033/* Functions for state transition. */ 2034 2035/* Return the next state to which the current state STATE will transit by 2036 accepting the current input byte, and update STATE_LOG if necessary. 2037 If STATE can accept a multibyte char/collating element/back reference 2038 update the destination of STATE_LOG. */ 2039 2040static re_dfastate_t * 2041transit_state (err, preg, mctx, state, fl_search) 2042 reg_errcode_t *err; 2043 const regex_t *preg; 2044 re_match_context_t *mctx; 2045 re_dfastate_t *state; 2046 int fl_search; 2047{ 2048 re_dfa_t *dfa = (re_dfa_t *) preg->buffer; 2049 re_dfastate_t **trtable, *next_state; 2050 unsigned char ch; 2051 int cur_idx; 2052 2053 if (re_string_cur_idx (mctx->input) + 1 >= mctx->input->bufs_len 2054 || (re_string_cur_idx (mctx->input) + 1 >= mctx->input->valid_len 2055 && mctx->input->valid_len < mctx->input->len)) 2056 { 2057 *err = extend_buffers (mctx); 2058 if (BE (*err != REG_NOERROR, 0)) 2059 return NULL; 2060 } 2061 2062 *err = REG_NOERROR; 2063 if (state == NULL) 2064 { 2065 next_state = state; 2066 re_string_skip_bytes (mctx->input, 1); 2067 } 2068 else 2069 { 2070#ifdef RE_ENABLE_I18N 2071 /* If the current state can accept multibyte. */ 2072 if (state->accept_mb) 2073 { 2074 *err = transit_state_mb (preg, state, mctx); 2075 if (BE (*err != REG_NOERROR, 0)) 2076 return NULL; 2077 } 2078#endif /* RE_ENABLE_I18N */ 2079 2080 /* Then decide the next state with the single byte. */ 2081 if (1) 2082 { 2083 /* Use transition table */ 2084 ch = re_string_fetch_byte (mctx->input); 2085 trtable = fl_search ? state->trtable_search : state->trtable; 2086 if (trtable == NULL) 2087 { 2088 trtable = build_trtable (preg, state, fl_search); 2089 if (fl_search) 2090 state->trtable_search = trtable; 2091 else 2092 state->trtable = trtable; 2093 } 2094 next_state = trtable[ch]; 2095 } 2096 else 2097 { 2098 /* don't use transition table */ 2099 next_state = transit_state_sb (err, preg, state, fl_search, mctx); 2100 if (BE (next_state == NULL && err != REG_NOERROR, 0)) 2101 return NULL; 2102 } 2103 } 2104 2105 cur_idx = re_string_cur_idx (mctx->input); 2106 /* Update the state_log if we need. */ 2107 if (mctx->state_log != NULL) 2108 { 2109 if (cur_idx > mctx->state_log_top) 2110 { 2111 mctx->state_log[cur_idx] = next_state; 2112 mctx->state_log_top = cur_idx; 2113 } 2114 else if (mctx->state_log[cur_idx] == 0) 2115 { 2116 mctx->state_log[cur_idx] = next_state; 2117 } 2118 else 2119 { 2120 re_dfastate_t *pstate; 2121 unsigned int context; 2122 re_node_set next_nodes, *log_nodes, *table_nodes = NULL; 2123 /* If (state_log[cur_idx] != 0), it implies that cur_idx is 2124 the destination of a multibyte char/collating element/ 2125 back reference. Then the next state is the union set of 2126 these destinations and the results of the transition table. */ 2127 pstate = mctx->state_log[cur_idx]; 2128 log_nodes = pstate->entrance_nodes; 2129 if (next_state != NULL) 2130 { 2131 table_nodes = next_state->entrance_nodes; 2132 *err = re_node_set_init_union (&next_nodes, table_nodes, 2133 log_nodes); 2134 if (BE (*err != REG_NOERROR, 0)) 2135 return NULL; 2136 } 2137 else 2138 next_nodes = *log_nodes; 2139 /* Note: We already add the nodes of the initial state, 2140 then we don't need to add them here. */ 2141 2142 context = re_string_context_at (mctx->input, 2143 re_string_cur_idx (mctx->input) - 1, 2144 mctx->eflags, preg->newline_anchor); 2145 next_state = mctx->state_log[cur_idx] 2146 = re_acquire_state_context (err, dfa, &next_nodes, context); 2147 /* We don't need to check errors here, since the return value of 2148 this function is next_state and ERR is already set. */ 2149 2150 if (table_nodes != NULL) 2151 re_node_set_free (&next_nodes); 2152 } 2153 } 2154 2155 /* Check OP_OPEN_SUBEXP in the current state in case that we use them 2156 later. We must check them here, since the back references in the 2157 next state might use them. */ 2158 if (dfa->nbackref && next_state/* && fl_process_bkref */) 2159 { 2160 *err = check_subexp_matching_top (dfa, mctx, &next_state->nodes, 2161 cur_idx); 2162 if (BE (*err != REG_NOERROR, 0)) 2163 return NULL; 2164 } 2165 2166 /* If the next state has back references. */ 2167 if (next_state != NULL && next_state->has_backref) 2168 { 2169 *err = transit_state_bkref (preg, &next_state->nodes, mctx); 2170 if (BE (*err != REG_NOERROR, 0)) 2171 return NULL; 2172 next_state = mctx->state_log[cur_idx]; 2173 } 2174 return next_state; 2175} 2176 2177/* Helper functions for transit_state. */ 2178 2179/* From the node set CUR_NODES, pick up the nodes whose types are 2180 OP_OPEN_SUBEXP and which have corresponding back references in the regular 2181 expression. And register them to use them later for evaluating the 2182 correspoding back references. */ 2183 2184static reg_errcode_t 2185check_subexp_matching_top (dfa, mctx, cur_nodes, str_idx) 2186 re_dfa_t *dfa; 2187 re_match_context_t *mctx; 2188 re_node_set *cur_nodes; 2189 int str_idx; 2190{ 2191 int node_idx; 2192 reg_errcode_t err; 2193 2194 /* TODO: This isn't efficient. 2195 Because there might be more than one nodes whose types are 2196 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all 2197 nodes. 2198 E.g. RE: (a){2} */ 2199 for (node_idx = 0; node_idx < cur_nodes->nelem; ++node_idx) 2200 { 2201 int node = cur_nodes->elems[node_idx]; 2202 if (dfa->nodes[node].type == OP_OPEN_SUBEXP 2203 && dfa->used_bkref_map & (1 << dfa->nodes[node].opr.idx)) 2204 { 2205 err = match_ctx_add_subtop (mctx, node, str_idx); 2206 if (BE (err != REG_NOERROR, 0)) 2207 return err; 2208 } 2209 } 2210 return REG_NOERROR; 2211} 2212 2213/* Return the next state to which the current state STATE will transit by 2214 accepting the current input byte. */ 2215 2216static re_dfastate_t * 2217transit_state_sb (err, preg, state, fl_search, mctx) 2218 reg_errcode_t *err; 2219 const regex_t *preg; 2220 re_dfastate_t *state; 2221 int fl_search; 2222 re_match_context_t *mctx; 2223{ 2224 re_dfa_t *dfa = (re_dfa_t *) preg->buffer; 2225 re_node_set next_nodes; 2226 re_dfastate_t *next_state; 2227 int node_cnt, cur_str_idx = re_string_cur_idx (mctx->input); 2228 unsigned int context; 2229 2230 *err = re_node_set_alloc (&next_nodes, state->nodes.nelem + 1); 2231 if (BE (*err != REG_NOERROR, 0)) 2232 return NULL; 2233 for (node_cnt = 0; node_cnt < state->nodes.nelem; ++node_cnt) 2234 { 2235 int cur_node = state->nodes.elems[node_cnt]; 2236 if (check_node_accept (preg, dfa->nodes + cur_node, mctx, cur_str_idx)) 2237 { 2238 *err = re_node_set_merge (&next_nodes, 2239 dfa->eclosures + dfa->nexts[cur_node]); 2240 if (BE (*err != REG_NOERROR, 0)) 2241 { 2242 re_node_set_free (&next_nodes); 2243 return NULL; 2244 } 2245 } 2246 } 2247 if (fl_search) 2248 { 2249#ifdef RE_ENABLE_I18N 2250 int not_initial = 0; 2251 if (re_mb_cur_max > 1) 2252 for (node_cnt = 0; node_cnt < next_nodes.nelem; ++node_cnt) 2253 if (dfa->nodes[next_nodes.elems[node_cnt]].type == CHARACTER) 2254 { 2255 not_initial = dfa->nodes[next_nodes.elems[node_cnt]].mb_partial; 2256 break; 2257 } 2258 if (!not_initial) 2259#endif 2260 { 2261 *err = re_node_set_merge (&next_nodes, 2262 dfa->init_state->entrance_nodes); 2263 if (BE (*err != REG_NOERROR, 0)) 2264 { 2265 re_node_set_free (&next_nodes); 2266 return NULL; 2267 } 2268 } 2269 } 2270 context = re_string_context_at (mctx->input, cur_str_idx, mctx->eflags, 2271 preg->newline_anchor); 2272 next_state = re_acquire_state_context (err, dfa, &next_nodes, context); 2273 /* We don't need to check errors here, since the return value of 2274 this function is next_state and ERR is already set. */ 2275 2276 re_node_set_free (&next_nodes); 2277 re_string_skip_bytes (mctx->input, 1); 2278 return next_state; 2279} 2280 2281#ifdef RE_ENABLE_I18N 2282static reg_errcode_t 2283transit_state_mb (preg, pstate, mctx) 2284 const regex_t *preg; 2285 re_dfastate_t *pstate; 2286 re_match_context_t *mctx; 2287{ 2288 reg_errcode_t err; 2289 re_dfa_t *dfa = (re_dfa_t *) preg->buffer; 2290 int i; 2291 2292 for (i = 0; i < pstate->nodes.nelem; ++i) 2293 { 2294 re_node_set dest_nodes, *new_nodes; 2295 int cur_node_idx = pstate->nodes.elems[i]; 2296 int naccepted = 0, dest_idx; 2297 unsigned int context; 2298 re_dfastate_t *dest_state; 2299 2300 if (dfa->nodes[cur_node_idx].constraint) 2301 { 2302 context = re_string_context_at (mctx->input, 2303 re_string_cur_idx (mctx->input), 2304 mctx->eflags, preg->newline_anchor); 2305 if (NOT_SATISFY_NEXT_CONSTRAINT (dfa->nodes[cur_node_idx].constraint, 2306 context)) 2307 continue; 2308 } 2309 2310 /* How many bytes the node can accepts? */ 2311 if (ACCEPT_MB_NODE (dfa->nodes[cur_node_idx].type)) 2312 naccepted = check_node_accept_bytes (preg, cur_node_idx, mctx->input, 2313 re_string_cur_idx (mctx->input)); 2314 if (naccepted == 0) 2315 continue; 2316 2317 /* The node can accepts `naccepted' bytes. */ 2318 dest_idx = re_string_cur_idx (mctx->input) + naccepted; 2319 mctx->max_mb_elem_len = ((mctx->max_mb_elem_len < naccepted) ? naccepted 2320 : mctx->max_mb_elem_len); 2321 err = clean_state_log_if_need (mctx, dest_idx); 2322 if (BE (err != REG_NOERROR, 0)) 2323 return err; 2324#ifdef DEBUG 2325 assert (dfa->nexts[cur_node_idx] != -1); 2326#endif 2327 /* `cur_node_idx' may point the entity of the OP_CONTEXT_NODE, 2328 then we use pstate->nodes.elems[i] instead. */ 2329 new_nodes = dfa->eclosures + dfa->nexts[pstate->nodes.elems[i]]; 2330 2331 dest_state = mctx->state_log[dest_idx]; 2332 if (dest_state == NULL) 2333 dest_nodes = *new_nodes; 2334 else 2335 { 2336 err = re_node_set_init_union (&dest_nodes, 2337 dest_state->entrance_nodes, new_nodes); 2338 if (BE (err != REG_NOERROR, 0)) 2339 return err; 2340 } 2341 context = re_string_context_at (mctx->input, dest_idx - 1, mctx->eflags, 2342 preg->newline_anchor); 2343 mctx->state_log[dest_idx] 2344 = re_acquire_state_context (&err, dfa, &dest_nodes, context); 2345 if (dest_state != NULL) 2346 re_node_set_free (&dest_nodes); 2347 if (BE (mctx->state_log[dest_idx] == NULL && err != REG_NOERROR, 0)) 2348 return err; 2349 } 2350 return REG_NOERROR; 2351} 2352#endif /* RE_ENABLE_I18N */ 2353 2354static reg_errcode_t 2355transit_state_bkref (preg, nodes, mctx) 2356 const regex_t *preg; 2357 re_node_set *nodes; 2358 re_match_context_t *mctx; 2359{ 2360 reg_errcode_t err; 2361 re_dfa_t *dfa = (re_dfa_t *) preg->buffer; 2362 int i; 2363 int cur_str_idx = re_string_cur_idx (mctx->input); 2364 2365 for (i = 0; i < nodes->nelem; ++i) 2366 { 2367 int dest_str_idx, prev_nelem, bkc_idx; 2368 int node_idx = nodes->elems[i]; 2369 unsigned int context; 2370 re_token_t *node = dfa->nodes + node_idx; 2371 re_node_set *new_dest_nodes; 2372 2373 /* Check whether `node' is a backreference or not. */ 2374 if (node->type != OP_BACK_REF) 2375 continue; 2376 2377 if (node->constraint) 2378 { 2379 context = re_string_context_at (mctx->input, cur_str_idx, 2380 mctx->eflags, preg->newline_anchor); 2381 if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context)) 2382 continue; 2383 } 2384 2385 /* `node' is a backreference. 2386 Check the substring which the substring matched. */ 2387 bkc_idx = mctx->nbkref_ents; 2388 err = get_subexp (preg, mctx, node_idx, cur_str_idx); 2389 if (BE (err != REG_NOERROR, 0)) 2390 goto free_return; 2391 2392 /* And add the epsilon closures (which is `new_dest_nodes') of 2393 the backreference to appropriate state_log. */ 2394#ifdef DEBUG 2395 assert (dfa->nexts[node_idx] != -1); 2396#endif 2397 for (; bkc_idx < mctx->nbkref_ents; ++bkc_idx) 2398 { 2399 int subexp_len; 2400 re_dfastate_t *dest_state; 2401 struct re_backref_cache_entry *bkref_ent; 2402 bkref_ent = mctx->bkref_ents + bkc_idx; 2403 if (bkref_ent->node != node_idx || bkref_ent->str_idx != cur_str_idx) 2404 continue; 2405 subexp_len = bkref_ent->subexp_to - bkref_ent->subexp_from; 2406 new_dest_nodes = (subexp_len == 0 2407 ? dfa->eclosures + dfa->edests[node_idx].elems[0] 2408 : dfa->eclosures + dfa->nexts[node_idx]); 2409 dest_str_idx = (cur_str_idx + bkref_ent->subexp_to 2410 - bkref_ent->subexp_from); 2411 context = re_string_context_at (mctx->input, dest_str_idx - 1, 2412 mctx->eflags, preg->newline_anchor); 2413 dest_state = mctx->state_log[dest_str_idx]; 2414 prev_nelem = ((mctx->state_log[cur_str_idx] == NULL) ? 0 2415 : mctx->state_log[cur_str_idx]->nodes.nelem); 2416 /* Add `new_dest_node' to state_log. */ 2417 if (dest_state == NULL) 2418 { 2419 mctx->state_log[dest_str_idx] 2420 = re_acquire_state_context (&err, dfa, new_dest_nodes, 2421 context); 2422 if (BE (mctx->state_log[dest_str_idx] == NULL 2423 && err != REG_NOERROR, 0)) 2424 goto free_return; 2425 } 2426 else 2427 { 2428 re_node_set dest_nodes; 2429 err = re_node_set_init_union (&dest_nodes, 2430 dest_state->entrance_nodes, 2431 new_dest_nodes); 2432 if (BE (err != REG_NOERROR, 0)) 2433 { 2434 re_node_set_free (&dest_nodes); 2435 goto free_return; 2436 } 2437 mctx->state_log[dest_str_idx] 2438 = re_acquire_state_context (&err, dfa, &dest_nodes, context); 2439 re_node_set_free (&dest_nodes); 2440 if (BE (mctx->state_log[dest_str_idx] == NULL 2441 && err != REG_NOERROR, 0)) 2442 goto free_return; 2443 } 2444 /* We need to check recursively if the backreference can epsilon 2445 transit. */ 2446 if (subexp_len == 0 2447 && mctx->state_log[cur_str_idx]->nodes.nelem > prev_nelem) 2448 { 2449 err = check_subexp_matching_top (dfa, mctx, new_dest_nodes, 2450 cur_str_idx); 2451 if (BE (err != REG_NOERROR, 0)) 2452 goto free_return; 2453 err = transit_state_bkref (preg, new_dest_nodes, mctx); 2454 if (BE (err != REG_NOERROR, 0)) 2455 goto free_return; 2456 } 2457 } 2458 } 2459 err = REG_NOERROR; 2460 free_return: 2461 return err; 2462} 2463 2464/* Enumerate all the candidates which the backreference BKREF_NODE can match 2465 at BKREF_STR_IDX, and register them by match_ctx_add_entry(). 2466 Note that we might collect inappropriate candidates here. 2467 However, the cost of checking them strictly here is too high, then we 2468 delay these checking for prune_impossible_nodes(). */ 2469 2470static reg_errcode_t 2471get_subexp (preg, mctx, bkref_node, bkref_str_idx) 2472 const regex_t *preg; 2473 re_match_context_t *mctx; 2474 int bkref_node, bkref_str_idx; 2475{ 2476 int subexp_num, sub_top_idx; 2477 re_dfa_t *dfa = (re_dfa_t *) preg->buffer; 2478 char *buf = (char *) re_string_get_buffer (mctx->input); 2479 /* Return if we have already checked BKREF_NODE at BKREF_STR_IDX. */ 2480 int cache_idx = search_cur_bkref_entry (mctx, bkref_str_idx); 2481 for (; cache_idx < mctx->nbkref_ents; ++cache_idx) 2482 { 2483 struct re_backref_cache_entry *entry = mctx->bkref_ents + cache_idx; 2484 if (entry->str_idx > bkref_str_idx) 2485 break; 2486 if (entry->node == bkref_node) 2487 return REG_NOERROR; /* We already checked it. */ 2488 } 2489 subexp_num = dfa->nodes[bkref_node].opr.idx - 1; 2490 2491 /* For each sub expression */ 2492 for (sub_top_idx = 0; sub_top_idx < mctx->nsub_tops; ++sub_top_idx) 2493 { 2494 reg_errcode_t err; 2495 re_sub_match_top_t *sub_top = mctx->sub_tops[sub_top_idx]; 2496 re_sub_match_last_t *sub_last; 2497 int sub_last_idx, sl_str; 2498 char *bkref_str; 2499 2500 if (dfa->nodes[sub_top->node].opr.idx != subexp_num) 2501 continue; /* It isn't related. */ 2502 2503 sl_str = sub_top->str_idx; 2504 bkref_str = buf + bkref_str_idx; 2505 /* At first, check the last node of sub expressions we already 2506 evaluated. */ 2507 for (sub_last_idx = 0; sub_last_idx < sub_top->nlasts; ++sub_last_idx) 2508 { 2509 int sl_str_diff; 2510 sub_last = sub_top->lasts[sub_last_idx]; 2511 sl_str_diff = sub_last->str_idx - sl_str; 2512 /* The matched string by the sub expression match with the substring 2513 at the back reference? */ 2514 if (sl_str_diff > 0 2515 && memcmp (bkref_str, buf + sl_str, sl_str_diff) != 0) 2516 break; /* We don't need to search this sub expression any more. */ 2517 bkref_str += sl_str_diff; 2518 sl_str += sl_str_diff; 2519 err = get_subexp_sub (preg, mctx, sub_top, sub_last, bkref_node, 2520 bkref_str_idx); 2521 if (err == REG_NOMATCH) 2522 continue; 2523 if (BE (err != REG_NOERROR, 0)) 2524 return err; 2525 } 2526 if (sub_last_idx < sub_top->nlasts) 2527 continue; 2528 if (sub_last_idx > 0) 2529 ++sl_str; 2530 /* Then, search for the other last nodes of the sub expression. */ 2531 for (; sl_str <= bkref_str_idx; ++sl_str) 2532 { 2533 int cls_node, sl_str_off; 2534 re_node_set *nodes; 2535 sl_str_off = sl_str - sub_top->str_idx; 2536 /* The matched string by the sub expression match with the substring 2537 at the back reference? */ 2538 if (sl_str_off > 0 2539 && memcmp (bkref_str++, buf + sl_str - 1, 1) != 0) 2540 break; /* We don't need to search this sub expression any more. */ 2541 if (mctx->state_log[sl_str] == NULL) 2542 continue; 2543 /* Does this state have a ')' of the sub expression? */ 2544 nodes = &mctx->state_log[sl_str]->nodes; 2545 cls_node = find_subexp_node (dfa, nodes, subexp_num, 0); 2546 if (cls_node == -1) 2547 continue; /* No. */ 2548 if (sub_top->path == NULL) 2549 { 2550 sub_top->path = calloc (sizeof (state_array_t), 2551 sl_str - sub_top->str_idx + 1); 2552 if (sub_top->path == NULL) 2553 return REG_ESPACE; 2554 } 2555 /* Can the OP_OPEN_SUBEXP node arrive the OP_CLOSE_SUBEXP node 2556 in the current context? */ 2557 err = check_arrival (preg, mctx, sub_top->path, sub_top->node, 2558 sub_top->str_idx, cls_node, sl_str, 0); 2559 if (err == REG_NOMATCH) 2560 continue; 2561 if (BE (err != REG_NOERROR, 0)) 2562 return err; 2563 sub_last = match_ctx_add_sublast (sub_top, cls_node, sl_str); 2564 if (BE (sub_last == NULL, 0)) 2565 return REG_ESPACE; 2566 err = get_subexp_sub (preg, mctx, sub_top, sub_last, bkref_node, 2567 bkref_str_idx); 2568 if (err == REG_NOMATCH) 2569 continue; 2570 } 2571 } 2572 return REG_NOERROR; 2573} 2574 2575/* Helper functions for get_subexp(). */ 2576 2577/* Check SUB_LAST can arrive to the back reference BKREF_NODE at BKREF_STR. 2578 If it can arrive, register the sub expression expressed with SUB_TOP 2579 and SUB_LAST. */ 2580 2581static reg_errcode_t 2582get_subexp_sub (preg, mctx, sub_top, sub_last, bkref_node, bkref_str) 2583 const regex_t *preg; 2584 re_match_context_t *mctx; 2585 re_sub_match_top_t *sub_top; 2586 re_sub_match_last_t *sub_last; 2587 int bkref_node, bkref_str; 2588{ 2589 reg_errcode_t err; 2590 int to_idx; 2591 /* Can the subexpression arrive the back reference? */ 2592 err = check_arrival (preg, mctx, &sub_last->path, sub_last->node, 2593 sub_last->str_idx, bkref_node, bkref_str, 1); 2594 if (err != REG_NOERROR) 2595 return err; 2596 err = match_ctx_add_entry (mctx, bkref_node, bkref_str, sub_top->str_idx, 2597 sub_last->str_idx); 2598 if (BE (err != REG_NOERROR, 0)) 2599 return err; 2600 to_idx = bkref_str + sub_last->str_idx - sub_top->str_idx; 2601 clean_state_log_if_need (mctx, to_idx); 2602 return REG_NOERROR; 2603} 2604 2605/* Find the first node which is '(' or ')' and whose index is SUBEXP_IDX. 2606 Search '(' if FL_OPEN, or search ')' otherwise. 2607 TODO: This function isn't efficient... 2608 Because there might be more than one nodes whose types are 2609 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all 2610 nodes. 2611 E.g. RE: (a){2} */ 2612 2613static int 2614find_subexp_node (dfa, nodes, subexp_idx, fl_open) 2615 re_dfa_t *dfa; 2616 re_node_set *nodes; 2617 int subexp_idx, fl_open; 2618{ 2619 int cls_idx; 2620 for (cls_idx = 0; cls_idx < nodes->nelem; ++cls_idx) 2621 { 2622 int cls_node = nodes->elems[cls_idx]; 2623 re_token_t *node = dfa->nodes + cls_node; 2624 if (((fl_open && node->type == OP_OPEN_SUBEXP) 2625 || (!fl_open && node->type == OP_CLOSE_SUBEXP)) 2626 && node->opr.idx == subexp_idx) 2627 return cls_node; 2628 } 2629 return -1; 2630} 2631 2632/* Check whether the node TOP_NODE at TOP_STR can arrive to the node 2633 LAST_NODE at LAST_STR. We record the path onto PATH since it will be 2634 heavily reused. 2635 Return REG_NOERROR if it can arrive, or REG_NOMATCH otherwise. */ 2636 2637static reg_errcode_t 2638check_arrival (preg, mctx, path, top_node, top_str, last_node, last_str, 2639 fl_open) 2640 const regex_t *preg; 2641 re_match_context_t *mctx; 2642 state_array_t *path; 2643 int top_node, top_str, last_node, last_str, fl_open; 2644{ 2645 re_dfa_t *dfa = (re_dfa_t *) preg->buffer; 2646 reg_errcode_t err; 2647 int subexp_num, backup_cur_idx, str_idx, null_cnt; 2648 re_dfastate_t *cur_state = NULL; 2649 re_node_set *cur_nodes, next_nodes; 2650 re_dfastate_t **backup_state_log; 2651 unsigned int context; 2652 2653 subexp_num = dfa->nodes[top_node].opr.idx; 2654 /* Extend the buffer if we need. */ 2655 if (path->alloc < last_str + mctx->max_mb_elem_len + 1) 2656 { 2657 re_dfastate_t **new_array; 2658 int old_alloc = path->alloc; 2659 path->alloc += last_str + mctx->max_mb_elem_len + 1; 2660 new_array = re_realloc (path->array, re_dfastate_t *, path->alloc); 2661 if (new_array == NULL) 2662 return REG_ESPACE; 2663 path->array = new_array; 2664 memset (new_array + old_alloc, '\0', 2665 sizeof (re_dfastate_t *) * (path->alloc - old_alloc)); 2666 } 2667 2668 str_idx = path->next_idx == 0 ? top_str : path->next_idx; 2669 2670 /* Temporary modify MCTX. */ 2671 backup_state_log = mctx->state_log; 2672 backup_cur_idx = mctx->input->cur_idx; 2673 mctx->state_log = path->array; 2674 mctx->input->cur_idx = str_idx; 2675 2676 /* Setup initial node set. */ 2677 context = re_string_context_at (mctx->input, str_idx - 1, mctx->eflags, 2678 preg->newline_anchor); 2679 if (str_idx == top_str) 2680 { 2681 err = re_node_set_init_1 (&next_nodes, top_node); 2682 if (BE (err != REG_NOERROR, 0)) 2683 return err; 2684 err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, fl_open); 2685 if (BE (err != REG_NOERROR, 0)) 2686 { 2687 re_node_set_free (&next_nodes); 2688 return err; 2689 } 2690 } 2691 else 2692 { 2693 cur_state = mctx->state_log[str_idx]; 2694 if (cur_state && cur_state->has_backref) 2695 { 2696 err = re_node_set_init_copy (&next_nodes, &cur_state->nodes); 2697 if (BE ( err != REG_NOERROR, 0)) 2698 return err; 2699 } 2700 else 2701 re_node_set_init_empty (&next_nodes); 2702 } 2703 if (str_idx == top_str || (cur_state && cur_state->has_backref)) 2704 { 2705 if (next_nodes.nelem) 2706 { 2707 err = expand_bkref_cache (preg, mctx, &next_nodes, str_idx, last_str, 2708 subexp_num, fl_open); 2709 if (BE ( err != REG_NOERROR, 0)) 2710 { 2711 re_node_set_free (&next_nodes); 2712 return err; 2713 } 2714 } 2715 cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context); 2716 if (BE (cur_state == NULL && err != REG_NOERROR, 0)) 2717 { 2718 re_node_set_free (&next_nodes); 2719 return err; 2720 } 2721 mctx->state_log[str_idx] = cur_state; 2722 } 2723 2724 for (null_cnt = 0; str_idx < last_str && null_cnt <= mctx->max_mb_elem_len;) 2725 { 2726 re_node_set_empty (&next_nodes); 2727 if (mctx->state_log[str_idx + 1]) 2728 { 2729 err = re_node_set_merge (&next_nodes, 2730 &mctx->state_log[str_idx + 1]->nodes); 2731 if (BE (err != REG_NOERROR, 0)) 2732 { 2733 re_node_set_free (&next_nodes); 2734 return err; 2735 } 2736 } 2737 if (cur_state) 2738 { 2739 err = check_arrival_add_next_nodes(preg, dfa, mctx, str_idx, 2740 &cur_state->nodes, &next_nodes); 2741 if (BE (err != REG_NOERROR, 0)) 2742 { 2743 re_node_set_free (&next_nodes); 2744 return err; 2745 } 2746 } 2747 ++str_idx; 2748 if (next_nodes.nelem) 2749 { 2750 err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, 2751 fl_open); 2752 if (BE (err != REG_NOERROR, 0)) 2753 { 2754 re_node_set_free (&next_nodes); 2755 return err; 2756 } 2757 err = expand_bkref_cache (preg, mctx, &next_nodes, str_idx, last_str, 2758 subexp_num, fl_open); 2759 if (BE ( err != REG_NOERROR, 0)) 2760 { 2761 re_node_set_free (&next_nodes); 2762 return err; 2763 } 2764 } 2765 context = re_string_context_at (mctx->input, str_idx - 1, mctx->eflags, 2766 preg->newline_anchor); 2767 cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context); 2768 if (BE (cur_state == NULL && err != REG_NOERROR, 0)) 2769 { 2770 re_node_set_free (&next_nodes); 2771 return err; 2772 } 2773 mctx->state_log[str_idx] = cur_state; 2774 null_cnt = cur_state == NULL ? null_cnt + 1 : 0; 2775 } 2776 re_node_set_free (&next_nodes); 2777 cur_nodes = (mctx->state_log[last_str] == NULL ? NULL 2778 : &mctx->state_log[last_str]->nodes); 2779 path->next_idx = str_idx; 2780 2781 /* Fix MCTX. */ 2782 mctx->state_log = backup_state_log; 2783 mctx->input->cur_idx = backup_cur_idx; 2784 2785 if (cur_nodes == NULL) 2786 return REG_NOMATCH; 2787 /* Then check the current node set has the node LAST_NODE. */ 2788 return (re_node_set_contains (cur_nodes, last_node) 2789 || re_node_set_contains (cur_nodes, last_node) ? REG_NOERROR 2790 : REG_NOMATCH); 2791} 2792 2793/* Helper functions for check_arrival. */ 2794 2795/* Calculate the destination nodes of CUR_NODES at STR_IDX, and append them 2796 to NEXT_NODES. 2797 TODO: This function is similar to the functions transit_state*(), 2798 however this function has many additional works. 2799 Can't we unify them? */ 2800 2801static reg_errcode_t 2802check_arrival_add_next_nodes (preg, dfa, mctx, str_idx, cur_nodes, next_nodes) 2803 const regex_t *preg; 2804 re_dfa_t *dfa; 2805 re_match_context_t *mctx; 2806 int str_idx; 2807 re_node_set *cur_nodes, *next_nodes; 2808{ 2809 int cur_idx; 2810 reg_errcode_t err; 2811 re_node_set union_set; 2812 re_node_set_init_empty (&union_set); 2813 for (cur_idx = 0; cur_idx < cur_nodes->nelem; ++cur_idx) 2814 { 2815 int naccepted = 0; 2816 int cur_node = cur_nodes->elems[cur_idx]; 2817 re_token_type_t type = dfa->nodes[cur_node].type; 2818 if (IS_EPSILON_NODE(type)) 2819 continue; 2820#ifdef RE_ENABLE_I18N 2821 /* If the node may accept `multi byte'. */ 2822 if (ACCEPT_MB_NODE (type)) 2823 { 2824 naccepted = check_node_accept_bytes (preg, cur_node, mctx->input, 2825 str_idx); 2826 if (naccepted > 1) 2827 { 2828 re_dfastate_t *dest_state; 2829 int next_node = dfa->nexts[cur_node]; 2830 int next_idx = str_idx + naccepted; 2831 dest_state = mctx->state_log[next_idx]; 2832 re_node_set_empty (&union_set); 2833 if (dest_state) 2834 { 2835 err = re_node_set_merge (&union_set, &dest_state->nodes); 2836 if (BE (err != REG_NOERROR, 0)) 2837 { 2838 re_node_set_free (&union_set); 2839 return err; 2840 } 2841 err = re_node_set_insert (&union_set, next_node); 2842 if (BE (err < 0, 0)) 2843 { 2844 re_node_set_free (&union_set); 2845 return REG_ESPACE; 2846 } 2847 } 2848 else 2849 { 2850 err = re_node_set_insert (&union_set, next_node); 2851 if (BE (err < 0, 0)) 2852 { 2853 re_node_set_free (&union_set); 2854 return REG_ESPACE; 2855 } 2856 } 2857 mctx->state_log[next_idx] = re_acquire_state (&err, dfa, 2858 &union_set); 2859 if (BE (mctx->state_log[next_idx] == NULL 2860 && err != REG_NOERROR, 0)) 2861 { 2862 re_node_set_free (&union_set); 2863 return err; 2864 } 2865 } 2866 } 2867#endif /* RE_ENABLE_I18N */ 2868 if (naccepted 2869 || check_node_accept (preg, dfa->nodes + cur_node, mctx, 2870 str_idx)) 2871 { 2872 err = re_node_set_insert (next_nodes, dfa->nexts[cur_node]); 2873 if (BE (err < 0, 0)) 2874 { 2875 re_node_set_free (&union_set); 2876 return REG_ESPACE; 2877 } 2878 } 2879 } 2880 re_node_set_free (&union_set); 2881 return REG_NOERROR; 2882} 2883 2884/* For all the nodes in CUR_NODES, add the epsilon closures of them to 2885 CUR_NODES, however exclude the nodes which are: 2886 - inside the sub expression whose number is EX_SUBEXP, if FL_OPEN. 2887 - out of the sub expression whose number is EX_SUBEXP, if !FL_OPEN. 2888*/ 2889 2890static reg_errcode_t 2891check_arrival_expand_ecl (dfa, cur_nodes, ex_subexp, fl_open) 2892 re_dfa_t *dfa; 2893 re_node_set *cur_nodes; 2894 int ex_subexp, fl_open; 2895{ 2896 reg_errcode_t err; 2897 int idx, outside_node; 2898 re_node_set new_nodes; 2899#ifdef DEBUG 2900 assert (cur_nodes->nelem); 2901#endif 2902 err = re_node_set_alloc (&new_nodes, cur_nodes->nelem); 2903 if (BE (err != REG_NOERROR, 0)) 2904 return err; 2905 /* Create a new node set NEW_NODES with the nodes which are epsilon 2906 closures of the node in CUR_NODES. */ 2907 2908 for (idx = 0; idx < cur_nodes->nelem; ++idx) 2909 { 2910 int cur_node = cur_nodes->elems[idx]; 2911 re_node_set *eclosure = dfa->eclosures + cur_node; 2912 outside_node = find_subexp_node (dfa, eclosure, ex_subexp, fl_open); 2913 if (outside_node == -1) 2914 { 2915 /* There are no problematic nodes, just merge them. */ 2916 err = re_node_set_merge (&new_nodes, eclosure); 2917 if (BE (err != REG_NOERROR, 0)) 2918 { 2919 re_node_set_free (&new_nodes); 2920 return err; 2921 } 2922 } 2923 else 2924 { 2925 /* There are problematic nodes, re-calculate incrementally. */ 2926 err = check_arrival_expand_ecl_sub (dfa, &new_nodes, cur_node, 2927 ex_subexp, fl_open); 2928 if (BE (err != REG_NOERROR, 0)) 2929 { 2930 re_node_set_free (&new_nodes); 2931 return err; 2932 } 2933 } 2934 } 2935 re_node_set_free (cur_nodes); 2936 *cur_nodes = new_nodes; 2937 return REG_NOERROR; 2938} 2939 2940/* Helper function for check_arrival_expand_ecl. 2941 Check incrementally the epsilon closure of TARGET, and if it isn't 2942 problematic append it to DST_NODES. */ 2943 2944static reg_errcode_t 2945check_arrival_expand_ecl_sub (dfa, dst_nodes, target, ex_subexp, fl_open) 2946 re_dfa_t *dfa; 2947 int target, ex_subexp, fl_open; 2948 re_node_set *dst_nodes; 2949{ 2950 int cur_node, type; 2951 for (cur_node = target; !re_node_set_contains (dst_nodes, cur_node);) 2952 { 2953 int err; 2954 type = dfa->nodes[cur_node].type; 2955 2956 if (((type == OP_OPEN_SUBEXP && fl_open) 2957 || (type == OP_CLOSE_SUBEXP && !fl_open)) 2958 && dfa->nodes[cur_node].opr.idx == ex_subexp) 2959 { 2960 if (!fl_open) 2961 { 2962 err = re_node_set_insert (dst_nodes, cur_node); 2963 if (BE (err == -1, 0)) 2964 return REG_ESPACE; 2965 } 2966 break; 2967 } 2968 err = re_node_set_insert (dst_nodes, cur_node); 2969 if (BE (err == -1, 0)) 2970 return REG_ESPACE; 2971 if (dfa->edests[cur_node].nelem == 0) 2972 break; 2973 if (dfa->edests[cur_node].nelem == 2) 2974 { 2975 err = check_arrival_expand_ecl_sub (dfa, dst_nodes, 2976 dfa->edests[cur_node].elems[1], 2977 ex_subexp, fl_open); 2978 if (BE (err != REG_NOERROR, 0)) 2979 return err; 2980 } 2981 cur_node = dfa->edests[cur_node].elems[0]; 2982 } 2983 return REG_NOERROR; 2984} 2985 2986 2987/* For all the back references in the current state, calculate the 2988 destination of the back references by the appropriate entry 2989 in MCTX->BKREF_ENTS. */ 2990 2991static reg_errcode_t 2992expand_bkref_cache (preg, mctx, cur_nodes, cur_str, last_str, subexp_num, 2993 fl_open) 2994 const regex_t *preg; 2995 re_match_context_t *mctx; 2996 int cur_str, last_str, subexp_num, fl_open; 2997 re_node_set *cur_nodes; 2998{ 2999 reg_errcode_t err; 3000 re_dfa_t *dfa = (re_dfa_t *) preg->buffer; 3001 int cache_idx, cache_idx_start; 3002 /* The current state. */ 3003 3004 cache_idx_start = search_cur_bkref_entry (mctx, cur_str); 3005 for (cache_idx = cache_idx_start; cache_idx < mctx->nbkref_ents; ++cache_idx) 3006 { 3007 int to_idx, next_node; 3008 struct re_backref_cache_entry *ent = mctx->bkref_ents + cache_idx; 3009 if (ent->str_idx > cur_str) 3010 break; 3011 /* Is this entry ENT is appropriate? */ 3012 if (!re_node_set_contains (cur_nodes, ent->node)) 3013 continue; /* No. */ 3014 3015 to_idx = cur_str + ent->subexp_to - ent->subexp_from; 3016 /* Calculate the destination of the back reference, and append it 3017 to MCTX->STATE_LOG. */ 3018 if (to_idx == cur_str) 3019 { 3020 /* The backreference did epsilon transit, we must re-check all the 3021 node in the current state. */ 3022 re_node_set new_dests; 3023 reg_errcode_t err2, err3; 3024 next_node = dfa->edests[ent->node].elems[0]; 3025 if (re_node_set_contains (cur_nodes, next_node)) 3026 continue; 3027 err = re_node_set_init_1 (&new_dests, next_node); 3028 err2 = check_arrival_expand_ecl (dfa, &new_dests, subexp_num, 3029 fl_open); 3030 err3 = re_node_set_merge (cur_nodes, &new_dests); 3031 re_node_set_free (&new_dests); 3032 if (BE (err != REG_NOERROR || err2 != REG_NOERROR 3033 || err3 != REG_NOERROR, 0)) 3034 { 3035 err = (err != REG_NOERROR ? err 3036 : (err2 != REG_NOERROR ? err2 : err3)); 3037 return err; 3038 } 3039 /* TODO: It is still inefficient... */ 3040 cache_idx = cache_idx_start - 1; 3041 continue; 3042 } 3043 else 3044 { 3045 re_node_set union_set; 3046 next_node = dfa->nexts[ent->node]; 3047 if (mctx->state_log[to_idx]) 3048 { 3049 int ret; 3050 if (re_node_set_contains (&mctx->state_log[to_idx]->nodes, 3051 next_node)) 3052 continue; 3053 err = re_node_set_init_copy (&union_set, 3054 &mctx->state_log[to_idx]->nodes); 3055 ret = re_node_set_insert (&union_set, next_node); 3056 if (BE (err != REG_NOERROR || ret < 0, 0)) 3057 { 3058 re_node_set_free (&union_set); 3059 err = err != REG_NOERROR ? err : REG_ESPACE; 3060 return err; 3061 } 3062 } 3063 else 3064 { 3065 err = re_node_set_init_1 (&union_set, next_node); 3066 if (BE (err != REG_NOERROR, 0)) 3067 return err; 3068 } 3069 mctx->state_log[to_idx] = re_acquire_state (&err, dfa, &union_set); 3070 re_node_set_free (&union_set); 3071 if (BE (mctx->state_log[to_idx] == NULL 3072 && err != REG_NOERROR, 0)) 3073 return err; 3074 } 3075 } 3076 return REG_NOERROR; 3077} 3078 3079/* Build transition table for the state. 3080 Return the new table if succeeded, otherwise return NULL. */ 3081 3082static re_dfastate_t ** 3083build_trtable (preg, state, fl_search) 3084 const regex_t *preg; 3085 const re_dfastate_t *state; 3086 int fl_search; 3087{ 3088 reg_errcode_t err; 3089 re_dfa_t *dfa = (re_dfa_t *) preg->buffer; 3090 int i, j, k, ch; 3091 int dests_node_malloced = 0, dest_states_malloced = 0; 3092 int ndests; /* Number of the destination states from `state'. */ 3093 re_dfastate_t **trtable; 3094 re_dfastate_t **dest_states = NULL, **dest_states_word, **dest_states_nl; 3095 re_node_set follows, *dests_node; 3096 bitset *dests_ch; 3097 bitset acceptable; 3098 3099 /* We build DFA states which corresponds to the destination nodes 3100 from `state'. `dests_node[i]' represents the nodes which i-th 3101 destination state contains, and `dests_ch[i]' represents the 3102 characters which i-th destination state accepts. */ 3103#ifdef _LIBC 3104 if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX)) 3105 dests_node = (re_node_set *) 3106 alloca ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX); 3107 else 3108#endif 3109 { 3110 dests_node = (re_node_set *) 3111 malloc ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX); 3112 if (BE (dests_node == NULL, 0)) 3113 return NULL; 3114 dests_node_malloced = 1; 3115 } 3116 dests_ch = (bitset *) (dests_node + SBC_MAX); 3117 3118 /* Initialize transiton table. */ 3119 trtable = (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), SBC_MAX); 3120 if (BE (trtable == NULL, 0)) 3121 { 3122 if (dests_node_malloced) 3123 free (dests_node); 3124 return NULL; 3125 } 3126 3127 /* At first, group all nodes belonging to `state' into several 3128 destinations. */ 3129 ndests = group_nodes_into_DFAstates (preg, state, dests_node, dests_ch); 3130 if (BE (ndests <= 0, 0)) 3131 { 3132 if (dests_node_malloced) 3133 free (dests_node); 3134 /* Return NULL in case of an error, trtable otherwise. */ 3135 if (ndests == 0) 3136 return trtable; 3137 free (trtable); 3138 return NULL; 3139 } 3140 3141 err = re_node_set_alloc (&follows, ndests + 1); 3142 if (BE (err != REG_NOERROR, 0)) 3143 goto out_free; 3144 3145#ifdef _LIBC 3146 if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX 3147 + ndests * 3 * sizeof (re_dfastate_t *))) 3148 dest_states = (re_dfastate_t **) 3149 alloca (ndests * 3 * sizeof (re_dfastate_t *)); 3150 else 3151#endif 3152 { 3153 dest_states = (re_dfastate_t **) 3154 malloc (ndests * 3 * sizeof (re_dfastate_t *)); 3155 if (BE (dest_states == NULL, 0)) 3156 { 3157out_free: 3158 if (dest_states_malloced) 3159 free (dest_states); 3160 re_node_set_free (&follows); 3161 for (i = 0; i < ndests; ++i) 3162 re_node_set_free (dests_node + i); 3163 free (trtable); 3164 if (dests_node_malloced) 3165 free (dests_node); 3166 return NULL; 3167 } 3168 dest_states_malloced = 1; 3169 } 3170 dest_states_word = dest_states + ndests; 3171 dest_states_nl = dest_states_word + ndests; 3172 bitset_empty (acceptable); 3173 3174 /* Then build the states for all destinations. */ 3175 for (i = 0; i < ndests; ++i) 3176 { 3177 int next_node; 3178 re_node_set_empty (&follows); 3179 /* Merge the follows of this destination states. */ 3180 for (j = 0; j < dests_node[i].nelem; ++j) 3181 { 3182 next_node = dfa->nexts[dests_node[i].elems[j]]; 3183 if (next_node != -1) 3184 { 3185 err = re_node_set_merge (&follows, dfa->eclosures + next_node); 3186 if (BE (err != REG_NOERROR, 0)) 3187 goto out_free; 3188 } 3189 } 3190 /* If search flag is set, merge the initial state. */ 3191 if (fl_search) 3192 { 3193#ifdef RE_ENABLE_I18N 3194 int not_initial = 0; 3195 for (j = 0; j < follows.nelem; ++j) 3196 if (dfa->nodes[follows.elems[j]].type == CHARACTER) 3197 { 3198 not_initial = dfa->nodes[follows.elems[j]].mb_partial; 3199 break; 3200 } 3201 if (!not_initial) 3202#endif 3203 { 3204 err = re_node_set_merge (&follows, 3205 dfa->init_state->entrance_nodes); 3206 if (BE (err != REG_NOERROR, 0)) 3207 goto out_free; 3208 } 3209 } 3210 dest_states[i] = re_acquire_state_context (&err, dfa, &follows, 0); 3211 if (BE (dest_states[i] == NULL && err != REG_NOERROR, 0)) 3212 goto out_free; 3213 /* If the new state has context constraint, 3214 build appropriate states for these contexts. */ 3215 if (dest_states[i]->has_constraint) 3216 { 3217 dest_states_word[i] = re_acquire_state_context (&err, dfa, &follows, 3218 CONTEXT_WORD); 3219 if (BE (dest_states_word[i] == NULL && err != REG_NOERROR, 0)) 3220 goto out_free; 3221 dest_states_nl[i] = re_acquire_state_context (&err, dfa, &follows, 3222 CONTEXT_NEWLINE); 3223 if (BE (dest_states_nl[i] == NULL && err != REG_NOERROR, 0)) 3224 goto out_free; 3225 } 3226 else 3227 { 3228 dest_states_word[i] = dest_states[i]; 3229 dest_states_nl[i] = dest_states[i]; 3230 } 3231 bitset_merge (acceptable, dests_ch[i]); 3232 } 3233 3234 /* Update the transition table. */ 3235 /* For all characters ch...: */ 3236 for (i = 0, ch = 0; i < BITSET_UINTS; ++i) 3237 for (j = 0; j < UINT_BITS; ++j, ++ch) 3238 if ((acceptable[i] >> j) & 1) 3239 { 3240 /* The current state accepts the character ch. */ 3241 if (IS_WORD_CHAR (ch)) 3242 { 3243 for (k = 0; k < ndests; ++k) 3244 if ((dests_ch[k][i] >> j) & 1) 3245 { 3246 /* k-th destination accepts the word character ch. */ 3247 trtable[ch] = dest_states_word[k]; 3248 /* There must be only one destination which accepts 3249 character ch. See group_nodes_into_DFAstates. */ 3250 break; 3251 } 3252 } 3253 else /* not WORD_CHAR */ 3254 { 3255 for (k = 0; k < ndests; ++k) 3256 if ((dests_ch[k][i] >> j) & 1) 3257 { 3258 /* k-th destination accepts the non-word character ch. */ 3259 trtable[ch] = dest_states[k]; 3260 /* There must be only one destination which accepts 3261 character ch. See group_nodes_into_DFAstates. */ 3262 break; 3263 } 3264 } 3265 } 3266 /* new line */ 3267 if (bitset_contain (acceptable, NEWLINE_CHAR)) 3268 { 3269 /* The current state accepts newline character. */ 3270 for (k = 0; k < ndests; ++k) 3271 if (bitset_contain (dests_ch[k], NEWLINE_CHAR)) 3272 { 3273 /* k-th destination accepts newline character. */ 3274 trtable[NEWLINE_CHAR] = dest_states_nl[k]; 3275 /* There must be only one destination which accepts 3276 newline. See group_nodes_into_DFAstates. */ 3277 break; 3278 } 3279 } 3280 3281 if (dest_states_malloced) 3282 free (dest_states); 3283 3284 re_node_set_free (&follows); 3285 for (i = 0; i < ndests; ++i) 3286 re_node_set_free (dests_node + i); 3287 3288 if (dests_node_malloced) 3289 free (dests_node); 3290 3291 return trtable; 3292} 3293 3294/* Group all nodes belonging to STATE into several destinations. 3295 Then for all destinations, set the nodes belonging to the destination 3296 to DESTS_NODE[i] and set the characters accepted by the destination 3297 to DEST_CH[i]. This function return the number of destinations. */ 3298 3299static int 3300group_nodes_into_DFAstates (preg, state, dests_node, dests_ch) 3301 const regex_t *preg; 3302 const re_dfastate_t *state; 3303 re_node_set *dests_node; 3304 bitset *dests_ch; 3305{ 3306 reg_errcode_t err; 3307 const re_dfa_t *dfa = (re_dfa_t *) preg->buffer; 3308 int i, j, k; 3309 int ndests; /* Number of the destinations from `state'. */ 3310 bitset accepts; /* Characters a node can accept. */ 3311 const re_node_set *cur_nodes = &state->nodes; 3312 bitset_empty (accepts); 3313 ndests = 0; 3314 3315 /* For all the nodes belonging to `state', */ 3316 for (i = 0; i < cur_nodes->nelem; ++i) 3317 { 3318 re_token_t *node = &dfa->nodes[cur_nodes->elems[i]]; 3319 re_token_type_t type = node->type; 3320 unsigned int constraint = node->constraint; 3321 3322 /* Enumerate all single byte character this node can accept. */ 3323 if (type == CHARACTER) 3324 bitset_set (accepts, node->opr.c); 3325 else if (type == SIMPLE_BRACKET) 3326 { 3327 bitset_merge (accepts, node->opr.sbcset); 3328 } 3329 else if (type == OP_PERIOD) 3330 { 3331 bitset_set_all (accepts); 3332 if (!(preg->syntax & RE_DOT_NEWLINE)) 3333 bitset_clear (accepts, '\n'); 3334 if (preg->syntax & RE_DOT_NOT_NULL) 3335 bitset_clear (accepts, '\0'); 3336 } 3337 else 3338 continue; 3339 3340 /* Check the `accepts' and sift the characters which are not 3341 match it the context. */ 3342 if (constraint) 3343 { 3344 if (constraint & NEXT_WORD_CONSTRAINT) 3345 for (j = 0; j < BITSET_UINTS; ++j) 3346 accepts[j] &= dfa->word_char[j]; 3347 if (constraint & NEXT_NOTWORD_CONSTRAINT) 3348 for (j = 0; j < BITSET_UINTS; ++j) 3349 accepts[j] &= ~dfa->word_char[j]; 3350 if (constraint & NEXT_NEWLINE_CONSTRAINT) 3351 { 3352 int accepts_newline = bitset_contain (accepts, NEWLINE_CHAR); 3353 bitset_empty (accepts); 3354 if (accepts_newline) 3355 bitset_set (accepts, NEWLINE_CHAR); 3356 else 3357 continue; 3358 } 3359 } 3360 3361 /* Then divide `accepts' into DFA states, or create a new 3362 state. */ 3363 for (j = 0; j < ndests; ++j) 3364 { 3365 bitset intersec; /* Intersection sets, see below. */ 3366 bitset remains; 3367 /* Flags, see below. */ 3368 int has_intersec, not_subset, not_consumed; 3369 3370 /* Optimization, skip if this state doesn't accept the character. */ 3371 if (type == CHARACTER && !bitset_contain (dests_ch[j], node->opr.c)) 3372 continue; 3373 3374 /* Enumerate the intersection set of this state and `accepts'. */ 3375 has_intersec = 0; 3376 for (k = 0; k < BITSET_UINTS; ++k) 3377 has_intersec |= intersec[k] = accepts[k] & dests_ch[j][k]; 3378 /* And skip if the intersection set is empty. */ 3379 if (!has_intersec) 3380 continue; 3381 3382 /* Then check if this state is a subset of `accepts'. */ 3383 not_subset = not_consumed = 0; 3384 for (k = 0; k < BITSET_UINTS; ++k) 3385 { 3386 not_subset |= remains[k] = ~accepts[k] & dests_ch[j][k]; 3387 not_consumed |= accepts[k] = accepts[k] & ~dests_ch[j][k]; 3388 } 3389 3390 /* If this state isn't a subset of `accepts', create a 3391 new group state, which has the `remains'. */ 3392 if (not_subset) 3393 { 3394 bitset_copy (dests_ch[ndests], remains); 3395 bitset_copy (dests_ch[j], intersec); 3396 err = re_node_set_init_copy (dests_node + ndests, &dests_node[j]); 3397 if (BE (err != REG_NOERROR, 0)) 3398 goto error_return; 3399 ++ndests; 3400 } 3401 3402 /* Put the position in the current group. */ 3403 err = re_node_set_insert (&dests_node[j], cur_nodes->elems[i]); 3404 if (BE (err < 0, 0)) 3405 goto error_return; 3406 3407 /* If all characters are consumed, go to next node. */ 3408 if (!not_consumed) 3409 break; 3410 } 3411 /* Some characters remain, create a new group. */ 3412 if (j == ndests) 3413 { 3414 bitset_copy (dests_ch[ndests], accepts); 3415 err = re_node_set_init_1 (dests_node + ndests, cur_nodes->elems[i]); 3416 if (BE (err != REG_NOERROR, 0)) 3417 goto error_return; 3418 ++ndests; 3419 bitset_empty (accepts); 3420 } 3421 } 3422 return ndests; 3423 error_return: 3424 for (j = 0; j < ndests; ++j) 3425 re_node_set_free (dests_node + j); 3426 return -1; 3427} 3428 3429#ifdef RE_ENABLE_I18N 3430/* Check how many bytes the node `dfa->nodes[node_idx]' accepts. 3431 Return the number of the bytes the node accepts. 3432 STR_IDX is the current index of the input string. 3433 3434 This function handles the nodes which can accept one character, or 3435 one collating element like '.', '[a-z]', opposite to the other nodes 3436 can only accept one byte. */ 3437 3438static int 3439check_node_accept_bytes (preg, node_idx, input, str_idx) 3440 const regex_t *preg; 3441 int node_idx, str_idx; 3442 const re_string_t *input; 3443{ 3444 const re_dfa_t *dfa = (re_dfa_t *) preg->buffer; 3445 const re_token_t *node = dfa->nodes + node_idx; 3446 int elem_len = re_string_elem_size_at (input, str_idx); 3447 int char_len = re_string_char_size_at (input, str_idx); 3448 int i; 3449# ifdef _LIBC 3450 int j; 3451 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES); 3452# endif /* _LIBC */ 3453 if (elem_len <= 1 && char_len <= 1) 3454 return 0; 3455 if (node->type == OP_PERIOD) 3456 { 3457 /* '.' accepts any one character except the following two cases. */ 3458 if ((!(preg->syntax & RE_DOT_NEWLINE) && 3459 re_string_byte_at (input, str_idx) == '\n') || 3460 ((preg->syntax & RE_DOT_NOT_NULL) && 3461 re_string_byte_at (input, str_idx) == '\0')) 3462 return 0; 3463 return char_len; 3464 } 3465 else if (node->type == COMPLEX_BRACKET) 3466 { 3467 const re_charset_t *cset = node->opr.mbcset; 3468# ifdef _LIBC 3469 const unsigned char *pin = re_string_get_buffer (input) + str_idx; 3470# endif /* _LIBC */ 3471 int match_len = 0; 3472 wchar_t wc = ((cset->nranges || cset->nchar_classes || cset->nmbchars) 3473 ? re_string_wchar_at (input, str_idx) : 0); 3474 3475 /* match with multibyte character? */ 3476 for (i = 0; i < cset->nmbchars; ++i) 3477 if (wc == cset->mbchars[i]) 3478 { 3479 match_len = char_len; 3480 goto check_node_accept_bytes_match; 3481 } 3482 /* match with character_class? */ 3483 for (i = 0; i < cset->nchar_classes; ++i) 3484 { 3485 wctype_t wt = cset->char_classes[i]; 3486 if (__iswctype (wc, wt)) 3487 { 3488 match_len = char_len; 3489 goto check_node_accept_bytes_match; 3490 } 3491 } 3492 3493# ifdef _LIBC 3494 if (nrules != 0) 3495 { 3496 unsigned int in_collseq = 0; 3497 const int32_t *table, *indirect; 3498 const unsigned char *weights, *extra; 3499 const char *collseqwc; 3500 int32_t idx; 3501 /* This #include defines a local function! */ 3502# include <locale/weight.h> 3503 3504 /* match with collating_symbol? */ 3505 if (cset->ncoll_syms) 3506 extra = (const unsigned char *) 3507 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB); 3508 for (i = 0; i < cset->ncoll_syms; ++i) 3509 { 3510 const unsigned char *coll_sym = extra + cset->coll_syms[i]; 3511 /* Compare the length of input collating element and 3512 the length of current collating element. */ 3513 if (*coll_sym != elem_len) 3514 continue; 3515 /* Compare each bytes. */ 3516 for (j = 0; j < *coll_sym; j++) 3517 if (pin[j] != coll_sym[1 + j]) 3518 break; 3519 if (j == *coll_sym) 3520 { 3521 /* Match if every bytes is equal. */ 3522 match_len = j; 3523 goto check_node_accept_bytes_match; 3524 } 3525 } 3526 3527 if (cset->nranges) 3528 { 3529 if (elem_len <= char_len) 3530 { 3531 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC); 3532 in_collseq = collseq_table_lookup (collseqwc, wc); 3533 } 3534 else 3535 in_collseq = find_collation_sequence_value (pin, elem_len); 3536 } 3537 /* match with range expression? */ 3538 for (i = 0; i < cset->nranges; ++i) 3539 if (cset->range_starts[i] <= in_collseq 3540 && in_collseq <= cset->range_ends[i]) 3541 { 3542 match_len = elem_len; 3543 goto check_node_accept_bytes_match; 3544 } 3545 3546 /* match with equivalence_class? */ 3547 if (cset->nequiv_classes) 3548 { 3549 const unsigned char *cp = pin; 3550 table = (const int32_t *) 3551 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB); 3552 weights = (const unsigned char *) 3553 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB); 3554 extra = (const unsigned char *) 3555 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB); 3556 indirect = (const int32_t *) 3557 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB); 3558 idx = findidx (&cp); 3559 if (idx > 0) 3560 for (i = 0; i < cset->nequiv_classes; ++i) 3561 { 3562 int32_t equiv_class_idx = cset->equiv_classes[i]; 3563 size_t weight_len = weights[idx]; 3564 if (weight_len == weights[equiv_class_idx]) 3565 { 3566 int cnt = 0; 3567 while (cnt <= weight_len 3568 && (weights[equiv_class_idx + 1 + cnt] 3569 == weights[idx + 1 + cnt])) 3570 ++cnt; 3571 if (cnt > weight_len) 3572 { 3573 match_len = elem_len; 3574 goto check_node_accept_bytes_match; 3575 } 3576 } 3577 } 3578 } 3579 } 3580 else 3581# endif /* _LIBC */ 3582 { 3583 /* match with range expression? */ 3584#if __GNUC__ >= 2 3585 wchar_t cmp_buf[] = {L'\0', L'\0', wc, L'\0', L'\0', L'\0'}; 3586#else 3587 wchar_t cmp_buf[] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'}; 3588 cmp_buf[2] = wc; 3589#endif 3590 for (i = 0; i < cset->nranges; ++i) 3591 { 3592 cmp_buf[0] = cset->range_starts[i]; 3593 cmp_buf[4] = cset->range_ends[i]; 3594 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0 3595 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0) 3596 { 3597 match_len = char_len; 3598 goto check_node_accept_bytes_match; 3599 } 3600 } 3601 } 3602 check_node_accept_bytes_match: 3603 if (!cset->non_match) 3604 return match_len; 3605 else 3606 { 3607 if (match_len > 0) 3608 return 0; 3609 else 3610 return (elem_len > char_len) ? elem_len : char_len; 3611 } 3612 } 3613 return 0; 3614} 3615 3616# ifdef _LIBC 3617static unsigned int 3618find_collation_sequence_value (mbs, mbs_len) 3619 const unsigned char *mbs; 3620 size_t mbs_len; 3621{ 3622 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES); 3623 if (nrules == 0) 3624 { 3625 if (mbs_len == 1) 3626 { 3627 /* No valid character. Match it as a single byte character. */ 3628 const unsigned char *collseq = (const unsigned char *) 3629 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB); 3630 return collseq[mbs[0]]; 3631 } 3632 return UINT_MAX; 3633 } 3634 else 3635 { 3636 int32_t idx; 3637 const unsigned char *extra = (const unsigned char *) 3638 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB); 3639 3640 for (idx = 0; ;) 3641 { 3642 int mbs_cnt, found = 0; 3643 int32_t elem_mbs_len; 3644 /* Skip the name of collating element name. */ 3645 idx = idx + extra[idx] + 1; 3646 elem_mbs_len = extra[idx++]; 3647 if (mbs_len == elem_mbs_len) 3648 { 3649 for (mbs_cnt = 0; mbs_cnt < elem_mbs_len; ++mbs_cnt) 3650 if (extra[idx + mbs_cnt] != mbs[mbs_cnt]) 3651 break; 3652 if (mbs_cnt == elem_mbs_len) 3653 /* Found the entry. */ 3654 found = 1; 3655 } 3656 /* Skip the byte sequence of the collating element. */ 3657 idx += elem_mbs_len; 3658 /* Adjust for the alignment. */ 3659 idx = (idx + 3) & ~3; 3660 /* Skip the collation sequence value. */ 3661 idx += sizeof (uint32_t); 3662 /* Skip the wide char sequence of the collating element. */ 3663 idx = idx + sizeof (uint32_t) * (extra[idx] + 1); 3664 /* If we found the entry, return the sequence value. */ 3665 if (found) 3666 return *(uint32_t *) (extra + idx); 3667 /* Skip the collation sequence value. */ 3668 idx += sizeof (uint32_t); 3669 } 3670 } 3671} 3672# endif /* _LIBC */ 3673#endif /* RE_ENABLE_I18N */ 3674 3675/* Check whether the node accepts the byte which is IDX-th 3676 byte of the INPUT. */ 3677 3678static int 3679check_node_accept (preg, node, mctx, idx) 3680 const regex_t *preg; 3681 const re_token_t *node; 3682 const re_match_context_t *mctx; 3683 int idx; 3684{ 3685 unsigned char ch; 3686 if (node->constraint) 3687 { 3688 /* The node has constraints. Check whether the current context 3689 satisfies the constraints. */ 3690 unsigned int context = re_string_context_at (mctx->input, idx, 3691 mctx->eflags, 3692 preg->newline_anchor); 3693 if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context)) 3694 return 0; 3695 } 3696 ch = re_string_byte_at (mctx->input, idx); 3697 if (node->type == CHARACTER) 3698 return node->opr.c == ch; 3699 else if (node->type == SIMPLE_BRACKET) 3700 return bitset_contain (node->opr.sbcset, ch); 3701 else if (node->type == OP_PERIOD) 3702 return !((ch == '\n' && !(preg->syntax & RE_DOT_NEWLINE)) 3703 || (ch == '\0' && (preg->syntax & RE_DOT_NOT_NULL))); 3704 else 3705 return 0; 3706} 3707 3708/* Extend the buffers, if the buffers have run out. */ 3709 3710static reg_errcode_t 3711extend_buffers (mctx) 3712 re_match_context_t *mctx; 3713{ 3714 reg_errcode_t ret; 3715 re_string_t *pstr = mctx->input; 3716 3717 /* Double the lengthes of the buffers. */ 3718 ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2); 3719 if (BE (ret != REG_NOERROR, 0)) 3720 return ret; 3721 3722 if (mctx->state_log != NULL) 3723 { 3724 /* And double the length of state_log. */ 3725 re_dfastate_t **new_array; 3726 new_array = re_realloc (mctx->state_log, re_dfastate_t *, 3727 pstr->bufs_len * 2); 3728 if (BE (new_array == NULL, 0)) 3729 return REG_ESPACE; 3730 mctx->state_log = new_array; 3731 } 3732 3733 /* Then reconstruct the buffers. */ 3734 if (pstr->icase) 3735 { 3736#ifdef RE_ENABLE_I18N 3737 if (re_mb_cur_max > 1) 3738 build_wcs_upper_buffer (pstr); 3739 else 3740#endif /* RE_ENABLE_I18N */ 3741 build_upper_buffer (pstr); 3742 } 3743 else 3744 { 3745#ifdef RE_ENABLE_I18N 3746 if (re_mb_cur_max > 1) 3747 build_wcs_buffer (pstr); 3748 else 3749#endif /* RE_ENABLE_I18N */ 3750 { 3751 if (pstr->trans != NULL) 3752 re_string_translate_buffer (pstr); 3753 else 3754 pstr->valid_len = pstr->bufs_len; 3755 } 3756 } 3757 return REG_NOERROR; 3758} 3759 3760 3761/* Functions for matching context. */ 3762 3763/* Initialize MCTX. */ 3764 3765static reg_errcode_t 3766match_ctx_init (mctx, eflags, input, n) 3767 re_match_context_t *mctx; 3768 int eflags, n; 3769 re_string_t *input; 3770{ 3771 mctx->eflags = eflags; 3772 mctx->input = input; 3773 mctx->match_last = -1; 3774 if (n > 0) 3775 { 3776 mctx->bkref_ents = re_malloc (struct re_backref_cache_entry, n); 3777 mctx->sub_tops = re_malloc (re_sub_match_top_t *, n); 3778 if (BE (mctx->bkref_ents == NULL || mctx->sub_tops == NULL, 0)) 3779 return REG_ESPACE; 3780 } 3781 else 3782 mctx->bkref_ents = NULL; 3783 mctx->nbkref_ents = 0; 3784 mctx->abkref_ents = n; 3785 mctx->max_mb_elem_len = 1; 3786 mctx->nsub_tops = 0; 3787 mctx->asub_tops = n; 3788 return REG_NOERROR; 3789} 3790 3791/* Clean the entries which depend on the current input in MCTX. 3792 This function must be invoked when the matcher changes the start index 3793 of the input, or changes the input string. */ 3794 3795static void 3796match_ctx_clean (mctx) 3797 re_match_context_t *mctx; 3798{ 3799 match_ctx_free_subtops (mctx); 3800 mctx->nsub_tops = 0; 3801 mctx->nbkref_ents = 0; 3802} 3803 3804/* Free all the memory associated with MCTX. */ 3805 3806static void 3807match_ctx_free (mctx) 3808 re_match_context_t *mctx; 3809{ 3810 match_ctx_free_subtops (mctx); 3811 re_free (mctx->sub_tops); 3812 re_free (mctx->bkref_ents); 3813} 3814 3815/* Free all the memory associated with MCTX->SUB_TOPS. */ 3816 3817static void 3818match_ctx_free_subtops (mctx) 3819 re_match_context_t *mctx; 3820{ 3821 int st_idx; 3822 for (st_idx = 0; st_idx < mctx->nsub_tops; ++st_idx) 3823 { 3824 int sl_idx; 3825 re_sub_match_top_t *top = mctx->sub_tops[st_idx]; 3826 for (sl_idx = 0; sl_idx < top->nlasts; ++sl_idx) 3827 { 3828 re_sub_match_last_t *last = top->lasts[sl_idx]; 3829 re_free (last->path.array); 3830 re_free (last); 3831 } 3832 re_free (top->lasts); 3833 if (top->path) 3834 { 3835 re_free (top->path->array); 3836 re_free (top->path); 3837 } 3838 free (top); 3839 } 3840} 3841 3842/* Add a new backreference entry to MCTX. 3843 Note that we assume that caller never call this function with duplicate 3844 entry, and call with STR_IDX which isn't smaller than any existing entry. 3845*/ 3846 3847static reg_errcode_t 3848match_ctx_add_entry (mctx, node, str_idx, from, to) 3849 re_match_context_t *mctx; 3850 int node, str_idx, from, to; 3851{ 3852 if (mctx->nbkref_ents >= mctx->abkref_ents) 3853 { 3854 struct re_backref_cache_entry* new_entry; 3855 new_entry = re_realloc (mctx->bkref_ents, struct re_backref_cache_entry, 3856 mctx->abkref_ents * 2); 3857 if (BE (new_entry == NULL, 0)) 3858 { 3859 re_free (mctx->bkref_ents); 3860 return REG_ESPACE; 3861 } 3862 mctx->bkref_ents = new_entry; 3863 memset (mctx->bkref_ents + mctx->nbkref_ents, '\0', 3864 sizeof (struct re_backref_cache_entry) * mctx->abkref_ents); 3865 mctx->abkref_ents *= 2; 3866 } 3867 mctx->bkref_ents[mctx->nbkref_ents].node = node; 3868 mctx->bkref_ents[mctx->nbkref_ents].str_idx = str_idx; 3869 mctx->bkref_ents[mctx->nbkref_ents].subexp_from = from; 3870 mctx->bkref_ents[mctx->nbkref_ents].subexp_to = to; 3871 mctx->bkref_ents[mctx->nbkref_ents++].flag = 0; 3872 if (mctx->max_mb_elem_len < to - from) 3873 mctx->max_mb_elem_len = to - from; 3874 return REG_NOERROR; 3875} 3876 3877/* Search for the first entry which has the same str_idx. 3878 Note that MCTX->BKREF_ENTS is already sorted by MCTX->STR_IDX. */ 3879 3880static int 3881search_cur_bkref_entry (mctx, str_idx) 3882 re_match_context_t *mctx; 3883 int str_idx; 3884{ 3885 int left, right, mid; 3886 right = mctx->nbkref_ents; 3887 for (left = 0; left < right;) 3888 { 3889 mid = (left + right) / 2; 3890 if (mctx->bkref_ents[mid].str_idx < str_idx) 3891 left = mid + 1; 3892 else 3893 right = mid; 3894 } 3895 return left; 3896} 3897 3898static void 3899match_ctx_clear_flag (mctx) 3900 re_match_context_t *mctx; 3901{ 3902 int i; 3903 for (i = 0; i < mctx->nbkref_ents; ++i) 3904 { 3905 mctx->bkref_ents[i].flag = 0; 3906 } 3907} 3908 3909/* Register the node NODE, whose type is OP_OPEN_SUBEXP, and which matches 3910 at STR_IDX. */ 3911 3912static reg_errcode_t 3913match_ctx_add_subtop (mctx, node, str_idx) 3914 re_match_context_t *mctx; 3915 int node, str_idx; 3916{ 3917#ifdef DEBUG 3918 assert (mctx->sub_tops != NULL); 3919 assert (mctx->asub_tops > 0); 3920#endif 3921 if (mctx->nsub_tops == mctx->asub_tops) 3922 { 3923 re_sub_match_top_t **new_array; 3924 mctx->asub_tops *= 2; 3925 new_array = re_realloc (mctx->sub_tops, re_sub_match_top_t *, 3926 mctx->asub_tops); 3927 if (BE (new_array == NULL, 0)) 3928 return REG_ESPACE; 3929 mctx->sub_tops = new_array; 3930 } 3931 mctx->sub_tops[mctx->nsub_tops] = calloc (1, sizeof (re_sub_match_top_t)); 3932 if (mctx->sub_tops[mctx->nsub_tops] == NULL) 3933 return REG_ESPACE; 3934 mctx->sub_tops[mctx->nsub_tops]->node = node; 3935 mctx->sub_tops[mctx->nsub_tops++]->str_idx = str_idx; 3936 return REG_NOERROR; 3937} 3938 3939/* Register the node NODE, whose type is OP_CLOSE_SUBEXP, and which matches 3940 at STR_IDX, whose corresponding OP_OPEN_SUBEXP is SUB_TOP. */ 3941 3942static re_sub_match_last_t * 3943match_ctx_add_sublast (subtop, node, str_idx) 3944 re_sub_match_top_t *subtop; 3945 int node, str_idx; 3946{ 3947 re_sub_match_last_t *new_entry; 3948 if (subtop->nlasts == subtop->alasts) 3949 { 3950 re_sub_match_last_t **new_array; 3951 subtop->alasts = 2 * subtop->alasts + 1; 3952 new_array = re_realloc (subtop->lasts, re_sub_match_last_t *, 3953 subtop->alasts); 3954 if (BE (new_array == NULL, 0)) 3955 return NULL; 3956 subtop->lasts = new_array; 3957 } 3958 new_entry = calloc (1, sizeof (re_sub_match_last_t)); 3959 if (BE (new_entry == NULL, 0)) 3960 return NULL; 3961 subtop->lasts[subtop->nlasts] = new_entry; 3962 new_entry->node = node; 3963 new_entry->str_idx = str_idx; 3964 ++subtop->nlasts; 3965 return new_entry; 3966} 3967 3968static void 3969sift_ctx_init (sctx, sifted_sts, limited_sts, last_node, last_str_idx, 3970 check_subexp) 3971 re_sift_context_t *sctx; 3972 re_dfastate_t **sifted_sts, **limited_sts; 3973 int last_node, last_str_idx, check_subexp; 3974{ 3975 sctx->sifted_states = sifted_sts; 3976 sctx->limited_states = limited_sts; 3977 sctx->last_node = last_node; 3978 sctx->last_str_idx = last_str_idx; 3979 sctx->check_subexp = check_subexp; 3980 sctx->cur_bkref = -1; 3981 sctx->cls_subexp_idx = -1; 3982 re_node_set_init_empty (&sctx->limits); 3983} 3984