1/* 2 tre-match-backtrack.c - TRE backtracking regex matching engine 3 4 This software is released under a BSD-style license. 5 See the file LICENSE for details and copyright. 6 7*/ 8 9/* 10 This matcher is for regexps that use back referencing. Regexp matching 11 with back referencing is an NP-complete problem on the number of back 12 references. The easiest way to match them is to use a backtracking 13 routine which basically goes through all possible paths in the TNFA 14 and chooses the one which results in the best (leftmost and longest) 15 match. This can be spectacularly expensive and may run out of stack 16 space, but there really is no better known generic algorithm. Quoting 17 Henry Spencer from comp.compilers: 18 <URL: http://compilers.iecc.com/comparch/article/93-03-102> 19 20 POSIX.2 REs require longest match, which is really exciting to 21 implement since the obsolete ("basic") variant also includes 22 \<digit>. I haven't found a better way of tackling this than doing 23 a preliminary match using a DFA (or simulation) on a modified RE 24 that just replicates subREs for \<digit>, and then doing a 25 backtracking match to determine whether the subRE matches were 26 right. This can be rather slow, but I console myself with the 27 thought that people who use \<digit> deserve very slow execution. 28 (Pun unintentional but very appropriate.) 29 30*/ 31 32 33#ifdef HAVE_CONFIG_H 34#include <config.h> 35#endif /* HAVE_CONFIG_H */ 36 37#include "tre-alloca.h" 38 39#include <assert.h> 40#include <stdlib.h> 41#include <string.h> 42#ifdef HAVE_WCHAR_H 43#include <wchar.h> 44#endif /* HAVE_WCHAR_H */ 45#ifdef HAVE_WCTYPE_H 46#include <wctype.h> 47#endif /* HAVE_WCTYPE_H */ 48#ifndef TRE_WCHAR 49#include <ctype.h> 50#endif /* !TRE_WCHAR */ 51#ifdef HAVE_MALLOC_H 52#include <malloc.h> 53#endif /* HAVE_MALLOC_H */ 54 55#include "tre-internal.h" 56#include "tre-mem.h" 57#include "tre-match-utils.h" 58#include "tre.h" 59#include "xmalloc.h" 60 61typedef struct { 62 int pos; 63 const char *str_byte; 64#ifdef TRE_WCHAR 65 const wchar_t *str_wide; 66#endif /* TRE_WCHAR */ 67 tre_tnfa_transition_t *state; 68 int state_id; 69 int next_c; 70 int *tags; 71#ifdef TRE_MBSTATE 72 mbstate_t mbstate; 73#endif /* TRE_MBSTATE */ 74} tre_backtrack_item_t; 75 76typedef struct tre_backtrack_struct { 77 tre_backtrack_item_t item; 78 struct tre_backtrack_struct *prev; 79 struct tre_backtrack_struct *next; 80} *tre_backtrack_t; 81 82#ifdef TRE_WCHAR 83#define BT_STACK_WIDE_IN(_str_wide) stack->item.str_wide = (_str_wide) 84#define BT_STACK_WIDE_OUT (str_wide) = stack->item.str_wide 85#else /* !TRE_WCHAR */ 86#define BT_STACK_WIDE_IN(_str_wide) 87#define BT_STACK_WIDE_OUT 88#endif /* !TRE_WCHAR */ 89 90#ifdef TRE_MBSTATE 91#define BT_STACK_MBSTATE_IN stack->item.mbstate = (mbstate) 92#define BT_STACK_MBSTATE_OUT (mbstate) = stack->item.mbstate 93#else /* !TRE_MBSTATE */ 94#define BT_STACK_MBSTATE_IN 95#define BT_STACK_MBSTATE_OUT 96#endif /* !TRE_MBSTATE */ 97 98 99#ifdef TRE_USE_ALLOCA 100#define tre_bt_mem_new tre_mem_newa 101#define tre_bt_mem_alloc tre_mem_alloca 102#define tre_bt_mem_destroy(obj) do { } while (/*CONSTCOND*/(void)0,0) 103#else /* !TRE_USE_ALLOCA */ 104#define tre_bt_mem_new tre_mem_new 105#define tre_bt_mem_alloc tre_mem_alloc 106#define tre_bt_mem_destroy tre_mem_destroy 107#endif /* !TRE_USE_ALLOCA */ 108 109 110#define BT_STACK_PUSH(_pos, _str_byte, _str_wide, _state, _state_id, _next_c, _tags, _mbstate) \ 111 do \ 112 { \ 113 size_t i; \ 114 if (!stack->next) \ 115 { \ 116 tre_backtrack_t s; \ 117 s = tre_bt_mem_alloc(mem, sizeof(*s)); \ 118 if (!s) \ 119 { \ 120 tre_bt_mem_destroy(mem); \ 121 if (tags) \ 122 xfree(tags); \ 123 if (pmatch) \ 124 xfree(pmatch); \ 125 if (states_seen) \ 126 xfree(states_seen); \ 127 return REG_ESPACE; \ 128 } \ 129 s->prev = stack; \ 130 s->next = NULL; \ 131 s->item.tags = tre_bt_mem_alloc(mem, \ 132 sizeof(*tags) * tnfa->num_tags); \ 133 if (!s->item.tags) \ 134 { \ 135 tre_bt_mem_destroy(mem); \ 136 if (tags) \ 137 xfree(tags); \ 138 if (pmatch) \ 139 xfree(pmatch); \ 140 if (states_seen) \ 141 xfree(states_seen); \ 142 return REG_ESPACE; \ 143 } \ 144 stack->next = s; \ 145 stack = s; \ 146 } \ 147 else \ 148 stack = stack->next; \ 149 stack->item.pos = (_pos); \ 150 stack->item.str_byte = (_str_byte); \ 151 BT_STACK_WIDE_IN(_str_wide); \ 152 stack->item.state = (_state); \ 153 stack->item.state_id = (_state_id); \ 154 stack->item.next_c = (_next_c); \ 155 for (i = 0; i < tnfa->num_tags; i++) \ 156 stack->item.tags[i] = (_tags)[i]; \ 157 BT_STACK_MBSTATE_IN; \ 158 } \ 159 while (/*CONSTCOND*/(void)0,0) 160 161#define BT_STACK_POP() \ 162 do \ 163 { \ 164 size_t i; \ 165 assert(stack->prev); \ 166 pos = stack->item.pos; \ 167 if (type == STR_USER) \ 168 str_source->rewind(pos + pos_add_next, str_source->context); \ 169 str_byte = stack->item.str_byte; \ 170 BT_STACK_WIDE_OUT; \ 171 state = stack->item.state; \ 172 next_c = (tre_char_t) stack->item.next_c; \ 173 for (i = 0; i < tnfa->num_tags; i++) \ 174 tags[i] = stack->item.tags[i]; \ 175 BT_STACK_MBSTATE_OUT; \ 176 stack = stack->prev; \ 177 } \ 178 while (/*CONSTCOND*/(void)0,0) 179 180#undef MIN 181#define MIN(a, b) ((a) <= (b) ? (a) : (b)) 182 183reg_errcode_t 184tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string, 185 int len, tre_str_type_t type, int *match_tags, 186 int eflags, int *match_end_ofs) 187{ 188 /* State variables required by GET_NEXT_WCHAR. */ 189 tre_char_t prev_c = 0, next_c = 0; 190 const char *str_byte = string; 191 int pos = 0; 192 unsigned int pos_add_next = 1; 193#ifdef TRE_WCHAR 194 const wchar_t *str_wide = string; 195#ifdef TRE_MBSTATE 196 mbstate_t mbstate; 197#endif /* TRE_MBSTATE */ 198#endif /* TRE_WCHAR */ 199 int reg_notbol = eflags & REG_NOTBOL; 200 int reg_noteol = eflags & REG_NOTEOL; 201 int reg_newline = tnfa->cflags & REG_NEWLINE; 202 size_t str_user_end = 0; 203 204 /* These are used to remember the necessary values of the above 205 variables to return to the position where the current search 206 started from. */ 207 int next_c_start; 208 const char *str_byte_start; 209 int pos_start = -1; 210#ifdef TRE_WCHAR 211 const wchar_t *str_wide_start; 212#endif /* TRE_WCHAR */ 213#ifdef TRE_MBSTATE 214 mbstate_t mbstate_start; 215#endif /* TRE_MBSTATE */ 216 217 /* End offset of best match so far, or -1 if no match found yet. */ 218 int match_eo = -1; 219 /* Tag arrays. */ 220 int *next_tags, *tags = NULL; 221 /* Current TNFA state. */ 222 tre_tnfa_transition_t *state; 223 int *states_seen = NULL; 224 225 /* Memory allocator to for allocating the backtracking stack. */ 226 tre_mem_t mem = tre_bt_mem_new(); 227 228 /* The backtracking stack. */ 229 tre_backtrack_t stack; 230 231 tre_tnfa_transition_t *trans_i; 232 regmatch_t *pmatch = NULL; 233 reg_errcode_t ret; 234 235#ifdef TRE_MBSTATE 236 memset(&mbstate, '\0', sizeof(mbstate)); 237#endif /* TRE_MBSTATE */ 238 239 if (!mem) 240 return REG_ESPACE; 241 stack = tre_bt_mem_alloc(mem, sizeof(*stack)); 242 if (!stack) 243 { 244 ret = REG_ESPACE; 245 goto error_exit; 246 } 247 stack->prev = NULL; 248 stack->next = NULL; 249 250 DPRINT(("tnfa_execute_backtrack, input type %d\n", type)); 251 DPRINT(("len = %d\n", len)); 252 253#ifdef TRE_USE_ALLOCA 254 tags = alloca(sizeof(*tags) * tnfa->num_tags); 255 pmatch = alloca(sizeof(*pmatch) * tnfa->num_submatches); 256 states_seen = alloca(sizeof(*states_seen) * tnfa->num_states); 257#else /* !TRE_USE_ALLOCA */ 258 if (tnfa->num_tags) 259 { 260 tags = xmalloc(sizeof(*tags) * tnfa->num_tags); 261 if (!tags) 262 { 263 ret = REG_ESPACE; 264 goto error_exit; 265 } 266 } 267 if (tnfa->num_submatches) 268 { 269 pmatch = xmalloc(sizeof(*pmatch) * tnfa->num_submatches); 270 if (!pmatch) 271 { 272 ret = REG_ESPACE; 273 goto error_exit; 274 } 275 } 276 if (tnfa->num_states) 277 { 278 states_seen = xmalloc(sizeof(*states_seen) * tnfa->num_states); 279 if (!states_seen) 280 { 281 ret = REG_ESPACE; 282 goto error_exit; 283 } 284 } 285#endif /* !TRE_USE_ALLOCA */ 286 287 retry: 288 { 289 size_t i; 290 for (i = 0; i < tnfa->num_tags; i++) 291 { 292 tags[i] = -1; 293 if (match_tags) 294 match_tags[i] = -1; 295 } 296 for (i = 0; i < tnfa->num_states; i++) 297 states_seen[i] = 0; 298 } 299 300 state = NULL; 301 pos = pos_start; 302 if (type == STR_USER) 303 str_source->rewind(pos + pos_add_next, str_source->context); 304 GET_NEXT_WCHAR(); 305 pos_start = pos; 306 next_c_start = next_c; 307 str_byte_start = str_byte; 308#ifdef TRE_WCHAR 309 str_wide_start = str_wide; 310#endif /* TRE_WCHAR */ 311#ifdef TRE_MBSTATE 312 mbstate_start = mbstate; 313#endif /* TRE_MBSTATE */ 314 315 /* Handle initial states. */ 316 next_tags = NULL; 317 for (trans_i = tnfa->initial; trans_i->state; trans_i++) 318 { 319 DPRINT(("> init %p, prev_c %lc\n", trans_i->state, (tre_cint_t)prev_c)); 320 if (trans_i->assertions && CHECK_ASSERTIONS(trans_i->assertions)) 321 { 322 DPRINT(("assert failed\n")); 323 continue; 324 } 325 if (state == NULL) 326 { 327 /* Start from this state. */ 328 state = trans_i->state; 329 next_tags = trans_i->tags; 330 } 331 else 332 { 333 /* Backtrack to this state. */ 334 DPRINT(("saving state %d for backtracking\n", trans_i->state_id)); 335 BT_STACK_PUSH(pos, str_byte, str_wide, trans_i->state, 336 trans_i->state_id, next_c, tags, mbstate); 337 { 338 int *tmp = trans_i->tags; 339 if (tmp) 340 while (*tmp >= 0) 341 stack->item.tags[*tmp++] = pos; 342 } 343 } 344 } 345 346 if (next_tags) 347 for (; *next_tags >= 0; next_tags++) 348 tags[*next_tags] = pos; 349 350 351 DPRINT(("entering match loop, pos %d, str_byte %p\n", pos, str_byte)); 352 DPRINT(("pos:chr/code | state and tags\n")); 353 DPRINT(("-------------+------------------------------------------------\n")); 354 355 if (state == NULL) 356 goto backtrack; 357 358 while (/*CONSTCOND*/(void)1,1) 359 { 360 tre_tnfa_transition_t *next_state; 361 int empty_br_match; 362 363 DPRINT(("start loop\n")); 364 if (state == tnfa->final) 365 { 366 DPRINT((" match found, %d %d\n", match_eo, pos)); 367 if (match_eo < pos 368 || (match_eo == pos 369 && match_tags 370 && tre_tag_order(tnfa->num_tags, tnfa->tag_directions, 371 tags, match_tags))) 372 { 373 size_t i; 374 /* This match wins the previous match. */ 375 DPRINT((" win previous\n")); 376 match_eo = pos; 377 if (match_tags) 378 for (i = 0; i < tnfa->num_tags; i++) 379 match_tags[i] = tags[i]; 380 } 381 /* Our TNFAs never have transitions leaving from the final state, 382 so we jump right to backtracking. */ 383 goto backtrack; 384 } 385 386#ifdef TRE_DEBUG 387 DPRINT(("%3d:%2lc/%05d | %p ", pos, (tre_cint_t)next_c, (int)next_c, 388 state)); 389 { 390 int i; 391 for (i = 0; i < tnfa->num_tags; i++) 392 DPRINT(("%d%s", tags[i], i < tnfa->num_tags - 1 ? ", " : "")); 393 DPRINT(("\n")); 394 } 395#endif /* TRE_DEBUG */ 396 397 /* Go to the next character in the input string. */ 398 empty_br_match = 0; 399 trans_i = state; 400 if (trans_i->state && trans_i->assertions & ASSERT_BACKREF) 401 { 402 /* This is a back reference state. All transitions leaving from 403 this state have the same back reference "assertion". Instead 404 of reading the next character, we match the back reference. */ 405 regoff_t so, eo, bt = trans_i->u.backref; 406 regoff_t bt_len; 407 regoff_t result; 408 409 DPRINT((" should match back reference %d\n", bt)); 410 /* Get the substring we need to match against. Remember to 411 turn off REG_NOSUB temporarily. */ 412 tre_fill_pmatch(bt + 1, pmatch, tnfa->cflags & ~REG_NOSUB, 413 tnfa, tags, pos); 414 /* LINTED */so = pmatch[bt].rm_so; 415 /* LINTED */eo = pmatch[bt].rm_eo; 416 bt_len = eo - so; 417 418#ifdef TRE_DEBUG 419 { 420 int slen; 421 if (len < 0) 422 slen = bt_len; 423 else 424 slen = MIN(bt_len, len - pos); 425 426 if (type == STR_BYTE) 427 { 428 DPRINT((" substring (len %d) is [%d, %d[: '%.*s'\n", 429 bt_len, so, eo, bt_len, (char*)string + so)); 430 DPRINT((" current string is '%.*s'\n", slen, str_byte - 1)); 431 } 432#ifdef TRE_WCHAR 433 else if (type == STR_WIDE) 434 { 435 DPRINT((" substring (len %d) is [%d, %d[: '%.*" STRF "'\n", 436 bt_len, so, eo, bt_len, (wchar_t*)string + so)); 437 DPRINT((" current string is '%.*" STRF "'\n", 438 slen, str_wide - 1)); 439 } 440#endif /* TRE_WCHAR */ 441 } 442#endif 443 444 if (len < 0) 445 { 446 if (type == STR_USER) 447 result = str_source->compare((unsigned)so, (unsigned)pos, 448 (unsigned)bt_len, 449 str_source->context); 450#ifdef TRE_WCHAR 451 else if (type == STR_WIDE) 452 result = wcsncmp((const wchar_t*)string + so, str_wide - 1, 453 (size_t)bt_len); 454#endif /* TRE_WCHAR */ 455 else 456 /* LINTED */result = strncmp((const char*)string + so, str_byte - 1, 457 (size_t)bt_len); 458 } 459 else if (len - pos < bt_len) 460 result = 1; 461#ifdef TRE_WCHAR 462 else if (type == STR_WIDE) 463 result = wmemcmp((const wchar_t*)string + so, str_wide - 1, 464 (size_t)bt_len); 465#endif /* TRE_WCHAR */ 466 else 467 /* LINTED */result = memcmp((const char*)string + so, str_byte - 1, 468 (size_t)bt_len); 469 470 if (result == 0) 471 { 472 /* Back reference matched. Check for infinite loop. */ 473 if (bt_len == 0) 474 empty_br_match = 1; 475 if (empty_br_match && states_seen[trans_i->state_id]) 476 { 477 DPRINT((" avoid loop\n")); 478 goto backtrack; 479 } 480 481 states_seen[trans_i->state_id] = empty_br_match; 482 483 /* Advance in input string and resync `prev_c', `next_c' 484 and pos. */ 485 DPRINT((" back reference matched\n")); 486 /* LINTED */str_byte += bt_len - 1; 487#ifdef TRE_WCHAR 488 /* LINTED */str_wide += bt_len - 1; 489#endif /* TRE_WCHAR */ 490 /* LINTED */pos += bt_len - 1; 491 GET_NEXT_WCHAR(); 492 DPRINT((" pos now %d\n", pos)); 493 } 494 else 495 { 496 DPRINT((" back reference did not match\n")); 497 goto backtrack; 498 } 499 } 500 else 501 { 502 /* Check for end of string. */ 503 if (len < 0) 504 { 505 if (type == STR_USER) 506 { 507 if (str_user_end) 508 goto backtrack; 509 } 510 else if (next_c == L'\0') 511 goto backtrack; 512 } 513 else 514 { 515 if (pos >= len) 516 goto backtrack; 517 } 518 519 /* Read the next character. */ 520 GET_NEXT_WCHAR(); 521 } 522 523 next_state = NULL; 524 for (trans_i = state; trans_i->state; trans_i++) 525 { 526 DPRINT((" transition %d-%d (%c-%c) %d to %d\n", 527 trans_i->code_min, trans_i->code_max, 528 trans_i->code_min, trans_i->code_max, 529 trans_i->assertions, trans_i->state_id)); 530 if (trans_i->code_min <= (tre_cint_t)prev_c 531 && trans_i->code_max >= (tre_cint_t)prev_c) 532 { 533 if (trans_i->assertions 534 && (CHECK_ASSERTIONS(trans_i->assertions) 535 || CHECK_CHAR_CLASSES(trans_i, tnfa, eflags))) 536 { 537 DPRINT((" assertion failed\n")); 538 continue; 539 } 540 541 if (next_state == NULL) 542 { 543 /* First matching transition. */ 544 DPRINT((" Next state is %d\n", trans_i->state_id)); 545 next_state = trans_i->state; 546 next_tags = trans_i->tags; 547 } 548 else 549 { 550 /* Second matching transition. We may need to backtrack here 551 to take this transition instead of the first one, so we 552 push this transition in the backtracking stack so we can 553 jump back here if needed. */ 554 DPRINT((" saving state %d for backtracking\n", 555 trans_i->state_id)); 556 BT_STACK_PUSH(pos, str_byte, str_wide, trans_i->state, 557 trans_i->state_id, next_c, tags, mbstate); 558 { 559 int *tmp; 560 for (tmp = trans_i->tags; tmp && *tmp >= 0; tmp++) 561 stack->item.tags[*tmp] = pos; 562 } 563#if 0 /* XXX - it's important not to look at all transitions here to keep 564 the stack small! */ 565 break; 566#endif 567 } 568 } 569 } 570 571 if (next_state != NULL) 572 { 573 /* Matching transitions were found. Take the first one. */ 574 state = next_state; 575 576 /* Update the tag values. */ 577 if (next_tags) 578 while (*next_tags >= 0) 579 tags[*next_tags++] = pos; 580 } 581 else 582 { 583 backtrack: 584 /* A matching transition was not found. Try to backtrack. */ 585 if (stack->prev) 586 { 587 DPRINT((" backtracking\n")); 588#if ASSERT_BACKREF 589 if (stack->item.state->assertions) 590 { 591 DPRINT((" states_seen[%d] = 0\n", 592 stack->item.state_id)); 593 states_seen[stack->item.state_id] = 0; 594 } 595#endif 596 597 BT_STACK_POP(); 598 } 599 else if (match_eo < 0) 600 { 601 /* Try starting from a later position in the input string. */ 602 /* Check for end of string. */ 603 if (len < 0) 604 { 605 if (next_c == L'\0') 606 { 607 DPRINT(("end of string.\n")); 608 break; 609 } 610 } 611 else 612 { 613 if (pos >= len) 614 { 615 DPRINT(("end of string.\n")); 616 break; 617 } 618 } 619 DPRINT(("restarting from next start position\n")); 620 next_c = (tre_char_t) next_c_start; 621#ifdef TRE_MBSTATE 622 mbstate = mbstate_start; 623#endif /* TRE_MBSTATE */ 624 str_byte = str_byte_start; 625#ifdef TRE_WCHAR 626 str_wide = str_wide_start; 627#endif /* TRE_WCHAR */ 628 goto retry; 629 } 630 else 631 { 632 DPRINT(("finished\n")); 633 break; 634 } 635 } 636 } 637 638 ret = match_eo >= 0 ? REG_OK : REG_NOMATCH; 639 *match_end_ofs = match_eo; 640 641 error_exit: 642 tre_bt_mem_destroy(mem); 643#ifndef TRE_USE_ALLOCA 644 if (tags) 645 xfree(tags); 646 if (pmatch) 647 xfree(pmatch); 648 if (states_seen) 649 xfree(states_seen); 650#endif /* !TRE_USE_ALLOCA */ 651 652 return ret; 653} 654