1/* 2 * re_*exec and friends - match REs 3 * 4 * Copyright (c) 1998, 1999 Henry Spencer. All rights reserved. 5 * 6 * Development of this software was funded, in part, by Cray Research Inc., 7 * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics 8 * Corporation, none of whom are responsible for the results. The author 9 * thanks all of them. 10 * 11 * Redistribution and use in source and binary forms -- with or without 12 * modification -- are permitted for any purpose, provided that 13 * redistributions in source form retain this entire copyright notice and 14 * indicate the origin and nature of any modifications. 15 * 16 * I'd appreciate being given credit for this package in the documentation 17 * of software which uses it, but that is not a requirement. 18 * 19 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, 20 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY 21 * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL 22 * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 23 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 24 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; 25 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 26 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR 27 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF 28 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 * 30 */ 31 32#include "regguts.h" 33 34 35 36/* lazy-DFA representation */ 37struct arcp { /* "pointer" to an outarc */ 38 struct sset *ss; 39 color co; 40}; 41 42struct sset { /* state set */ 43 unsigned *states; /* pointer to bitvector */ 44 unsigned hash; /* hash of bitvector */ 45# define HASH(bv, nw) (((nw) == 1) ? *(bv) : hash(bv, nw)) 46# define HIT(h,bv,ss,nw) ((ss)->hash == (h) && ((nw) == 1 || \ 47 memcmp(VS(bv), VS((ss)->states), (nw)*sizeof(unsigned)) == 0)) 48 int flags; 49# define STARTER 01 /* the initial state set */ 50# define POSTSTATE 02 /* includes the goal state */ 51# define LOCKED 04 /* locked in cache */ 52# define NOPROGRESS 010 /* zero-progress state set */ 53 struct arcp ins; /* chain of inarcs pointing here */ 54 chr *lastseen; /* last entered on arrival here */ 55 struct sset **outs; /* outarc vector indexed by color */ 56 struct arcp *inchain; /* chain-pointer vector for outarcs */ 57}; 58 59struct dfa { 60 int nssets; /* size of cache */ 61 int nssused; /* how many entries occupied yet */ 62 int nstates; /* number of states */ 63 int ncolors; /* length of outarc and inchain vectors */ 64 int wordsper; /* length of state-set bitvectors */ 65 struct sset *ssets; /* state-set cache */ 66 unsigned *statesarea; /* bitvector storage */ 67 unsigned *work; /* pointer to work area within statesarea */ 68 struct sset **outsarea; /* outarc-vector storage */ 69 struct arcp *incarea; /* inchain storage */ 70 struct cnfa *cnfa; 71 struct colormap *cm; 72 chr *lastpost; /* location of last cache-flushed success */ 73 chr *lastnopr; /* location of last cache-flushed NOPROGRESS */ 74 struct sset *search; /* replacement-search-pointer memory */ 75 int cptsmalloced; /* were the areas individually malloced? */ 76 char *mallocarea; /* self, or master malloced area, or NULL */ 77}; 78 79#define WORK 1 /* number of work bitvectors needed */ 80 81/* setup for non-malloc allocation for small cases */ 82#define FEWSTATES 20 /* must be less than UBITS */ 83#define FEWCOLORS 15 84struct smalldfa { 85 struct dfa dfa; 86 struct sset ssets[FEWSTATES*2]; 87 unsigned statesarea[FEWSTATES*2 + WORK]; 88 struct sset *outsarea[FEWSTATES*2 * FEWCOLORS]; 89 struct arcp incarea[FEWSTATES*2 * FEWCOLORS]; 90}; 91#define DOMALLOC ((struct smalldfa *)NULL) /* force malloc */ 92 93 94 95/* internal variables, bundled for easy passing around */ 96struct vars { 97 regex_t *re; 98 struct guts *g; 99 int eflags; /* copies of arguments */ 100 size_t nmatch; 101 regmatch_t *pmatch; 102 rm_detail_t *details; 103 chr *start; /* start of string */ 104 chr *stop; /* just past end of string */ 105 int err; /* error code if any (0 none) */ 106 regoff_t *mem; /* memory vector for backtracking */ 107 struct smalldfa dfa1; 108 struct smalldfa dfa2; 109}; 110#define VISERR(vv) ((vv)->err != 0) /* have we seen an error yet? */ 111#define ISERR() VISERR(v) 112#define VERR(vv,e) (((vv)->err) ? (vv)->err : ((vv)->err = (e))) 113#define ERR(e) VERR(v, e) /* record an error */ 114#define NOERR() {if (ISERR()) return v->err;} /* if error seen, return it */ 115#define OFF(p) ((p) - v->start) 116#define LOFF(p) ((long)OFF(p)) 117 118 119 120/* 121 * forward declarations 122 */ 123/* =====^!^===== begin forwards =====^!^===== */ 124/* automatically gathered by fwd; do not hand-edit */ 125/* === regexec.c === */ 126int exec _ANSI_ARGS_((regex_t *, CONST chr *, size_t, rm_detail_t *, size_t, regmatch_t [], int)); 127static int find _ANSI_ARGS_((struct vars *, struct cnfa *, struct colormap *)); 128static int cfind _ANSI_ARGS_((struct vars *, struct cnfa *, struct colormap *)); 129static int cfindloop _ANSI_ARGS_((struct vars *, struct cnfa *, struct colormap *, struct dfa *, struct dfa *, chr **)); 130static VOID zapsubs _ANSI_ARGS_((regmatch_t *, size_t)); 131static VOID zapmem _ANSI_ARGS_((struct vars *, struct subre *)); 132static VOID subset _ANSI_ARGS_((struct vars *, struct subre *, chr *, chr *)); 133static int dissect _ANSI_ARGS_((struct vars *, struct subre *, chr *, chr *)); 134static int condissect _ANSI_ARGS_((struct vars *, struct subre *, chr *, chr *)); 135static int altdissect _ANSI_ARGS_((struct vars *, struct subre *, chr *, chr *)); 136static int cdissect _ANSI_ARGS_((struct vars *, struct subre *, chr *, chr *)); 137static int ccondissect _ANSI_ARGS_((struct vars *, struct subre *, chr *, chr *)); 138static int crevdissect _ANSI_ARGS_((struct vars *, struct subre *, chr *, chr *)); 139static int cbrdissect _ANSI_ARGS_((struct vars *, struct subre *, chr *, chr *)); 140static int caltdissect _ANSI_ARGS_((struct vars *, struct subre *, chr *, chr *)); 141/* === rege_dfa.c === */ 142static chr *longest _ANSI_ARGS_((struct vars *, struct dfa *, chr *, chr *, int *)); 143static chr *shortest _ANSI_ARGS_((struct vars *, struct dfa *, chr *, chr *, chr *, chr **, int *)); 144static chr *lastcold _ANSI_ARGS_((struct vars *, struct dfa *)); 145static struct dfa *newdfa _ANSI_ARGS_((struct vars *, struct cnfa *, struct colormap *, struct smalldfa *)); 146static VOID freedfa _ANSI_ARGS_((struct dfa *)); 147static unsigned hash _ANSI_ARGS_((unsigned *, int)); 148static struct sset *initialize _ANSI_ARGS_((struct vars *, struct dfa *, chr *)); 149static struct sset *miss _ANSI_ARGS_((struct vars *, struct dfa *, struct sset *, pcolor, chr *, chr *)); 150static int lacon _ANSI_ARGS_((struct vars *, struct cnfa *, chr *, pcolor)); 151static struct sset *getvacant _ANSI_ARGS_((struct vars *, struct dfa *, chr *, chr *)); 152static struct sset *pickss _ANSI_ARGS_((struct vars *, struct dfa *, chr *, chr *)); 153/* automatically gathered by fwd; do not hand-edit */ 154/* =====^!^===== end forwards =====^!^===== */ 155 156 157 158/* 159 - exec - match regular expression 160 ^ int exec(regex_t *, CONST chr *, size_t, rm_detail_t *, 161 ^ size_t, regmatch_t [], int); 162 */ 163int 164exec(re, string, len, details, nmatch, pmatch, flags) 165regex_t *re; 166CONST chr *string; 167size_t len; 168rm_detail_t *details; 169size_t nmatch; 170regmatch_t pmatch[]; 171int flags; 172{ 173 struct vars var; 174 register struct vars *v = &var; 175 int st; 176 size_t n; 177 int backref; 178# define LOCALMAT 20 179 regmatch_t mat[LOCALMAT]; 180# define LOCALMEM 40 181 regoff_t mem[LOCALMEM]; 182 183 /* sanity checks */ 184 if (re == NULL || string == NULL || re->re_magic != REMAGIC) 185 return REG_INVARG; 186 if (re->re_csize != sizeof(chr)) 187 return REG_MIXED; 188 189 /* setup */ 190 v->re = re; 191 v->g = (struct guts *)re->re_guts; 192 if ((v->g->cflags®_EXPECT) && details == NULL) 193 return REG_INVARG; 194 if (v->g->info®_UIMPOSSIBLE) 195 return REG_NOMATCH; 196 backref = (v->g->info®_UBACKREF) ? 1 : 0; 197 v->eflags = flags; 198 if (v->g->cflags®_NOSUB) 199 nmatch = 0; /* override client */ 200 v->nmatch = nmatch; 201 if (backref) { 202 /* need work area */ 203 if (v->g->nsub + 1 <= LOCALMAT) 204 v->pmatch = mat; 205 else 206 v->pmatch = (regmatch_t *)MALLOC((v->g->nsub + 1) * 207 sizeof(regmatch_t)); 208 if (v->pmatch == NULL) 209 return REG_ESPACE; 210 v->nmatch = v->g->nsub + 1; 211 } else 212 v->pmatch = pmatch; 213 v->details = details; 214 v->start = (chr *)string; 215 v->stop = (chr *)string + len; 216 v->err = 0; 217 if (backref) { 218 /* need retry memory */ 219 assert(v->g->ntree >= 0); 220 n = (size_t)v->g->ntree; 221 if (n <= LOCALMEM) 222 v->mem = mem; 223 else 224 v->mem = (regoff_t *)MALLOC(n*sizeof(regoff_t)); 225 if (v->mem == NULL) { 226 if (v->pmatch != pmatch && v->pmatch != mat) 227 FREE(v->pmatch); 228 return REG_ESPACE; 229 } 230 } else 231 v->mem = NULL; 232 233 /* do it */ 234 assert(v->g->tree != NULL); 235 if (backref) 236 st = cfind(v, &v->g->tree->cnfa, &v->g->cmap); 237 else 238 st = find(v, &v->g->tree->cnfa, &v->g->cmap); 239 240 /* copy (portion of) match vector over if necessary */ 241 if (st == REG_OKAY && v->pmatch != pmatch && nmatch > 0) { 242 zapsubs(pmatch, nmatch); 243 n = (nmatch < v->nmatch) ? nmatch : v->nmatch; 244 memcpy(VS(pmatch), VS(v->pmatch), n*sizeof(regmatch_t)); 245 } 246 247 /* clean up */ 248 if (v->pmatch != pmatch && v->pmatch != mat) 249 FREE(v->pmatch); 250 if (v->mem != NULL && v->mem != mem) 251 FREE(v->mem); 252 return st; 253} 254 255/* 256 - find - find a match for the main NFA (no-complications case) 257 ^ static int find(struct vars *, struct cnfa *, struct colormap *); 258 */ 259static int 260find(v, cnfa, cm) 261struct vars *v; 262struct cnfa *cnfa; 263struct colormap *cm; 264{ 265 struct dfa *s; 266 struct dfa *d; 267 chr *begin; 268 chr *end = NULL; 269 chr *cold; 270 chr *open; /* open and close of range of possible starts */ 271 chr *close; 272 int hitend; 273 int shorter = (v->g->tree->flags&SHORTER) ? 1 : 0; 274 275 /* first, a shot with the search RE */ 276 s = newdfa(v, &v->g->search, cm, &v->dfa1); 277 assert(!(ISERR() && s != NULL)); 278 NOERR(); 279 MDEBUG(("\nsearch at %ld\n", LOFF(v->start))); 280 cold = NULL; 281 close = shortest(v, s, v->start, v->start, v->stop, &cold, (int *)NULL); 282 freedfa(s); 283 NOERR(); 284 if (v->g->cflags®_EXPECT) { 285 assert(v->details != NULL); 286 if (cold != NULL) 287 v->details->rm_extend.rm_so = OFF(cold); 288 else 289 v->details->rm_extend.rm_so = OFF(v->stop); 290 v->details->rm_extend.rm_eo = OFF(v->stop); /* unknown */ 291 } 292 if (close == NULL) /* not found */ 293 return REG_NOMATCH; 294 if (v->nmatch == 0) /* found, don't need exact location */ 295 return REG_OKAY; 296 297 /* find starting point and match */ 298 assert(cold != NULL); 299 open = cold; 300 cold = NULL; 301 MDEBUG(("between %ld and %ld\n", LOFF(open), LOFF(close))); 302 d = newdfa(v, cnfa, cm, &v->dfa1); 303 assert(!(ISERR() && d != NULL)); 304 NOERR(); 305 for (begin = open; begin <= close; begin++) { 306 MDEBUG(("\nfind trying at %ld\n", LOFF(begin))); 307 if (shorter) 308 end = shortest(v, d, begin, begin, v->stop, 309 (chr **)NULL, &hitend); 310 else 311 end = longest(v, d, begin, v->stop, &hitend); 312 NOERR(); 313 if (hitend && cold == NULL) 314 cold = begin; 315 if (end != NULL) 316 break; /* NOTE BREAK OUT */ 317 } 318 assert(end != NULL); /* search RE succeeded so loop should */ 319 freedfa(d); 320 321 /* and pin down details */ 322 assert(v->nmatch > 0); 323 v->pmatch[0].rm_so = OFF(begin); 324 v->pmatch[0].rm_eo = OFF(end); 325 if (v->g->cflags®_EXPECT) { 326 if (cold != NULL) 327 v->details->rm_extend.rm_so = OFF(cold); 328 else 329 v->details->rm_extend.rm_so = OFF(v->stop); 330 v->details->rm_extend.rm_eo = OFF(v->stop); /* unknown */ 331 } 332 if (v->nmatch == 1) /* no need for submatches */ 333 return REG_OKAY; 334 335 /* submatches */ 336 zapsubs(v->pmatch, v->nmatch); 337 return dissect(v, v->g->tree, begin, end); 338} 339 340/* 341 - cfind - find a match for the main NFA (with complications) 342 ^ static int cfind(struct vars *, struct cnfa *, struct colormap *); 343 */ 344static int 345cfind(v, cnfa, cm) 346struct vars *v; 347struct cnfa *cnfa; 348struct colormap *cm; 349{ 350 struct dfa *s; 351 struct dfa *d; 352 chr *cold = NULL; /* silence gcc 4 warning */ 353 int ret; 354 355 s = newdfa(v, &v->g->search, cm, &v->dfa1); 356 NOERR(); 357 d = newdfa(v, cnfa, cm, &v->dfa2); 358 if (ISERR()) { 359 assert(d == NULL); 360 freedfa(s); 361 return v->err; 362 } 363 364 ret = cfindloop(v, cnfa, cm, d, s, &cold); 365 366 freedfa(d); 367 freedfa(s); 368 NOERR(); 369 if (v->g->cflags®_EXPECT) { 370 assert(v->details != NULL); 371 if (cold != NULL) 372 v->details->rm_extend.rm_so = OFF(cold); 373 else 374 v->details->rm_extend.rm_so = OFF(v->stop); 375 v->details->rm_extend.rm_eo = OFF(v->stop); /* unknown */ 376 } 377 return ret; 378} 379 380/* 381 - cfindloop - the heart of cfind 382 ^ static int cfindloop(struct vars *, struct cnfa *, struct colormap *, 383 ^ struct dfa *, struct dfa *, chr **); 384 */ 385static int 386cfindloop(v, cnfa, cm, d, s, coldp) 387struct vars *v; 388struct cnfa *cnfa; 389struct colormap *cm; 390struct dfa *d; 391struct dfa *s; 392chr **coldp; /* where to put coldstart pointer */ 393{ 394 chr *begin; 395 chr *end; 396 chr *cold; 397 chr *open; /* open and close of range of possible starts */ 398 chr *close; 399 chr *estart; 400 chr *estop; 401 int er; 402 int shorter = v->g->tree->flags&SHORTER; 403 int hitend; 404 405 assert(d != NULL && s != NULL); 406 cold = NULL; 407 close = v->start; 408 do { 409 MDEBUG(("\ncsearch at %ld\n", LOFF(close))); 410 close = shortest(v, s, close, close, v->stop, &cold, (int *)NULL); 411 if (close == NULL) 412 break; /* NOTE BREAK */ 413 assert(cold != NULL); 414 open = cold; 415 cold = NULL; 416 MDEBUG(("cbetween %ld and %ld\n", LOFF(open), LOFF(close))); 417 for (begin = open; begin <= close; begin++) { 418 MDEBUG(("\ncfind trying at %ld\n", LOFF(begin))); 419 estart = begin; 420 estop = v->stop; 421 for (;;) { 422 if (shorter) 423 end = shortest(v, d, begin, estart, 424 estop, (chr **)NULL, &hitend); 425 else 426 end = longest(v, d, begin, estop, 427 &hitend); 428 if (hitend && cold == NULL) 429 cold = begin; 430 if (end == NULL) 431 break; /* NOTE BREAK OUT */ 432 MDEBUG(("tentative end %ld\n", LOFF(end))); 433 zapsubs(v->pmatch, v->nmatch); 434 zapmem(v, v->g->tree); 435 er = cdissect(v, v->g->tree, begin, end); 436 if (er == REG_OKAY) { 437 if (v->nmatch > 0) { 438 v->pmatch[0].rm_so = OFF(begin); 439 v->pmatch[0].rm_eo = OFF(end); 440 } 441 *coldp = cold; 442 return REG_OKAY; 443 } 444 if (er != REG_NOMATCH) { 445 ERR(er); 446 return er; 447 } 448 if ((shorter) ? end == estop : end == begin) { 449 /* no point in trying again */ 450 *coldp = cold; 451 return REG_NOMATCH; 452 } 453 /* go around and try again */ 454 if (shorter) 455 estart = end + 1; 456 else 457 estop = end - 1; 458 } 459 } 460 } while (close < v->stop); 461 462 *coldp = cold; 463 return REG_NOMATCH; 464} 465 466/* 467 - zapsubs - initialize the subexpression matches to "no match" 468 ^ static VOID zapsubs(regmatch_t *, size_t); 469 */ 470static VOID 471zapsubs(p, n) 472regmatch_t *p; 473size_t n; 474{ 475 size_t i; 476 477 for (i = n-1; i > 0; i--) { 478 p[i].rm_so = -1; 479 p[i].rm_eo = -1; 480 } 481} 482 483/* 484 - zapmem - initialize the retry memory of a subtree to zeros 485 ^ static VOID zapmem(struct vars *, struct subre *); 486 */ 487static VOID 488zapmem(v, t) 489struct vars *v; 490struct subre *t; 491{ 492 if (t == NULL) 493 return; 494 495 assert(v->mem != NULL); 496 v->mem[t->retry] = 0; 497 if (t->op == '(') { 498 assert(t->subno > 0); 499 v->pmatch[t->subno].rm_so = -1; 500 v->pmatch[t->subno].rm_eo = -1; 501 } 502 503 if (t->left != NULL) 504 zapmem(v, t->left); 505 if (t->right != NULL) 506 zapmem(v, t->right); 507} 508 509/* 510 - subset - set any subexpression relevant to a successful subre 511 ^ static VOID subset(struct vars *, struct subre *, chr *, chr *); 512 */ 513static VOID 514subset(v, sub, begin, end) 515struct vars *v; 516struct subre *sub; 517chr *begin; 518chr *end; 519{ 520 int n = sub->subno; 521 522 assert(n > 0); 523 if ((size_t)n >= v->nmatch) 524 return; 525 526 MDEBUG(("setting %d\n", n)); 527 v->pmatch[n].rm_so = OFF(begin); 528 v->pmatch[n].rm_eo = OFF(end); 529} 530 531/* 532 - dissect - determine subexpression matches (uncomplicated case) 533 ^ static int dissect(struct vars *, struct subre *, chr *, chr *); 534 */ 535static int /* regexec return code */ 536dissect(v, t, begin, end) 537struct vars *v; 538struct subre *t; 539chr *begin; /* beginning of relevant substring */ 540chr *end; /* end of same */ 541{ 542 assert(t != NULL); 543 MDEBUG(("dissect %ld-%ld\n", LOFF(begin), LOFF(end))); 544 545 switch (t->op) { 546 case '=': /* terminal node */ 547 assert(t->left == NULL && t->right == NULL); 548 return REG_OKAY; /* no action, parent did the work */ 549 break; 550 case '|': /* alternation */ 551 assert(t->left != NULL); 552 return altdissect(v, t, begin, end); 553 break; 554 case 'b': /* back ref -- shouldn't be calling us! */ 555 return REG_ASSERT; 556 break; 557 case '.': /* concatenation */ 558 assert(t->left != NULL && t->right != NULL); 559 return condissect(v, t, begin, end); 560 break; 561 case '(': /* capturing */ 562 assert(t->left != NULL && t->right == NULL); 563 assert(t->subno > 0); 564 subset(v, t, begin, end); 565 return dissect(v, t->left, begin, end); 566 break; 567 default: 568 return REG_ASSERT; 569 break; 570 } 571} 572 573/* 574 - condissect - determine concatenation subexpression matches (uncomplicated) 575 ^ static int condissect(struct vars *, struct subre *, chr *, chr *); 576 */ 577static int /* regexec return code */ 578condissect(v, t, begin, end) 579struct vars *v; 580struct subre *t; 581chr *begin; /* beginning of relevant substring */ 582chr *end; /* end of same */ 583{ 584 struct dfa *d; 585 struct dfa *d2; 586 chr *mid; 587 int i; 588 int shorter = (t->left->flags&SHORTER) ? 1 : 0; 589 chr *stop = (shorter) ? end : begin; 590 591 assert(t->op == '.'); 592 assert(t->left != NULL && t->left->cnfa.nstates > 0); 593 assert(t->right != NULL && t->right->cnfa.nstates > 0); 594 595 d = newdfa(v, &t->left->cnfa, &v->g->cmap, &v->dfa1); 596 NOERR(); 597 d2 = newdfa(v, &t->right->cnfa, &v->g->cmap, &v->dfa2); 598 if (ISERR()) { 599 assert(d2 == NULL); 600 freedfa(d); 601 return v->err; 602 } 603 604 /* pick a tentative midpoint */ 605 if (shorter) 606 mid = shortest(v, d, begin, begin, end, (chr **)NULL, 607 (int *)NULL); 608 else 609 mid = longest(v, d, begin, end, (int *)NULL); 610 if (mid == NULL) { 611 freedfa(d); 612 freedfa(d2); 613 return REG_ASSERT; 614 } 615 MDEBUG(("tentative midpoint %ld\n", LOFF(mid))); 616 617 /* iterate until satisfaction or failure */ 618 while (longest(v, d2, mid, end, (int *)NULL) != end) { 619 /* that midpoint didn't work, find a new one */ 620 if (mid == stop) { 621 /* all possibilities exhausted! */ 622 MDEBUG(("no midpoint!\n")); 623 freedfa(d); 624 freedfa(d2); 625 return REG_ASSERT; 626 } 627 if (shorter) 628 mid = shortest(v, d, begin, mid+1, end, (chr **)NULL, 629 (int *)NULL); 630 else 631 mid = longest(v, d, begin, mid-1, (int *)NULL); 632 if (mid == NULL) { 633 /* failed to find a new one! */ 634 MDEBUG(("failed midpoint!\n")); 635 freedfa(d); 636 freedfa(d2); 637 return REG_ASSERT; 638 } 639 MDEBUG(("new midpoint %ld\n", LOFF(mid))); 640 } 641 642 /* satisfaction */ 643 MDEBUG(("successful\n")); 644 freedfa(d); 645 freedfa(d2); 646 i = dissect(v, t->left, begin, mid); 647 if (i != REG_OKAY) 648 return i; 649 return dissect(v, t->right, mid, end); 650} 651 652/* 653 - altdissect - determine alternative subexpression matches (uncomplicated) 654 ^ static int altdissect(struct vars *, struct subre *, chr *, chr *); 655 */ 656static int /* regexec return code */ 657altdissect(v, t, begin, end) 658struct vars *v; 659struct subre *t; 660chr *begin; /* beginning of relevant substring */ 661chr *end; /* end of same */ 662{ 663 struct dfa *d; 664 int i; 665 666 assert(t != NULL); 667 assert(t->op == '|'); 668 669 for (i = 0; t != NULL; t = t->right, i++) { 670 MDEBUG(("trying %dth\n", i)); 671 assert(t->left != NULL && t->left->cnfa.nstates > 0); 672 d = newdfa(v, &t->left->cnfa, &v->g->cmap, &v->dfa1); 673 if (ISERR()) 674 return v->err; 675 if (longest(v, d, begin, end, (int *)NULL) == end) { 676 MDEBUG(("success\n")); 677 freedfa(d); 678 return dissect(v, t->left, begin, end); 679 } 680 freedfa(d); 681 } 682 return REG_ASSERT; /* none of them matched?!? */ 683} 684 685/* 686 - cdissect - determine subexpression matches (with complications) 687 * The retry memory stores the offset of the trial midpoint from begin, 688 * plus 1 so that 0 uniquely means "clean slate". 689 ^ static int cdissect(struct vars *, struct subre *, chr *, chr *); 690 */ 691static int /* regexec return code */ 692cdissect(v, t, begin, end) 693struct vars *v; 694struct subre *t; 695chr *begin; /* beginning of relevant substring */ 696chr *end; /* end of same */ 697{ 698 int er; 699 700 assert(t != NULL); 701 MDEBUG(("cdissect %ld-%ld %c\n", LOFF(begin), LOFF(end), t->op)); 702 703 switch (t->op) { 704 case '=': /* terminal node */ 705 assert(t->left == NULL && t->right == NULL); 706 return REG_OKAY; /* no action, parent did the work */ 707 break; 708 case '|': /* alternation */ 709 assert(t->left != NULL); 710 return caltdissect(v, t, begin, end); 711 break; 712 case 'b': /* back ref -- shouldn't be calling us! */ 713 assert(t->left == NULL && t->right == NULL); 714 return cbrdissect(v, t, begin, end); 715 break; 716 case '.': /* concatenation */ 717 assert(t->left != NULL && t->right != NULL); 718 return ccondissect(v, t, begin, end); 719 break; 720 case '(': /* capturing */ 721 assert(t->left != NULL && t->right == NULL); 722 assert(t->subno > 0); 723 er = cdissect(v, t->left, begin, end); 724 if (er == REG_OKAY) 725 subset(v, t, begin, end); 726 return er; 727 break; 728 default: 729 return REG_ASSERT; 730 break; 731 } 732} 733 734/* 735 - ccondissect - concatenation subexpression matches (with complications) 736 * The retry memory stores the offset of the trial midpoint from begin, 737 * plus 1 so that 0 uniquely means "clean slate". 738 ^ static int ccondissect(struct vars *, struct subre *, chr *, chr *); 739 */ 740static int /* regexec return code */ 741ccondissect(v, t, begin, end) 742struct vars *v; 743struct subre *t; 744chr *begin; /* beginning of relevant substring */ 745chr *end; /* end of same */ 746{ 747 struct dfa *d; 748 struct dfa *d2; 749 chr *mid; 750 int er; 751 752 assert(t->op == '.'); 753 assert(t->left != NULL && t->left->cnfa.nstates > 0); 754 assert(t->right != NULL && t->right->cnfa.nstates > 0); 755 756 if (t->left->flags&SHORTER) /* reverse scan */ 757 return crevdissect(v, t, begin, end); 758 759 d = newdfa(v, &t->left->cnfa, &v->g->cmap, DOMALLOC); 760 if (ISERR()) 761 return v->err; 762 d2 = newdfa(v, &t->right->cnfa, &v->g->cmap, DOMALLOC); 763 if (ISERR()) { 764 freedfa(d); 765 return v->err; 766 } 767 MDEBUG(("cconcat %d\n", t->retry)); 768 769 /* pick a tentative midpoint */ 770 if (v->mem[t->retry] == 0) { 771 mid = longest(v, d, begin, end, (int *)NULL); 772 if (mid == NULL) { 773 freedfa(d); 774 freedfa(d2); 775 return REG_NOMATCH; 776 } 777 MDEBUG(("tentative midpoint %ld\n", LOFF(mid))); 778 v->mem[t->retry] = (mid - begin) + 1; 779 } else { 780 mid = begin + (v->mem[t->retry] - 1); 781 MDEBUG(("working midpoint %ld\n", LOFF(mid))); 782 } 783 784 /* iterate until satisfaction or failure */ 785 for (;;) { 786 /* try this midpoint on for size */ 787 if (longest(v, d2, mid, end, NULL) == end) { 788 er = cdissect(v, t->left, begin, mid); 789 if (er == REG_OKAY) { 790 er = cdissect(v, t->right, mid, end); 791 if (er == REG_OKAY) { 792 /* satisfaction */ 793 MDEBUG(("successful\n")); 794 freedfa(d); 795 freedfa(d2); 796 return REG_OKAY; 797 } 798 } 799 if ((er != REG_OKAY) && (er != REG_NOMATCH)) { 800 freedfa(d); 801 freedfa(d2); 802 return er; 803 } 804 } 805 806 /* that midpoint didn't work, find a new one */ 807 if (mid == begin) { 808 /* all possibilities exhausted */ 809 MDEBUG(("%d no midpoint\n", t->retry)); 810 freedfa(d); 811 freedfa(d2); 812 return REG_NOMATCH; 813 } 814 mid = longest(v, d, begin, mid-1, (int *)NULL); 815 if (mid == NULL) { 816 /* failed to find a new one */ 817 MDEBUG(("%d failed midpoint\n", t->retry)); 818 freedfa(d); 819 freedfa(d2); 820 return REG_NOMATCH; 821 } 822 MDEBUG(("%d: new midpoint %ld\n", t->retry, LOFF(mid))); 823 v->mem[t->retry] = (mid - begin) + 1; 824 zapmem(v, t->left); 825 zapmem(v, t->right); 826 } 827} 828 829/* 830 - crevdissect - determine backref shortest-first subexpression matches 831 * The retry memory stores the offset of the trial midpoint from begin, 832 * plus 1 so that 0 uniquely means "clean slate". 833 ^ static int crevdissect(struct vars *, struct subre *, chr *, chr *); 834 */ 835static int /* regexec return code */ 836crevdissect(v, t, begin, end) 837struct vars *v; 838struct subre *t; 839chr *begin; /* beginning of relevant substring */ 840chr *end; /* end of same */ 841{ 842 struct dfa *d; 843 struct dfa *d2; 844 chr *mid; 845 int er; 846 847 assert(t->op == '.'); 848 assert(t->left != NULL && t->left->cnfa.nstates > 0); 849 assert(t->right != NULL && t->right->cnfa.nstates > 0); 850 assert(t->left->flags&SHORTER); 851 852 /* concatenation -- need to split the substring between parts */ 853 d = newdfa(v, &t->left->cnfa, &v->g->cmap, DOMALLOC); 854 if (ISERR()) 855 return v->err; 856 d2 = newdfa(v, &t->right->cnfa, &v->g->cmap, DOMALLOC); 857 if (ISERR()) { 858 freedfa(d); 859 return v->err; 860 } 861 MDEBUG(("crev %d\n", t->retry)); 862 863 /* pick a tentative midpoint */ 864 if (v->mem[t->retry] == 0) { 865 mid = shortest(v, d, begin, begin, end, (chr **)NULL, (int *)NULL); 866 if (mid == NULL) { 867 freedfa(d); 868 freedfa(d2); 869 return REG_NOMATCH; 870 } 871 MDEBUG(("tentative midpoint %ld\n", LOFF(mid))); 872 v->mem[t->retry] = (mid - begin) + 1; 873 } else { 874 mid = begin + (v->mem[t->retry] - 1); 875 MDEBUG(("working midpoint %ld\n", LOFF(mid))); 876 } 877 878 /* iterate until satisfaction or failure */ 879 for (;;) { 880 /* try this midpoint on for size */ 881 if (longest(v, d2, mid, end, NULL) == end) { 882 er = cdissect(v, t->left, begin, mid); 883 if (er == REG_OKAY) { 884 er = cdissect(v, t->right, mid, end); 885 if (er == REG_OKAY) { 886 /* satisfaction */ 887 MDEBUG(("successful\n")); 888 freedfa(d); 889 freedfa(d2); 890 return REG_OKAY; 891 } 892 } 893 if (er != REG_OKAY && er != REG_NOMATCH) { 894 freedfa(d); 895 freedfa(d2); 896 return er; 897 } 898 } 899 900 /* that midpoint didn't work, find a new one */ 901 if (mid == end) { 902 /* all possibilities exhausted */ 903 MDEBUG(("%d no midpoint\n", t->retry)); 904 freedfa(d); 905 freedfa(d2); 906 return REG_NOMATCH; 907 } 908 mid = shortest(v, d, begin, mid+1, end, (chr **)NULL, (int *)NULL); 909 if (mid == NULL) { 910 /* failed to find a new one */ 911 MDEBUG(("%d failed midpoint\n", t->retry)); 912 freedfa(d); 913 freedfa(d2); 914 return REG_NOMATCH; 915 } 916 MDEBUG(("%d: new midpoint %ld\n", t->retry, LOFF(mid))); 917 v->mem[t->retry] = (mid - begin) + 1; 918 zapmem(v, t->left); 919 zapmem(v, t->right); 920 } 921} 922 923/* 924 - cbrdissect - determine backref subexpression matches 925 ^ static int cbrdissect(struct vars *, struct subre *, chr *, chr *); 926 */ 927static int /* regexec return code */ 928cbrdissect(v, t, begin, end) 929struct vars *v; 930struct subre *t; 931chr *begin; /* beginning of relevant substring */ 932chr *end; /* end of same */ 933{ 934 int i; 935 int n = t->subno; 936 size_t len; 937 chr *paren; 938 chr *p; 939 chr *stop; 940 int min = t->min; 941 int max = t->max; 942 943 assert(t != NULL); 944 assert(t->op == 'b'); 945 assert(n >= 0); 946 assert((size_t)n < v->nmatch); 947 948 MDEBUG(("cbackref n%d %d{%d-%d}\n", t->retry, n, min, max)); 949 950 if (v->pmatch[n].rm_so == -1) 951 return REG_NOMATCH; 952 paren = v->start + v->pmatch[n].rm_so; 953 len = v->pmatch[n].rm_eo - v->pmatch[n].rm_so; 954 955 /* no room to maneuver -- retries are pointless */ 956 if (v->mem[t->retry]) 957 return REG_NOMATCH; 958 v->mem[t->retry] = 1; 959 960 /* special-case zero-length string */ 961 if (len == 0) { 962 if (begin == end) 963 return REG_OKAY; 964 return REG_NOMATCH; 965 } 966 967 /* and too-short string */ 968 assert(end >= begin); 969 if ((size_t)(end - begin) < len) 970 return REG_NOMATCH; 971 stop = end - len; 972 973 /* count occurrences */ 974 i = 0; 975 for (p = begin; p <= stop && (i < max || max == INFINITY); p += len) { 976 if ((*v->g->compare)(paren, p, len) != 0) 977 break; 978 i++; 979 } 980 MDEBUG(("cbackref found %d\n", i)); 981 982 /* and sort it out */ 983 if (p != end) /* didn't consume all of it */ 984 return REG_NOMATCH; 985 if (min <= i && (i <= max || max == INFINITY)) 986 return REG_OKAY; 987 return REG_NOMATCH; /* out of range */ 988} 989 990/* 991 - caltdissect - determine alternative subexpression matches (w. complications) 992 ^ static int caltdissect(struct vars *, struct subre *, chr *, chr *); 993 */ 994static int /* regexec return code */ 995caltdissect(v, t, begin, end) 996struct vars *v; 997struct subre *t; 998chr *begin; /* beginning of relevant substring */ 999chr *end; /* end of same */ 1000{ 1001 struct dfa *d; 1002 int er; 1003# define UNTRIED 0 /* not yet tried at all */ 1004# define TRYING 1 /* top matched, trying submatches */ 1005# define TRIED 2 /* top didn't match or submatches exhausted */ 1006 1007 if (t == NULL) 1008 return REG_NOMATCH; 1009 assert(t->op == '|'); 1010 if (v->mem[t->retry] == TRIED) 1011 return caltdissect(v, t->right, begin, end); 1012 1013 MDEBUG(("calt n%d\n", t->retry)); 1014 assert(t->left != NULL); 1015 1016 if (v->mem[t->retry] == UNTRIED) { 1017 d = newdfa(v, &t->left->cnfa, &v->g->cmap, DOMALLOC); 1018 if (ISERR()) 1019 return v->err; 1020 if (longest(v, d, begin, end, (int *)NULL) != end) { 1021 freedfa(d); 1022 v->mem[t->retry] = TRIED; 1023 return caltdissect(v, t->right, begin, end); 1024 } 1025 freedfa(d); 1026 MDEBUG(("calt matched\n")); 1027 v->mem[t->retry] = TRYING; 1028 } 1029 1030 er = cdissect(v, t->left, begin, end); 1031 if (er != REG_NOMATCH) 1032 return er; 1033 1034 v->mem[t->retry] = TRIED; 1035 return caltdissect(v, t->right, begin, end); 1036} 1037 1038 1039 1040#include "rege_dfa.c" 1041