engine.c revision 1.23
1/* $OpenBSD: engine.c,v 1.23 2016/05/26 05:46:44 martijn Exp $ */ 2 3/*- 4 * Copyright (c) 1992, 1993, 1994 Henry Spencer. 5 * Copyright (c) 1992, 1993, 1994 6 * The Regents of the University of California. All rights reserved. 7 * 8 * This code is derived from software contributed to Berkeley by 9 * Henry Spencer. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 3. Neither the name of the University nor the names of its contributors 20 * may be used to endorse or promote products derived from this software 21 * without specific prior written permission. 22 * 23 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 24 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 25 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 26 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 27 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 28 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 29 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 30 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 31 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 32 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 33 * SUCH DAMAGE. 34 * 35 * @(#)engine.c 8.5 (Berkeley) 3/20/94 36 */ 37 38/* 39 * The matching engine and friends. This file is #included by regexec.c 40 * after suitable #defines of a variety of macros used herein, so that 41 * different state representations can be used without duplicating masses 42 * of code. 43 */ 44 45#ifdef SNAMES 46#define matcher smatcher 47#define fast sfast 48#define slow sslow 49#define dissect sdissect 50#define backref sbackref 51#define step sstep 52#define print sprint 53#define at sat 54#define match smat 55#define nope snope 56#endif 57#ifdef LNAMES 58#define matcher lmatcher 59#define fast lfast 60#define slow lslow 61#define dissect ldissect 62#define backref lbackref 63#define step lstep 64#define print lprint 65#define at lat 66#define match lmat 67#define nope lnope 68#endif 69 70/* another structure passed up and down to avoid zillions of parameters */ 71struct match { 72 struct re_guts *g; 73 int eflags; 74 regmatch_t *pmatch; /* [nsub+1] (0 element unused) */ 75 char *offp; /* offsets work from here */ 76 char *beginp; /* start of string -- virtual NUL precedes */ 77 char *endp; /* end of string -- virtual NUL here */ 78 char *coldp; /* can be no match starting before here */ 79 char **lastpos; /* [nplus+1] */ 80 STATEVARS; 81 states st; /* current states */ 82 states fresh; /* states for a fresh start */ 83 states tmp; /* temporary */ 84 states empty; /* empty set of states */ 85}; 86 87static int matcher(struct re_guts *, char *, size_t, regmatch_t[], int); 88static char *dissect(struct match *, char *, char *, sopno, sopno); 89static char *backref(struct match *, char *, char *, sopno, sopno, sopno, int); 90static char *fast(struct match *, char *, char *, sopno, sopno); 91static char *slow(struct match *, char *, char *, sopno, sopno); 92static states step(struct re_guts *, sopno, sopno, states, int, states); 93#define MAX_RECURSION 100 94#define BOL (OUT+1) 95#define EOL (BOL+1) 96#define BOLEOL (BOL+2) 97#define NOTHING (BOL+3) 98#define BOW (BOL+4) 99#define EOW (BOL+5) 100/* update nonchars[] array below when adding fake chars here */ 101#define CODEMAX (BOL+5) /* highest code used */ 102#define NONCHAR(c) ((c) > CHAR_MAX) 103#define NNONCHAR (CODEMAX-CHAR_MAX) 104#ifdef REDEBUG 105static void print(struct match *, char *, states, int, FILE *); 106#endif 107#ifdef REDEBUG 108static void at(struct match *, char *, char *, char *, sopno, sopno); 109#endif 110#ifdef REDEBUG 111static const char *pchar(int); 112#endif 113 114#ifdef REDEBUG 115#define SP(t, s, c) print(m, t, s, c, stdout) 116#define AT(t, p1, p2, s1, s2) at(m, t, p1, p2, s1, s2) 117#define NOTE(str) { if (m->eflags®_TRACE) (void)printf("=%s\n", (str)); } 118static int nope = 0; 119#else 120#define SP(t, s, c) /* nothing */ 121#define AT(t, p1, p2, s1, s2) /* nothing */ 122#define NOTE(s) /* nothing */ 123#endif 124 125/* 126 - matcher - the actual matching engine 127 */ 128static int /* 0 success, REG_NOMATCH failure */ 129matcher(struct re_guts *g, char *string, size_t nmatch, regmatch_t pmatch[], 130 int eflags) 131{ 132 char *endp; 133 int i; 134 struct match mv; 135 struct match *m = &mv; 136 char *dp; 137 const sopno gf = g->firststate+1; /* +1 for OEND */ 138 const sopno gl = g->laststate; 139 char *start; 140 char *stop; 141 142 /* simplify the situation where possible */ 143 if (g->cflags®_NOSUB) 144 nmatch = 0; 145 if (eflags®_STARTEND) { 146 start = string + pmatch[0].rm_so; 147 stop = string + pmatch[0].rm_eo; 148 } else { 149 start = string; 150 stop = start + strlen(start); 151 } 152 if (stop < start) 153 return(REG_INVARG); 154 155 /* prescreening; this does wonders for this rather slow code */ 156 if (g->must != NULL) { 157 for (dp = start; dp < stop; dp++) 158 if (*dp == g->must[0] && stop - dp >= g->mlen && 159 memcmp(dp, g->must, (size_t)g->mlen) == 0) 160 break; 161 if (dp == stop) /* we didn't find g->must */ 162 return(REG_NOMATCH); 163 } 164 165 /* match struct setup */ 166 m->g = g; 167 m->eflags = eflags; 168 m->pmatch = NULL; 169 m->lastpos = NULL; 170 m->offp = string; 171 m->beginp = start; 172 m->endp = stop; 173 STATESETUP(m, 4); 174 SETUP(m->st); 175 SETUP(m->fresh); 176 SETUP(m->tmp); 177 SETUP(m->empty); 178 CLEAR(m->empty); 179 180 /* this loop does only one repetition except for backrefs */ 181 for (;;) { 182 endp = fast(m, start, stop, gf, gl); 183 if (endp == NULL) { /* a miss */ 184 free(m->pmatch); 185 free(m->lastpos); 186 STATETEARDOWN(m); 187 return(REG_NOMATCH); 188 } 189 if (nmatch == 0 && !g->backrefs) 190 break; /* no further info needed */ 191 192 /* where? */ 193 assert(m->coldp != NULL); 194 for (;;) { 195 NOTE("finding start"); 196 endp = slow(m, m->coldp, stop, gf, gl); 197 if (endp != NULL) 198 break; 199 assert(m->coldp < m->endp); 200 m->coldp++; 201 } 202 if (nmatch == 1 && !g->backrefs) 203 break; /* no further info needed */ 204 205 /* oh my, he wants the subexpressions... */ 206 if (m->pmatch == NULL) 207 m->pmatch = reallocarray(NULL, m->g->nsub + 1, 208 sizeof(regmatch_t)); 209 if (m->pmatch == NULL) { 210 STATETEARDOWN(m); 211 return(REG_ESPACE); 212 } 213 for (i = 1; i <= m->g->nsub; i++) 214 m->pmatch[i].rm_so = m->pmatch[i].rm_eo = -1; 215 if (!g->backrefs && !(m->eflags®_BACKR)) { 216 NOTE("dissecting"); 217 dp = dissect(m, m->coldp, endp, gf, gl); 218 } else { 219 if (g->nplus > 0 && m->lastpos == NULL) 220 m->lastpos = reallocarray(NULL, 221 g->nplus+1, sizeof(char *)); 222 if (g->nplus > 0 && m->lastpos == NULL) { 223 free(m->pmatch); 224 STATETEARDOWN(m); 225 return(REG_ESPACE); 226 } 227 NOTE("backref dissect"); 228 dp = backref(m, m->coldp, endp, gf, gl, (sopno)0, 0); 229 } 230 if (dp != NULL) 231 break; 232 233 /* uh-oh... we couldn't find a subexpression-level match */ 234 assert(g->backrefs); /* must be back references doing it */ 235 assert(g->nplus == 0 || m->lastpos != NULL); 236 for (;;) { 237 if (dp != NULL || endp <= m->coldp) 238 break; /* defeat */ 239 NOTE("backoff"); 240 endp = slow(m, m->coldp, endp-1, gf, gl); 241 if (endp == NULL) 242 break; /* defeat */ 243 /* try it on a shorter possibility */ 244#ifndef NDEBUG 245 for (i = 1; i <= m->g->nsub; i++) { 246 assert(m->pmatch[i].rm_so == -1); 247 assert(m->pmatch[i].rm_eo == -1); 248 } 249#endif 250 NOTE("backoff dissect"); 251 dp = backref(m, m->coldp, endp, gf, gl, (sopno)0, 0); 252 } 253 assert(dp == NULL || dp == endp); 254 if (dp != NULL) /* found a shorter one */ 255 break; 256 257 /* despite initial appearances, there is no match here */ 258 NOTE("false alarm"); 259 if (m->coldp == stop) 260 break; 261 start = m->coldp + 1; /* recycle starting later */ 262 } 263 264 /* fill in the details if requested */ 265 if (nmatch > 0) { 266 pmatch[0].rm_so = m->coldp - m->offp; 267 pmatch[0].rm_eo = endp - m->offp; 268 } 269 if (nmatch > 1) { 270 assert(m->pmatch != NULL); 271 for (i = 1; i < nmatch; i++) 272 if (i <= m->g->nsub) 273 pmatch[i] = m->pmatch[i]; 274 else { 275 pmatch[i].rm_so = -1; 276 pmatch[i].rm_eo = -1; 277 } 278 } 279 280 free(m->pmatch); 281 free(m->lastpos); 282 STATETEARDOWN(m); 283 return(0); 284} 285 286/* 287 - dissect - figure out what matched what, no back references 288 */ 289static char * /* == stop (success) always */ 290dissect(struct match *m, char *start, char *stop, sopno startst, sopno stopst) 291{ 292 int i; 293 sopno ss; /* start sop of current subRE */ 294 sopno es; /* end sop of current subRE */ 295 char *sp; /* start of string matched by it */ 296 char *stp; /* string matched by it cannot pass here */ 297 char *rest; /* start of rest of string */ 298 char *tail; /* string unmatched by rest of RE */ 299 sopno ssub; /* start sop of subsubRE */ 300 sopno esub; /* end sop of subsubRE */ 301 char *ssp; /* start of string matched by subsubRE */ 302 char *sep; /* end of string matched by subsubRE */ 303 char *oldssp; /* previous ssp */ 304 char *dp; 305 306 AT("diss", start, stop, startst, stopst); 307 sp = start; 308 for (ss = startst; ss < stopst; ss = es) { 309 /* identify end of subRE */ 310 es = ss; 311 switch (OP(m->g->strip[es])) { 312 case OPLUS_: 313 case OQUEST_: 314 es += OPND(m->g->strip[es]); 315 break; 316 case OCH_: 317 while (OP(m->g->strip[es]) != O_CH) 318 es += OPND(m->g->strip[es]); 319 break; 320 } 321 es++; 322 323 /* figure out what it matched */ 324 switch (OP(m->g->strip[ss])) { 325 case OEND: 326 assert(nope); 327 break; 328 case OCHAR: 329 sp++; 330 break; 331 case OBOL: 332 case OEOL: 333 case OBOW: 334 case OEOW: 335 break; 336 case OANY: 337 case OANYOF: 338 sp++; 339 break; 340 case OBACK_: 341 case O_BACK: 342 assert(nope); 343 break; 344 /* cases where length of match is hard to find */ 345 case OQUEST_: 346 stp = stop; 347 for (;;) { 348 /* how long could this one be? */ 349 rest = slow(m, sp, stp, ss, es); 350 assert(rest != NULL); /* it did match */ 351 /* could the rest match the rest? */ 352 tail = slow(m, rest, stop, es, stopst); 353 if (tail == stop) 354 break; /* yes! */ 355 /* no -- try a shorter match for this one */ 356 stp = rest - 1; 357 assert(stp >= sp); /* it did work */ 358 } 359 ssub = ss + 1; 360 esub = es - 1; 361 /* did innards match? */ 362 if (slow(m, sp, rest, ssub, esub) != NULL) { 363 dp = dissect(m, sp, rest, ssub, esub); 364 assert(dp == rest); 365 } else /* no */ 366 assert(sp == rest); 367 sp = rest; 368 break; 369 case OPLUS_: 370 stp = stop; 371 for (;;) { 372 /* how long could this one be? */ 373 rest = slow(m, sp, stp, ss, es); 374 assert(rest != NULL); /* it did match */ 375 /* could the rest match the rest? */ 376 tail = slow(m, rest, stop, es, stopst); 377 if (tail == stop) 378 break; /* yes! */ 379 /* no -- try a shorter match for this one */ 380 stp = rest - 1; 381 assert(stp >= sp); /* it did work */ 382 } 383 ssub = ss + 1; 384 esub = es - 1; 385 ssp = sp; 386 oldssp = ssp; 387 for (;;) { /* find last match of innards */ 388 sep = slow(m, ssp, rest, ssub, esub); 389 if (sep == NULL || sep == ssp) 390 break; /* failed or matched null */ 391 oldssp = ssp; /* on to next try */ 392 ssp = sep; 393 } 394 if (sep == NULL) { 395 /* last successful match */ 396 sep = ssp; 397 ssp = oldssp; 398 } 399 assert(sep == rest); /* must exhaust substring */ 400 assert(slow(m, ssp, sep, ssub, esub) == rest); 401 dp = dissect(m, ssp, sep, ssub, esub); 402 assert(dp == sep); 403 sp = rest; 404 break; 405 case OCH_: 406 stp = stop; 407 for (;;) { 408 /* how long could this one be? */ 409 rest = slow(m, sp, stp, ss, es); 410 assert(rest != NULL); /* it did match */ 411 /* could the rest match the rest? */ 412 tail = slow(m, rest, stop, es, stopst); 413 if (tail == stop) 414 break; /* yes! */ 415 /* no -- try a shorter match for this one */ 416 stp = rest - 1; 417 assert(stp >= sp); /* it did work */ 418 } 419 ssub = ss + 1; 420 esub = ss + OPND(m->g->strip[ss]) - 1; 421 assert(OP(m->g->strip[esub]) == OOR1); 422 for (;;) { /* find first matching branch */ 423 if (slow(m, sp, rest, ssub, esub) == rest) 424 break; /* it matched all of it */ 425 /* that one missed, try next one */ 426 assert(OP(m->g->strip[esub]) == OOR1); 427 esub++; 428 assert(OP(m->g->strip[esub]) == OOR2); 429 ssub = esub + 1; 430 esub += OPND(m->g->strip[esub]); 431 if (OP(m->g->strip[esub]) == OOR2) 432 esub--; 433 else 434 assert(OP(m->g->strip[esub]) == O_CH); 435 } 436 dp = dissect(m, sp, rest, ssub, esub); 437 assert(dp == rest); 438 sp = rest; 439 break; 440 case O_PLUS: 441 case O_QUEST: 442 case OOR1: 443 case OOR2: 444 case O_CH: 445 assert(nope); 446 break; 447 case OLPAREN: 448 i = OPND(m->g->strip[ss]); 449 assert(0 < i && i <= m->g->nsub); 450 m->pmatch[i].rm_so = sp - m->offp; 451 break; 452 case ORPAREN: 453 i = OPND(m->g->strip[ss]); 454 assert(0 < i && i <= m->g->nsub); 455 m->pmatch[i].rm_eo = sp - m->offp; 456 break; 457 default: /* uh oh */ 458 assert(nope); 459 break; 460 } 461 } 462 463 assert(sp == stop); 464 return(sp); 465} 466 467/* 468 - backref - figure out what matched what, figuring in back references 469 */ 470static char * /* == stop (success) or NULL (failure) */ 471backref(struct match *m, char *start, char *stop, sopno startst, sopno stopst, 472 sopno lev, int rec) /* PLUS nesting level */ 473{ 474 int i; 475 sopno ss; /* start sop of current subRE */ 476 char *sp; /* start of string matched by it */ 477 sopno ssub; /* start sop of subsubRE */ 478 sopno esub; /* end sop of subsubRE */ 479 char *ssp; /* start of string matched by subsubRE */ 480 char *dp; 481 size_t len; 482 int hard; 483 sop s; 484 regoff_t offsave; 485 cset *cs; 486 487 AT("back", start, stop, startst, stopst); 488 sp = start; 489 490 /* get as far as we can with easy stuff */ 491 hard = 0; 492 for (ss = startst; !hard && ss < stopst; ss++) 493 switch (OP(s = m->g->strip[ss])) { 494 case OCHAR: 495 if (sp == stop || *sp++ != (char)OPND(s)) 496 return(NULL); 497 break; 498 case OANY: 499 if (sp == stop) 500 return(NULL); 501 sp++; 502 break; 503 case OANYOF: 504 cs = &m->g->sets[OPND(s)]; 505 if (sp == stop || !CHIN(cs, *sp++)) 506 return(NULL); 507 break; 508 case OBOL: 509 if ((sp == m->beginp && !(m->eflags®_NOTBOL)) || 510 (sp > m->offp && sp < m->endp && 511 *(sp-1) == '\n' && (m->g->cflags®_NEWLINE))) 512 { /* yes */ } 513 else 514 return(NULL); 515 break; 516 case OEOL: 517 if ( (sp == m->endp && !(m->eflags®_NOTEOL)) || 518 (sp < m->endp && *sp == '\n' && 519 (m->g->cflags®_NEWLINE)) ) 520 { /* yes */ } 521 else 522 return(NULL); 523 break; 524 case OBOW: 525 if (sp < m->endp && ISWORD(*sp) && 526 ((sp == m->beginp && !(m->eflags®_NOTBOL)) || 527 (sp > m->offp && !ISWORD(*(sp-1))))) 528 { /* yes */ } 529 else 530 return(NULL); 531 break; 532 case OEOW: 533 if (( (sp == m->endp && !(m->eflags®_NOTEOL)) || 534 (sp < m->endp && *sp == '\n' && 535 (m->g->cflags®_NEWLINE)) || 536 (sp < m->endp && !ISWORD(*sp)) ) && 537 (sp > m->beginp && ISWORD(*(sp-1))) ) 538 { /* yes */ } 539 else 540 return(NULL); 541 break; 542 case O_QUEST: 543 break; 544 case OOR1: /* matches null but needs to skip */ 545 ss++; 546 s = m->g->strip[ss]; 547 do { 548 assert(OP(s) == OOR2); 549 ss += OPND(s); 550 } while (OP(s = m->g->strip[ss]) != O_CH); 551 /* note that the ss++ gets us past the O_CH */ 552 break; 553 default: /* have to make a choice */ 554 hard = 1; 555 break; 556 } 557 if (!hard) { /* that was it! */ 558 if (sp != stop) 559 return(NULL); 560 return(sp); 561 } 562 ss--; /* adjust for the for's final increment */ 563 564 /* the hard stuff */ 565 AT("hard", sp, stop, ss, stopst); 566 s = m->g->strip[ss]; 567 switch (OP(s)) { 568 case OBACK_: /* the vilest depths */ 569 i = OPND(s); 570 assert(0 < i && i <= m->g->nsub); 571 if (m->pmatch[i].rm_eo == -1) 572 return(NULL); 573 assert(m->pmatch[i].rm_so != -1); 574 len = m->pmatch[i].rm_eo - m->pmatch[i].rm_so; 575 if (len == 0 && rec++ > MAX_RECURSION) 576 return(NULL); 577 assert(stop - m->beginp >= len); 578 if (sp > stop - len) 579 return(NULL); /* not enough left to match */ 580 ssp = m->offp + m->pmatch[i].rm_so; 581 if (memcmp(sp, ssp, len) != 0) 582 return(NULL); 583 while (m->g->strip[ss] != SOP(O_BACK, i)) 584 ss++; 585 return(backref(m, sp+len, stop, ss+1, stopst, lev, rec)); 586 break; 587 case OQUEST_: /* to null or not */ 588 dp = backref(m, sp, stop, ss+1, stopst, lev, rec); 589 if (dp != NULL) 590 return(dp); /* not */ 591 return(backref(m, sp, stop, ss+OPND(s)+1, stopst, lev, rec)); 592 break; 593 case OPLUS_: 594 assert(m->lastpos != NULL); 595 assert(lev+1 <= m->g->nplus); 596 m->lastpos[lev+1] = sp; 597 return(backref(m, sp, stop, ss+1, stopst, lev+1, rec)); 598 break; 599 case O_PLUS: 600 if (sp == m->lastpos[lev]) /* last pass matched null */ 601 return(backref(m, sp, stop, ss+1, stopst, lev-1, rec)); 602 /* try another pass */ 603 m->lastpos[lev] = sp; 604 dp = backref(m, sp, stop, ss-OPND(s)+1, stopst, lev, rec); 605 if (dp == NULL) 606 return(backref(m, sp, stop, ss+1, stopst, lev-1, rec)); 607 else 608 return(dp); 609 break; 610 case OCH_: /* find the right one, if any */ 611 ssub = ss + 1; 612 esub = ss + OPND(s) - 1; 613 assert(OP(m->g->strip[esub]) == OOR1); 614 for (;;) { /* find first matching branch */ 615 dp = backref(m, sp, stop, ssub, esub, lev, rec); 616 if (dp != NULL) 617 return(dp); 618 /* that one missed, try next one */ 619 if (OP(m->g->strip[esub]) == O_CH) 620 return(NULL); /* there is none */ 621 esub++; 622 assert(OP(m->g->strip[esub]) == OOR2); 623 ssub = esub + 1; 624 esub += OPND(m->g->strip[esub]); 625 if (OP(m->g->strip[esub]) == OOR2) 626 esub--; 627 else 628 assert(OP(m->g->strip[esub]) == O_CH); 629 } 630 break; 631 case OLPAREN: /* must undo assignment if rest fails */ 632 i = OPND(s); 633 assert(0 < i && i <= m->g->nsub); 634 offsave = m->pmatch[i].rm_so; 635 m->pmatch[i].rm_so = sp - m->offp; 636 dp = backref(m, sp, stop, ss+1, stopst, lev, rec); 637 if (dp != NULL) 638 return(dp); 639 m->pmatch[i].rm_so = offsave; 640 return(NULL); 641 break; 642 case ORPAREN: /* must undo assignment if rest fails */ 643 i = OPND(s); 644 assert(0 < i && i <= m->g->nsub); 645 offsave = m->pmatch[i].rm_eo; 646 m->pmatch[i].rm_eo = sp - m->offp; 647 dp = backref(m, sp, stop, ss+1, stopst, lev, rec); 648 if (dp != NULL) 649 return(dp); 650 m->pmatch[i].rm_eo = offsave; 651 return(NULL); 652 break; 653 default: /* uh oh */ 654 assert(nope); 655 break; 656 } 657 658 /* "can't happen" */ 659 assert(nope); 660 /* NOTREACHED */ 661 return NULL; 662} 663 664/* 665 - fast - step through the string at top speed 666 */ 667static char * /* where tentative match ended, or NULL */ 668fast(struct match *m, char *start, char *stop, sopno startst, sopno stopst) 669{ 670 states st = m->st; 671 states fresh = m->fresh; 672 states tmp = m->tmp; 673 char *p = start; 674 int c; 675 int lastc; /* previous c */ 676 int flagch; 677 int i; 678 char *coldp; /* last p after which no match was underway */ 679 680 if (start == m->offp || (start == m->beginp && !(m->eflags®_NOTBOL))) 681 c = OUT; 682 else 683 c = *(start-1); 684 685 CLEAR(st); 686 SET1(st, startst); 687 st = step(m->g, startst, stopst, st, NOTHING, st); 688 ASSIGN(fresh, st); 689 SP("start", st, *p); 690 coldp = NULL; 691 for (;;) { 692 /* next character */ 693 lastc = c; 694 c = (p == m->endp) ? OUT : *p; 695 if (EQ(st, fresh)) 696 coldp = p; 697 698 /* is there an EOL and/or BOL between lastc and c? */ 699 flagch = '\0'; 700 i = 0; 701 if ((lastc == '\n' && m->g->cflags®_NEWLINE) || 702 (lastc == OUT && !(m->eflags®_NOTBOL))) { 703 flagch = BOL; 704 i = m->g->nbol; 705 } 706 if ((c == '\n' && m->g->cflags®_NEWLINE) || 707 (c == OUT && !(m->eflags®_NOTEOL)) ) { 708 flagch = (flagch == BOL) ? BOLEOL : EOL; 709 i += m->g->neol; 710 } 711 if (i != 0) { 712 for (; i > 0; i--) 713 st = step(m->g, startst, stopst, 714 st, flagch, st); 715 SP("boleol", st, c); 716 } 717 718 /* how about a word boundary? */ 719 if ((flagch == BOL || (lastc != OUT && !ISWORD(lastc))) && 720 (c != OUT && ISWORD(c))) 721 flagch = BOW; 722 if ((lastc != OUT && ISWORD(lastc)) && 723 (flagch == EOL || (c != OUT && !ISWORD(c)))) 724 flagch = EOW; 725 if (flagch == BOW || flagch == EOW) { 726 st = step(m->g, startst, stopst, st, flagch, st); 727 SP("boweow", st, c); 728 } 729 730 /* are we done? */ 731 if (ISSET(st, stopst) || p == stop) 732 break; /* NOTE BREAK OUT */ 733 734 /* no, we must deal with this character */ 735 ASSIGN(tmp, st); 736 ASSIGN(st, fresh); 737 assert(c != OUT); 738 st = step(m->g, startst, stopst, tmp, c, st); 739 SP("aft", st, c); 740 assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st)); 741 p++; 742 } 743 744 assert(coldp != NULL); 745 m->coldp = coldp; 746 if (ISSET(st, stopst)) 747 return(p+1); 748 else 749 return(NULL); 750} 751 752/* 753 - slow - step through the string more deliberately 754 */ 755static char * /* where it ended */ 756slow(struct match *m, char *start, char *stop, sopno startst, sopno stopst) 757{ 758 states st = m->st; 759 states empty = m->empty; 760 states tmp = m->tmp; 761 char *p = start; 762 int c; 763 int lastc; /* previous c */ 764 int flagch; 765 int i; 766 char *matchp; /* last p at which a match ended */ 767 768 if (start == m->offp || (start == m->beginp && !(m->eflags®_NOTBOL))) 769 c = OUT; 770 else 771 c = *(start-1); 772 773 AT("slow", start, stop, startst, stopst); 774 CLEAR(st); 775 SET1(st, startst); 776 SP("sstart", st, *p); 777 st = step(m->g, startst, stopst, st, NOTHING, st); 778 matchp = NULL; 779 for (;;) { 780 /* next character */ 781 lastc = c; 782 c = (p == m->endp) ? OUT : *p; 783 784 /* is there an EOL and/or BOL between lastc and c? */ 785 flagch = '\0'; 786 i = 0; 787 if ((lastc == '\n' && m->g->cflags®_NEWLINE) || 788 (lastc == OUT && !(m->eflags®_NOTBOL))) { 789 flagch = BOL; 790 i = m->g->nbol; 791 } 792 if ((c == '\n' && m->g->cflags®_NEWLINE) || 793 (c == OUT && !(m->eflags®_NOTEOL))) { 794 flagch = (flagch == BOL) ? BOLEOL : EOL; 795 i += m->g->neol; 796 } 797 if (i != 0) { 798 for (; i > 0; i--) 799 st = step(m->g, startst, stopst, 800 st, flagch, st); 801 SP("sboleol", st, c); 802 } 803 804 /* how about a word boundary? */ 805 if ((flagch == BOL || (lastc != OUT && !ISWORD(lastc))) && 806 (c != OUT && ISWORD(c))) 807 flagch = BOW; 808 if ((lastc != OUT && ISWORD(lastc)) && 809 (flagch == EOL || (c != OUT && !ISWORD(c)))) 810 flagch = EOW; 811 if (flagch == BOW || flagch == EOW) { 812 st = step(m->g, startst, stopst, st, flagch, st); 813 SP("sboweow", st, c); 814 } 815 816 /* are we done? */ 817 if (ISSET(st, stopst)) 818 matchp = p; 819 if (EQ(st, empty) || p == stop) 820 break; /* NOTE BREAK OUT */ 821 822 /* no, we must deal with this character */ 823 ASSIGN(tmp, st); 824 ASSIGN(st, empty); 825 assert(c != OUT); 826 st = step(m->g, startst, stopst, tmp, c, st); 827 SP("saft", st, c); 828 assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st)); 829 p++; 830 } 831 832 return(matchp); 833} 834 835 836/* 837 - step - map set of states reachable before char to set reachable after 838 */ 839static states 840step(struct re_guts *g, 841 sopno start, /* start state within strip */ 842 sopno stop, /* state after stop state within strip */ 843 states bef, /* states reachable before */ 844 int ch, /* character or NONCHAR code */ 845 states aft) /* states already known reachable after */ 846{ 847 cset *cs; 848 sop s; 849 sopno pc; 850 onestate here; /* note, macros know this name */ 851 sopno look; 852 int i; 853 854 for (pc = start, INIT(here, pc); pc != stop; pc++, INC(here)) { 855 s = g->strip[pc]; 856 switch (OP(s)) { 857 case OEND: 858 assert(pc == stop-1); 859 break; 860 case OCHAR: 861 /* only characters can match */ 862 assert(!NONCHAR(ch) || ch != (char)OPND(s)); 863 if (ch == (char)OPND(s)) 864 FWD(aft, bef, 1); 865 break; 866 case OBOL: 867 if (ch == BOL || ch == BOLEOL) 868 FWD(aft, bef, 1); 869 break; 870 case OEOL: 871 if (ch == EOL || ch == BOLEOL) 872 FWD(aft, bef, 1); 873 break; 874 case OBOW: 875 if (ch == BOW) 876 FWD(aft, bef, 1); 877 break; 878 case OEOW: 879 if (ch == EOW) 880 FWD(aft, bef, 1); 881 break; 882 case OANY: 883 if (!NONCHAR(ch)) 884 FWD(aft, bef, 1); 885 break; 886 case OANYOF: 887 cs = &g->sets[OPND(s)]; 888 if (!NONCHAR(ch) && CHIN(cs, ch)) 889 FWD(aft, bef, 1); 890 break; 891 case OBACK_: /* ignored here */ 892 case O_BACK: 893 FWD(aft, aft, 1); 894 break; 895 case OPLUS_: /* forward, this is just an empty */ 896 FWD(aft, aft, 1); 897 break; 898 case O_PLUS: /* both forward and back */ 899 FWD(aft, aft, 1); 900 i = ISSETBACK(aft, OPND(s)); 901 BACK(aft, aft, OPND(s)); 902 if (!i && ISSETBACK(aft, OPND(s))) { 903 /* oho, must reconsider loop body */ 904 pc -= OPND(s) + 1; 905 INIT(here, pc); 906 } 907 break; 908 case OQUEST_: /* two branches, both forward */ 909 FWD(aft, aft, 1); 910 FWD(aft, aft, OPND(s)); 911 break; 912 case O_QUEST: /* just an empty */ 913 FWD(aft, aft, 1); 914 break; 915 case OLPAREN: /* not significant here */ 916 case ORPAREN: 917 FWD(aft, aft, 1); 918 break; 919 case OCH_: /* mark the first two branches */ 920 FWD(aft, aft, 1); 921 assert(OP(g->strip[pc+OPND(s)]) == OOR2); 922 FWD(aft, aft, OPND(s)); 923 break; 924 case OOR1: /* done a branch, find the O_CH */ 925 if (ISSTATEIN(aft, here)) { 926 for (look = 1; 927 OP(s = g->strip[pc+look]) != O_CH; 928 look += OPND(s)) 929 assert(OP(s) == OOR2); 930 FWD(aft, aft, look); 931 } 932 break; 933 case OOR2: /* propagate OCH_'s marking */ 934 FWD(aft, aft, 1); 935 if (OP(g->strip[pc+OPND(s)]) != O_CH) { 936 assert(OP(g->strip[pc+OPND(s)]) == OOR2); 937 FWD(aft, aft, OPND(s)); 938 } 939 break; 940 case O_CH: /* just empty */ 941 FWD(aft, aft, 1); 942 break; 943 default: /* ooooops... */ 944 assert(nope); 945 break; 946 } 947 } 948 949 return(aft); 950} 951 952#ifdef REDEBUG 953/* 954 - print - print a set of states 955 */ 956static void 957print(struct match *m, char *caption, states st, int ch, FILE *d) 958{ 959 struct re_guts *g = m->g; 960 int i; 961 int first = 1; 962 963 if (!(m->eflags®_TRACE)) 964 return; 965 966 (void)fprintf(d, "%s", caption); 967 (void)fprintf(d, " %s", pchar(ch)); 968 for (i = 0; i < g->nstates; i++) { 969 if (ISSET(st, i)) { 970 (void)fprintf(d, "%s%d", (first) ? "\t" : ", ", i); 971 first = 0; 972 } 973 } 974 (void)fprintf(d, "\n"); 975} 976 977/* 978 - at - print current situation 979 */ 980static void 981at(struct match *m, char *title, char *start, char *stop, sopno startst, 982 sopno stopst) 983{ 984 if (!(m->eflags®_TRACE)) 985 return; 986 987 (void)printf("%s %s-", title, pchar(*start)); 988 (void)printf("%s ", pchar(*stop)); 989 (void)printf("%ld-%ld\n", (long)startst, (long)stopst); 990} 991 992#ifndef PCHARDONE 993#define PCHARDONE /* never again */ 994static const char *nonchars[] = 995 { "OUT", "BOL", "EOL", "BOLEOL", "NOTHING", "BOW", "EOW" }; 996#define PNONCHAR(c) \ 997 ((c) - OUT < (sizeof(nonchars)/sizeof(nonchars[0])) \ 998 ? nonchars[(c) - OUT] : "invalid") 999 1000/* 1001 - pchar - make a character printable 1002 * 1003 * Is this identical to regchar() over in debug.c? Well, yes. But a 1004 * duplicate here avoids having a debugging-capable regexec.o tied to 1005 * a matching debug.o, and this is convenient. It all disappears in 1006 * the non-debug compilation anyway, so it doesn't matter much. 1007 */ 1008static const char * /* -> representation */ 1009pchar(int ch) 1010{ 1011 static char pbuf[10]; 1012 1013 if (NONCHAR(ch)) { 1014 if (ch - OUT < (sizeof(nonchars)/sizeof(nonchars[0]))) 1015 return nonchars[ch - OUT]; 1016 return "invalid"; 1017 } 1018 if (isprint((unsigned char)ch) || ch == ' ') 1019 (void)snprintf(pbuf, sizeof pbuf, "%c", ch); 1020 else 1021 (void)snprintf(pbuf, sizeof pbuf, "\\%o", ch); 1022 return(pbuf); 1023} 1024#endif 1025#endif 1026 1027#undef matcher 1028#undef fast 1029#undef slow 1030#undef dissect 1031#undef backref 1032#undef step 1033#undef print 1034#undef at 1035#undef match 1036#undef nope 1037