engine.c revision 1.22
1/* $OpenBSD: engine.c,v 1.22 2016/05/25 21:01:11 schwarze Exp $ */ 2 3/*- 4 * Copyright (c) 1992, 1993, 1994 Henry Spencer. 5 * Copyright (c) 1992, 1993, 1994 6 * The Regents of the University of California. All rights reserved. 7 * 8 * This code is derived from software contributed to Berkeley by 9 * Henry Spencer. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 3. Neither the name of the University nor the names of its contributors 20 * may be used to endorse or promote products derived from this software 21 * without specific prior written permission. 22 * 23 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 24 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 25 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 26 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 27 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 28 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 29 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 30 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 31 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 32 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 33 * SUCH DAMAGE. 34 * 35 * @(#)engine.c 8.5 (Berkeley) 3/20/94 36 */ 37 38/* 39 * The matching engine and friends. This file is #included by regexec.c 40 * after suitable #defines of a variety of macros used herein, so that 41 * different state representations can be used without duplicating masses 42 * of code. 43 */ 44 45#ifdef SNAMES 46#define matcher smatcher 47#define fast sfast 48#define slow sslow 49#define dissect sdissect 50#define backref sbackref 51#define step sstep 52#define print sprint 53#define at sat 54#define match smat 55#define nope snope 56#endif 57#ifdef LNAMES 58#define matcher lmatcher 59#define fast lfast 60#define slow lslow 61#define dissect ldissect 62#define backref lbackref 63#define step lstep 64#define print lprint 65#define at lat 66#define match lmat 67#define nope lnope 68#endif 69 70/* another structure passed up and down to avoid zillions of parameters */ 71struct match { 72 struct re_guts *g; 73 int eflags; 74 regmatch_t *pmatch; /* [nsub+1] (0 element unused) */ 75 char *offp; /* offsets work from here */ 76 char *beginp; /* start of string -- virtual NUL precedes */ 77 char *endp; /* end of string -- virtual NUL here */ 78 char *coldp; /* can be no match starting before here */ 79 char **lastpos; /* [nplus+1] */ 80 STATEVARS; 81 states st; /* current states */ 82 states fresh; /* states for a fresh start */ 83 states tmp; /* temporary */ 84 states empty; /* empty set of states */ 85}; 86 87static int matcher(struct re_guts *, char *, size_t, regmatch_t[], int); 88static char *dissect(struct match *, char *, char *, sopno, sopno); 89static char *backref(struct match *, char *, char *, sopno, sopno, sopno, int); 90static char *fast(struct match *, char *, char *, sopno, sopno); 91static char *slow(struct match *, char *, char *, sopno, sopno); 92static states step(struct re_guts *, sopno, sopno, states, int, states); 93#define MAX_RECURSION 100 94#define BOL (OUT+1) 95#define EOL (BOL+1) 96#define BOLEOL (BOL+2) 97#define NOTHING (BOL+3) 98#define BOW (BOL+4) 99#define EOW (BOL+5) 100/* update nonchars[] array below when adding fake chars here */ 101#define CODEMAX (BOL+5) /* highest code used */ 102#define NONCHAR(c) ((c) > CHAR_MAX) 103#define NNONCHAR (CODEMAX-CHAR_MAX) 104#ifdef REDEBUG 105static void print(struct match *, char *, states, int, FILE *); 106#endif 107#ifdef REDEBUG 108static void at(struct match *, char *, char *, char *, sopno, sopno); 109#endif 110#ifdef REDEBUG 111static const char *pchar(int); 112#endif 113 114#ifdef REDEBUG 115#define SP(t, s, c) print(m, t, s, c, stdout) 116#define AT(t, p1, p2, s1, s2) at(m, t, p1, p2, s1, s2) 117#define NOTE(str) { if (m->eflags®_TRACE) (void)printf("=%s\n", (str)); } 118static int nope = 0; 119#else 120#define SP(t, s, c) /* nothing */ 121#define AT(t, p1, p2, s1, s2) /* nothing */ 122#define NOTE(s) /* nothing */ 123#endif 124 125/* 126 - matcher - the actual matching engine 127 */ 128static int /* 0 success, REG_NOMATCH failure */ 129matcher(struct re_guts *g, char *string, size_t nmatch, regmatch_t pmatch[], 130 int eflags) 131{ 132 char *endp; 133 int i; 134 struct match mv; 135 struct match *m = &mv; 136 char *dp; 137 const sopno gf = g->firststate+1; /* +1 for OEND */ 138 const sopno gl = g->laststate; 139 char *start; 140 char *stop; 141 142 /* simplify the situation where possible */ 143 if (g->cflags®_NOSUB) 144 nmatch = 0; 145 if (eflags®_STARTEND) { 146 start = string + pmatch[0].rm_so; 147 stop = string + pmatch[0].rm_eo; 148 } else { 149 start = string; 150 stop = start + strlen(start); 151 } 152 if (stop < start) 153 return(REG_INVARG); 154 155 /* prescreening; this does wonders for this rather slow code */ 156 if (g->must != NULL) { 157 for (dp = start; dp < stop; dp++) 158 if (*dp == g->must[0] && stop - dp >= g->mlen && 159 memcmp(dp, g->must, (size_t)g->mlen) == 0) 160 break; 161 if (dp == stop) /* we didn't find g->must */ 162 return(REG_NOMATCH); 163 } 164 165 /* match struct setup */ 166 m->g = g; 167 m->eflags = eflags; 168 m->pmatch = NULL; 169 m->lastpos = NULL; 170 m->offp = string; 171 m->beginp = start; 172 m->endp = stop; 173 STATESETUP(m, 4); 174 SETUP(m->st); 175 SETUP(m->fresh); 176 SETUP(m->tmp); 177 SETUP(m->empty); 178 CLEAR(m->empty); 179 180 /* this loop does only one repetition except for backrefs */ 181 for (;;) { 182 endp = fast(m, start, stop, gf, gl); 183 if (endp == NULL) { /* a miss */ 184 free(m->pmatch); 185 free(m->lastpos); 186 STATETEARDOWN(m); 187 return(REG_NOMATCH); 188 } 189 if (nmatch == 0 && !g->backrefs) 190 break; /* no further info needed */ 191 192 /* where? */ 193 assert(m->coldp != NULL); 194 for (;;) { 195 NOTE("finding start"); 196 endp = slow(m, m->coldp, stop, gf, gl); 197 if (endp != NULL) 198 break; 199 assert(m->coldp < m->endp); 200 m->coldp++; 201 } 202 if (nmatch == 1 && !g->backrefs) 203 break; /* no further info needed */ 204 205 /* oh my, he wants the subexpressions... */ 206 if (m->pmatch == NULL) 207 m->pmatch = reallocarray(NULL, m->g->nsub + 1, 208 sizeof(regmatch_t)); 209 if (m->pmatch == NULL) { 210 STATETEARDOWN(m); 211 return(REG_ESPACE); 212 } 213 for (i = 1; i <= m->g->nsub; i++) 214 m->pmatch[i].rm_so = m->pmatch[i].rm_eo = -1; 215 if (!g->backrefs && !(m->eflags®_BACKR)) { 216 NOTE("dissecting"); 217 dp = dissect(m, m->coldp, endp, gf, gl); 218 } else { 219 if (g->nplus > 0 && m->lastpos == NULL) 220 m->lastpos = reallocarray(NULL, 221 g->nplus+1, sizeof(char *)); 222 if (g->nplus > 0 && m->lastpos == NULL) { 223 free(m->pmatch); 224 STATETEARDOWN(m); 225 return(REG_ESPACE); 226 } 227 NOTE("backref dissect"); 228 dp = backref(m, m->coldp, endp, gf, gl, (sopno)0, 0); 229 } 230 if (dp != NULL) 231 break; 232 233 /* uh-oh... we couldn't find a subexpression-level match */ 234 assert(g->backrefs); /* must be back references doing it */ 235 assert(g->nplus == 0 || m->lastpos != NULL); 236 for (;;) { 237 if (dp != NULL || endp <= m->coldp) 238 break; /* defeat */ 239 NOTE("backoff"); 240 endp = slow(m, m->coldp, endp-1, gf, gl); 241 if (endp == NULL) 242 break; /* defeat */ 243 /* try it on a shorter possibility */ 244#ifndef NDEBUG 245 for (i = 1; i <= m->g->nsub; i++) { 246 assert(m->pmatch[i].rm_so == -1); 247 assert(m->pmatch[i].rm_eo == -1); 248 } 249#endif 250 NOTE("backoff dissect"); 251 dp = backref(m, m->coldp, endp, gf, gl, (sopno)0, 0); 252 } 253 assert(dp == NULL || dp == endp); 254 if (dp != NULL) /* found a shorter one */ 255 break; 256 257 /* despite initial appearances, there is no match here */ 258 NOTE("false alarm"); 259 if (m->coldp == stop) 260 break; 261 start = m->coldp + 1; /* recycle starting later */ 262 } 263 264 /* fill in the details if requested */ 265 if (nmatch > 0) { 266 pmatch[0].rm_so = m->coldp - m->offp; 267 pmatch[0].rm_eo = endp - m->offp; 268 } 269 if (nmatch > 1) { 270 assert(m->pmatch != NULL); 271 for (i = 1; i < nmatch; i++) 272 if (i <= m->g->nsub) 273 pmatch[i] = m->pmatch[i]; 274 else { 275 pmatch[i].rm_so = -1; 276 pmatch[i].rm_eo = -1; 277 } 278 } 279 280 free(m->pmatch); 281 free(m->lastpos); 282 STATETEARDOWN(m); 283 return(0); 284} 285 286/* 287 - dissect - figure out what matched what, no back references 288 */ 289static char * /* == stop (success) always */ 290dissect(struct match *m, char *start, char *stop, sopno startst, sopno stopst) 291{ 292 int i; 293 sopno ss; /* start sop of current subRE */ 294 sopno es; /* end sop of current subRE */ 295 char *sp; /* start of string matched by it */ 296 char *stp; /* string matched by it cannot pass here */ 297 char *rest; /* start of rest of string */ 298 char *tail; /* string unmatched by rest of RE */ 299 sopno ssub; /* start sop of subsubRE */ 300 sopno esub; /* end sop of subsubRE */ 301 char *ssp; /* start of string matched by subsubRE */ 302 char *sep; /* end of string matched by subsubRE */ 303 char *oldssp; /* previous ssp */ 304 char *dp; 305 306 AT("diss", start, stop, startst, stopst); 307 sp = start; 308 for (ss = startst; ss < stopst; ss = es) { 309 /* identify end of subRE */ 310 es = ss; 311 switch (OP(m->g->strip[es])) { 312 case OPLUS_: 313 case OQUEST_: 314 es += OPND(m->g->strip[es]); 315 break; 316 case OCH_: 317 while (OP(m->g->strip[es]) != O_CH) 318 es += OPND(m->g->strip[es]); 319 break; 320 } 321 es++; 322 323 /* figure out what it matched */ 324 switch (OP(m->g->strip[ss])) { 325 case OEND: 326 assert(nope); 327 break; 328 case OCHAR: 329 sp++; 330 break; 331 case OBOL: 332 case OEOL: 333 case OBOW: 334 case OEOW: 335 break; 336 case OANY: 337 case OANYOF: 338 sp++; 339 break; 340 case OBACK_: 341 case O_BACK: 342 assert(nope); 343 break; 344 /* cases where length of match is hard to find */ 345 case OQUEST_: 346 stp = stop; 347 for (;;) { 348 /* how long could this one be? */ 349 rest = slow(m, sp, stp, ss, es); 350 assert(rest != NULL); /* it did match */ 351 /* could the rest match the rest? */ 352 tail = slow(m, rest, stop, es, stopst); 353 if (tail == stop) 354 break; /* yes! */ 355 /* no -- try a shorter match for this one */ 356 stp = rest - 1; 357 assert(stp >= sp); /* it did work */ 358 } 359 ssub = ss + 1; 360 esub = es - 1; 361 /* did innards match? */ 362 if (slow(m, sp, rest, ssub, esub) != NULL) { 363 dp = dissect(m, sp, rest, ssub, esub); 364 assert(dp == rest); 365 } else /* no */ 366 assert(sp == rest); 367 sp = rest; 368 break; 369 case OPLUS_: 370 stp = stop; 371 for (;;) { 372 /* how long could this one be? */ 373 rest = slow(m, sp, stp, ss, es); 374 assert(rest != NULL); /* it did match */ 375 /* could the rest match the rest? */ 376 tail = slow(m, rest, stop, es, stopst); 377 if (tail == stop) 378 break; /* yes! */ 379 /* no -- try a shorter match for this one */ 380 stp = rest - 1; 381 assert(stp >= sp); /* it did work */ 382 } 383 ssub = ss + 1; 384 esub = es - 1; 385 ssp = sp; 386 oldssp = ssp; 387 for (;;) { /* find last match of innards */ 388 sep = slow(m, ssp, rest, ssub, esub); 389 if (sep == NULL || sep == ssp) 390 break; /* failed or matched null */ 391 oldssp = ssp; /* on to next try */ 392 ssp = sep; 393 } 394 if (sep == NULL) { 395 /* last successful match */ 396 sep = ssp; 397 ssp = oldssp; 398 } 399 assert(sep == rest); /* must exhaust substring */ 400 assert(slow(m, ssp, sep, ssub, esub) == rest); 401 dp = dissect(m, ssp, sep, ssub, esub); 402 assert(dp == sep); 403 sp = rest; 404 break; 405 case OCH_: 406 stp = stop; 407 for (;;) { 408 /* how long could this one be? */ 409 rest = slow(m, sp, stp, ss, es); 410 assert(rest != NULL); /* it did match */ 411 /* could the rest match the rest? */ 412 tail = slow(m, rest, stop, es, stopst); 413 if (tail == stop) 414 break; /* yes! */ 415 /* no -- try a shorter match for this one */ 416 stp = rest - 1; 417 assert(stp >= sp); /* it did work */ 418 } 419 ssub = ss + 1; 420 esub = ss + OPND(m->g->strip[ss]) - 1; 421 assert(OP(m->g->strip[esub]) == OOR1); 422 for (;;) { /* find first matching branch */ 423 if (slow(m, sp, rest, ssub, esub) == rest) 424 break; /* it matched all of it */ 425 /* that one missed, try next one */ 426 assert(OP(m->g->strip[esub]) == OOR1); 427 esub++; 428 assert(OP(m->g->strip[esub]) == OOR2); 429 ssub = esub + 1; 430 esub += OPND(m->g->strip[esub]); 431 if (OP(m->g->strip[esub]) == OOR2) 432 esub--; 433 else 434 assert(OP(m->g->strip[esub]) == O_CH); 435 } 436 dp = dissect(m, sp, rest, ssub, esub); 437 assert(dp == rest); 438 sp = rest; 439 break; 440 case O_PLUS: 441 case O_QUEST: 442 case OOR1: 443 case OOR2: 444 case O_CH: 445 assert(nope); 446 break; 447 case OLPAREN: 448 i = OPND(m->g->strip[ss]); 449 assert(0 < i && i <= m->g->nsub); 450 m->pmatch[i].rm_so = sp - m->offp; 451 break; 452 case ORPAREN: 453 i = OPND(m->g->strip[ss]); 454 assert(0 < i && i <= m->g->nsub); 455 m->pmatch[i].rm_eo = sp - m->offp; 456 break; 457 default: /* uh oh */ 458 assert(nope); 459 break; 460 } 461 } 462 463 assert(sp == stop); 464 return(sp); 465} 466 467/* 468 - backref - figure out what matched what, figuring in back references 469 */ 470static char * /* == stop (success) or NULL (failure) */ 471backref(struct match *m, char *start, char *stop, sopno startst, sopno stopst, 472 sopno lev, int rec) /* PLUS nesting level */ 473{ 474 int i; 475 sopno ss; /* start sop of current subRE */ 476 char *sp; /* start of string matched by it */ 477 sopno ssub; /* start sop of subsubRE */ 478 sopno esub; /* end sop of subsubRE */ 479 char *ssp; /* start of string matched by subsubRE */ 480 char *dp; 481 size_t len; 482 int hard; 483 sop s; 484 regoff_t offsave; 485 cset *cs; 486 487 AT("back", start, stop, startst, stopst); 488 sp = start; 489 490 /* get as far as we can with easy stuff */ 491 hard = 0; 492 for (ss = startst; !hard && ss < stopst; ss++) 493 switch (OP(s = m->g->strip[ss])) { 494 case OCHAR: 495 if (sp == stop || *sp++ != (char)OPND(s)) 496 return(NULL); 497 break; 498 case OANY: 499 if (sp == stop) 500 return(NULL); 501 sp++; 502 break; 503 case OANYOF: 504 cs = &m->g->sets[OPND(s)]; 505 if (sp == stop || !CHIN(cs, *sp++)) 506 return(NULL); 507 break; 508 case OBOL: 509 if ((sp == m->beginp && !(m->eflags®_NOTBOL)) || 510 (sp > m->offp && sp < m->endp && 511 *(sp-1) == '\n' && (m->g->cflags®_NEWLINE))) 512 { /* yes */ } 513 else 514 return(NULL); 515 break; 516 case OEOL: 517 if ( (sp == m->endp && !(m->eflags®_NOTEOL)) || 518 (sp < m->endp && *sp == '\n' && 519 (m->g->cflags®_NEWLINE)) ) 520 { /* yes */ } 521 else 522 return(NULL); 523 break; 524 case OBOW: 525 if (sp < m->endp && ISWORD(*sp) && 526 ((sp == m->beginp && !(m->eflags®_NOTBOL)) || 527 (sp > m->offp && !ISWORD(*(sp-1))))) 528 { /* yes */ } 529 else 530 return(NULL); 531 break; 532 case OEOW: 533 if (( (sp == m->endp && !(m->eflags®_NOTEOL)) || 534 (sp < m->endp && *sp == '\n' && 535 (m->g->cflags®_NEWLINE)) || 536 (sp < m->endp && !ISWORD(*sp)) ) && 537 (sp > m->beginp && ISWORD(*(sp-1))) ) 538 { /* yes */ } 539 else 540 return(NULL); 541 break; 542 case O_QUEST: 543 break; 544 case OOR1: /* matches null but needs to skip */ 545 ss++; 546 s = m->g->strip[ss]; 547 do { 548 assert(OP(s) == OOR2); 549 ss += OPND(s); 550 } while (OP(s = m->g->strip[ss]) != O_CH); 551 /* note that the ss++ gets us past the O_CH */ 552 break; 553 default: /* have to make a choice */ 554 hard = 1; 555 break; 556 } 557 if (!hard) { /* that was it! */ 558 if (sp != stop) 559 return(NULL); 560 return(sp); 561 } 562 ss--; /* adjust for the for's final increment */ 563 564 /* the hard stuff */ 565 AT("hard", sp, stop, ss, stopst); 566 s = m->g->strip[ss]; 567 switch (OP(s)) { 568 case OBACK_: /* the vilest depths */ 569 i = OPND(s); 570 assert(0 < i && i <= m->g->nsub); 571 if (m->pmatch[i].rm_eo == -1) 572 return(NULL); 573 assert(m->pmatch[i].rm_so != -1); 574 len = m->pmatch[i].rm_eo - m->pmatch[i].rm_so; 575 if (len == 0 && rec++ > MAX_RECURSION) 576 return(NULL); 577 assert(stop - m->beginp >= len); 578 if (sp > stop - len) 579 return(NULL); /* not enough left to match */ 580 ssp = m->offp + m->pmatch[i].rm_so; 581 if (memcmp(sp, ssp, len) != 0) 582 return(NULL); 583 while (m->g->strip[ss] != SOP(O_BACK, i)) 584 ss++; 585 return(backref(m, sp+len, stop, ss+1, stopst, lev, rec)); 586 break; 587 case OQUEST_: /* to null or not */ 588 dp = backref(m, sp, stop, ss+1, stopst, lev, rec); 589 if (dp != NULL) 590 return(dp); /* not */ 591 return(backref(m, sp, stop, ss+OPND(s)+1, stopst, lev, rec)); 592 break; 593 case OPLUS_: 594 assert(m->lastpos != NULL); 595 assert(lev+1 <= m->g->nplus); 596 m->lastpos[lev+1] = sp; 597 return(backref(m, sp, stop, ss+1, stopst, lev+1, rec)); 598 break; 599 case O_PLUS: 600 if (sp == m->lastpos[lev]) /* last pass matched null */ 601 return(backref(m, sp, stop, ss+1, stopst, lev-1, rec)); 602 /* try another pass */ 603 m->lastpos[lev] = sp; 604 dp = backref(m, sp, stop, ss-OPND(s)+1, stopst, lev, rec); 605 if (dp == NULL) 606 return(backref(m, sp, stop, ss+1, stopst, lev-1, rec)); 607 else 608 return(dp); 609 break; 610 case OCH_: /* find the right one, if any */ 611 ssub = ss + 1; 612 esub = ss + OPND(s) - 1; 613 assert(OP(m->g->strip[esub]) == OOR1); 614 for (;;) { /* find first matching branch */ 615 dp = backref(m, sp, stop, ssub, esub, lev, rec); 616 if (dp != NULL) 617 return(dp); 618 /* that one missed, try next one */ 619 if (OP(m->g->strip[esub]) == O_CH) 620 return(NULL); /* there is none */ 621 esub++; 622 assert(OP(m->g->strip[esub]) == OOR2); 623 ssub = esub + 1; 624 esub += OPND(m->g->strip[esub]); 625 if (OP(m->g->strip[esub]) == OOR2) 626 esub--; 627 else 628 assert(OP(m->g->strip[esub]) == O_CH); 629 } 630 break; 631 case OLPAREN: /* must undo assignment if rest fails */ 632 i = OPND(s); 633 assert(0 < i && i <= m->g->nsub); 634 offsave = m->pmatch[i].rm_so; 635 m->pmatch[i].rm_so = sp - m->offp; 636 dp = backref(m, sp, stop, ss+1, stopst, lev, rec); 637 if (dp != NULL) 638 return(dp); 639 m->pmatch[i].rm_so = offsave; 640 return(NULL); 641 break; 642 case ORPAREN: /* must undo assignment if rest fails */ 643 i = OPND(s); 644 assert(0 < i && i <= m->g->nsub); 645 offsave = m->pmatch[i].rm_eo; 646 m->pmatch[i].rm_eo = sp - m->offp; 647 dp = backref(m, sp, stop, ss+1, stopst, lev, rec); 648 if (dp != NULL) 649 return(dp); 650 m->pmatch[i].rm_eo = offsave; 651 return(NULL); 652 break; 653 default: /* uh oh */ 654 assert(nope); 655 break; 656 } 657 658 /* "can't happen" */ 659 assert(nope); 660 /* NOTREACHED */ 661 return NULL; 662} 663 664/* 665 - fast - step through the string at top speed 666 */ 667static char * /* where tentative match ended, or NULL */ 668fast(struct match *m, char *start, char *stop, sopno startst, sopno stopst) 669{ 670 states st = m->st; 671 states fresh = m->fresh; 672 states tmp = m->tmp; 673 char *p = start; 674 int c = (start == m->beginp) ? OUT : *(start-1); 675 int lastc; /* previous c */ 676 int flagch; 677 int i; 678 char *coldp; /* last p after which no match was underway */ 679 680 CLEAR(st); 681 SET1(st, startst); 682 st = step(m->g, startst, stopst, st, NOTHING, st); 683 ASSIGN(fresh, st); 684 SP("start", st, *p); 685 coldp = NULL; 686 for (;;) { 687 /* next character */ 688 lastc = c; 689 c = (p == m->endp) ? OUT : *p; 690 if (EQ(st, fresh)) 691 coldp = p; 692 693 /* is there an EOL and/or BOL between lastc and c? */ 694 flagch = '\0'; 695 i = 0; 696 if ((lastc == '\n' && m->g->cflags®_NEWLINE) || 697 (lastc == OUT && !(m->eflags®_NOTBOL))) { 698 flagch = BOL; 699 i = m->g->nbol; 700 } 701 if ((c == '\n' && m->g->cflags®_NEWLINE) || 702 (c == OUT && !(m->eflags®_NOTEOL)) ) { 703 flagch = (flagch == BOL) ? BOLEOL : EOL; 704 i += m->g->neol; 705 } 706 if (i != 0) { 707 for (; i > 0; i--) 708 st = step(m->g, startst, stopst, 709 st, flagch, st); 710 SP("boleol", st, c); 711 } 712 713 /* how about a word boundary? */ 714 if ((flagch == BOL || (lastc != OUT && !ISWORD(lastc))) && 715 (c != OUT && ISWORD(c))) 716 flagch = BOW; 717 if ((lastc != OUT && ISWORD(lastc)) && 718 (flagch == EOL || (c != OUT && !ISWORD(c)))) 719 flagch = EOW; 720 if (flagch == BOW || flagch == EOW) { 721 st = step(m->g, startst, stopst, st, flagch, st); 722 SP("boweow", st, c); 723 } 724 725 /* are we done? */ 726 if (ISSET(st, stopst) || p == stop) 727 break; /* NOTE BREAK OUT */ 728 729 /* no, we must deal with this character */ 730 ASSIGN(tmp, st); 731 ASSIGN(st, fresh); 732 assert(c != OUT); 733 st = step(m->g, startst, stopst, tmp, c, st); 734 SP("aft", st, c); 735 assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st)); 736 p++; 737 } 738 739 assert(coldp != NULL); 740 m->coldp = coldp; 741 if (ISSET(st, stopst)) 742 return(p+1); 743 else 744 return(NULL); 745} 746 747/* 748 - slow - step through the string more deliberately 749 */ 750static char * /* where it ended */ 751slow(struct match *m, char *start, char *stop, sopno startst, sopno stopst) 752{ 753 states st = m->st; 754 states empty = m->empty; 755 states tmp = m->tmp; 756 char *p = start; 757 int c = (start == m->beginp) ? OUT : *(start-1); 758 int lastc; /* previous c */ 759 int flagch; 760 int i; 761 char *matchp; /* last p at which a match ended */ 762 763 AT("slow", start, stop, startst, stopst); 764 CLEAR(st); 765 SET1(st, startst); 766 SP("sstart", st, *p); 767 st = step(m->g, startst, stopst, st, NOTHING, st); 768 matchp = NULL; 769 for (;;) { 770 /* next character */ 771 lastc = c; 772 c = (p == m->endp) ? OUT : *p; 773 774 /* is there an EOL and/or BOL between lastc and c? */ 775 flagch = '\0'; 776 i = 0; 777 if ((lastc == '\n' && m->g->cflags®_NEWLINE) || 778 (lastc == OUT && !(m->eflags®_NOTBOL))) { 779 flagch = BOL; 780 i = m->g->nbol; 781 } 782 if ((c == '\n' && m->g->cflags®_NEWLINE) || 783 (c == OUT && !(m->eflags®_NOTEOL))) { 784 flagch = (flagch == BOL) ? BOLEOL : EOL; 785 i += m->g->neol; 786 } 787 if (i != 0) { 788 for (; i > 0; i--) 789 st = step(m->g, startst, stopst, 790 st, flagch, st); 791 SP("sboleol", st, c); 792 } 793 794 /* how about a word boundary? */ 795 if ((flagch == BOL || (lastc != OUT && !ISWORD(lastc))) && 796 (c != OUT && ISWORD(c))) 797 flagch = BOW; 798 if ((lastc != OUT && ISWORD(lastc)) && 799 (flagch == EOL || (c != OUT && !ISWORD(c)))) 800 flagch = EOW; 801 if (flagch == BOW || flagch == EOW) { 802 st = step(m->g, startst, stopst, st, flagch, st); 803 SP("sboweow", st, c); 804 } 805 806 /* are we done? */ 807 if (ISSET(st, stopst)) 808 matchp = p; 809 if (EQ(st, empty) || p == stop) 810 break; /* NOTE BREAK OUT */ 811 812 /* no, we must deal with this character */ 813 ASSIGN(tmp, st); 814 ASSIGN(st, empty); 815 assert(c != OUT); 816 st = step(m->g, startst, stopst, tmp, c, st); 817 SP("saft", st, c); 818 assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st)); 819 p++; 820 } 821 822 return(matchp); 823} 824 825 826/* 827 - step - map set of states reachable before char to set reachable after 828 */ 829static states 830step(struct re_guts *g, 831 sopno start, /* start state within strip */ 832 sopno stop, /* state after stop state within strip */ 833 states bef, /* states reachable before */ 834 int ch, /* character or NONCHAR code */ 835 states aft) /* states already known reachable after */ 836{ 837 cset *cs; 838 sop s; 839 sopno pc; 840 onestate here; /* note, macros know this name */ 841 sopno look; 842 int i; 843 844 for (pc = start, INIT(here, pc); pc != stop; pc++, INC(here)) { 845 s = g->strip[pc]; 846 switch (OP(s)) { 847 case OEND: 848 assert(pc == stop-1); 849 break; 850 case OCHAR: 851 /* only characters can match */ 852 assert(!NONCHAR(ch) || ch != (char)OPND(s)); 853 if (ch == (char)OPND(s)) 854 FWD(aft, bef, 1); 855 break; 856 case OBOL: 857 if (ch == BOL || ch == BOLEOL) 858 FWD(aft, bef, 1); 859 break; 860 case OEOL: 861 if (ch == EOL || ch == BOLEOL) 862 FWD(aft, bef, 1); 863 break; 864 case OBOW: 865 if (ch == BOW) 866 FWD(aft, bef, 1); 867 break; 868 case OEOW: 869 if (ch == EOW) 870 FWD(aft, bef, 1); 871 break; 872 case OANY: 873 if (!NONCHAR(ch)) 874 FWD(aft, bef, 1); 875 break; 876 case OANYOF: 877 cs = &g->sets[OPND(s)]; 878 if (!NONCHAR(ch) && CHIN(cs, ch)) 879 FWD(aft, bef, 1); 880 break; 881 case OBACK_: /* ignored here */ 882 case O_BACK: 883 FWD(aft, aft, 1); 884 break; 885 case OPLUS_: /* forward, this is just an empty */ 886 FWD(aft, aft, 1); 887 break; 888 case O_PLUS: /* both forward and back */ 889 FWD(aft, aft, 1); 890 i = ISSETBACK(aft, OPND(s)); 891 BACK(aft, aft, OPND(s)); 892 if (!i && ISSETBACK(aft, OPND(s))) { 893 /* oho, must reconsider loop body */ 894 pc -= OPND(s) + 1; 895 INIT(here, pc); 896 } 897 break; 898 case OQUEST_: /* two branches, both forward */ 899 FWD(aft, aft, 1); 900 FWD(aft, aft, OPND(s)); 901 break; 902 case O_QUEST: /* just an empty */ 903 FWD(aft, aft, 1); 904 break; 905 case OLPAREN: /* not significant here */ 906 case ORPAREN: 907 FWD(aft, aft, 1); 908 break; 909 case OCH_: /* mark the first two branches */ 910 FWD(aft, aft, 1); 911 assert(OP(g->strip[pc+OPND(s)]) == OOR2); 912 FWD(aft, aft, OPND(s)); 913 break; 914 case OOR1: /* done a branch, find the O_CH */ 915 if (ISSTATEIN(aft, here)) { 916 for (look = 1; 917 OP(s = g->strip[pc+look]) != O_CH; 918 look += OPND(s)) 919 assert(OP(s) == OOR2); 920 FWD(aft, aft, look); 921 } 922 break; 923 case OOR2: /* propagate OCH_'s marking */ 924 FWD(aft, aft, 1); 925 if (OP(g->strip[pc+OPND(s)]) != O_CH) { 926 assert(OP(g->strip[pc+OPND(s)]) == OOR2); 927 FWD(aft, aft, OPND(s)); 928 } 929 break; 930 case O_CH: /* just empty */ 931 FWD(aft, aft, 1); 932 break; 933 default: /* ooooops... */ 934 assert(nope); 935 break; 936 } 937 } 938 939 return(aft); 940} 941 942#ifdef REDEBUG 943/* 944 - print - print a set of states 945 */ 946static void 947print(struct match *m, char *caption, states st, int ch, FILE *d) 948{ 949 struct re_guts *g = m->g; 950 int i; 951 int first = 1; 952 953 if (!(m->eflags®_TRACE)) 954 return; 955 956 (void)fprintf(d, "%s", caption); 957 (void)fprintf(d, " %s", pchar(ch)); 958 for (i = 0; i < g->nstates; i++) { 959 if (ISSET(st, i)) { 960 (void)fprintf(d, "%s%d", (first) ? "\t" : ", ", i); 961 first = 0; 962 } 963 } 964 (void)fprintf(d, "\n"); 965} 966 967/* 968 - at - print current situation 969 */ 970static void 971at(struct match *m, char *title, char *start, char *stop, sopno startst, 972 sopno stopst) 973{ 974 if (!(m->eflags®_TRACE)) 975 return; 976 977 (void)printf("%s %s-", title, pchar(*start)); 978 (void)printf("%s ", pchar(*stop)); 979 (void)printf("%ld-%ld\n", (long)startst, (long)stopst); 980} 981 982#ifndef PCHARDONE 983#define PCHARDONE /* never again */ 984static const char *nonchars[] = 985 { "OUT", "BOL", "EOL", "BOLEOL", "NOTHING", "BOW", "EOW" }; 986#define PNONCHAR(c) \ 987 ((c) - OUT < (sizeof(nonchars)/sizeof(nonchars[0])) \ 988 ? nonchars[(c) - OUT] : "invalid") 989 990/* 991 - pchar - make a character printable 992 * 993 * Is this identical to regchar() over in debug.c? Well, yes. But a 994 * duplicate here avoids having a debugging-capable regexec.o tied to 995 * a matching debug.o, and this is convenient. It all disappears in 996 * the non-debug compilation anyway, so it doesn't matter much. 997 */ 998static const char * /* -> representation */ 999pchar(int ch) 1000{ 1001 static char pbuf[10]; 1002 1003 if (NONCHAR(ch)) { 1004 if (ch - OUT < (sizeof(nonchars)/sizeof(nonchars[0]))) 1005 return nonchars[ch - OUT]; 1006 return "invalid"; 1007 } 1008 if (isprint((unsigned char)ch) || ch == ' ') 1009 (void)snprintf(pbuf, sizeof pbuf, "%c", ch); 1010 else 1011 (void)snprintf(pbuf, sizeof pbuf, "\\%o", ch); 1012 return(pbuf); 1013} 1014#endif 1015#endif 1016 1017#undef matcher 1018#undef fast 1019#undef slow 1020#undef dissect 1021#undef backref 1022#undef step 1023#undef print 1024#undef at 1025#undef match 1026#undef nope 1027