engine.c revision 1.20
1/* $OpenBSD: engine.c,v 1.20 2016/05/17 22:03:18 schwarze Exp $ */ 2 3/*- 4 * Copyright (c) 1992, 1993, 1994 Henry Spencer. 5 * Copyright (c) 1992, 1993, 1994 6 * The Regents of the University of California. All rights reserved. 7 * 8 * This code is derived from software contributed to Berkeley by 9 * Henry Spencer. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 3. Neither the name of the University nor the names of its contributors 20 * may be used to endorse or promote products derived from this software 21 * without specific prior written permission. 22 * 23 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 24 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 25 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 26 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 27 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 28 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 29 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 30 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 31 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 32 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 33 * SUCH DAMAGE. 34 * 35 * @(#)engine.c 8.5 (Berkeley) 3/20/94 36 */ 37 38/* 39 * The matching engine and friends. This file is #included by regexec.c 40 * after suitable #defines of a variety of macros used herein, so that 41 * different state representations can be used without duplicating masses 42 * of code. 43 */ 44 45#ifdef SNAMES 46#define matcher smatcher 47#define fast sfast 48#define slow sslow 49#define dissect sdissect 50#define backref sbackref 51#define step sstep 52#define print sprint 53#define at sat 54#define match smat 55#define nope snope 56#endif 57#ifdef LNAMES 58#define matcher lmatcher 59#define fast lfast 60#define slow lslow 61#define dissect ldissect 62#define backref lbackref 63#define step lstep 64#define print lprint 65#define at lat 66#define match lmat 67#define nope lnope 68#endif 69 70/* another structure passed up and down to avoid zillions of parameters */ 71struct match { 72 struct re_guts *g; 73 int eflags; 74 regmatch_t *pmatch; /* [nsub+1] (0 element unused) */ 75 char *offp; /* offsets work from here */ 76 char *beginp; /* start of string -- virtual NUL precedes */ 77 char *endp; /* end of string -- virtual NUL here */ 78 char *coldp; /* can be no match starting before here */ 79 char **lastpos; /* [nplus+1] */ 80 STATEVARS; 81 states st; /* current states */ 82 states fresh; /* states for a fresh start */ 83 states tmp; /* temporary */ 84 states empty; /* empty set of states */ 85}; 86 87static int matcher(struct re_guts *, char *, size_t, regmatch_t[], int); 88static char *dissect(struct match *, char *, char *, sopno, sopno); 89static char *backref(struct match *, char *, char *, sopno, sopno, sopno, int); 90static char *fast(struct match *, char *, char *, sopno, sopno); 91static char *slow(struct match *, char *, char *, sopno, sopno); 92static states step(struct re_guts *, sopno, sopno, states, int, states); 93#define MAX_RECURSION 100 94#define BOL (OUT+1) 95#define EOL (BOL+1) 96#define BOLEOL (BOL+2) 97#define NOTHING (BOL+3) 98#define BOW (BOL+4) 99#define EOW (BOL+5) 100/* update nonchars[] array below when adding fake chars here */ 101#define CODEMAX (BOL+5) /* highest code used */ 102#define NONCHAR(c) ((c) > CHAR_MAX) 103#define NNONCHAR (CODEMAX-CHAR_MAX) 104#ifdef REDEBUG 105static void print(struct match *, char *, states, int, FILE *); 106#endif 107#ifdef REDEBUG 108static void at(struct match *, char *, char *, char *, sopno, sopno); 109#endif 110#ifdef REDEBUG 111static const char *pchar(int); 112#endif 113 114#ifdef REDEBUG 115#define SP(t, s, c) print(m, t, s, c, stdout) 116#define AT(t, p1, p2, s1, s2) at(m, t, p1, p2, s1, s2) 117#define NOTE(str) { if (m->eflags®_TRACE) (void)printf("=%s\n", (str)); } 118static int nope = 0; 119#else 120#define SP(t, s, c) /* nothing */ 121#define AT(t, p1, p2, s1, s2) /* nothing */ 122#define NOTE(s) /* nothing */ 123#endif 124 125/* 126 - matcher - the actual matching engine 127 */ 128static int /* 0 success, REG_NOMATCH failure */ 129matcher(struct re_guts *g, char *string, size_t nmatch, regmatch_t pmatch[], 130 int eflags) 131{ 132 char *endp; 133 int i; 134 struct match mv; 135 struct match *m = &mv; 136 char *dp; 137 const sopno gf = g->firststate+1; /* +1 for OEND */ 138 const sopno gl = g->laststate; 139 char *start; 140 char *stop; 141 142 /* simplify the situation where possible */ 143 if (g->cflags®_NOSUB) 144 nmatch = 0; 145 if (eflags®_STARTEND) { 146 start = string + pmatch[0].rm_so; 147 stop = string + pmatch[0].rm_eo; 148 } else { 149 start = string; 150 stop = start + strlen(start); 151 } 152 if (stop < start) 153 return(REG_INVARG); 154 155 /* prescreening; this does wonders for this rather slow code */ 156 if (g->must != NULL) { 157 for (dp = start; dp < stop; dp++) 158 if (*dp == g->must[0] && stop - dp >= g->mlen && 159 memcmp(dp, g->must, (size_t)g->mlen) == 0) 160 break; 161 if (dp == stop) /* we didn't find g->must */ 162 return(REG_NOMATCH); 163 } 164 165 /* match struct setup */ 166 m->g = g; 167 m->eflags = eflags; 168 m->pmatch = NULL; 169 m->lastpos = NULL; 170 m->offp = string; 171 m->beginp = start; 172 m->endp = stop; 173 STATESETUP(m, 4); 174 SETUP(m->st); 175 SETUP(m->fresh); 176 SETUP(m->tmp); 177 SETUP(m->empty); 178 CLEAR(m->empty); 179 180 /* this loop does only one repetition except for backrefs */ 181 for (;;) { 182 endp = fast(m, start, stop, gf, gl); 183 if (endp == NULL) { /* a miss */ 184 free(m->pmatch); 185 free(m->lastpos); 186 STATETEARDOWN(m); 187 return(REG_NOMATCH); 188 } 189 if (nmatch == 0 && !g->backrefs) 190 break; /* no further info needed */ 191 192 /* where? */ 193 assert(m->coldp != NULL); 194 for (;;) { 195 NOTE("finding start"); 196 endp = slow(m, m->coldp, stop, gf, gl); 197 if (endp != NULL) 198 break; 199 assert(m->coldp < m->endp); 200 m->coldp++; 201 } 202 if (nmatch == 1 && !g->backrefs) 203 break; /* no further info needed */ 204 205 /* oh my, he wants the subexpressions... */ 206 if (m->pmatch == NULL) 207 m->pmatch = reallocarray(NULL, m->g->nsub + 1, 208 sizeof(regmatch_t)); 209 if (m->pmatch == NULL) { 210 STATETEARDOWN(m); 211 return(REG_ESPACE); 212 } 213 for (i = 1; i <= m->g->nsub; i++) 214 m->pmatch[i].rm_so = m->pmatch[i].rm_eo = -1; 215 if (!g->backrefs && !(m->eflags®_BACKR)) { 216 NOTE("dissecting"); 217 dp = dissect(m, m->coldp, endp, gf, gl); 218 } else { 219 if (g->nplus > 0 && m->lastpos == NULL) 220 m->lastpos = reallocarray(NULL, 221 g->nplus+1, sizeof(char *)); 222 if (g->nplus > 0 && m->lastpos == NULL) { 223 free(m->pmatch); 224 STATETEARDOWN(m); 225 return(REG_ESPACE); 226 } 227 NOTE("backref dissect"); 228 dp = backref(m, m->coldp, endp, gf, gl, (sopno)0, 0); 229 } 230 if (dp != NULL) 231 break; 232 233 /* uh-oh... we couldn't find a subexpression-level match */ 234 assert(g->backrefs); /* must be back references doing it */ 235 assert(g->nplus == 0 || m->lastpos != NULL); 236 for (;;) { 237 if (dp != NULL || endp <= m->coldp) 238 break; /* defeat */ 239 NOTE("backoff"); 240 endp = slow(m, m->coldp, endp-1, gf, gl); 241 if (endp == NULL) 242 break; /* defeat */ 243 /* try it on a shorter possibility */ 244#ifndef NDEBUG 245 for (i = 1; i <= m->g->nsub; i++) { 246 assert(m->pmatch[i].rm_so == -1); 247 assert(m->pmatch[i].rm_eo == -1); 248 } 249#endif 250 NOTE("backoff dissect"); 251 dp = backref(m, m->coldp, endp, gf, gl, (sopno)0, 0); 252 } 253 assert(dp == NULL || dp == endp); 254 if (dp != NULL) /* found a shorter one */ 255 break; 256 257 /* despite initial appearances, there is no match here */ 258 NOTE("false alarm"); 259 if (m->coldp == stop) 260 break; 261 start = m->coldp + 1; /* recycle starting later */ 262 } 263 264 /* fill in the details if requested */ 265 if (nmatch > 0) { 266 pmatch[0].rm_so = m->coldp - m->offp; 267 pmatch[0].rm_eo = endp - m->offp; 268 } 269 if (nmatch > 1) { 270 assert(m->pmatch != NULL); 271 for (i = 1; i < nmatch; i++) 272 if (i <= m->g->nsub) 273 pmatch[i] = m->pmatch[i]; 274 else { 275 pmatch[i].rm_so = -1; 276 pmatch[i].rm_eo = -1; 277 } 278 } 279 280 free(m->pmatch); 281 free(m->lastpos); 282 STATETEARDOWN(m); 283 return(0); 284} 285 286/* 287 - dissect - figure out what matched what, no back references 288 */ 289static char * /* == stop (success) always */ 290dissect(struct match *m, char *start, char *stop, sopno startst, sopno stopst) 291{ 292 int i; 293 sopno ss; /* start sop of current subRE */ 294 sopno es; /* end sop of current subRE */ 295 char *sp; /* start of string matched by it */ 296 char *stp; /* string matched by it cannot pass here */ 297 char *rest; /* start of rest of string */ 298 char *tail; /* string unmatched by rest of RE */ 299 sopno ssub; /* start sop of subsubRE */ 300 sopno esub; /* end sop of subsubRE */ 301 char *ssp; /* start of string matched by subsubRE */ 302 char *sep; /* end of string matched by subsubRE */ 303 char *oldssp; /* previous ssp */ 304 char *dp; 305 306 AT("diss", start, stop, startst, stopst); 307 sp = start; 308 for (ss = startst; ss < stopst; ss = es) { 309 /* identify end of subRE */ 310 es = ss; 311 switch (OP(m->g->strip[es])) { 312 case OPLUS_: 313 case OQUEST_: 314 es += OPND(m->g->strip[es]); 315 break; 316 case OCH_: 317 while (OP(m->g->strip[es]) != O_CH) 318 es += OPND(m->g->strip[es]); 319 break; 320 } 321 es++; 322 323 /* figure out what it matched */ 324 switch (OP(m->g->strip[ss])) { 325 case OEND: 326 assert(nope); 327 break; 328 case OCHAR: 329 sp++; 330 break; 331 case OBOL: 332 case OEOL: 333 case OBOW: 334 case OEOW: 335 break; 336 case OANY: 337 case OANYOF: 338 sp++; 339 break; 340 case OBACK_: 341 case O_BACK: 342 assert(nope); 343 break; 344 /* cases where length of match is hard to find */ 345 case OQUEST_: 346 stp = stop; 347 for (;;) { 348 /* how long could this one be? */ 349 rest = slow(m, sp, stp, ss, es); 350 assert(rest != NULL); /* it did match */ 351 /* could the rest match the rest? */ 352 tail = slow(m, rest, stop, es, stopst); 353 if (tail == stop) 354 break; /* yes! */ 355 /* no -- try a shorter match for this one */ 356 stp = rest - 1; 357 assert(stp >= sp); /* it did work */ 358 } 359 ssub = ss + 1; 360 esub = es - 1; 361 /* did innards match? */ 362 if (slow(m, sp, rest, ssub, esub) != NULL) { 363 dp = dissect(m, sp, rest, ssub, esub); 364 assert(dp == rest); 365 } else /* no */ 366 assert(sp == rest); 367 sp = rest; 368 break; 369 case OPLUS_: 370 stp = stop; 371 for (;;) { 372 /* how long could this one be? */ 373 rest = slow(m, sp, stp, ss, es); 374 assert(rest != NULL); /* it did match */ 375 /* could the rest match the rest? */ 376 tail = slow(m, rest, stop, es, stopst); 377 if (tail == stop) 378 break; /* yes! */ 379 /* no -- try a shorter match for this one */ 380 stp = rest - 1; 381 assert(stp >= sp); /* it did work */ 382 } 383 ssub = ss + 1; 384 esub = es - 1; 385 ssp = sp; 386 oldssp = ssp; 387 for (;;) { /* find last match of innards */ 388 sep = slow(m, ssp, rest, ssub, esub); 389 if (sep == NULL || sep == ssp) 390 break; /* failed or matched null */ 391 oldssp = ssp; /* on to next try */ 392 ssp = sep; 393 } 394 if (sep == NULL) { 395 /* last successful match */ 396 sep = ssp; 397 ssp = oldssp; 398 } 399 assert(sep == rest); /* must exhaust substring */ 400 assert(slow(m, ssp, sep, ssub, esub) == rest); 401 dp = dissect(m, ssp, sep, ssub, esub); 402 assert(dp == sep); 403 sp = rest; 404 break; 405 case OCH_: 406 stp = stop; 407 for (;;) { 408 /* how long could this one be? */ 409 rest = slow(m, sp, stp, ss, es); 410 assert(rest != NULL); /* it did match */ 411 /* could the rest match the rest? */ 412 tail = slow(m, rest, stop, es, stopst); 413 if (tail == stop) 414 break; /* yes! */ 415 /* no -- try a shorter match for this one */ 416 stp = rest - 1; 417 assert(stp >= sp); /* it did work */ 418 } 419 ssub = ss + 1; 420 esub = ss + OPND(m->g->strip[ss]) - 1; 421 assert(OP(m->g->strip[esub]) == OOR1); 422 for (;;) { /* find first matching branch */ 423 if (slow(m, sp, rest, ssub, esub) == rest) 424 break; /* it matched all of it */ 425 /* that one missed, try next one */ 426 assert(OP(m->g->strip[esub]) == OOR1); 427 esub++; 428 assert(OP(m->g->strip[esub]) == OOR2); 429 ssub = esub + 1; 430 esub += OPND(m->g->strip[esub]); 431 if (OP(m->g->strip[esub]) == OOR2) 432 esub--; 433 else 434 assert(OP(m->g->strip[esub]) == O_CH); 435 } 436 dp = dissect(m, sp, rest, ssub, esub); 437 assert(dp == rest); 438 sp = rest; 439 break; 440 case O_PLUS: 441 case O_QUEST: 442 case OOR1: 443 case OOR2: 444 case O_CH: 445 assert(nope); 446 break; 447 case OLPAREN: 448 i = OPND(m->g->strip[ss]); 449 assert(0 < i && i <= m->g->nsub); 450 m->pmatch[i].rm_so = sp - m->offp; 451 break; 452 case ORPAREN: 453 i = OPND(m->g->strip[ss]); 454 assert(0 < i && i <= m->g->nsub); 455 m->pmatch[i].rm_eo = sp - m->offp; 456 break; 457 default: /* uh oh */ 458 assert(nope); 459 break; 460 } 461 } 462 463 assert(sp == stop); 464 return(sp); 465} 466 467/* 468 - backref - figure out what matched what, figuring in back references 469 */ 470static char * /* == stop (success) or NULL (failure) */ 471backref(struct match *m, char *start, char *stop, sopno startst, sopno stopst, 472 sopno lev, int rec) /* PLUS nesting level */ 473{ 474 int i; 475 sopno ss; /* start sop of current subRE */ 476 char *sp; /* start of string matched by it */ 477 sopno ssub; /* start sop of subsubRE */ 478 sopno esub; /* end sop of subsubRE */ 479 char *ssp; /* start of string matched by subsubRE */ 480 char *dp; 481 size_t len; 482 int hard; 483 sop s; 484 regoff_t offsave; 485 cset *cs; 486 487 AT("back", start, stop, startst, stopst); 488 sp = start; 489 490 /* get as far as we can with easy stuff */ 491 hard = 0; 492 for (ss = startst; !hard && ss < stopst; ss++) 493 switch (OP(s = m->g->strip[ss])) { 494 case OCHAR: 495 if (sp == stop || *sp++ != (char)OPND(s)) 496 return(NULL); 497 break; 498 case OANY: 499 if (sp == stop) 500 return(NULL); 501 sp++; 502 break; 503 case OANYOF: 504 cs = &m->g->sets[OPND(s)]; 505 if (sp == stop || !CHIN(cs, *sp++)) 506 return(NULL); 507 break; 508 case OBOL: 509 if ((sp == m->beginp && !(m->eflags®_NOTBOL)) || 510 (sp > m->offp && sp < m->endp && 511 *(sp-1) == '\n' && (m->g->cflags®_NEWLINE))) 512 { /* yes */ } 513 else 514 return(NULL); 515 break; 516 case OEOL: 517 if ( (sp == m->endp && !(m->eflags®_NOTEOL)) || 518 (sp < m->endp && *sp == '\n' && 519 (m->g->cflags®_NEWLINE)) ) 520 { /* yes */ } 521 else 522 return(NULL); 523 break; 524 case OBOW: 525 if (( (sp == m->beginp && !(m->eflags®_NOTBOL)) || 526 (sp < m->endp && *(sp-1) == '\n' && 527 (m->g->cflags®_NEWLINE)) || 528 (sp > m->beginp && 529 !ISWORD(*(sp-1))) ) && 530 (sp < m->endp && ISWORD(*sp)) ) 531 { /* yes */ } 532 else 533 return(NULL); 534 break; 535 case OEOW: 536 if (( (sp == m->endp && !(m->eflags®_NOTEOL)) || 537 (sp < m->endp && *sp == '\n' && 538 (m->g->cflags®_NEWLINE)) || 539 (sp < m->endp && !ISWORD(*sp)) ) && 540 (sp > m->beginp && ISWORD(*(sp-1))) ) 541 { /* yes */ } 542 else 543 return(NULL); 544 break; 545 case O_QUEST: 546 break; 547 case OOR1: /* matches null but needs to skip */ 548 ss++; 549 s = m->g->strip[ss]; 550 do { 551 assert(OP(s) == OOR2); 552 ss += OPND(s); 553 } while (OP(s = m->g->strip[ss]) != O_CH); 554 /* note that the ss++ gets us past the O_CH */ 555 break; 556 default: /* have to make a choice */ 557 hard = 1; 558 break; 559 } 560 if (!hard) { /* that was it! */ 561 if (sp != stop) 562 return(NULL); 563 return(sp); 564 } 565 ss--; /* adjust for the for's final increment */ 566 567 /* the hard stuff */ 568 AT("hard", sp, stop, ss, stopst); 569 s = m->g->strip[ss]; 570 switch (OP(s)) { 571 case OBACK_: /* the vilest depths */ 572 i = OPND(s); 573 assert(0 < i && i <= m->g->nsub); 574 if (m->pmatch[i].rm_eo == -1) 575 return(NULL); 576 assert(m->pmatch[i].rm_so != -1); 577 len = m->pmatch[i].rm_eo - m->pmatch[i].rm_so; 578 if (len == 0 && rec++ > MAX_RECURSION) 579 return(NULL); 580 assert(stop - m->beginp >= len); 581 if (sp > stop - len) 582 return(NULL); /* not enough left to match */ 583 ssp = m->offp + m->pmatch[i].rm_so; 584 if (memcmp(sp, ssp, len) != 0) 585 return(NULL); 586 while (m->g->strip[ss] != SOP(O_BACK, i)) 587 ss++; 588 return(backref(m, sp+len, stop, ss+1, stopst, lev, rec)); 589 break; 590 case OQUEST_: /* to null or not */ 591 dp = backref(m, sp, stop, ss+1, stopst, lev, rec); 592 if (dp != NULL) 593 return(dp); /* not */ 594 return(backref(m, sp, stop, ss+OPND(s)+1, stopst, lev, rec)); 595 break; 596 case OPLUS_: 597 assert(m->lastpos != NULL); 598 assert(lev+1 <= m->g->nplus); 599 m->lastpos[lev+1] = sp; 600 return(backref(m, sp, stop, ss+1, stopst, lev+1, rec)); 601 break; 602 case O_PLUS: 603 if (sp == m->lastpos[lev]) /* last pass matched null */ 604 return(backref(m, sp, stop, ss+1, stopst, lev-1, rec)); 605 /* try another pass */ 606 m->lastpos[lev] = sp; 607 dp = backref(m, sp, stop, ss-OPND(s)+1, stopst, lev, rec); 608 if (dp == NULL) 609 return(backref(m, sp, stop, ss+1, stopst, lev-1, rec)); 610 else 611 return(dp); 612 break; 613 case OCH_: /* find the right one, if any */ 614 ssub = ss + 1; 615 esub = ss + OPND(s) - 1; 616 assert(OP(m->g->strip[esub]) == OOR1); 617 for (;;) { /* find first matching branch */ 618 dp = backref(m, sp, stop, ssub, esub, lev, rec); 619 if (dp != NULL) 620 return(dp); 621 /* that one missed, try next one */ 622 if (OP(m->g->strip[esub]) == O_CH) 623 return(NULL); /* there is none */ 624 esub++; 625 assert(OP(m->g->strip[esub]) == OOR2); 626 ssub = esub + 1; 627 esub += OPND(m->g->strip[esub]); 628 if (OP(m->g->strip[esub]) == OOR2) 629 esub--; 630 else 631 assert(OP(m->g->strip[esub]) == O_CH); 632 } 633 break; 634 case OLPAREN: /* must undo assignment if rest fails */ 635 i = OPND(s); 636 assert(0 < i && i <= m->g->nsub); 637 offsave = m->pmatch[i].rm_so; 638 m->pmatch[i].rm_so = sp - m->offp; 639 dp = backref(m, sp, stop, ss+1, stopst, lev, rec); 640 if (dp != NULL) 641 return(dp); 642 m->pmatch[i].rm_so = offsave; 643 return(NULL); 644 break; 645 case ORPAREN: /* must undo assignment if rest fails */ 646 i = OPND(s); 647 assert(0 < i && i <= m->g->nsub); 648 offsave = m->pmatch[i].rm_eo; 649 m->pmatch[i].rm_eo = sp - m->offp; 650 dp = backref(m, sp, stop, ss+1, stopst, lev, rec); 651 if (dp != NULL) 652 return(dp); 653 m->pmatch[i].rm_eo = offsave; 654 return(NULL); 655 break; 656 default: /* uh oh */ 657 assert(nope); 658 break; 659 } 660 661 /* "can't happen" */ 662 assert(nope); 663 /* NOTREACHED */ 664 return NULL; 665} 666 667/* 668 - fast - step through the string at top speed 669 */ 670static char * /* where tentative match ended, or NULL */ 671fast(struct match *m, char *start, char *stop, sopno startst, sopno stopst) 672{ 673 states st = m->st; 674 states fresh = m->fresh; 675 states tmp = m->tmp; 676 char *p = start; 677 int c = (start == m->beginp) ? OUT : *(start-1); 678 int lastc; /* previous c */ 679 int flagch; 680 int i; 681 char *coldp; /* last p after which no match was underway */ 682 683 CLEAR(st); 684 SET1(st, startst); 685 st = step(m->g, startst, stopst, st, NOTHING, st); 686 ASSIGN(fresh, st); 687 SP("start", st, *p); 688 coldp = NULL; 689 for (;;) { 690 /* next character */ 691 lastc = c; 692 c = (p == m->endp) ? OUT : *p; 693 if (EQ(st, fresh)) 694 coldp = p; 695 696 /* is there an EOL and/or BOL between lastc and c? */ 697 flagch = '\0'; 698 i = 0; 699 if ( (lastc == '\n' && m->g->cflags®_NEWLINE) || 700 (lastc == OUT && !(m->eflags®_NOTBOL)) ) { 701 flagch = BOL; 702 i = m->g->nbol; 703 } 704 if ( (c == '\n' && m->g->cflags®_NEWLINE) || 705 (c == OUT && !(m->eflags®_NOTEOL)) ) { 706 flagch = (flagch == BOL) ? BOLEOL : EOL; 707 i += m->g->neol; 708 } 709 if (i != 0) { 710 for (; i > 0; i--) 711 st = step(m->g, startst, stopst, st, flagch, st); 712 SP("boleol", st, c); 713 } 714 715 /* how about a word boundary? */ 716 if ( (flagch == BOL || (lastc != OUT && !ISWORD(lastc))) && 717 (c != OUT && ISWORD(c)) ) { 718 flagch = BOW; 719 } 720 if ( (lastc != OUT && ISWORD(lastc)) && 721 (flagch == EOL || (c != OUT && !ISWORD(c))) ) { 722 flagch = EOW; 723 } 724 if (flagch == BOW || flagch == EOW) { 725 st = step(m->g, startst, stopst, st, flagch, st); 726 SP("boweow", st, c); 727 } 728 729 /* are we done? */ 730 if (ISSET(st, stopst) || p == stop) 731 break; /* NOTE BREAK OUT */ 732 733 /* no, we must deal with this character */ 734 ASSIGN(tmp, st); 735 ASSIGN(st, fresh); 736 assert(c != OUT); 737 st = step(m->g, startst, stopst, tmp, c, st); 738 SP("aft", st, c); 739 assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st)); 740 p++; 741 } 742 743 assert(coldp != NULL); 744 m->coldp = coldp; 745 if (ISSET(st, stopst)) 746 return(p+1); 747 else 748 return(NULL); 749} 750 751/* 752 - slow - step through the string more deliberately 753 */ 754static char * /* where it ended */ 755slow(struct match *m, char *start, char *stop, sopno startst, sopno stopst) 756{ 757 states st = m->st; 758 states empty = m->empty; 759 states tmp = m->tmp; 760 char *p = start; 761 int c = (start == m->beginp) ? OUT : *(start-1); 762 int lastc; /* previous c */ 763 int flagch; 764 int i; 765 char *matchp; /* last p at which a match ended */ 766 767 AT("slow", start, stop, startst, stopst); 768 CLEAR(st); 769 SET1(st, startst); 770 SP("sstart", st, *p); 771 st = step(m->g, startst, stopst, st, NOTHING, st); 772 matchp = NULL; 773 for (;;) { 774 /* next character */ 775 lastc = c; 776 c = (p == m->endp) ? OUT : *p; 777 778 /* is there an EOL and/or BOL between lastc and c? */ 779 flagch = '\0'; 780 i = 0; 781 if ( (lastc == '\n' && m->g->cflags®_NEWLINE) || 782 (lastc == OUT && !(m->eflags®_NOTBOL)) ) { 783 flagch = BOL; 784 i = m->g->nbol; 785 } 786 if ( (c == '\n' && m->g->cflags®_NEWLINE) || 787 (c == OUT && !(m->eflags®_NOTEOL)) ) { 788 flagch = (flagch == BOL) ? BOLEOL : EOL; 789 i += m->g->neol; 790 } 791 if (i != 0) { 792 for (; i > 0; i--) 793 st = step(m->g, startst, stopst, st, flagch, st); 794 SP("sboleol", st, c); 795 } 796 797 /* how about a word boundary? */ 798 if ( (flagch == BOL || (lastc != OUT && !ISWORD(lastc))) && 799 (c != OUT && ISWORD(c)) ) { 800 flagch = BOW; 801 } 802 if ( (lastc != OUT && ISWORD(lastc)) && 803 (flagch == EOL || (c != OUT && !ISWORD(c))) ) { 804 flagch = EOW; 805 } 806 if (flagch == BOW || flagch == EOW) { 807 st = step(m->g, startst, stopst, st, flagch, st); 808 SP("sboweow", st, c); 809 } 810 811 /* are we done? */ 812 if (ISSET(st, stopst)) 813 matchp = p; 814 if (EQ(st, empty) || p == stop) 815 break; /* NOTE BREAK OUT */ 816 817 /* no, we must deal with this character */ 818 ASSIGN(tmp, st); 819 ASSIGN(st, empty); 820 assert(c != OUT); 821 st = step(m->g, startst, stopst, tmp, c, st); 822 SP("saft", st, c); 823 assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st)); 824 p++; 825 } 826 827 return(matchp); 828} 829 830 831/* 832 - step - map set of states reachable before char to set reachable after 833 */ 834static states 835step(struct re_guts *g, 836 sopno start, /* start state within strip */ 837 sopno stop, /* state after stop state within strip */ 838 states bef, /* states reachable before */ 839 int ch, /* character or NONCHAR code */ 840 states aft) /* states already known reachable after */ 841{ 842 cset *cs; 843 sop s; 844 sopno pc; 845 onestate here; /* note, macros know this name */ 846 sopno look; 847 int i; 848 849 for (pc = start, INIT(here, pc); pc != stop; pc++, INC(here)) { 850 s = g->strip[pc]; 851 switch (OP(s)) { 852 case OEND: 853 assert(pc == stop-1); 854 break; 855 case OCHAR: 856 /* only characters can match */ 857 assert(!NONCHAR(ch) || ch != (char)OPND(s)); 858 if (ch == (char)OPND(s)) 859 FWD(aft, bef, 1); 860 break; 861 case OBOL: 862 if (ch == BOL || ch == BOLEOL) 863 FWD(aft, bef, 1); 864 break; 865 case OEOL: 866 if (ch == EOL || ch == BOLEOL) 867 FWD(aft, bef, 1); 868 break; 869 case OBOW: 870 if (ch == BOW) 871 FWD(aft, bef, 1); 872 break; 873 case OEOW: 874 if (ch == EOW) 875 FWD(aft, bef, 1); 876 break; 877 case OANY: 878 if (!NONCHAR(ch)) 879 FWD(aft, bef, 1); 880 break; 881 case OANYOF: 882 cs = &g->sets[OPND(s)]; 883 if (!NONCHAR(ch) && CHIN(cs, ch)) 884 FWD(aft, bef, 1); 885 break; 886 case OBACK_: /* ignored here */ 887 case O_BACK: 888 FWD(aft, aft, 1); 889 break; 890 case OPLUS_: /* forward, this is just an empty */ 891 FWD(aft, aft, 1); 892 break; 893 case O_PLUS: /* both forward and back */ 894 FWD(aft, aft, 1); 895 i = ISSETBACK(aft, OPND(s)); 896 BACK(aft, aft, OPND(s)); 897 if (!i && ISSETBACK(aft, OPND(s))) { 898 /* oho, must reconsider loop body */ 899 pc -= OPND(s) + 1; 900 INIT(here, pc); 901 } 902 break; 903 case OQUEST_: /* two branches, both forward */ 904 FWD(aft, aft, 1); 905 FWD(aft, aft, OPND(s)); 906 break; 907 case O_QUEST: /* just an empty */ 908 FWD(aft, aft, 1); 909 break; 910 case OLPAREN: /* not significant here */ 911 case ORPAREN: 912 FWD(aft, aft, 1); 913 break; 914 case OCH_: /* mark the first two branches */ 915 FWD(aft, aft, 1); 916 assert(OP(g->strip[pc+OPND(s)]) == OOR2); 917 FWD(aft, aft, OPND(s)); 918 break; 919 case OOR1: /* done a branch, find the O_CH */ 920 if (ISSTATEIN(aft, here)) { 921 for (look = 1; 922 OP(s = g->strip[pc+look]) != O_CH; 923 look += OPND(s)) 924 assert(OP(s) == OOR2); 925 FWD(aft, aft, look); 926 } 927 break; 928 case OOR2: /* propagate OCH_'s marking */ 929 FWD(aft, aft, 1); 930 if (OP(g->strip[pc+OPND(s)]) != O_CH) { 931 assert(OP(g->strip[pc+OPND(s)]) == OOR2); 932 FWD(aft, aft, OPND(s)); 933 } 934 break; 935 case O_CH: /* just empty */ 936 FWD(aft, aft, 1); 937 break; 938 default: /* ooooops... */ 939 assert(nope); 940 break; 941 } 942 } 943 944 return(aft); 945} 946 947#ifdef REDEBUG 948/* 949 - print - print a set of states 950 */ 951static void 952print(struct match *m, char *caption, states st, int ch, FILE *d) 953{ 954 struct re_guts *g = m->g; 955 int i; 956 int first = 1; 957 958 if (!(m->eflags®_TRACE)) 959 return; 960 961 (void)fprintf(d, "%s", caption); 962 (void)fprintf(d, " %s", pchar(ch)); 963 for (i = 0; i < g->nstates; i++) 964 if (ISSET(st, i)) { 965 (void)fprintf(d, "%s%d", (first) ? "\t" : ", ", i); 966 first = 0; 967 } 968 (void)fprintf(d, "\n"); 969} 970 971/* 972 - at - print current situation 973 */ 974static void 975at(struct match *m, char *title, char *start, char *stop, sopno startst, 976 sopno stopst) 977{ 978 if (!(m->eflags®_TRACE)) 979 return; 980 981 (void)printf("%s %s-", title, pchar(*start)); 982 (void)printf("%s ", pchar(*stop)); 983 (void)printf("%ld-%ld\n", (long)startst, (long)stopst); 984} 985 986#ifndef PCHARDONE 987#define PCHARDONE /* never again */ 988static const char *nonchars[] = 989 { "OUT", "BOL", "EOL", "BOLEOL", "NOTHING", "BOW", "EOW" }; 990#define PNONCHAR(c) \ 991 ((c) - OUT < (sizeof(nonchars)/sizeof(nonchars[0])) \ 992 ? nonchars[(c) - OUT] : "invalid") 993 994/* 995 - pchar - make a character printable 996 * 997 * Is this identical to regchar() over in debug.c? Well, yes. But a 998 * duplicate here avoids having a debugging-capable regexec.o tied to 999 * a matching debug.o, and this is convenient. It all disappears in 1000 * the non-debug compilation anyway, so it doesn't matter much. 1001 */ 1002static const char * /* -> representation */ 1003pchar(int ch) 1004{ 1005 static char pbuf[10]; 1006 1007 if (NONCHAR(ch)) { 1008 if (ch - OUT < (sizeof(nonchars)/sizeof(nonchars[0]))) 1009 return nonchars[ch - OUT]; 1010 return "invalid"; 1011 } 1012 if (isprint((unsigned char)ch) || ch == ' ') 1013 (void)snprintf(pbuf, sizeof pbuf, "%c", ch); 1014 else 1015 (void)snprintf(pbuf, sizeof pbuf, "\\%o", ch); 1016 return(pbuf); 1017} 1018#endif 1019#endif 1020 1021#undef matcher 1022#undef fast 1023#undef slow 1024#undef dissect 1025#undef backref 1026#undef step 1027#undef print 1028#undef at 1029#undef match 1030#undef nope 1031