engine.c revision 1.17
1/* $OpenBSD: engine.c,v 1.17 2013/11/28 05:09:45 guenther Exp $ */ 2 3/*- 4 * Copyright (c) 1992, 1993, 1994 Henry Spencer. 5 * Copyright (c) 1992, 1993, 1994 6 * The Regents of the University of California. All rights reserved. 7 * 8 * This code is derived from software contributed to Berkeley by 9 * Henry Spencer. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 3. Neither the name of the University nor the names of its contributors 20 * may be used to endorse or promote products derived from this software 21 * without specific prior written permission. 22 * 23 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 24 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 25 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 26 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 27 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 28 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 29 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 30 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 31 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 32 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 33 * SUCH DAMAGE. 34 * 35 * @(#)engine.c 8.5 (Berkeley) 3/20/94 36 */ 37 38/* 39 * The matching engine and friends. This file is #included by regexec.c 40 * after suitable #defines of a variety of macros used herein, so that 41 * different state representations can be used without duplicating masses 42 * of code. 43 */ 44 45#ifdef SNAMES 46#define matcher smatcher 47#define fast sfast 48#define slow sslow 49#define dissect sdissect 50#define backref sbackref 51#define step sstep 52#define print sprint 53#define at sat 54#define match smat 55#define nope snope 56#endif 57#ifdef LNAMES 58#define matcher lmatcher 59#define fast lfast 60#define slow lslow 61#define dissect ldissect 62#define backref lbackref 63#define step lstep 64#define print lprint 65#define at lat 66#define match lmat 67#define nope lnope 68#endif 69 70/* another structure passed up and down to avoid zillions of parameters */ 71struct match { 72 struct re_guts *g; 73 int eflags; 74 regmatch_t *pmatch; /* [nsub+1] (0 element unused) */ 75 char *offp; /* offsets work from here */ 76 char *beginp; /* start of string -- virtual NUL precedes */ 77 char *endp; /* end of string -- virtual NUL here */ 78 char *coldp; /* can be no match starting before here */ 79 char **lastpos; /* [nplus+1] */ 80 STATEVARS; 81 states st; /* current states */ 82 states fresh; /* states for a fresh start */ 83 states tmp; /* temporary */ 84 states empty; /* empty set of states */ 85}; 86 87static int matcher(struct re_guts *, char *, size_t, regmatch_t[], int); 88static char *dissect(struct match *, char *, char *, sopno, sopno); 89static char *backref(struct match *, char *, char *, sopno, sopno, sopno, int); 90static char *fast(struct match *, char *, char *, sopno, sopno); 91static char *slow(struct match *, char *, char *, sopno, sopno); 92static states step(struct re_guts *, sopno, sopno, states, int, states); 93#define MAX_RECURSION 100 94#define BOL (OUT+1) 95#define EOL (BOL+1) 96#define BOLEOL (BOL+2) 97#define NOTHING (BOL+3) 98#define BOW (BOL+4) 99#define EOW (BOL+5) 100/* update nonchars[] array below when adding fake chars here */ 101#define CODEMAX (BOL+5) /* highest code used */ 102#define NONCHAR(c) ((c) > CHAR_MAX) 103#define NNONCHAR (CODEMAX-CHAR_MAX) 104#ifdef REDEBUG 105static void print(struct match *, char *, states, int, FILE *); 106#endif 107#ifdef REDEBUG 108static void at(struct match *, char *, char *, char *, sopno, sopno); 109#endif 110#ifdef REDEBUG 111static const char *pchar(int); 112#endif 113 114#ifdef REDEBUG 115#define SP(t, s, c) print(m, t, s, c, stdout) 116#define AT(t, p1, p2, s1, s2) at(m, t, p1, p2, s1, s2) 117#define NOTE(str) { if (m->eflags®_TRACE) (void)printf("=%s\n", (str)); } 118static int nope = 0; 119#else 120#define SP(t, s, c) /* nothing */ 121#define AT(t, p1, p2, s1, s2) /* nothing */ 122#define NOTE(s) /* nothing */ 123#endif 124 125/* 126 - matcher - the actual matching engine 127 */ 128static int /* 0 success, REG_NOMATCH failure */ 129matcher(struct re_guts *g, char *string, size_t nmatch, regmatch_t pmatch[], 130 int eflags) 131{ 132 char *endp; 133 int i; 134 struct match mv; 135 struct match *m = &mv; 136 char *dp; 137 const sopno gf = g->firststate+1; /* +1 for OEND */ 138 const sopno gl = g->laststate; 139 char *start; 140 char *stop; 141 142 /* simplify the situation where possible */ 143 if (g->cflags®_NOSUB) 144 nmatch = 0; 145 if (eflags®_STARTEND) { 146 start = string + pmatch[0].rm_so; 147 stop = string + pmatch[0].rm_eo; 148 } else { 149 start = string; 150 stop = start + strlen(start); 151 } 152 if (stop < start) 153 return(REG_INVARG); 154 155 /* prescreening; this does wonders for this rather slow code */ 156 if (g->must != NULL) { 157 for (dp = start; dp < stop; dp++) 158 if (*dp == g->must[0] && stop - dp >= g->mlen && 159 memcmp(dp, g->must, (size_t)g->mlen) == 0) 160 break; 161 if (dp == stop) /* we didn't find g->must */ 162 return(REG_NOMATCH); 163 } 164 165 /* match struct setup */ 166 m->g = g; 167 m->eflags = eflags; 168 m->pmatch = NULL; 169 m->lastpos = NULL; 170 m->offp = string; 171 m->beginp = start; 172 m->endp = stop; 173 STATESETUP(m, 4); 174 SETUP(m->st); 175 SETUP(m->fresh); 176 SETUP(m->tmp); 177 SETUP(m->empty); 178 CLEAR(m->empty); 179 180 /* this loop does only one repetition except for backrefs */ 181 for (;;) { 182 endp = fast(m, start, stop, gf, gl); 183 if (endp == NULL) { /* a miss */ 184 free(m->pmatch); 185 free(m->lastpos); 186 STATETEARDOWN(m); 187 return(REG_NOMATCH); 188 } 189 if (nmatch == 0 && !g->backrefs) 190 break; /* no further info needed */ 191 192 /* where? */ 193 assert(m->coldp != NULL); 194 for (;;) { 195 NOTE("finding start"); 196 endp = slow(m, m->coldp, stop, gf, gl); 197 if (endp != NULL) 198 break; 199 assert(m->coldp < m->endp); 200 m->coldp++; 201 } 202 if (nmatch == 1 && !g->backrefs) 203 break; /* no further info needed */ 204 205 /* oh my, he wants the subexpressions... */ 206 if (m->pmatch == NULL) 207 m->pmatch = (regmatch_t *)malloc((m->g->nsub + 1) * 208 sizeof(regmatch_t)); 209 if (m->pmatch == NULL) { 210 STATETEARDOWN(m); 211 return(REG_ESPACE); 212 } 213 for (i = 1; i <= m->g->nsub; i++) 214 m->pmatch[i].rm_so = m->pmatch[i].rm_eo = -1; 215 if (!g->backrefs && !(m->eflags®_BACKR)) { 216 NOTE("dissecting"); 217 dp = dissect(m, m->coldp, endp, gf, gl); 218 } else { 219 if (g->nplus > 0 && m->lastpos == NULL) 220 m->lastpos = (char **)malloc((g->nplus+1) * 221 sizeof(char *)); 222 if (g->nplus > 0 && m->lastpos == NULL) { 223 free(m->pmatch); 224 STATETEARDOWN(m); 225 return(REG_ESPACE); 226 } 227 NOTE("backref dissect"); 228 dp = backref(m, m->coldp, endp, gf, gl, (sopno)0, 0); 229 } 230 if (dp != NULL) 231 break; 232 233 /* uh-oh... we couldn't find a subexpression-level match */ 234 assert(g->backrefs); /* must be back references doing it */ 235 assert(g->nplus == 0 || m->lastpos != NULL); 236 for (;;) { 237 if (dp != NULL || endp <= m->coldp) 238 break; /* defeat */ 239 NOTE("backoff"); 240 endp = slow(m, m->coldp, endp-1, gf, gl); 241 if (endp == NULL) 242 break; /* defeat */ 243 /* try it on a shorter possibility */ 244#ifndef NDEBUG 245 for (i = 1; i <= m->g->nsub; i++) { 246 assert(m->pmatch[i].rm_so == -1); 247 assert(m->pmatch[i].rm_eo == -1); 248 } 249#endif 250 NOTE("backoff dissect"); 251 dp = backref(m, m->coldp, endp, gf, gl, (sopno)0, 0); 252 } 253 assert(dp == NULL || dp == endp); 254 if (dp != NULL) /* found a shorter one */ 255 break; 256 257 /* despite initial appearances, there is no match here */ 258 NOTE("false alarm"); 259 if (m->coldp == stop) 260 break; 261 start = m->coldp + 1; /* recycle starting later */ 262 } 263 264 /* fill in the details if requested */ 265 if (nmatch > 0) { 266 pmatch[0].rm_so = m->coldp - m->offp; 267 pmatch[0].rm_eo = endp - m->offp; 268 } 269 if (nmatch > 1) { 270 assert(m->pmatch != NULL); 271 for (i = 1; i < nmatch; i++) 272 if (i <= m->g->nsub) 273 pmatch[i] = m->pmatch[i]; 274 else { 275 pmatch[i].rm_so = -1; 276 pmatch[i].rm_eo = -1; 277 } 278 } 279 280 if (m->pmatch != NULL) 281 free((char *)m->pmatch); 282 if (m->lastpos != NULL) 283 free((char *)m->lastpos); 284 STATETEARDOWN(m); 285 return(0); 286} 287 288/* 289 - dissect - figure out what matched what, no back references 290 */ 291static char * /* == stop (success) always */ 292dissect(struct match *m, char *start, char *stop, sopno startst, sopno stopst) 293{ 294 int i; 295 sopno ss; /* start sop of current subRE */ 296 sopno es; /* end sop of current subRE */ 297 char *sp; /* start of string matched by it */ 298 char *stp; /* string matched by it cannot pass here */ 299 char *rest; /* start of rest of string */ 300 char *tail; /* string unmatched by rest of RE */ 301 sopno ssub; /* start sop of subsubRE */ 302 sopno esub; /* end sop of subsubRE */ 303 char *ssp; /* start of string matched by subsubRE */ 304 char *sep; /* end of string matched by subsubRE */ 305 char *oldssp; /* previous ssp */ 306 char *dp; 307 308 AT("diss", start, stop, startst, stopst); 309 sp = start; 310 for (ss = startst; ss < stopst; ss = es) { 311 /* identify end of subRE */ 312 es = ss; 313 switch (OP(m->g->strip[es])) { 314 case OPLUS_: 315 case OQUEST_: 316 es += OPND(m->g->strip[es]); 317 break; 318 case OCH_: 319 while (OP(m->g->strip[es]) != O_CH) 320 es += OPND(m->g->strip[es]); 321 break; 322 } 323 es++; 324 325 /* figure out what it matched */ 326 switch (OP(m->g->strip[ss])) { 327 case OEND: 328 assert(nope); 329 break; 330 case OCHAR: 331 sp++; 332 break; 333 case OBOL: 334 case OEOL: 335 case OBOW: 336 case OEOW: 337 break; 338 case OANY: 339 case OANYOF: 340 sp++; 341 break; 342 case OBACK_: 343 case O_BACK: 344 assert(nope); 345 break; 346 /* cases where length of match is hard to find */ 347 case OQUEST_: 348 stp = stop; 349 for (;;) { 350 /* how long could this one be? */ 351 rest = slow(m, sp, stp, ss, es); 352 assert(rest != NULL); /* it did match */ 353 /* could the rest match the rest? */ 354 tail = slow(m, rest, stop, es, stopst); 355 if (tail == stop) 356 break; /* yes! */ 357 /* no -- try a shorter match for this one */ 358 stp = rest - 1; 359 assert(stp >= sp); /* it did work */ 360 } 361 ssub = ss + 1; 362 esub = es - 1; 363 /* did innards match? */ 364 if (slow(m, sp, rest, ssub, esub) != NULL) { 365 dp = dissect(m, sp, rest, ssub, esub); 366 assert(dp == rest); 367 } else /* no */ 368 assert(sp == rest); 369 sp = rest; 370 break; 371 case OPLUS_: 372 stp = stop; 373 for (;;) { 374 /* how long could this one be? */ 375 rest = slow(m, sp, stp, ss, es); 376 assert(rest != NULL); /* it did match */ 377 /* could the rest match the rest? */ 378 tail = slow(m, rest, stop, es, stopst); 379 if (tail == stop) 380 break; /* yes! */ 381 /* no -- try a shorter match for this one */ 382 stp = rest - 1; 383 assert(stp >= sp); /* it did work */ 384 } 385 ssub = ss + 1; 386 esub = es - 1; 387 ssp = sp; 388 oldssp = ssp; 389 for (;;) { /* find last match of innards */ 390 sep = slow(m, ssp, rest, ssub, esub); 391 if (sep == NULL || sep == ssp) 392 break; /* failed or matched null */ 393 oldssp = ssp; /* on to next try */ 394 ssp = sep; 395 } 396 if (sep == NULL) { 397 /* last successful match */ 398 sep = ssp; 399 ssp = oldssp; 400 } 401 assert(sep == rest); /* must exhaust substring */ 402 assert(slow(m, ssp, sep, ssub, esub) == rest); 403 dp = dissect(m, ssp, sep, ssub, esub); 404 assert(dp == sep); 405 sp = rest; 406 break; 407 case OCH_: 408 stp = stop; 409 for (;;) { 410 /* how long could this one be? */ 411 rest = slow(m, sp, stp, ss, es); 412 assert(rest != NULL); /* it did match */ 413 /* could the rest match the rest? */ 414 tail = slow(m, rest, stop, es, stopst); 415 if (tail == stop) 416 break; /* yes! */ 417 /* no -- try a shorter match for this one */ 418 stp = rest - 1; 419 assert(stp >= sp); /* it did work */ 420 } 421 ssub = ss + 1; 422 esub = ss + OPND(m->g->strip[ss]) - 1; 423 assert(OP(m->g->strip[esub]) == OOR1); 424 for (;;) { /* find first matching branch */ 425 if (slow(m, sp, rest, ssub, esub) == rest) 426 break; /* it matched all of it */ 427 /* that one missed, try next one */ 428 assert(OP(m->g->strip[esub]) == OOR1); 429 esub++; 430 assert(OP(m->g->strip[esub]) == OOR2); 431 ssub = esub + 1; 432 esub += OPND(m->g->strip[esub]); 433 if (OP(m->g->strip[esub]) == OOR2) 434 esub--; 435 else 436 assert(OP(m->g->strip[esub]) == O_CH); 437 } 438 dp = dissect(m, sp, rest, ssub, esub); 439 assert(dp == rest); 440 sp = rest; 441 break; 442 case O_PLUS: 443 case O_QUEST: 444 case OOR1: 445 case OOR2: 446 case O_CH: 447 assert(nope); 448 break; 449 case OLPAREN: 450 i = OPND(m->g->strip[ss]); 451 assert(0 < i && i <= m->g->nsub); 452 m->pmatch[i].rm_so = sp - m->offp; 453 break; 454 case ORPAREN: 455 i = OPND(m->g->strip[ss]); 456 assert(0 < i && i <= m->g->nsub); 457 m->pmatch[i].rm_eo = sp - m->offp; 458 break; 459 default: /* uh oh */ 460 assert(nope); 461 break; 462 } 463 } 464 465 assert(sp == stop); 466 return(sp); 467} 468 469/* 470 - backref - figure out what matched what, figuring in back references 471 */ 472static char * /* == stop (success) or NULL (failure) */ 473backref(struct match *m, char *start, char *stop, sopno startst, sopno stopst, 474 sopno lev, int rec) /* PLUS nesting level */ 475{ 476 int i; 477 sopno ss; /* start sop of current subRE */ 478 char *sp; /* start of string matched by it */ 479 sopno ssub; /* start sop of subsubRE */ 480 sopno esub; /* end sop of subsubRE */ 481 char *ssp; /* start of string matched by subsubRE */ 482 char *dp; 483 size_t len; 484 int hard; 485 sop s; 486 regoff_t offsave; 487 cset *cs; 488 489 AT("back", start, stop, startst, stopst); 490 sp = start; 491 492 /* get as far as we can with easy stuff */ 493 hard = 0; 494 for (ss = startst; !hard && ss < stopst; ss++) 495 switch (OP(s = m->g->strip[ss])) { 496 case OCHAR: 497 if (sp == stop || *sp++ != (char)OPND(s)) 498 return(NULL); 499 break; 500 case OANY: 501 if (sp == stop) 502 return(NULL); 503 sp++; 504 break; 505 case OANYOF: 506 cs = &m->g->sets[OPND(s)]; 507 if (sp == stop || !CHIN(cs, *sp++)) 508 return(NULL); 509 break; 510 case OBOL: 511 if ( (sp == m->beginp && !(m->eflags®_NOTBOL)) || 512 (sp < m->endp && *(sp-1) == '\n' && 513 (m->g->cflags®_NEWLINE)) ) 514 { /* yes */ } 515 else 516 return(NULL); 517 break; 518 case OEOL: 519 if ( (sp == m->endp && !(m->eflags®_NOTEOL)) || 520 (sp < m->endp && *sp == '\n' && 521 (m->g->cflags®_NEWLINE)) ) 522 { /* yes */ } 523 else 524 return(NULL); 525 break; 526 case OBOW: 527 if (( (sp == m->beginp && !(m->eflags®_NOTBOL)) || 528 (sp < m->endp && *(sp-1) == '\n' && 529 (m->g->cflags®_NEWLINE)) || 530 (sp > m->beginp && 531 !ISWORD(*(sp-1))) ) && 532 (sp < m->endp && ISWORD(*sp)) ) 533 { /* yes */ } 534 else 535 return(NULL); 536 break; 537 case OEOW: 538 if (( (sp == m->endp && !(m->eflags®_NOTEOL)) || 539 (sp < m->endp && *sp == '\n' && 540 (m->g->cflags®_NEWLINE)) || 541 (sp < m->endp && !ISWORD(*sp)) ) && 542 (sp > m->beginp && ISWORD(*(sp-1))) ) 543 { /* yes */ } 544 else 545 return(NULL); 546 break; 547 case O_QUEST: 548 break; 549 case OOR1: /* matches null but needs to skip */ 550 ss++; 551 s = m->g->strip[ss]; 552 do { 553 assert(OP(s) == OOR2); 554 ss += OPND(s); 555 } while (OP(s = m->g->strip[ss]) != O_CH); 556 /* note that the ss++ gets us past the O_CH */ 557 break; 558 default: /* have to make a choice */ 559 hard = 1; 560 break; 561 } 562 if (!hard) { /* that was it! */ 563 if (sp != stop) 564 return(NULL); 565 return(sp); 566 } 567 ss--; /* adjust for the for's final increment */ 568 569 /* the hard stuff */ 570 AT("hard", sp, stop, ss, stopst); 571 s = m->g->strip[ss]; 572 switch (OP(s)) { 573 case OBACK_: /* the vilest depths */ 574 i = OPND(s); 575 assert(0 < i && i <= m->g->nsub); 576 if (m->pmatch[i].rm_eo == -1) 577 return(NULL); 578 assert(m->pmatch[i].rm_so != -1); 579 len = m->pmatch[i].rm_eo - m->pmatch[i].rm_so; 580 if (len == 0 && rec++ > MAX_RECURSION) 581 return(NULL); 582 assert(stop - m->beginp >= len); 583 if (sp > stop - len) 584 return(NULL); /* not enough left to match */ 585 ssp = m->offp + m->pmatch[i].rm_so; 586 if (memcmp(sp, ssp, len) != 0) 587 return(NULL); 588 while (m->g->strip[ss] != SOP(O_BACK, i)) 589 ss++; 590 return(backref(m, sp+len, stop, ss+1, stopst, lev, rec)); 591 break; 592 case OQUEST_: /* to null or not */ 593 dp = backref(m, sp, stop, ss+1, stopst, lev, rec); 594 if (dp != NULL) 595 return(dp); /* not */ 596 return(backref(m, sp, stop, ss+OPND(s)+1, stopst, lev, rec)); 597 break; 598 case OPLUS_: 599 assert(m->lastpos != NULL); 600 assert(lev+1 <= m->g->nplus); 601 m->lastpos[lev+1] = sp; 602 return(backref(m, sp, stop, ss+1, stopst, lev+1, rec)); 603 break; 604 case O_PLUS: 605 if (sp == m->lastpos[lev]) /* last pass matched null */ 606 return(backref(m, sp, stop, ss+1, stopst, lev-1, rec)); 607 /* try another pass */ 608 m->lastpos[lev] = sp; 609 dp = backref(m, sp, stop, ss-OPND(s)+1, stopst, lev, rec); 610 if (dp == NULL) 611 return(backref(m, sp, stop, ss+1, stopst, lev-1, rec)); 612 else 613 return(dp); 614 break; 615 case OCH_: /* find the right one, if any */ 616 ssub = ss + 1; 617 esub = ss + OPND(s) - 1; 618 assert(OP(m->g->strip[esub]) == OOR1); 619 for (;;) { /* find first matching branch */ 620 dp = backref(m, sp, stop, ssub, esub, lev, rec); 621 if (dp != NULL) 622 return(dp); 623 /* that one missed, try next one */ 624 if (OP(m->g->strip[esub]) == O_CH) 625 return(NULL); /* there is none */ 626 esub++; 627 assert(OP(m->g->strip[esub]) == OOR2); 628 ssub = esub + 1; 629 esub += OPND(m->g->strip[esub]); 630 if (OP(m->g->strip[esub]) == OOR2) 631 esub--; 632 else 633 assert(OP(m->g->strip[esub]) == O_CH); 634 } 635 break; 636 case OLPAREN: /* must undo assignment if rest fails */ 637 i = OPND(s); 638 assert(0 < i && i <= m->g->nsub); 639 offsave = m->pmatch[i].rm_so; 640 m->pmatch[i].rm_so = sp - m->offp; 641 dp = backref(m, sp, stop, ss+1, stopst, lev, rec); 642 if (dp != NULL) 643 return(dp); 644 m->pmatch[i].rm_so = offsave; 645 return(NULL); 646 break; 647 case ORPAREN: /* must undo assignment if rest fails */ 648 i = OPND(s); 649 assert(0 < i && i <= m->g->nsub); 650 offsave = m->pmatch[i].rm_eo; 651 m->pmatch[i].rm_eo = sp - m->offp; 652 dp = backref(m, sp, stop, ss+1, stopst, lev, rec); 653 if (dp != NULL) 654 return(dp); 655 m->pmatch[i].rm_eo = offsave; 656 return(NULL); 657 break; 658 default: /* uh oh */ 659 assert(nope); 660 break; 661 } 662 663 /* "can't happen" */ 664 assert(nope); 665 /* NOTREACHED */ 666 return NULL; 667} 668 669/* 670 - fast - step through the string at top speed 671 */ 672static char * /* where tentative match ended, or NULL */ 673fast(struct match *m, char *start, char *stop, sopno startst, sopno stopst) 674{ 675 states st = m->st; 676 states fresh = m->fresh; 677 states tmp = m->tmp; 678 char *p = start; 679 int c = (start == m->beginp) ? OUT : *(start-1); 680 int lastc; /* previous c */ 681 int flagch; 682 int i; 683 char *coldp; /* last p after which no match was underway */ 684 685 CLEAR(st); 686 SET1(st, startst); 687 st = step(m->g, startst, stopst, st, NOTHING, st); 688 ASSIGN(fresh, st); 689 SP("start", st, *p); 690 coldp = NULL; 691 for (;;) { 692 /* next character */ 693 lastc = c; 694 c = (p == m->endp) ? OUT : *p; 695 if (EQ(st, fresh)) 696 coldp = p; 697 698 /* is there an EOL and/or BOL between lastc and c? */ 699 flagch = '\0'; 700 i = 0; 701 if ( (lastc == '\n' && m->g->cflags®_NEWLINE) || 702 (lastc == OUT && !(m->eflags®_NOTBOL)) ) { 703 flagch = BOL; 704 i = m->g->nbol; 705 } 706 if ( (c == '\n' && m->g->cflags®_NEWLINE) || 707 (c == OUT && !(m->eflags®_NOTEOL)) ) { 708 flagch = (flagch == BOL) ? BOLEOL : EOL; 709 i += m->g->neol; 710 } 711 if (i != 0) { 712 for (; i > 0; i--) 713 st = step(m->g, startst, stopst, st, flagch, st); 714 SP("boleol", st, c); 715 } 716 717 /* how about a word boundary? */ 718 if ( (flagch == BOL || (lastc != OUT && !ISWORD(lastc))) && 719 (c != OUT && ISWORD(c)) ) { 720 flagch = BOW; 721 } 722 if ( (lastc != OUT && ISWORD(lastc)) && 723 (flagch == EOL || (c != OUT && !ISWORD(c))) ) { 724 flagch = EOW; 725 } 726 if (flagch == BOW || flagch == EOW) { 727 st = step(m->g, startst, stopst, st, flagch, st); 728 SP("boweow", st, c); 729 } 730 731 /* are we done? */ 732 if (ISSET(st, stopst) || p == stop) 733 break; /* NOTE BREAK OUT */ 734 735 /* no, we must deal with this character */ 736 ASSIGN(tmp, st); 737 ASSIGN(st, fresh); 738 assert(c != OUT); 739 st = step(m->g, startst, stopst, tmp, c, st); 740 SP("aft", st, c); 741 assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st)); 742 p++; 743 } 744 745 assert(coldp != NULL); 746 m->coldp = coldp; 747 if (ISSET(st, stopst)) 748 return(p+1); 749 else 750 return(NULL); 751} 752 753/* 754 - slow - step through the string more deliberately 755 */ 756static char * /* where it ended */ 757slow(struct match *m, char *start, char *stop, sopno startst, sopno stopst) 758{ 759 states st = m->st; 760 states empty = m->empty; 761 states tmp = m->tmp; 762 char *p = start; 763 int c = (start == m->beginp) ? OUT : *(start-1); 764 int lastc; /* previous c */ 765 int flagch; 766 int i; 767 char *matchp; /* last p at which a match ended */ 768 769 AT("slow", start, stop, startst, stopst); 770 CLEAR(st); 771 SET1(st, startst); 772 SP("sstart", st, *p); 773 st = step(m->g, startst, stopst, st, NOTHING, st); 774 matchp = NULL; 775 for (;;) { 776 /* next character */ 777 lastc = c; 778 c = (p == m->endp) ? OUT : *p; 779 780 /* is there an EOL and/or BOL between lastc and c? */ 781 flagch = '\0'; 782 i = 0; 783 if ( (lastc == '\n' && m->g->cflags®_NEWLINE) || 784 (lastc == OUT && !(m->eflags®_NOTBOL)) ) { 785 flagch = BOL; 786 i = m->g->nbol; 787 } 788 if ( (c == '\n' && m->g->cflags®_NEWLINE) || 789 (c == OUT && !(m->eflags®_NOTEOL)) ) { 790 flagch = (flagch == BOL) ? BOLEOL : EOL; 791 i += m->g->neol; 792 } 793 if (i != 0) { 794 for (; i > 0; i--) 795 st = step(m->g, startst, stopst, st, flagch, st); 796 SP("sboleol", st, c); 797 } 798 799 /* how about a word boundary? */ 800 if ( (flagch == BOL || (lastc != OUT && !ISWORD(lastc))) && 801 (c != OUT && ISWORD(c)) ) { 802 flagch = BOW; 803 } 804 if ( (lastc != OUT && ISWORD(lastc)) && 805 (flagch == EOL || (c != OUT && !ISWORD(c))) ) { 806 flagch = EOW; 807 } 808 if (flagch == BOW || flagch == EOW) { 809 st = step(m->g, startst, stopst, st, flagch, st); 810 SP("sboweow", st, c); 811 } 812 813 /* are we done? */ 814 if (ISSET(st, stopst)) 815 matchp = p; 816 if (EQ(st, empty) || p == stop) 817 break; /* NOTE BREAK OUT */ 818 819 /* no, we must deal with this character */ 820 ASSIGN(tmp, st); 821 ASSIGN(st, empty); 822 assert(c != OUT); 823 st = step(m->g, startst, stopst, tmp, c, st); 824 SP("saft", st, c); 825 assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st)); 826 p++; 827 } 828 829 return(matchp); 830} 831 832 833/* 834 - step - map set of states reachable before char to set reachable after 835 */ 836static states 837step(struct re_guts *g, 838 sopno start, /* start state within strip */ 839 sopno stop, /* state after stop state within strip */ 840 states bef, /* states reachable before */ 841 int ch, /* character or NONCHAR code */ 842 states aft) /* states already known reachable after */ 843{ 844 cset *cs; 845 sop s; 846 sopno pc; 847 onestate here; /* note, macros know this name */ 848 sopno look; 849 int i; 850 851 for (pc = start, INIT(here, pc); pc != stop; pc++, INC(here)) { 852 s = g->strip[pc]; 853 switch (OP(s)) { 854 case OEND: 855 assert(pc == stop-1); 856 break; 857 case OCHAR: 858 /* only characters can match */ 859 assert(!NONCHAR(ch) || ch != (char)OPND(s)); 860 if (ch == (char)OPND(s)) 861 FWD(aft, bef, 1); 862 break; 863 case OBOL: 864 if (ch == BOL || ch == BOLEOL) 865 FWD(aft, bef, 1); 866 break; 867 case OEOL: 868 if (ch == EOL || ch == BOLEOL) 869 FWD(aft, bef, 1); 870 break; 871 case OBOW: 872 if (ch == BOW) 873 FWD(aft, bef, 1); 874 break; 875 case OEOW: 876 if (ch == EOW) 877 FWD(aft, bef, 1); 878 break; 879 case OANY: 880 if (!NONCHAR(ch)) 881 FWD(aft, bef, 1); 882 break; 883 case OANYOF: 884 cs = &g->sets[OPND(s)]; 885 if (!NONCHAR(ch) && CHIN(cs, ch)) 886 FWD(aft, bef, 1); 887 break; 888 case OBACK_: /* ignored here */ 889 case O_BACK: 890 FWD(aft, aft, 1); 891 break; 892 case OPLUS_: /* forward, this is just an empty */ 893 FWD(aft, aft, 1); 894 break; 895 case O_PLUS: /* both forward and back */ 896 FWD(aft, aft, 1); 897 i = ISSETBACK(aft, OPND(s)); 898 BACK(aft, aft, OPND(s)); 899 if (!i && ISSETBACK(aft, OPND(s))) { 900 /* oho, must reconsider loop body */ 901 pc -= OPND(s) + 1; 902 INIT(here, pc); 903 } 904 break; 905 case OQUEST_: /* two branches, both forward */ 906 FWD(aft, aft, 1); 907 FWD(aft, aft, OPND(s)); 908 break; 909 case O_QUEST: /* just an empty */ 910 FWD(aft, aft, 1); 911 break; 912 case OLPAREN: /* not significant here */ 913 case ORPAREN: 914 FWD(aft, aft, 1); 915 break; 916 case OCH_: /* mark the first two branches */ 917 FWD(aft, aft, 1); 918 assert(OP(g->strip[pc+OPND(s)]) == OOR2); 919 FWD(aft, aft, OPND(s)); 920 break; 921 case OOR1: /* done a branch, find the O_CH */ 922 if (ISSTATEIN(aft, here)) { 923 for (look = 1; 924 OP(s = g->strip[pc+look]) != O_CH; 925 look += OPND(s)) 926 assert(OP(s) == OOR2); 927 FWD(aft, aft, look); 928 } 929 break; 930 case OOR2: /* propagate OCH_'s marking */ 931 FWD(aft, aft, 1); 932 if (OP(g->strip[pc+OPND(s)]) != O_CH) { 933 assert(OP(g->strip[pc+OPND(s)]) == OOR2); 934 FWD(aft, aft, OPND(s)); 935 } 936 break; 937 case O_CH: /* just empty */ 938 FWD(aft, aft, 1); 939 break; 940 default: /* ooooops... */ 941 assert(nope); 942 break; 943 } 944 } 945 946 return(aft); 947} 948 949#ifdef REDEBUG 950/* 951 - print - print a set of states 952 */ 953static void 954print(struct match *m, char *caption, states st, int ch, FILE *d) 955{ 956 struct re_guts *g = m->g; 957 int i; 958 int first = 1; 959 960 if (!(m->eflags®_TRACE)) 961 return; 962 963 (void)fprintf(d, "%s", caption); 964 (void)fprintf(d, " %s", pchar(ch)); 965 for (i = 0; i < g->nstates; i++) 966 if (ISSET(st, i)) { 967 (void)fprintf(d, "%s%d", (first) ? "\t" : ", ", i); 968 first = 0; 969 } 970 (void)fprintf(d, "\n"); 971} 972 973/* 974 - at - print current situation 975 */ 976static void 977at(struct match *m, char *title, char *start, char *stop, sopno startst, 978 sopno stopst) 979{ 980 if (!(m->eflags®_TRACE)) 981 return; 982 983 (void)printf("%s %s-", title, pchar(*start)); 984 (void)printf("%s ", pchar(*stop)); 985 (void)printf("%ld-%ld\n", (long)startst, (long)stopst); 986} 987 988#ifndef PCHARDONE 989#define PCHARDONE /* never again */ 990static const char *nonchars[] = 991 { "OUT", "BOL", "EOL", "BOLEOL", "NOTHING", "BOW", "EOW" }; 992#define PNONCHAR(c) \ 993 ((c) - OUT < (sizeof(nonchars)/sizeof(nonchars[0])) \ 994 ? nonchars[(c) - OUT] : "invalid") 995 996/* 997 - pchar - make a character printable 998 * 999 * Is this identical to regchar() over in debug.c? Well, yes. But a 1000 * duplicate here avoids having a debugging-capable regexec.o tied to 1001 * a matching debug.o, and this is convenient. It all disappears in 1002 * the non-debug compilation anyway, so it doesn't matter much. 1003 */ 1004static const char * /* -> representation */ 1005pchar(int ch) 1006{ 1007 static char pbuf[10]; 1008 1009 if (NONCHAR(ch)) { 1010 if (ch - OUT < (sizeof(nonchars)/sizeof(nonchars[0]))) 1011 return nonchars[ch - OUT]; 1012 return "invalid"; 1013 } 1014 if (isprint((unsigned char)ch) || ch == ' ') 1015 (void)snprintf(pbuf, sizeof pbuf, "%c", ch); 1016 else 1017 (void)snprintf(pbuf, sizeof pbuf, "\\%o", ch); 1018 return(pbuf); 1019} 1020#endif 1021#endif 1022 1023#undef matcher 1024#undef fast 1025#undef slow 1026#undef dissect 1027#undef backref 1028#undef step 1029#undef print 1030#undef at 1031#undef match 1032#undef nope 1033