engine.c revision 1.15
1/* $OpenBSD: engine.c,v 1.15 2005/08/05 13:03:00 espie Exp $ */ 2 3/*- 4 * Copyright (c) 1992, 1993, 1994 Henry Spencer. 5 * Copyright (c) 1992, 1993, 1994 6 * The Regents of the University of California. All rights reserved. 7 * 8 * This code is derived from software contributed to Berkeley by 9 * Henry Spencer. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 3. Neither the name of the University nor the names of its contributors 20 * may be used to endorse or promote products derived from this software 21 * without specific prior written permission. 22 * 23 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 24 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 25 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 26 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 27 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 28 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 29 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 30 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 31 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 32 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 33 * SUCH DAMAGE. 34 * 35 * @(#)engine.c 8.5 (Berkeley) 3/20/94 36 */ 37 38/* 39 * The matching engine and friends. This file is #included by regexec.c 40 * after suitable #defines of a variety of macros used herein, so that 41 * different state representations can be used without duplicating masses 42 * of code. 43 */ 44 45#ifdef SNAMES 46#define matcher smatcher 47#define fast sfast 48#define slow sslow 49#define dissect sdissect 50#define backref sbackref 51#define step sstep 52#define print sprint 53#define at sat 54#define match smat 55#define nope snope 56#endif 57#ifdef LNAMES 58#define matcher lmatcher 59#define fast lfast 60#define slow lslow 61#define dissect ldissect 62#define backref lbackref 63#define step lstep 64#define print lprint 65#define at lat 66#define match lmat 67#define nope lnope 68#endif 69 70/* another structure passed up and down to avoid zillions of parameters */ 71struct match { 72 struct re_guts *g; 73 int eflags; 74 regmatch_t *pmatch; /* [nsub+1] (0 element unused) */ 75 char *offp; /* offsets work from here */ 76 char *beginp; /* start of string -- virtual NUL precedes */ 77 char *endp; /* end of string -- virtual NUL here */ 78 char *coldp; /* can be no match starting before here */ 79 char **lastpos; /* [nplus+1] */ 80 STATEVARS; 81 states st; /* current states */ 82 states fresh; /* states for a fresh start */ 83 states tmp; /* temporary */ 84 states empty; /* empty set of states */ 85}; 86 87static int matcher(struct re_guts *, char *, size_t, regmatch_t[], int); 88static char *dissect(struct match *, char *, char *, sopno, sopno); 89static char *backref(struct match *, char *, char *, sopno, sopno, sopno, int); 90static char *fast(struct match *, char *, char *, sopno, sopno); 91static char *slow(struct match *, char *, char *, sopno, sopno); 92static states step(struct re_guts *, sopno, sopno, states, int, states); 93#define MAX_RECURSION 100 94#define BOL (OUT+1) 95#define EOL (BOL+1) 96#define BOLEOL (BOL+2) 97#define NOTHING (BOL+3) 98#define BOW (BOL+4) 99#define EOW (BOL+5) 100#define CODEMAX (BOL+5) /* highest code used */ 101#define NONCHAR(c) ((c) > CHAR_MAX) 102#define NNONCHAR (CODEMAX-CHAR_MAX) 103#ifdef REDEBUG 104static void print(struct match *, char *, states, int, FILE *); 105#endif 106#ifdef REDEBUG 107static void at(struct match *, char *, char *, char *, sopno, sopno); 108#endif 109#ifdef REDEBUG 110static char *pchar(int); 111#endif 112 113#ifdef REDEBUG 114#define SP(t, s, c) print(m, t, s, c, stdout) 115#define AT(t, p1, p2, s1, s2) at(m, t, p1, p2, s1, s2) 116#define NOTE(str) { if (m->eflags®_TRACE) (void)printf("=%s\n", (str)); } 117static int nope = 0; 118#else 119#define SP(t, s, c) /* nothing */ 120#define AT(t, p1, p2, s1, s2) /* nothing */ 121#define NOTE(s) /* nothing */ 122#endif 123 124/* 125 - matcher - the actual matching engine 126 */ 127static int /* 0 success, REG_NOMATCH failure */ 128matcher(struct re_guts *g, char *string, size_t nmatch, regmatch_t pmatch[], 129 int eflags) 130{ 131 char *endp; 132 int i; 133 struct match mv; 134 struct match *m = &mv; 135 char *dp; 136 const sopno gf = g->firststate+1; /* +1 for OEND */ 137 const sopno gl = g->laststate; 138 char *start; 139 char *stop; 140 141 /* simplify the situation where possible */ 142 if (g->cflags®_NOSUB) 143 nmatch = 0; 144 if (eflags®_STARTEND) { 145 start = string + pmatch[0].rm_so; 146 stop = string + pmatch[0].rm_eo; 147 } else { 148 start = string; 149 stop = start + strlen(start); 150 } 151 if (stop < start) 152 return(REG_INVARG); 153 154 /* prescreening; this does wonders for this rather slow code */ 155 if (g->must != NULL) { 156 for (dp = start; dp < stop; dp++) 157 if (*dp == g->must[0] && stop - dp >= g->mlen && 158 memcmp(dp, g->must, (size_t)g->mlen) == 0) 159 break; 160 if (dp == stop) /* we didn't find g->must */ 161 return(REG_NOMATCH); 162 } 163 164 /* match struct setup */ 165 m->g = g; 166 m->eflags = eflags; 167 m->pmatch = NULL; 168 m->lastpos = NULL; 169 m->offp = string; 170 m->beginp = start; 171 m->endp = stop; 172 STATESETUP(m, 4); 173 SETUP(m->st); 174 SETUP(m->fresh); 175 SETUP(m->tmp); 176 SETUP(m->empty); 177 CLEAR(m->empty); 178 179 /* this loop does only one repetition except for backrefs */ 180 for (;;) { 181 endp = fast(m, start, stop, gf, gl); 182 if (endp == NULL) { /* a miss */ 183 free(m->pmatch); 184 free(m->lastpos); 185 STATETEARDOWN(m); 186 return(REG_NOMATCH); 187 } 188 if (nmatch == 0 && !g->backrefs) 189 break; /* no further info needed */ 190 191 /* where? */ 192 assert(m->coldp != NULL); 193 for (;;) { 194 NOTE("finding start"); 195 endp = slow(m, m->coldp, stop, gf, gl); 196 if (endp != NULL) 197 break; 198 assert(m->coldp < m->endp); 199 m->coldp++; 200 } 201 if (nmatch == 1 && !g->backrefs) 202 break; /* no further info needed */ 203 204 /* oh my, he wants the subexpressions... */ 205 if (m->pmatch == NULL) 206 m->pmatch = (regmatch_t *)malloc((m->g->nsub + 1) * 207 sizeof(regmatch_t)); 208 if (m->pmatch == NULL) { 209 STATETEARDOWN(m); 210 return(REG_ESPACE); 211 } 212 for (i = 1; i <= m->g->nsub; i++) 213 m->pmatch[i].rm_so = m->pmatch[i].rm_eo = -1; 214 if (!g->backrefs && !(m->eflags®_BACKR)) { 215 NOTE("dissecting"); 216 dp = dissect(m, m->coldp, endp, gf, gl); 217 } else { 218 if (g->nplus > 0 && m->lastpos == NULL) 219 m->lastpos = (char **)malloc((g->nplus+1) * 220 sizeof(char *)); 221 if (g->nplus > 0 && m->lastpos == NULL) { 222 free(m->pmatch); 223 STATETEARDOWN(m); 224 return(REG_ESPACE); 225 } 226 NOTE("backref dissect"); 227 dp = backref(m, m->coldp, endp, gf, gl, (sopno)0, 0); 228 } 229 if (dp != NULL) 230 break; 231 232 /* uh-oh... we couldn't find a subexpression-level match */ 233 assert(g->backrefs); /* must be back references doing it */ 234 assert(g->nplus == 0 || m->lastpos != NULL); 235 for (;;) { 236 if (dp != NULL || endp <= m->coldp) 237 break; /* defeat */ 238 NOTE("backoff"); 239 endp = slow(m, m->coldp, endp-1, gf, gl); 240 if (endp == NULL) 241 break; /* defeat */ 242 /* try it on a shorter possibility */ 243#ifndef NDEBUG 244 for (i = 1; i <= m->g->nsub; i++) { 245 assert(m->pmatch[i].rm_so == -1); 246 assert(m->pmatch[i].rm_eo == -1); 247 } 248#endif 249 NOTE("backoff dissect"); 250 dp = backref(m, m->coldp, endp, gf, gl, (sopno)0, 0); 251 } 252 assert(dp == NULL || dp == endp); 253 if (dp != NULL) /* found a shorter one */ 254 break; 255 256 /* despite initial appearances, there is no match here */ 257 NOTE("false alarm"); 258 if (m->coldp == stop) 259 break; 260 start = m->coldp + 1; /* recycle starting later */ 261 } 262 263 /* fill in the details if requested */ 264 if (nmatch > 0) { 265 pmatch[0].rm_so = m->coldp - m->offp; 266 pmatch[0].rm_eo = endp - m->offp; 267 } 268 if (nmatch > 1) { 269 assert(m->pmatch != NULL); 270 for (i = 1; i < nmatch; i++) 271 if (i <= m->g->nsub) 272 pmatch[i] = m->pmatch[i]; 273 else { 274 pmatch[i].rm_so = -1; 275 pmatch[i].rm_eo = -1; 276 } 277 } 278 279 if (m->pmatch != NULL) 280 free((char *)m->pmatch); 281 if (m->lastpos != NULL) 282 free((char *)m->lastpos); 283 STATETEARDOWN(m); 284 return(0); 285} 286 287/* 288 - dissect - figure out what matched what, no back references 289 */ 290static char * /* == stop (success) always */ 291dissect(struct match *m, char *start, char *stop, sopno startst, sopno stopst) 292{ 293 int i; 294 sopno ss; /* start sop of current subRE */ 295 sopno es; /* end sop of current subRE */ 296 char *sp; /* start of string matched by it */ 297 char *stp; /* string matched by it cannot pass here */ 298 char *rest; /* start of rest of string */ 299 char *tail; /* string unmatched by rest of RE */ 300 sopno ssub; /* start sop of subsubRE */ 301 sopno esub; /* end sop of subsubRE */ 302 char *ssp; /* start of string matched by subsubRE */ 303 char *sep; /* end of string matched by subsubRE */ 304 char *oldssp; /* previous ssp */ 305 char *dp; 306 307 AT("diss", start, stop, startst, stopst); 308 sp = start; 309 for (ss = startst; ss < stopst; ss = es) { 310 /* identify end of subRE */ 311 es = ss; 312 switch (OP(m->g->strip[es])) { 313 case OPLUS_: 314 case OQUEST_: 315 es += OPND(m->g->strip[es]); 316 break; 317 case OCH_: 318 while (OP(m->g->strip[es]) != O_CH) 319 es += OPND(m->g->strip[es]); 320 break; 321 } 322 es++; 323 324 /* figure out what it matched */ 325 switch (OP(m->g->strip[ss])) { 326 case OEND: 327 assert(nope); 328 break; 329 case OCHAR: 330 sp++; 331 break; 332 case OBOL: 333 case OEOL: 334 case OBOW: 335 case OEOW: 336 break; 337 case OANY: 338 case OANYOF: 339 sp++; 340 break; 341 case OBACK_: 342 case O_BACK: 343 assert(nope); 344 break; 345 /* cases where length of match is hard to find */ 346 case OQUEST_: 347 stp = stop; 348 for (;;) { 349 /* how long could this one be? */ 350 rest = slow(m, sp, stp, ss, es); 351 assert(rest != NULL); /* it did match */ 352 /* could the rest match the rest? */ 353 tail = slow(m, rest, stop, es, stopst); 354 if (tail == stop) 355 break; /* yes! */ 356 /* no -- try a shorter match for this one */ 357 stp = rest - 1; 358 assert(stp >= sp); /* it did work */ 359 } 360 ssub = ss + 1; 361 esub = es - 1; 362 /* did innards match? */ 363 if (slow(m, sp, rest, ssub, esub) != NULL) { 364 dp = dissect(m, sp, rest, ssub, esub); 365 assert(dp == rest); 366 } else /* no */ 367 assert(sp == rest); 368 sp = rest; 369 break; 370 case OPLUS_: 371 stp = stop; 372 for (;;) { 373 /* how long could this one be? */ 374 rest = slow(m, sp, stp, ss, es); 375 assert(rest != NULL); /* it did match */ 376 /* could the rest match the rest? */ 377 tail = slow(m, rest, stop, es, stopst); 378 if (tail == stop) 379 break; /* yes! */ 380 /* no -- try a shorter match for this one */ 381 stp = rest - 1; 382 assert(stp >= sp); /* it did work */ 383 } 384 ssub = ss + 1; 385 esub = es - 1; 386 ssp = sp; 387 oldssp = ssp; 388 for (;;) { /* find last match of innards */ 389 sep = slow(m, ssp, rest, ssub, esub); 390 if (sep == NULL || sep == ssp) 391 break; /* failed or matched null */ 392 oldssp = ssp; /* on to next try */ 393 ssp = sep; 394 } 395 if (sep == NULL) { 396 /* last successful match */ 397 sep = ssp; 398 ssp = oldssp; 399 } 400 assert(sep == rest); /* must exhaust substring */ 401 assert(slow(m, ssp, sep, ssub, esub) == rest); 402 dp = dissect(m, ssp, sep, ssub, esub); 403 assert(dp == sep); 404 sp = rest; 405 break; 406 case OCH_: 407 stp = stop; 408 for (;;) { 409 /* how long could this one be? */ 410 rest = slow(m, sp, stp, ss, es); 411 assert(rest != NULL); /* it did match */ 412 /* could the rest match the rest? */ 413 tail = slow(m, rest, stop, es, stopst); 414 if (tail == stop) 415 break; /* yes! */ 416 /* no -- try a shorter match for this one */ 417 stp = rest - 1; 418 assert(stp >= sp); /* it did work */ 419 } 420 ssub = ss + 1; 421 esub = ss + OPND(m->g->strip[ss]) - 1; 422 assert(OP(m->g->strip[esub]) == OOR1); 423 for (;;) { /* find first matching branch */ 424 if (slow(m, sp, rest, ssub, esub) == rest) 425 break; /* it matched all of it */ 426 /* that one missed, try next one */ 427 assert(OP(m->g->strip[esub]) == OOR1); 428 esub++; 429 assert(OP(m->g->strip[esub]) == OOR2); 430 ssub = esub + 1; 431 esub += OPND(m->g->strip[esub]); 432 if (OP(m->g->strip[esub]) == OOR2) 433 esub--; 434 else 435 assert(OP(m->g->strip[esub]) == O_CH); 436 } 437 dp = dissect(m, sp, rest, ssub, esub); 438 assert(dp == rest); 439 sp = rest; 440 break; 441 case O_PLUS: 442 case O_QUEST: 443 case OOR1: 444 case OOR2: 445 case O_CH: 446 assert(nope); 447 break; 448 case OLPAREN: 449 i = OPND(m->g->strip[ss]); 450 assert(0 < i && i <= m->g->nsub); 451 m->pmatch[i].rm_so = sp - m->offp; 452 break; 453 case ORPAREN: 454 i = OPND(m->g->strip[ss]); 455 assert(0 < i && i <= m->g->nsub); 456 m->pmatch[i].rm_eo = sp - m->offp; 457 break; 458 default: /* uh oh */ 459 assert(nope); 460 break; 461 } 462 } 463 464 assert(sp == stop); 465 return(sp); 466} 467 468/* 469 - backref - figure out what matched what, figuring in back references 470 */ 471static char * /* == stop (success) or NULL (failure) */ 472backref(struct match *m, char *start, char *stop, sopno startst, sopno stopst, 473 sopno lev, int rec) /* PLUS nesting level */ 474{ 475 int i; 476 sopno ss; /* start sop of current subRE */ 477 char *sp; /* start of string matched by it */ 478 sopno ssub; /* start sop of subsubRE */ 479 sopno esub; /* end sop of subsubRE */ 480 char *ssp; /* start of string matched by subsubRE */ 481 char *dp; 482 size_t len; 483 int hard; 484 sop s; 485 regoff_t offsave; 486 cset *cs; 487 488 AT("back", start, stop, startst, stopst); 489 sp = start; 490 491 /* get as far as we can with easy stuff */ 492 hard = 0; 493 for (ss = startst; !hard && ss < stopst; ss++) 494 switch (OP(s = m->g->strip[ss])) { 495 case OCHAR: 496 if (sp == stop || *sp++ != (char)OPND(s)) 497 return(NULL); 498 break; 499 case OANY: 500 if (sp == stop) 501 return(NULL); 502 sp++; 503 break; 504 case OANYOF: 505 cs = &m->g->sets[OPND(s)]; 506 if (sp == stop || !CHIN(cs, *sp++)) 507 return(NULL); 508 break; 509 case OBOL: 510 if ( (sp == m->beginp && !(m->eflags®_NOTBOL)) || 511 (sp < m->endp && *(sp-1) == '\n' && 512 (m->g->cflags®_NEWLINE)) ) 513 { /* yes */ } 514 else 515 return(NULL); 516 break; 517 case OEOL: 518 if ( (sp == m->endp && !(m->eflags®_NOTEOL)) || 519 (sp < m->endp && *sp == '\n' && 520 (m->g->cflags®_NEWLINE)) ) 521 { /* yes */ } 522 else 523 return(NULL); 524 break; 525 case OBOW: 526 if (( (sp == m->beginp && !(m->eflags®_NOTBOL)) || 527 (sp < m->endp && *(sp-1) == '\n' && 528 (m->g->cflags®_NEWLINE)) || 529 (sp > m->beginp && 530 !ISWORD(*(sp-1))) ) && 531 (sp < m->endp && ISWORD(*sp)) ) 532 { /* yes */ } 533 else 534 return(NULL); 535 break; 536 case OEOW: 537 if (( (sp == m->endp && !(m->eflags®_NOTEOL)) || 538 (sp < m->endp && *sp == '\n' && 539 (m->g->cflags®_NEWLINE)) || 540 (sp < m->endp && !ISWORD(*sp)) ) && 541 (sp > m->beginp && ISWORD(*(sp-1))) ) 542 { /* yes */ } 543 else 544 return(NULL); 545 break; 546 case O_QUEST: 547 break; 548 case OOR1: /* matches null but needs to skip */ 549 ss++; 550 s = m->g->strip[ss]; 551 do { 552 assert(OP(s) == OOR2); 553 ss += OPND(s); 554 } while (OP(s = m->g->strip[ss]) != O_CH); 555 /* note that the ss++ gets us past the O_CH */ 556 break; 557 default: /* have to make a choice */ 558 hard = 1; 559 break; 560 } 561 if (!hard) { /* that was it! */ 562 if (sp != stop) 563 return(NULL); 564 return(sp); 565 } 566 ss--; /* adjust for the for's final increment */ 567 568 /* the hard stuff */ 569 AT("hard", sp, stop, ss, stopst); 570 s = m->g->strip[ss]; 571 switch (OP(s)) { 572 case OBACK_: /* the vilest depths */ 573 i = OPND(s); 574 assert(0 < i && i <= m->g->nsub); 575 if (m->pmatch[i].rm_eo == -1) 576 return(NULL); 577 assert(m->pmatch[i].rm_so != -1); 578 len = m->pmatch[i].rm_eo - m->pmatch[i].rm_so; 579 if (len == 0 && rec++ > MAX_RECURSION) 580 return(NULL); 581 assert(stop - m->beginp >= len); 582 if (sp > stop - len) 583 return(NULL); /* not enough left to match */ 584 ssp = m->offp + m->pmatch[i].rm_so; 585 if (memcmp(sp, ssp, len) != 0) 586 return(NULL); 587 while (m->g->strip[ss] != SOP(O_BACK, i)) 588 ss++; 589 return(backref(m, sp+len, stop, ss+1, stopst, lev, rec)); 590 break; 591 case OQUEST_: /* to null or not */ 592 dp = backref(m, sp, stop, ss+1, stopst, lev, rec); 593 if (dp != NULL) 594 return(dp); /* not */ 595 return(backref(m, sp, stop, ss+OPND(s)+1, stopst, lev, rec)); 596 break; 597 case OPLUS_: 598 assert(m->lastpos != NULL); 599 assert(lev+1 <= m->g->nplus); 600 m->lastpos[lev+1] = sp; 601 return(backref(m, sp, stop, ss+1, stopst, lev+1, rec)); 602 break; 603 case O_PLUS: 604 if (sp == m->lastpos[lev]) /* last pass matched null */ 605 return(backref(m, sp, stop, ss+1, stopst, lev-1, rec)); 606 /* try another pass */ 607 m->lastpos[lev] = sp; 608 dp = backref(m, sp, stop, ss-OPND(s)+1, stopst, lev, rec); 609 if (dp == NULL) 610 return(backref(m, sp, stop, ss+1, stopst, lev-1, rec)); 611 else 612 return(dp); 613 break; 614 case OCH_: /* find the right one, if any */ 615 ssub = ss + 1; 616 esub = ss + OPND(s) - 1; 617 assert(OP(m->g->strip[esub]) == OOR1); 618 for (;;) { /* find first matching branch */ 619 dp = backref(m, sp, stop, ssub, esub, lev, rec); 620 if (dp != NULL) 621 return(dp); 622 /* that one missed, try next one */ 623 if (OP(m->g->strip[esub]) == O_CH) 624 return(NULL); /* there is none */ 625 esub++; 626 assert(OP(m->g->strip[esub]) == OOR2); 627 ssub = esub + 1; 628 esub += OPND(m->g->strip[esub]); 629 if (OP(m->g->strip[esub]) == OOR2) 630 esub--; 631 else 632 assert(OP(m->g->strip[esub]) == O_CH); 633 } 634 break; 635 case OLPAREN: /* must undo assignment if rest fails */ 636 i = OPND(s); 637 assert(0 < i && i <= m->g->nsub); 638 offsave = m->pmatch[i].rm_so; 639 m->pmatch[i].rm_so = sp - m->offp; 640 dp = backref(m, sp, stop, ss+1, stopst, lev, rec); 641 if (dp != NULL) 642 return(dp); 643 m->pmatch[i].rm_so = offsave; 644 return(NULL); 645 break; 646 case ORPAREN: /* must undo assignment if rest fails */ 647 i = OPND(s); 648 assert(0 < i && i <= m->g->nsub); 649 offsave = m->pmatch[i].rm_eo; 650 m->pmatch[i].rm_eo = sp - m->offp; 651 dp = backref(m, sp, stop, ss+1, stopst, lev, rec); 652 if (dp != NULL) 653 return(dp); 654 m->pmatch[i].rm_eo = offsave; 655 return(NULL); 656 break; 657 default: /* uh oh */ 658 assert(nope); 659 break; 660 } 661 662 /* "can't happen" */ 663 assert(nope); 664 /* NOTREACHED */ 665} 666 667/* 668 - fast - step through the string at top speed 669 */ 670static char * /* where tentative match ended, or NULL */ 671fast(struct match *m, char *start, char *stop, sopno startst, sopno stopst) 672{ 673 states st = m->st; 674 states fresh = m->fresh; 675 states tmp = m->tmp; 676 char *p = start; 677 int c = (start == m->beginp) ? OUT : *(start-1); 678 int lastc; /* previous c */ 679 int flagch; 680 int i; 681 char *coldp; /* last p after which no match was underway */ 682 683 CLEAR(st); 684 SET1(st, startst); 685 st = step(m->g, startst, stopst, st, NOTHING, st); 686 ASSIGN(fresh, st); 687 SP("start", st, *p); 688 coldp = NULL; 689 for (;;) { 690 /* next character */ 691 lastc = c; 692 c = (p == m->endp) ? OUT : *p; 693 if (EQ(st, fresh)) 694 coldp = p; 695 696 /* is there an EOL and/or BOL between lastc and c? */ 697 flagch = '\0'; 698 i = 0; 699 if ( (lastc == '\n' && m->g->cflags®_NEWLINE) || 700 (lastc == OUT && !(m->eflags®_NOTBOL)) ) { 701 flagch = BOL; 702 i = m->g->nbol; 703 } 704 if ( (c == '\n' && m->g->cflags®_NEWLINE) || 705 (c == OUT && !(m->eflags®_NOTEOL)) ) { 706 flagch = (flagch == BOL) ? BOLEOL : EOL; 707 i += m->g->neol; 708 } 709 if (i != 0) { 710 for (; i > 0; i--) 711 st = step(m->g, startst, stopst, st, flagch, st); 712 SP("boleol", st, c); 713 } 714 715 /* how about a word boundary? */ 716 if ( (flagch == BOL || (lastc != OUT && !ISWORD(lastc))) && 717 (c != OUT && ISWORD(c)) ) { 718 flagch = BOW; 719 } 720 if ( (lastc != OUT && ISWORD(lastc)) && 721 (flagch == EOL || (c != OUT && !ISWORD(c))) ) { 722 flagch = EOW; 723 } 724 if (flagch == BOW || flagch == EOW) { 725 st = step(m->g, startst, stopst, st, flagch, st); 726 SP("boweow", st, c); 727 } 728 729 /* are we done? */ 730 if (ISSET(st, stopst) || p == stop) 731 break; /* NOTE BREAK OUT */ 732 733 /* no, we must deal with this character */ 734 ASSIGN(tmp, st); 735 ASSIGN(st, fresh); 736 assert(c != OUT); 737 st = step(m->g, startst, stopst, tmp, c, st); 738 SP("aft", st, c); 739 assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st)); 740 p++; 741 } 742 743 assert(coldp != NULL); 744 m->coldp = coldp; 745 if (ISSET(st, stopst)) 746 return(p+1); 747 else 748 return(NULL); 749} 750 751/* 752 - slow - step through the string more deliberately 753 */ 754static char * /* where it ended */ 755slow(struct match *m, char *start, char *stop, sopno startst, sopno stopst) 756{ 757 states st = m->st; 758 states empty = m->empty; 759 states tmp = m->tmp; 760 char *p = start; 761 int c = (start == m->beginp) ? OUT : *(start-1); 762 int lastc; /* previous c */ 763 int flagch; 764 int i; 765 char *matchp; /* last p at which a match ended */ 766 767 AT("slow", start, stop, startst, stopst); 768 CLEAR(st); 769 SET1(st, startst); 770 SP("sstart", st, *p); 771 st = step(m->g, startst, stopst, st, NOTHING, st); 772 matchp = NULL; 773 for (;;) { 774 /* next character */ 775 lastc = c; 776 c = (p == m->endp) ? OUT : *p; 777 778 /* is there an EOL and/or BOL between lastc and c? */ 779 flagch = '\0'; 780 i = 0; 781 if ( (lastc == '\n' && m->g->cflags®_NEWLINE) || 782 (lastc == OUT && !(m->eflags®_NOTBOL)) ) { 783 flagch = BOL; 784 i = m->g->nbol; 785 } 786 if ( (c == '\n' && m->g->cflags®_NEWLINE) || 787 (c == OUT && !(m->eflags®_NOTEOL)) ) { 788 flagch = (flagch == BOL) ? BOLEOL : EOL; 789 i += m->g->neol; 790 } 791 if (i != 0) { 792 for (; i > 0; i--) 793 st = step(m->g, startst, stopst, st, flagch, st); 794 SP("sboleol", st, c); 795 } 796 797 /* how about a word boundary? */ 798 if ( (flagch == BOL || (lastc != OUT && !ISWORD(lastc))) && 799 (c != OUT && ISWORD(c)) ) { 800 flagch = BOW; 801 } 802 if ( (lastc != OUT && ISWORD(lastc)) && 803 (flagch == EOL || (c != OUT && !ISWORD(c))) ) { 804 flagch = EOW; 805 } 806 if (flagch == BOW || flagch == EOW) { 807 st = step(m->g, startst, stopst, st, flagch, st); 808 SP("sboweow", st, c); 809 } 810 811 /* are we done? */ 812 if (ISSET(st, stopst)) 813 matchp = p; 814 if (EQ(st, empty) || p == stop) 815 break; /* NOTE BREAK OUT */ 816 817 /* no, we must deal with this character */ 818 ASSIGN(tmp, st); 819 ASSIGN(st, empty); 820 assert(c != OUT); 821 st = step(m->g, startst, stopst, tmp, c, st); 822 SP("saft", st, c); 823 assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st)); 824 p++; 825 } 826 827 return(matchp); 828} 829 830 831/* 832 - step - map set of states reachable before char to set reachable after 833 */ 834static states 835step(struct re_guts *g, 836 sopno start, /* start state within strip */ 837 sopno stop, /* state after stop state within strip */ 838 states bef, /* states reachable before */ 839 int ch, /* character or NONCHAR code */ 840 states aft) /* states already known reachable after */ 841{ 842 cset *cs; 843 sop s; 844 sopno pc; 845 onestate here; /* note, macros know this name */ 846 sopno look; 847 int i; 848 849 for (pc = start, INIT(here, pc); pc != stop; pc++, INC(here)) { 850 s = g->strip[pc]; 851 switch (OP(s)) { 852 case OEND: 853 assert(pc == stop-1); 854 break; 855 case OCHAR: 856 /* only characters can match */ 857 assert(!NONCHAR(ch) || ch != (char)OPND(s)); 858 if (ch == (char)OPND(s)) 859 FWD(aft, bef, 1); 860 break; 861 case OBOL: 862 if (ch == BOL || ch == BOLEOL) 863 FWD(aft, bef, 1); 864 break; 865 case OEOL: 866 if (ch == EOL || ch == BOLEOL) 867 FWD(aft, bef, 1); 868 break; 869 case OBOW: 870 if (ch == BOW) 871 FWD(aft, bef, 1); 872 break; 873 case OEOW: 874 if (ch == EOW) 875 FWD(aft, bef, 1); 876 break; 877 case OANY: 878 if (!NONCHAR(ch)) 879 FWD(aft, bef, 1); 880 break; 881 case OANYOF: 882 cs = &g->sets[OPND(s)]; 883 if (!NONCHAR(ch) && CHIN(cs, ch)) 884 FWD(aft, bef, 1); 885 break; 886 case OBACK_: /* ignored here */ 887 case O_BACK: 888 FWD(aft, aft, 1); 889 break; 890 case OPLUS_: /* forward, this is just an empty */ 891 FWD(aft, aft, 1); 892 break; 893 case O_PLUS: /* both forward and back */ 894 FWD(aft, aft, 1); 895 i = ISSETBACK(aft, OPND(s)); 896 BACK(aft, aft, OPND(s)); 897 if (!i && ISSETBACK(aft, OPND(s))) { 898 /* oho, must reconsider loop body */ 899 pc -= OPND(s) + 1; 900 INIT(here, pc); 901 } 902 break; 903 case OQUEST_: /* two branches, both forward */ 904 FWD(aft, aft, 1); 905 FWD(aft, aft, OPND(s)); 906 break; 907 case O_QUEST: /* just an empty */ 908 FWD(aft, aft, 1); 909 break; 910 case OLPAREN: /* not significant here */ 911 case ORPAREN: 912 FWD(aft, aft, 1); 913 break; 914 case OCH_: /* mark the first two branches */ 915 FWD(aft, aft, 1); 916 assert(OP(g->strip[pc+OPND(s)]) == OOR2); 917 FWD(aft, aft, OPND(s)); 918 break; 919 case OOR1: /* done a branch, find the O_CH */ 920 if (ISSTATEIN(aft, here)) { 921 for (look = 1; 922 OP(s = g->strip[pc+look]) != O_CH; 923 look += OPND(s)) 924 assert(OP(s) == OOR2); 925 FWD(aft, aft, look); 926 } 927 break; 928 case OOR2: /* propagate OCH_'s marking */ 929 FWD(aft, aft, 1); 930 if (OP(g->strip[pc+OPND(s)]) != O_CH) { 931 assert(OP(g->strip[pc+OPND(s)]) == OOR2); 932 FWD(aft, aft, OPND(s)); 933 } 934 break; 935 case O_CH: /* just empty */ 936 FWD(aft, aft, 1); 937 break; 938 default: /* ooooops... */ 939 assert(nope); 940 break; 941 } 942 } 943 944 return(aft); 945} 946 947#ifdef REDEBUG 948/* 949 - print - print a set of states 950 */ 951static void 952print(struct match *m, char *caption, states st, int ch, FILE *d) 953{ 954 struct re_guts *g = m->g; 955 int i; 956 int first = 1; 957 958 if (!(m->eflags®_TRACE)) 959 return; 960 961 (void)fprintf(d, "%s", caption); 962 if (ch != '\0') 963 (void)fprintf(d, " %s", pchar(ch)); 964 for (i = 0; i < g->nstates; i++) 965 if (ISSET(st, i)) { 966 (void)fprintf(d, "%s%d", (first) ? "\t" : ", ", i); 967 first = 0; 968 } 969 (void)fprintf(d, "\n"); 970} 971 972/* 973 - at - print current situation 974 */ 975static void 976at(struct match *m, char *title, char *start, char *stop, sopno startst, 977 sopno stopst) 978{ 979 if (!(m->eflags®_TRACE)) 980 return; 981 982 (void)printf("%s %s-", title, pchar(*start)); 983 (void)printf("%s ", pchar(*stop)); 984 (void)printf("%ld-%ld\n", (long)startst, (long)stopst); 985} 986 987#ifndef PCHARDONE 988#define PCHARDONE /* never again */ 989/* 990 - pchar - make a character printable 991 * 992 * Is this identical to regchar() over in debug.c? Well, yes. But a 993 * duplicate here avoids having a debugging-capable regexec.o tied to 994 * a matching debug.o, and this is convenient. It all disappears in 995 * the non-debug compilation anyway, so it doesn't matter much. 996 */ 997static char * /* -> representation */ 998pchar(int ch) 999{ 1000 static char pbuf[10]; 1001 1002 if (isprint(ch) || ch == ' ') 1003 (void)snprintf(pbuf, sizeof pbuf, "%c", ch); 1004 else 1005 (void)snprintf(pbuf, sizeof pbuf, "\\%o", ch); 1006 return(pbuf); 1007} 1008#endif 1009#endif 1010 1011#undef matcher 1012#undef fast 1013#undef slow 1014#undef dissect 1015#undef backref 1016#undef step 1017#undef print 1018#undef at 1019#undef match 1020#undef nope 1021