engine.c revision 1.14
1/* $OpenBSD: engine.c,v 1.14 2005/01/03 19:51:14 otto Exp $ */ 2 3/*- 4 * Copyright (c) 1992, 1993, 1994 Henry Spencer. 5 * Copyright (c) 1992, 1993, 1994 6 * The Regents of the University of California. All rights reserved. 7 * 8 * This code is derived from software contributed to Berkeley by 9 * Henry Spencer. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 3. Neither the name of the University nor the names of its contributors 20 * may be used to endorse or promote products derived from this software 21 * without specific prior written permission. 22 * 23 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 24 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 25 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 26 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 27 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 28 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 29 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 30 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 31 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 32 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 33 * SUCH DAMAGE. 34 * 35 * @(#)engine.c 8.5 (Berkeley) 3/20/94 36 */ 37 38#if defined(SNAMES) && defined(LIBC_SCCS) && !defined(lint) 39static char enginercsid[] = "$OpenBSD: engine.c,v 1.14 2005/01/03 19:51:14 otto Exp $"; 40#endif /* SNAMES and LIBC_SCCS and not lint */ 41 42/* 43 * The matching engine and friends. This file is #included by regexec.c 44 * after suitable #defines of a variety of macros used herein, so that 45 * different state representations can be used without duplicating masses 46 * of code. 47 */ 48 49#ifdef SNAMES 50#define matcher smatcher 51#define fast sfast 52#define slow sslow 53#define dissect sdissect 54#define backref sbackref 55#define step sstep 56#define print sprint 57#define at sat 58#define match smat 59#define nope snope 60#endif 61#ifdef LNAMES 62#define matcher lmatcher 63#define fast lfast 64#define slow lslow 65#define dissect ldissect 66#define backref lbackref 67#define step lstep 68#define print lprint 69#define at lat 70#define match lmat 71#define nope lnope 72#endif 73 74/* another structure passed up and down to avoid zillions of parameters */ 75struct match { 76 struct re_guts *g; 77 int eflags; 78 regmatch_t *pmatch; /* [nsub+1] (0 element unused) */ 79 char *offp; /* offsets work from here */ 80 char *beginp; /* start of string -- virtual NUL precedes */ 81 char *endp; /* end of string -- virtual NUL here */ 82 char *coldp; /* can be no match starting before here */ 83 char **lastpos; /* [nplus+1] */ 84 STATEVARS; 85 states st; /* current states */ 86 states fresh; /* states for a fresh start */ 87 states tmp; /* temporary */ 88 states empty; /* empty set of states */ 89}; 90 91static int matcher(struct re_guts *, char *, size_t, regmatch_t[], int); 92static char *dissect(struct match *, char *, char *, sopno, sopno); 93static char *backref(struct match *, char *, char *, sopno, sopno, sopno, int); 94static char *fast(struct match *, char *, char *, sopno, sopno); 95static char *slow(struct match *, char *, char *, sopno, sopno); 96static states step(struct re_guts *, sopno, sopno, states, int, states); 97#define MAX_RECURSION 100 98#define BOL (OUT+1) 99#define EOL (BOL+1) 100#define BOLEOL (BOL+2) 101#define NOTHING (BOL+3) 102#define BOW (BOL+4) 103#define EOW (BOL+5) 104#define CODEMAX (BOL+5) /* highest code used */ 105#define NONCHAR(c) ((c) > CHAR_MAX) 106#define NNONCHAR (CODEMAX-CHAR_MAX) 107#ifdef REDEBUG 108static void print(struct match *, char *, states, int, FILE *); 109#endif 110#ifdef REDEBUG 111static void at(struct match *, char *, char *, char *, sopno, sopno); 112#endif 113#ifdef REDEBUG 114static char *pchar(int); 115#endif 116 117#ifdef REDEBUG 118#define SP(t, s, c) print(m, t, s, c, stdout) 119#define AT(t, p1, p2, s1, s2) at(m, t, p1, p2, s1, s2) 120#define NOTE(str) { if (m->eflags®_TRACE) (void)printf("=%s\n", (str)); } 121static int nope = 0; 122#else 123#define SP(t, s, c) /* nothing */ 124#define AT(t, p1, p2, s1, s2) /* nothing */ 125#define NOTE(s) /* nothing */ 126#endif 127 128/* 129 - matcher - the actual matching engine 130 */ 131static int /* 0 success, REG_NOMATCH failure */ 132matcher(struct re_guts *g, char *string, size_t nmatch, regmatch_t pmatch[], 133 int eflags) 134{ 135 char *endp; 136 int i; 137 struct match mv; 138 struct match *m = &mv; 139 char *dp; 140 const sopno gf = g->firststate+1; /* +1 for OEND */ 141 const sopno gl = g->laststate; 142 char *start; 143 char *stop; 144 145 /* simplify the situation where possible */ 146 if (g->cflags®_NOSUB) 147 nmatch = 0; 148 if (eflags®_STARTEND) { 149 start = string + pmatch[0].rm_so; 150 stop = string + pmatch[0].rm_eo; 151 } else { 152 start = string; 153 stop = start + strlen(start); 154 } 155 if (stop < start) 156 return(REG_INVARG); 157 158 /* prescreening; this does wonders for this rather slow code */ 159 if (g->must != NULL) { 160 for (dp = start; dp < stop; dp++) 161 if (*dp == g->must[0] && stop - dp >= g->mlen && 162 memcmp(dp, g->must, (size_t)g->mlen) == 0) 163 break; 164 if (dp == stop) /* we didn't find g->must */ 165 return(REG_NOMATCH); 166 } 167 168 /* match struct setup */ 169 m->g = g; 170 m->eflags = eflags; 171 m->pmatch = NULL; 172 m->lastpos = NULL; 173 m->offp = string; 174 m->beginp = start; 175 m->endp = stop; 176 STATESETUP(m, 4); 177 SETUP(m->st); 178 SETUP(m->fresh); 179 SETUP(m->tmp); 180 SETUP(m->empty); 181 CLEAR(m->empty); 182 183 /* this loop does only one repetition except for backrefs */ 184 for (;;) { 185 endp = fast(m, start, stop, gf, gl); 186 if (endp == NULL) { /* a miss */ 187 free(m->pmatch); 188 free(m->lastpos); 189 STATETEARDOWN(m); 190 return(REG_NOMATCH); 191 } 192 if (nmatch == 0 && !g->backrefs) 193 break; /* no further info needed */ 194 195 /* where? */ 196 assert(m->coldp != NULL); 197 for (;;) { 198 NOTE("finding start"); 199 endp = slow(m, m->coldp, stop, gf, gl); 200 if (endp != NULL) 201 break; 202 assert(m->coldp < m->endp); 203 m->coldp++; 204 } 205 if (nmatch == 1 && !g->backrefs) 206 break; /* no further info needed */ 207 208 /* oh my, he wants the subexpressions... */ 209 if (m->pmatch == NULL) 210 m->pmatch = (regmatch_t *)malloc((m->g->nsub + 1) * 211 sizeof(regmatch_t)); 212 if (m->pmatch == NULL) { 213 STATETEARDOWN(m); 214 return(REG_ESPACE); 215 } 216 for (i = 1; i <= m->g->nsub; i++) 217 m->pmatch[i].rm_so = m->pmatch[i].rm_eo = -1; 218 if (!g->backrefs && !(m->eflags®_BACKR)) { 219 NOTE("dissecting"); 220 dp = dissect(m, m->coldp, endp, gf, gl); 221 } else { 222 if (g->nplus > 0 && m->lastpos == NULL) 223 m->lastpos = (char **)malloc((g->nplus+1) * 224 sizeof(char *)); 225 if (g->nplus > 0 && m->lastpos == NULL) { 226 free(m->pmatch); 227 STATETEARDOWN(m); 228 return(REG_ESPACE); 229 } 230 NOTE("backref dissect"); 231 dp = backref(m, m->coldp, endp, gf, gl, (sopno)0, 0); 232 } 233 if (dp != NULL) 234 break; 235 236 /* uh-oh... we couldn't find a subexpression-level match */ 237 assert(g->backrefs); /* must be back references doing it */ 238 assert(g->nplus == 0 || m->lastpos != NULL); 239 for (;;) { 240 if (dp != NULL || endp <= m->coldp) 241 break; /* defeat */ 242 NOTE("backoff"); 243 endp = slow(m, m->coldp, endp-1, gf, gl); 244 if (endp == NULL) 245 break; /* defeat */ 246 /* try it on a shorter possibility */ 247#ifndef NDEBUG 248 for (i = 1; i <= m->g->nsub; i++) { 249 assert(m->pmatch[i].rm_so == -1); 250 assert(m->pmatch[i].rm_eo == -1); 251 } 252#endif 253 NOTE("backoff dissect"); 254 dp = backref(m, m->coldp, endp, gf, gl, (sopno)0, 0); 255 } 256 assert(dp == NULL || dp == endp); 257 if (dp != NULL) /* found a shorter one */ 258 break; 259 260 /* despite initial appearances, there is no match here */ 261 NOTE("false alarm"); 262 if (m->coldp == stop) 263 break; 264 start = m->coldp + 1; /* recycle starting later */ 265 } 266 267 /* fill in the details if requested */ 268 if (nmatch > 0) { 269 pmatch[0].rm_so = m->coldp - m->offp; 270 pmatch[0].rm_eo = endp - m->offp; 271 } 272 if (nmatch > 1) { 273 assert(m->pmatch != NULL); 274 for (i = 1; i < nmatch; i++) 275 if (i <= m->g->nsub) 276 pmatch[i] = m->pmatch[i]; 277 else { 278 pmatch[i].rm_so = -1; 279 pmatch[i].rm_eo = -1; 280 } 281 } 282 283 if (m->pmatch != NULL) 284 free((char *)m->pmatch); 285 if (m->lastpos != NULL) 286 free((char *)m->lastpos); 287 STATETEARDOWN(m); 288 return(0); 289} 290 291/* 292 - dissect - figure out what matched what, no back references 293 */ 294static char * /* == stop (success) always */ 295dissect(struct match *m, char *start, char *stop, sopno startst, sopno stopst) 296{ 297 int i; 298 sopno ss; /* start sop of current subRE */ 299 sopno es; /* end sop of current subRE */ 300 char *sp; /* start of string matched by it */ 301 char *stp; /* string matched by it cannot pass here */ 302 char *rest; /* start of rest of string */ 303 char *tail; /* string unmatched by rest of RE */ 304 sopno ssub; /* start sop of subsubRE */ 305 sopno esub; /* end sop of subsubRE */ 306 char *ssp; /* start of string matched by subsubRE */ 307 char *sep; /* end of string matched by subsubRE */ 308 char *oldssp; /* previous ssp */ 309 char *dp; 310 311 AT("diss", start, stop, startst, stopst); 312 sp = start; 313 for (ss = startst; ss < stopst; ss = es) { 314 /* identify end of subRE */ 315 es = ss; 316 switch (OP(m->g->strip[es])) { 317 case OPLUS_: 318 case OQUEST_: 319 es += OPND(m->g->strip[es]); 320 break; 321 case OCH_: 322 while (OP(m->g->strip[es]) != O_CH) 323 es += OPND(m->g->strip[es]); 324 break; 325 } 326 es++; 327 328 /* figure out what it matched */ 329 switch (OP(m->g->strip[ss])) { 330 case OEND: 331 assert(nope); 332 break; 333 case OCHAR: 334 sp++; 335 break; 336 case OBOL: 337 case OEOL: 338 case OBOW: 339 case OEOW: 340 break; 341 case OANY: 342 case OANYOF: 343 sp++; 344 break; 345 case OBACK_: 346 case O_BACK: 347 assert(nope); 348 break; 349 /* cases where length of match is hard to find */ 350 case OQUEST_: 351 stp = stop; 352 for (;;) { 353 /* how long could this one be? */ 354 rest = slow(m, sp, stp, ss, es); 355 assert(rest != NULL); /* it did match */ 356 /* could the rest match the rest? */ 357 tail = slow(m, rest, stop, es, stopst); 358 if (tail == stop) 359 break; /* yes! */ 360 /* no -- try a shorter match for this one */ 361 stp = rest - 1; 362 assert(stp >= sp); /* it did work */ 363 } 364 ssub = ss + 1; 365 esub = es - 1; 366 /* did innards match? */ 367 if (slow(m, sp, rest, ssub, esub) != NULL) { 368 dp = dissect(m, sp, rest, ssub, esub); 369 assert(dp == rest); 370 } else /* no */ 371 assert(sp == rest); 372 sp = rest; 373 break; 374 case OPLUS_: 375 stp = stop; 376 for (;;) { 377 /* how long could this one be? */ 378 rest = slow(m, sp, stp, ss, es); 379 assert(rest != NULL); /* it did match */ 380 /* could the rest match the rest? */ 381 tail = slow(m, rest, stop, es, stopst); 382 if (tail == stop) 383 break; /* yes! */ 384 /* no -- try a shorter match for this one */ 385 stp = rest - 1; 386 assert(stp >= sp); /* it did work */ 387 } 388 ssub = ss + 1; 389 esub = es - 1; 390 ssp = sp; 391 oldssp = ssp; 392 for (;;) { /* find last match of innards */ 393 sep = slow(m, ssp, rest, ssub, esub); 394 if (sep == NULL || sep == ssp) 395 break; /* failed or matched null */ 396 oldssp = ssp; /* on to next try */ 397 ssp = sep; 398 } 399 if (sep == NULL) { 400 /* last successful match */ 401 sep = ssp; 402 ssp = oldssp; 403 } 404 assert(sep == rest); /* must exhaust substring */ 405 assert(slow(m, ssp, sep, ssub, esub) == rest); 406 dp = dissect(m, ssp, sep, ssub, esub); 407 assert(dp == sep); 408 sp = rest; 409 break; 410 case OCH_: 411 stp = stop; 412 for (;;) { 413 /* how long could this one be? */ 414 rest = slow(m, sp, stp, ss, es); 415 assert(rest != NULL); /* it did match */ 416 /* could the rest match the rest? */ 417 tail = slow(m, rest, stop, es, stopst); 418 if (tail == stop) 419 break; /* yes! */ 420 /* no -- try a shorter match for this one */ 421 stp = rest - 1; 422 assert(stp >= sp); /* it did work */ 423 } 424 ssub = ss + 1; 425 esub = ss + OPND(m->g->strip[ss]) - 1; 426 assert(OP(m->g->strip[esub]) == OOR1); 427 for (;;) { /* find first matching branch */ 428 if (slow(m, sp, rest, ssub, esub) == rest) 429 break; /* it matched all of it */ 430 /* that one missed, try next one */ 431 assert(OP(m->g->strip[esub]) == OOR1); 432 esub++; 433 assert(OP(m->g->strip[esub]) == OOR2); 434 ssub = esub + 1; 435 esub += OPND(m->g->strip[esub]); 436 if (OP(m->g->strip[esub]) == OOR2) 437 esub--; 438 else 439 assert(OP(m->g->strip[esub]) == O_CH); 440 } 441 dp = dissect(m, sp, rest, ssub, esub); 442 assert(dp == rest); 443 sp = rest; 444 break; 445 case O_PLUS: 446 case O_QUEST: 447 case OOR1: 448 case OOR2: 449 case O_CH: 450 assert(nope); 451 break; 452 case OLPAREN: 453 i = OPND(m->g->strip[ss]); 454 assert(0 < i && i <= m->g->nsub); 455 m->pmatch[i].rm_so = sp - m->offp; 456 break; 457 case ORPAREN: 458 i = OPND(m->g->strip[ss]); 459 assert(0 < i && i <= m->g->nsub); 460 m->pmatch[i].rm_eo = sp - m->offp; 461 break; 462 default: /* uh oh */ 463 assert(nope); 464 break; 465 } 466 } 467 468 assert(sp == stop); 469 return(sp); 470} 471 472/* 473 - backref - figure out what matched what, figuring in back references 474 */ 475static char * /* == stop (success) or NULL (failure) */ 476backref(struct match *m, char *start, char *stop, sopno startst, sopno stopst, 477 sopno lev, int rec) /* PLUS nesting level */ 478{ 479 int i; 480 sopno ss; /* start sop of current subRE */ 481 char *sp; /* start of string matched by it */ 482 sopno ssub; /* start sop of subsubRE */ 483 sopno esub; /* end sop of subsubRE */ 484 char *ssp; /* start of string matched by subsubRE */ 485 char *dp; 486 size_t len; 487 int hard; 488 sop s; 489 regoff_t offsave; 490 cset *cs; 491 492 AT("back", start, stop, startst, stopst); 493 sp = start; 494 495 /* get as far as we can with easy stuff */ 496 hard = 0; 497 for (ss = startst; !hard && ss < stopst; ss++) 498 switch (OP(s = m->g->strip[ss])) { 499 case OCHAR: 500 if (sp == stop || *sp++ != (char)OPND(s)) 501 return(NULL); 502 break; 503 case OANY: 504 if (sp == stop) 505 return(NULL); 506 sp++; 507 break; 508 case OANYOF: 509 cs = &m->g->sets[OPND(s)]; 510 if (sp == stop || !CHIN(cs, *sp++)) 511 return(NULL); 512 break; 513 case OBOL: 514 if ( (sp == m->beginp && !(m->eflags®_NOTBOL)) || 515 (sp < m->endp && *(sp-1) == '\n' && 516 (m->g->cflags®_NEWLINE)) ) 517 { /* yes */ } 518 else 519 return(NULL); 520 break; 521 case OEOL: 522 if ( (sp == m->endp && !(m->eflags®_NOTEOL)) || 523 (sp < m->endp && *sp == '\n' && 524 (m->g->cflags®_NEWLINE)) ) 525 { /* yes */ } 526 else 527 return(NULL); 528 break; 529 case OBOW: 530 if (( (sp == m->beginp && !(m->eflags®_NOTBOL)) || 531 (sp < m->endp && *(sp-1) == '\n' && 532 (m->g->cflags®_NEWLINE)) || 533 (sp > m->beginp && 534 !ISWORD(*(sp-1))) ) && 535 (sp < m->endp && ISWORD(*sp)) ) 536 { /* yes */ } 537 else 538 return(NULL); 539 break; 540 case OEOW: 541 if (( (sp == m->endp && !(m->eflags®_NOTEOL)) || 542 (sp < m->endp && *sp == '\n' && 543 (m->g->cflags®_NEWLINE)) || 544 (sp < m->endp && !ISWORD(*sp)) ) && 545 (sp > m->beginp && ISWORD(*(sp-1))) ) 546 { /* yes */ } 547 else 548 return(NULL); 549 break; 550 case O_QUEST: 551 break; 552 case OOR1: /* matches null but needs to skip */ 553 ss++; 554 s = m->g->strip[ss]; 555 do { 556 assert(OP(s) == OOR2); 557 ss += OPND(s); 558 } while (OP(s = m->g->strip[ss]) != O_CH); 559 /* note that the ss++ gets us past the O_CH */ 560 break; 561 default: /* have to make a choice */ 562 hard = 1; 563 break; 564 } 565 if (!hard) { /* that was it! */ 566 if (sp != stop) 567 return(NULL); 568 return(sp); 569 } 570 ss--; /* adjust for the for's final increment */ 571 572 /* the hard stuff */ 573 AT("hard", sp, stop, ss, stopst); 574 s = m->g->strip[ss]; 575 switch (OP(s)) { 576 case OBACK_: /* the vilest depths */ 577 i = OPND(s); 578 assert(0 < i && i <= m->g->nsub); 579 if (m->pmatch[i].rm_eo == -1) 580 return(NULL); 581 assert(m->pmatch[i].rm_so != -1); 582 len = m->pmatch[i].rm_eo - m->pmatch[i].rm_so; 583 if (len == 0 && rec++ > MAX_RECURSION) 584 return(NULL); 585 assert(stop - m->beginp >= len); 586 if (sp > stop - len) 587 return(NULL); /* not enough left to match */ 588 ssp = m->offp + m->pmatch[i].rm_so; 589 if (memcmp(sp, ssp, len) != 0) 590 return(NULL); 591 while (m->g->strip[ss] != SOP(O_BACK, i)) 592 ss++; 593 return(backref(m, sp+len, stop, ss+1, stopst, lev, rec)); 594 break; 595 case OQUEST_: /* to null or not */ 596 dp = backref(m, sp, stop, ss+1, stopst, lev, rec); 597 if (dp != NULL) 598 return(dp); /* not */ 599 return(backref(m, sp, stop, ss+OPND(s)+1, stopst, lev, rec)); 600 break; 601 case OPLUS_: 602 assert(m->lastpos != NULL); 603 assert(lev+1 <= m->g->nplus); 604 m->lastpos[lev+1] = sp; 605 return(backref(m, sp, stop, ss+1, stopst, lev+1, rec)); 606 break; 607 case O_PLUS: 608 if (sp == m->lastpos[lev]) /* last pass matched null */ 609 return(backref(m, sp, stop, ss+1, stopst, lev-1, rec)); 610 /* try another pass */ 611 m->lastpos[lev] = sp; 612 dp = backref(m, sp, stop, ss-OPND(s)+1, stopst, lev, rec); 613 if (dp == NULL) 614 return(backref(m, sp, stop, ss+1, stopst, lev-1, rec)); 615 else 616 return(dp); 617 break; 618 case OCH_: /* find the right one, if any */ 619 ssub = ss + 1; 620 esub = ss + OPND(s) - 1; 621 assert(OP(m->g->strip[esub]) == OOR1); 622 for (;;) { /* find first matching branch */ 623 dp = backref(m, sp, stop, ssub, esub, lev, rec); 624 if (dp != NULL) 625 return(dp); 626 /* that one missed, try next one */ 627 if (OP(m->g->strip[esub]) == O_CH) 628 return(NULL); /* there is none */ 629 esub++; 630 assert(OP(m->g->strip[esub]) == OOR2); 631 ssub = esub + 1; 632 esub += OPND(m->g->strip[esub]); 633 if (OP(m->g->strip[esub]) == OOR2) 634 esub--; 635 else 636 assert(OP(m->g->strip[esub]) == O_CH); 637 } 638 break; 639 case OLPAREN: /* must undo assignment if rest fails */ 640 i = OPND(s); 641 assert(0 < i && i <= m->g->nsub); 642 offsave = m->pmatch[i].rm_so; 643 m->pmatch[i].rm_so = sp - m->offp; 644 dp = backref(m, sp, stop, ss+1, stopst, lev, rec); 645 if (dp != NULL) 646 return(dp); 647 m->pmatch[i].rm_so = offsave; 648 return(NULL); 649 break; 650 case ORPAREN: /* must undo assignment if rest fails */ 651 i = OPND(s); 652 assert(0 < i && i <= m->g->nsub); 653 offsave = m->pmatch[i].rm_eo; 654 m->pmatch[i].rm_eo = sp - m->offp; 655 dp = backref(m, sp, stop, ss+1, stopst, lev, rec); 656 if (dp != NULL) 657 return(dp); 658 m->pmatch[i].rm_eo = offsave; 659 return(NULL); 660 break; 661 default: /* uh oh */ 662 assert(nope); 663 break; 664 } 665 666 /* "can't happen" */ 667 assert(nope); 668 /* NOTREACHED */ 669} 670 671/* 672 - fast - step through the string at top speed 673 */ 674static char * /* where tentative match ended, or NULL */ 675fast(struct match *m, char *start, char *stop, sopno startst, sopno stopst) 676{ 677 states st = m->st; 678 states fresh = m->fresh; 679 states tmp = m->tmp; 680 char *p = start; 681 int c = (start == m->beginp) ? OUT : *(start-1); 682 int lastc; /* previous c */ 683 int flagch; 684 int i; 685 char *coldp; /* last p after which no match was underway */ 686 687 CLEAR(st); 688 SET1(st, startst); 689 st = step(m->g, startst, stopst, st, NOTHING, st); 690 ASSIGN(fresh, st); 691 SP("start", st, *p); 692 coldp = NULL; 693 for (;;) { 694 /* next character */ 695 lastc = c; 696 c = (p == m->endp) ? OUT : *p; 697 if (EQ(st, fresh)) 698 coldp = p; 699 700 /* is there an EOL and/or BOL between lastc and c? */ 701 flagch = '\0'; 702 i = 0; 703 if ( (lastc == '\n' && m->g->cflags®_NEWLINE) || 704 (lastc == OUT && !(m->eflags®_NOTBOL)) ) { 705 flagch = BOL; 706 i = m->g->nbol; 707 } 708 if ( (c == '\n' && m->g->cflags®_NEWLINE) || 709 (c == OUT && !(m->eflags®_NOTEOL)) ) { 710 flagch = (flagch == BOL) ? BOLEOL : EOL; 711 i += m->g->neol; 712 } 713 if (i != 0) { 714 for (; i > 0; i--) 715 st = step(m->g, startst, stopst, st, flagch, st); 716 SP("boleol", st, c); 717 } 718 719 /* how about a word boundary? */ 720 if ( (flagch == BOL || (lastc != OUT && !ISWORD(lastc))) && 721 (c != OUT && ISWORD(c)) ) { 722 flagch = BOW; 723 } 724 if ( (lastc != OUT && ISWORD(lastc)) && 725 (flagch == EOL || (c != OUT && !ISWORD(c))) ) { 726 flagch = EOW; 727 } 728 if (flagch == BOW || flagch == EOW) { 729 st = step(m->g, startst, stopst, st, flagch, st); 730 SP("boweow", st, c); 731 } 732 733 /* are we done? */ 734 if (ISSET(st, stopst) || p == stop) 735 break; /* NOTE BREAK OUT */ 736 737 /* no, we must deal with this character */ 738 ASSIGN(tmp, st); 739 ASSIGN(st, fresh); 740 assert(c != OUT); 741 st = step(m->g, startst, stopst, tmp, c, st); 742 SP("aft", st, c); 743 assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st)); 744 p++; 745 } 746 747 assert(coldp != NULL); 748 m->coldp = coldp; 749 if (ISSET(st, stopst)) 750 return(p+1); 751 else 752 return(NULL); 753} 754 755/* 756 - slow - step through the string more deliberately 757 */ 758static char * /* where it ended */ 759slow(struct match *m, char *start, char *stop, sopno startst, sopno stopst) 760{ 761 states st = m->st; 762 states empty = m->empty; 763 states tmp = m->tmp; 764 char *p = start; 765 int c = (start == m->beginp) ? OUT : *(start-1); 766 int lastc; /* previous c */ 767 int flagch; 768 int i; 769 char *matchp; /* last p at which a match ended */ 770 771 AT("slow", start, stop, startst, stopst); 772 CLEAR(st); 773 SET1(st, startst); 774 SP("sstart", st, *p); 775 st = step(m->g, startst, stopst, st, NOTHING, st); 776 matchp = NULL; 777 for (;;) { 778 /* next character */ 779 lastc = c; 780 c = (p == m->endp) ? OUT : *p; 781 782 /* is there an EOL and/or BOL between lastc and c? */ 783 flagch = '\0'; 784 i = 0; 785 if ( (lastc == '\n' && m->g->cflags®_NEWLINE) || 786 (lastc == OUT && !(m->eflags®_NOTBOL)) ) { 787 flagch = BOL; 788 i = m->g->nbol; 789 } 790 if ( (c == '\n' && m->g->cflags®_NEWLINE) || 791 (c == OUT && !(m->eflags®_NOTEOL)) ) { 792 flagch = (flagch == BOL) ? BOLEOL : EOL; 793 i += m->g->neol; 794 } 795 if (i != 0) { 796 for (; i > 0; i--) 797 st = step(m->g, startst, stopst, st, flagch, st); 798 SP("sboleol", st, c); 799 } 800 801 /* how about a word boundary? */ 802 if ( (flagch == BOL || (lastc != OUT && !ISWORD(lastc))) && 803 (c != OUT && ISWORD(c)) ) { 804 flagch = BOW; 805 } 806 if ( (lastc != OUT && ISWORD(lastc)) && 807 (flagch == EOL || (c != OUT && !ISWORD(c))) ) { 808 flagch = EOW; 809 } 810 if (flagch == BOW || flagch == EOW) { 811 st = step(m->g, startst, stopst, st, flagch, st); 812 SP("sboweow", st, c); 813 } 814 815 /* are we done? */ 816 if (ISSET(st, stopst)) 817 matchp = p; 818 if (EQ(st, empty) || p == stop) 819 break; /* NOTE BREAK OUT */ 820 821 /* no, we must deal with this character */ 822 ASSIGN(tmp, st); 823 ASSIGN(st, empty); 824 assert(c != OUT); 825 st = step(m->g, startst, stopst, tmp, c, st); 826 SP("saft", st, c); 827 assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st)); 828 p++; 829 } 830 831 return(matchp); 832} 833 834 835/* 836 - step - map set of states reachable before char to set reachable after 837 */ 838static states 839step(struct re_guts *g, 840 sopno start, /* start state within strip */ 841 sopno stop, /* state after stop state within strip */ 842 states bef, /* states reachable before */ 843 int ch, /* character or NONCHAR code */ 844 states aft) /* states already known reachable after */ 845{ 846 cset *cs; 847 sop s; 848 sopno pc; 849 onestate here; /* note, macros know this name */ 850 sopno look; 851 int i; 852 853 for (pc = start, INIT(here, pc); pc != stop; pc++, INC(here)) { 854 s = g->strip[pc]; 855 switch (OP(s)) { 856 case OEND: 857 assert(pc == stop-1); 858 break; 859 case OCHAR: 860 /* only characters can match */ 861 assert(!NONCHAR(ch) || ch != (char)OPND(s)); 862 if (ch == (char)OPND(s)) 863 FWD(aft, bef, 1); 864 break; 865 case OBOL: 866 if (ch == BOL || ch == BOLEOL) 867 FWD(aft, bef, 1); 868 break; 869 case OEOL: 870 if (ch == EOL || ch == BOLEOL) 871 FWD(aft, bef, 1); 872 break; 873 case OBOW: 874 if (ch == BOW) 875 FWD(aft, bef, 1); 876 break; 877 case OEOW: 878 if (ch == EOW) 879 FWD(aft, bef, 1); 880 break; 881 case OANY: 882 if (!NONCHAR(ch)) 883 FWD(aft, bef, 1); 884 break; 885 case OANYOF: 886 cs = &g->sets[OPND(s)]; 887 if (!NONCHAR(ch) && CHIN(cs, ch)) 888 FWD(aft, bef, 1); 889 break; 890 case OBACK_: /* ignored here */ 891 case O_BACK: 892 FWD(aft, aft, 1); 893 break; 894 case OPLUS_: /* forward, this is just an empty */ 895 FWD(aft, aft, 1); 896 break; 897 case O_PLUS: /* both forward and back */ 898 FWD(aft, aft, 1); 899 i = ISSETBACK(aft, OPND(s)); 900 BACK(aft, aft, OPND(s)); 901 if (!i && ISSETBACK(aft, OPND(s))) { 902 /* oho, must reconsider loop body */ 903 pc -= OPND(s) + 1; 904 INIT(here, pc); 905 } 906 break; 907 case OQUEST_: /* two branches, both forward */ 908 FWD(aft, aft, 1); 909 FWD(aft, aft, OPND(s)); 910 break; 911 case O_QUEST: /* just an empty */ 912 FWD(aft, aft, 1); 913 break; 914 case OLPAREN: /* not significant here */ 915 case ORPAREN: 916 FWD(aft, aft, 1); 917 break; 918 case OCH_: /* mark the first two branches */ 919 FWD(aft, aft, 1); 920 assert(OP(g->strip[pc+OPND(s)]) == OOR2); 921 FWD(aft, aft, OPND(s)); 922 break; 923 case OOR1: /* done a branch, find the O_CH */ 924 if (ISSTATEIN(aft, here)) { 925 for (look = 1; 926 OP(s = g->strip[pc+look]) != O_CH; 927 look += OPND(s)) 928 assert(OP(s) == OOR2); 929 FWD(aft, aft, look); 930 } 931 break; 932 case OOR2: /* propagate OCH_'s marking */ 933 FWD(aft, aft, 1); 934 if (OP(g->strip[pc+OPND(s)]) != O_CH) { 935 assert(OP(g->strip[pc+OPND(s)]) == OOR2); 936 FWD(aft, aft, OPND(s)); 937 } 938 break; 939 case O_CH: /* just empty */ 940 FWD(aft, aft, 1); 941 break; 942 default: /* ooooops... */ 943 assert(nope); 944 break; 945 } 946 } 947 948 return(aft); 949} 950 951#ifdef REDEBUG 952/* 953 - print - print a set of states 954 */ 955static void 956print(struct match *m, char *caption, states st, int ch, FILE *d) 957{ 958 struct re_guts *g = m->g; 959 int i; 960 int first = 1; 961 962 if (!(m->eflags®_TRACE)) 963 return; 964 965 (void)fprintf(d, "%s", caption); 966 if (ch != '\0') 967 (void)fprintf(d, " %s", pchar(ch)); 968 for (i = 0; i < g->nstates; i++) 969 if (ISSET(st, i)) { 970 (void)fprintf(d, "%s%d", (first) ? "\t" : ", ", i); 971 first = 0; 972 } 973 (void)fprintf(d, "\n"); 974} 975 976/* 977 - at - print current situation 978 */ 979static void 980at(struct match *m, char *title, char *start, char *stop, sopno startst, 981 sopno stopst) 982{ 983 if (!(m->eflags®_TRACE)) 984 return; 985 986 (void)printf("%s %s-", title, pchar(*start)); 987 (void)printf("%s ", pchar(*stop)); 988 (void)printf("%ld-%ld\n", (long)startst, (long)stopst); 989} 990 991#ifndef PCHARDONE 992#define PCHARDONE /* never again */ 993/* 994 - pchar - make a character printable 995 * 996 * Is this identical to regchar() over in debug.c? Well, yes. But a 997 * duplicate here avoids having a debugging-capable regexec.o tied to 998 * a matching debug.o, and this is convenient. It all disappears in 999 * the non-debug compilation anyway, so it doesn't matter much. 1000 */ 1001static char * /* -> representation */ 1002pchar(int ch) 1003{ 1004 static char pbuf[10]; 1005 1006 if (isprint(ch) || ch == ' ') 1007 (void)snprintf(pbuf, sizeof pbuf, "%c", ch); 1008 else 1009 (void)snprintf(pbuf, sizeof pbuf, "\\%o", ch); 1010 return(pbuf); 1011} 1012#endif 1013#endif 1014 1015#undef matcher 1016#undef fast 1017#undef slow 1018#undef dissect 1019#undef backref 1020#undef step 1021#undef print 1022#undef at 1023#undef match 1024#undef nope 1025