engine.c revision 1.10
1/* $OpenBSD: engine.c,v 1.10 2004/08/13 14:52:49 millert Exp $ */ 2 3/*- 4 * Copyright (c) 1992, 1993, 1994 Henry Spencer. 5 * Copyright (c) 1992, 1993, 1994 6 * The Regents of the University of California. All rights reserved. 7 * 8 * This code is derived from software contributed to Berkeley by 9 * Henry Spencer. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 3. Neither the name of the University nor the names of its contributors 20 * may be used to endorse or promote products derived from this software 21 * without specific prior written permission. 22 * 23 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 24 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 25 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 26 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 27 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 28 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 29 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 30 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 31 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 32 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 33 * SUCH DAMAGE. 34 * 35 * @(#)engine.c 8.5 (Berkeley) 3/20/94 36 */ 37 38#if defined(SNAMES) && defined(LIBC_SCCS) && !defined(lint) 39static char enginercsid[] = "$OpenBSD: engine.c,v 1.10 2004/08/13 14:52:49 millert Exp $"; 40#endif /* SNAMES and LIBC_SCCS and not lint */ 41 42/* 43 * The matching engine and friends. This file is #included by regexec.c 44 * after suitable #defines of a variety of macros used herein, so that 45 * different state representations can be used without duplicating masses 46 * of code. 47 */ 48 49#ifdef SNAMES 50#define matcher smatcher 51#define fast sfast 52#define slow sslow 53#define dissect sdissect 54#define backref sbackref 55#define step sstep 56#define print sprint 57#define at sat 58#define match smat 59#define nope snope 60#endif 61#ifdef LNAMES 62#define matcher lmatcher 63#define fast lfast 64#define slow lslow 65#define dissect ldissect 66#define backref lbackref 67#define step lstep 68#define print lprint 69#define at lat 70#define match lmat 71#define nope lnope 72#endif 73 74/* another structure passed up and down to avoid zillions of parameters */ 75struct match { 76 struct re_guts *g; 77 int eflags; 78 regmatch_t *pmatch; /* [nsub+1] (0 element unused) */ 79 char *offp; /* offsets work from here */ 80 char *beginp; /* start of string -- virtual NUL precedes */ 81 char *endp; /* end of string -- virtual NUL here */ 82 char *coldp; /* can be no match starting before here */ 83 char **lastpos; /* [nplus+1] */ 84 STATEVARS; 85 states st; /* current states */ 86 states fresh; /* states for a fresh start */ 87 states tmp; /* temporary */ 88 states empty; /* empty set of states */ 89}; 90 91/* ========= begin header generated by ./mkh ========= */ 92#ifdef __cplusplus 93extern "C" { 94#endif 95 96/* === engine.c === */ 97static int matcher(struct re_guts *g, char *string, size_t nmatch, regmatch_t pmatch[], int eflags); 98static char *dissect(struct match *m, char *start, char *stop, sopno startst, sopno stopst); 99static char *backref(struct match *m, char *start, char *stop, sopno startst, sopno stopst, sopno lev); 100static char *fast(struct match *m, char *start, char *stop, sopno startst, sopno stopst); 101static char *slow(struct match *m, char *start, char *stop, sopno startst, sopno stopst); 102static states step(struct re_guts *g, sopno start, sopno stop, states bef, int ch, states aft); 103#define BOL (OUT+1) 104#define EOL (BOL+1) 105#define BOLEOL (BOL+2) 106#define NOTHING (BOL+3) 107#define BOW (BOL+4) 108#define EOW (BOL+5) 109#define CODEMAX (BOL+5) /* highest code used */ 110#define NONCHAR(c) ((c) > CHAR_MAX) 111#define NNONCHAR (CODEMAX-CHAR_MAX) 112#ifdef REDEBUG 113static void print(struct match *m, char *caption, states st, int ch, FILE *d); 114#endif 115#ifdef REDEBUG 116static void at(struct match *m, char *title, char *start, char *stop, sopno startst, sopno stopst); 117#endif 118#ifdef REDEBUG 119static char *pchar(int ch); 120#endif 121 122#ifdef __cplusplus 123} 124#endif 125/* ========= end header generated by ./mkh ========= */ 126 127#ifdef REDEBUG 128#define SP(t, s, c) print(m, t, s, c, stdout) 129#define AT(t, p1, p2, s1, s2) at(m, t, p1, p2, s1, s2) 130#define NOTE(str) { if (m->eflags®_TRACE) (void)printf("=%s\n", (str)); } 131static int nope = 0; 132#else 133#define SP(t, s, c) /* nothing */ 134#define AT(t, p1, p2, s1, s2) /* nothing */ 135#define NOTE(s) /* nothing */ 136#endif 137 138/* 139 - matcher - the actual matching engine 140 == static int matcher(register struct re_guts *g, char *string, \ 141 == size_t nmatch, regmatch_t pmatch[], int eflags); 142 */ 143static int /* 0 success, REG_NOMATCH failure */ 144matcher(g, string, nmatch, pmatch, eflags) 145register struct re_guts *g; 146char *string; 147size_t nmatch; 148regmatch_t pmatch[]; 149int eflags; 150{ 151 register char *endp; 152 register int i; 153 struct match mv; 154 register struct match *m = &mv; 155 register char *dp; 156 const register sopno gf = g->firststate+1; /* +1 for OEND */ 157 const register sopno gl = g->laststate; 158 char *start; 159 char *stop; 160 161 /* simplify the situation where possible */ 162 if (g->cflags®_NOSUB) 163 nmatch = 0; 164 if (eflags®_STARTEND) { 165 start = string + pmatch[0].rm_so; 166 stop = string + pmatch[0].rm_eo; 167 } else { 168 start = string; 169 stop = start + strlen(start); 170 } 171 if (stop < start) 172 return(REG_INVARG); 173 174 /* prescreening; this does wonders for this rather slow code */ 175 if (g->must != NULL) { 176 for (dp = start; dp < stop; dp++) 177 if (*dp == g->must[0] && stop - dp >= g->mlen && 178 memcmp(dp, g->must, (size_t)g->mlen) == 0) 179 break; 180 if (dp == stop) /* we didn't find g->must */ 181 return(REG_NOMATCH); 182 } 183 184 /* match struct setup */ 185 m->g = g; 186 m->eflags = eflags; 187 m->pmatch = NULL; 188 m->lastpos = NULL; 189 m->offp = string; 190 m->beginp = start; 191 m->endp = stop; 192 STATESETUP(m, 4); 193 SETUP(m->st); 194 SETUP(m->fresh); 195 SETUP(m->tmp); 196 SETUP(m->empty); 197 CLEAR(m->empty); 198 199 /* this loop does only one repetition except for backrefs */ 200 for (;;) { 201 endp = fast(m, start, stop, gf, gl); 202 if (endp == NULL) { /* a miss */ 203 STATETEARDOWN(m); 204 return(REG_NOMATCH); 205 } 206 if (nmatch == 0 && !g->backrefs) 207 break; /* no further info needed */ 208 209 /* where? */ 210 assert(m->coldp != NULL); 211 for (;;) { 212 NOTE("finding start"); 213 endp = slow(m, m->coldp, stop, gf, gl); 214 if (endp != NULL) 215 break; 216 assert(m->coldp < m->endp); 217 m->coldp++; 218 } 219 if (nmatch == 1 && !g->backrefs) 220 break; /* no further info needed */ 221 222 /* oh my, he wants the subexpressions... */ 223 if (m->pmatch == NULL) 224 m->pmatch = (regmatch_t *)malloc((m->g->nsub + 1) * 225 sizeof(regmatch_t)); 226 if (m->pmatch == NULL) { 227 STATETEARDOWN(m); 228 return(REG_ESPACE); 229 } 230 for (i = 1; i <= m->g->nsub; i++) 231 m->pmatch[i].rm_so = m->pmatch[i].rm_eo = -1; 232 if (!g->backrefs && !(m->eflags®_BACKR)) { 233 NOTE("dissecting"); 234 dp = dissect(m, m->coldp, endp, gf, gl); 235 } else { 236 if (g->nplus > 0 && m->lastpos == NULL) 237 m->lastpos = (char **)malloc((g->nplus+1) * 238 sizeof(char *)); 239 if (g->nplus > 0 && m->lastpos == NULL) { 240 free(m->pmatch); 241 STATETEARDOWN(m); 242 return(REG_ESPACE); 243 } 244 NOTE("backref dissect"); 245 dp = backref(m, m->coldp, endp, gf, gl, (sopno)0); 246 } 247 if (dp != NULL) 248 break; 249 250 /* uh-oh... we couldn't find a subexpression-level match */ 251 assert(g->backrefs); /* must be back references doing it */ 252 assert(g->nplus == 0 || m->lastpos != NULL); 253 for (;;) { 254 if (dp != NULL || endp <= m->coldp) 255 break; /* defeat */ 256 NOTE("backoff"); 257 endp = slow(m, m->coldp, endp-1, gf, gl); 258 if (endp == NULL) 259 break; /* defeat */ 260 /* try it on a shorter possibility */ 261#ifndef NDEBUG 262 for (i = 1; i <= m->g->nsub; i++) { 263 assert(m->pmatch[i].rm_so == -1); 264 assert(m->pmatch[i].rm_eo == -1); 265 } 266#endif 267 NOTE("backoff dissect"); 268 dp = backref(m, m->coldp, endp, gf, gl, (sopno)0); 269 } 270 assert(dp == NULL || dp == endp); 271 if (dp != NULL) /* found a shorter one */ 272 break; 273 274 /* despite initial appearances, there is no match here */ 275 NOTE("false alarm"); 276 if (m->coldp == stop) 277 break; 278 start = m->coldp + 1; /* recycle starting later */ 279 } 280 281 /* fill in the details if requested */ 282 if (nmatch > 0) { 283 pmatch[0].rm_so = m->coldp - m->offp; 284 pmatch[0].rm_eo = endp - m->offp; 285 } 286 if (nmatch > 1) { 287 assert(m->pmatch != NULL); 288 for (i = 1; i < nmatch; i++) 289 if (i <= m->g->nsub) 290 pmatch[i] = m->pmatch[i]; 291 else { 292 pmatch[i].rm_so = -1; 293 pmatch[i].rm_eo = -1; 294 } 295 } 296 297 if (m->pmatch != NULL) 298 free((char *)m->pmatch); 299 if (m->lastpos != NULL) 300 free((char *)m->lastpos); 301 STATETEARDOWN(m); 302 return(0); 303} 304 305/* 306 - dissect - figure out what matched what, no back references 307 == static char *dissect(register struct match *m, char *start, \ 308 == char *stop, sopno startst, sopno stopst); 309 */ 310static char * /* == stop (success) always */ 311dissect(m, start, stop, startst, stopst) 312register struct match *m; 313char *start; 314char *stop; 315sopno startst; 316sopno stopst; 317{ 318 register int i; 319 register sopno ss; /* start sop of current subRE */ 320 register sopno es; /* end sop of current subRE */ 321 register char *sp; /* start of string matched by it */ 322 register char *stp; /* string matched by it cannot pass here */ 323 register char *rest; /* start of rest of string */ 324 register char *tail; /* string unmatched by rest of RE */ 325 register sopno ssub; /* start sop of subsubRE */ 326 register sopno esub; /* end sop of subsubRE */ 327 register char *ssp; /* start of string matched by subsubRE */ 328 register char *sep; /* end of string matched by subsubRE */ 329 register char *oldssp; /* previous ssp */ 330 register char *dp; 331 332 AT("diss", start, stop, startst, stopst); 333 sp = start; 334 for (ss = startst; ss < stopst; ss = es) { 335 /* identify end of subRE */ 336 es = ss; 337 switch (OP(m->g->strip[es])) { 338 case OPLUS_: 339 case OQUEST_: 340 es += OPND(m->g->strip[es]); 341 break; 342 case OCH_: 343 while (OP(m->g->strip[es]) != O_CH) 344 es += OPND(m->g->strip[es]); 345 break; 346 } 347 es++; 348 349 /* figure out what it matched */ 350 switch (OP(m->g->strip[ss])) { 351 case OEND: 352 assert(nope); 353 break; 354 case OCHAR: 355 sp++; 356 break; 357 case OBOL: 358 case OEOL: 359 case OBOW: 360 case OEOW: 361 break; 362 case OANY: 363 case OANYOF: 364 sp++; 365 break; 366 case OBACK_: 367 case O_BACK: 368 assert(nope); 369 break; 370 /* cases where length of match is hard to find */ 371 case OQUEST_: 372 stp = stop; 373 for (;;) { 374 /* how long could this one be? */ 375 rest = slow(m, sp, stp, ss, es); 376 assert(rest != NULL); /* it did match */ 377 /* could the rest match the rest? */ 378 tail = slow(m, rest, stop, es, stopst); 379 if (tail == stop) 380 break; /* yes! */ 381 /* no -- try a shorter match for this one */ 382 stp = rest - 1; 383 assert(stp >= sp); /* it did work */ 384 } 385 ssub = ss + 1; 386 esub = es - 1; 387 /* did innards match? */ 388 if (slow(m, sp, rest, ssub, esub) != NULL) { 389 dp = dissect(m, sp, rest, ssub, esub); 390 assert(dp == rest); 391 } else /* no */ 392 assert(sp == rest); 393 sp = rest; 394 break; 395 case OPLUS_: 396 stp = stop; 397 for (;;) { 398 /* how long could this one be? */ 399 rest = slow(m, sp, stp, ss, es); 400 assert(rest != NULL); /* it did match */ 401 /* could the rest match the rest? */ 402 tail = slow(m, rest, stop, es, stopst); 403 if (tail == stop) 404 break; /* yes! */ 405 /* no -- try a shorter match for this one */ 406 stp = rest - 1; 407 assert(stp >= sp); /* it did work */ 408 } 409 ssub = ss + 1; 410 esub = es - 1; 411 ssp = sp; 412 oldssp = ssp; 413 for (;;) { /* find last match of innards */ 414 sep = slow(m, ssp, rest, ssub, esub); 415 if (sep == NULL || sep == ssp) 416 break; /* failed or matched null */ 417 oldssp = ssp; /* on to next try */ 418 ssp = sep; 419 } 420 if (sep == NULL) { 421 /* last successful match */ 422 sep = ssp; 423 ssp = oldssp; 424 } 425 assert(sep == rest); /* must exhaust substring */ 426 assert(slow(m, ssp, sep, ssub, esub) == rest); 427 dp = dissect(m, ssp, sep, ssub, esub); 428 assert(dp == sep); 429 sp = rest; 430 break; 431 case OCH_: 432 stp = stop; 433 for (;;) { 434 /* how long could this one be? */ 435 rest = slow(m, sp, stp, ss, es); 436 assert(rest != NULL); /* it did match */ 437 /* could the rest match the rest? */ 438 tail = slow(m, rest, stop, es, stopst); 439 if (tail == stop) 440 break; /* yes! */ 441 /* no -- try a shorter match for this one */ 442 stp = rest - 1; 443 assert(stp >= sp); /* it did work */ 444 } 445 ssub = ss + 1; 446 esub = ss + OPND(m->g->strip[ss]) - 1; 447 assert(OP(m->g->strip[esub]) == OOR1); 448 for (;;) { /* find first matching branch */ 449 if (slow(m, sp, rest, ssub, esub) == rest) 450 break; /* it matched all of it */ 451 /* that one missed, try next one */ 452 assert(OP(m->g->strip[esub]) == OOR1); 453 esub++; 454 assert(OP(m->g->strip[esub]) == OOR2); 455 ssub = esub + 1; 456 esub += OPND(m->g->strip[esub]); 457 if (OP(m->g->strip[esub]) == OOR2) 458 esub--; 459 else 460 assert(OP(m->g->strip[esub]) == O_CH); 461 } 462 dp = dissect(m, sp, rest, ssub, esub); 463 assert(dp == rest); 464 sp = rest; 465 break; 466 case O_PLUS: 467 case O_QUEST: 468 case OOR1: 469 case OOR2: 470 case O_CH: 471 assert(nope); 472 break; 473 case OLPAREN: 474 i = OPND(m->g->strip[ss]); 475 assert(0 < i && i <= m->g->nsub); 476 m->pmatch[i].rm_so = sp - m->offp; 477 break; 478 case ORPAREN: 479 i = OPND(m->g->strip[ss]); 480 assert(0 < i && i <= m->g->nsub); 481 m->pmatch[i].rm_eo = sp - m->offp; 482 break; 483 default: /* uh oh */ 484 assert(nope); 485 break; 486 } 487 } 488 489 assert(sp == stop); 490 return(sp); 491} 492 493/* 494 - backref - figure out what matched what, figuring in back references 495 == static char *backref(register struct match *m, char *start, \ 496 == char *stop, sopno startst, sopno stopst, sopno lev); 497 */ 498static char * /* == stop (success) or NULL (failure) */ 499backref(m, start, stop, startst, stopst, lev) 500register struct match *m; 501char *start; 502char *stop; 503sopno startst; 504sopno stopst; 505sopno lev; /* PLUS nesting level */ 506{ 507 register int i; 508 register sopno ss; /* start sop of current subRE */ 509 register char *sp; /* start of string matched by it */ 510 register sopno ssub; /* start sop of subsubRE */ 511 register sopno esub; /* end sop of subsubRE */ 512 register char *ssp; /* start of string matched by subsubRE */ 513 register char *dp; 514 register size_t len; 515 register int hard; 516 register sop s; 517 register regoff_t offsave; 518 register cset *cs; 519 520 AT("back", start, stop, startst, stopst); 521 sp = start; 522 523 /* get as far as we can with easy stuff */ 524 hard = 0; 525 for (ss = startst; !hard && ss < stopst; ss++) 526 switch (OP(s = m->g->strip[ss])) { 527 case OCHAR: 528 if (sp == stop || *sp++ != (char)OPND(s)) 529 return(NULL); 530 break; 531 case OANY: 532 if (sp == stop) 533 return(NULL); 534 sp++; 535 break; 536 case OANYOF: 537 cs = &m->g->sets[OPND(s)]; 538 if (sp == stop || !CHIN(cs, *sp++)) 539 return(NULL); 540 break; 541 case OBOL: 542 if ( (sp == m->beginp && !(m->eflags®_NOTBOL)) || 543 (sp < m->endp && *(sp-1) == '\n' && 544 (m->g->cflags®_NEWLINE)) ) 545 { /* yes */ } 546 else 547 return(NULL); 548 break; 549 case OEOL: 550 if ( (sp == m->endp && !(m->eflags®_NOTEOL)) || 551 (sp < m->endp && *sp == '\n' && 552 (m->g->cflags®_NEWLINE)) ) 553 { /* yes */ } 554 else 555 return(NULL); 556 break; 557 case OBOW: 558 if (( (sp == m->beginp && !(m->eflags®_NOTBOL)) || 559 (sp < m->endp && *(sp-1) == '\n' && 560 (m->g->cflags®_NEWLINE)) || 561 (sp > m->beginp && 562 !ISWORD(*(sp-1))) ) && 563 (sp < m->endp && ISWORD(*sp)) ) 564 { /* yes */ } 565 else 566 return(NULL); 567 break; 568 case OEOW: 569 if (( (sp == m->endp && !(m->eflags®_NOTEOL)) || 570 (sp < m->endp && *sp == '\n' && 571 (m->g->cflags®_NEWLINE)) || 572 (sp < m->endp && !ISWORD(*sp)) ) && 573 (sp > m->beginp && ISWORD(*(sp-1))) ) 574 { /* yes */ } 575 else 576 return(NULL); 577 break; 578 case O_QUEST: 579 break; 580 case OOR1: /* matches null but needs to skip */ 581 ss++; 582 s = m->g->strip[ss]; 583 do { 584 assert(OP(s) == OOR2); 585 ss += OPND(s); 586 } while (OP(s = m->g->strip[ss]) != O_CH); 587 /* note that the ss++ gets us past the O_CH */ 588 break; 589 default: /* have to make a choice */ 590 hard = 1; 591 break; 592 } 593 if (!hard) { /* that was it! */ 594 if (sp != stop) 595 return(NULL); 596 return(sp); 597 } 598 ss--; /* adjust for the for's final increment */ 599 600 /* the hard stuff */ 601 AT("hard", sp, stop, ss, stopst); 602 s = m->g->strip[ss]; 603 switch (OP(s)) { 604 case OBACK_: /* the vilest depths */ 605 i = OPND(s); 606 assert(0 < i && i <= m->g->nsub); 607 if (m->pmatch[i].rm_eo == -1) 608 return(NULL); 609 assert(m->pmatch[i].rm_so != -1); 610 len = m->pmatch[i].rm_eo - m->pmatch[i].rm_so; 611 if (len == 0) 612 return(NULL); 613 assert(stop - m->beginp >= len); 614 if (sp > stop - len) 615 return(NULL); /* not enough left to match */ 616 ssp = m->offp + m->pmatch[i].rm_so; 617 if (memcmp(sp, ssp, len) != 0) 618 return(NULL); 619 while (m->g->strip[ss] != SOP(O_BACK, i)) 620 ss++; 621 return(backref(m, sp+len, stop, ss+1, stopst, lev)); 622 break; 623 case OQUEST_: /* to null or not */ 624 dp = backref(m, sp, stop, ss+1, stopst, lev); 625 if (dp != NULL) 626 return(dp); /* not */ 627 return(backref(m, sp, stop, ss+OPND(s)+1, stopst, lev)); 628 break; 629 case OPLUS_: 630 assert(m->lastpos != NULL); 631 assert(lev+1 <= m->g->nplus); 632 m->lastpos[lev+1] = sp; 633 return(backref(m, sp, stop, ss+1, stopst, lev+1)); 634 break; 635 case O_PLUS: 636 if (sp == m->lastpos[lev]) /* last pass matched null */ 637 return(backref(m, sp, stop, ss+1, stopst, lev-1)); 638 /* try another pass */ 639 m->lastpos[lev] = sp; 640 dp = backref(m, sp, stop, ss-OPND(s)+1, stopst, lev); 641 if (dp == NULL) 642 return(backref(m, sp, stop, ss+1, stopst, lev-1)); 643 else 644 return(dp); 645 break; 646 case OCH_: /* find the right one, if any */ 647 ssub = ss + 1; 648 esub = ss + OPND(s) - 1; 649 assert(OP(m->g->strip[esub]) == OOR1); 650 for (;;) { /* find first matching branch */ 651 dp = backref(m, sp, stop, ssub, esub, lev); 652 if (dp != NULL) 653 return(dp); 654 /* that one missed, try next one */ 655 if (OP(m->g->strip[esub]) == O_CH) 656 return(NULL); /* there is none */ 657 esub++; 658 assert(OP(m->g->strip[esub]) == OOR2); 659 ssub = esub + 1; 660 esub += OPND(m->g->strip[esub]); 661 if (OP(m->g->strip[esub]) == OOR2) 662 esub--; 663 else 664 assert(OP(m->g->strip[esub]) == O_CH); 665 } 666 break; 667 case OLPAREN: /* must undo assignment if rest fails */ 668 i = OPND(s); 669 assert(0 < i && i <= m->g->nsub); 670 offsave = m->pmatch[i].rm_so; 671 m->pmatch[i].rm_so = sp - m->offp; 672 dp = backref(m, sp, stop, ss+1, stopst, lev); 673 if (dp != NULL) 674 return(dp); 675 m->pmatch[i].rm_so = offsave; 676 return(NULL); 677 break; 678 case ORPAREN: /* must undo assignment if rest fails */ 679 i = OPND(s); 680 assert(0 < i && i <= m->g->nsub); 681 offsave = m->pmatch[i].rm_eo; 682 m->pmatch[i].rm_eo = sp - m->offp; 683 dp = backref(m, sp, stop, ss+1, stopst, lev); 684 if (dp != NULL) 685 return(dp); 686 m->pmatch[i].rm_eo = offsave; 687 return(NULL); 688 break; 689 default: /* uh oh */ 690 assert(nope); 691 break; 692 } 693 694 /* "can't happen" */ 695 assert(nope); 696 /* NOTREACHED */ 697} 698 699/* 700 - fast - step through the string at top speed 701 == static char *fast(register struct match *m, char *start, \ 702 == char *stop, sopno startst, sopno stopst); 703 */ 704static char * /* where tentative match ended, or NULL */ 705fast(m, start, stop, startst, stopst) 706register struct match *m; 707char *start; 708char *stop; 709sopno startst; 710sopno stopst; 711{ 712 register states st = m->st; 713 register states fresh = m->fresh; 714 register states tmp = m->tmp; 715 register char *p = start; 716 register int c = (start == m->beginp) ? OUT : *(start-1); 717 register int lastc; /* previous c */ 718 register int flagch; 719 register int i; 720 register char *coldp; /* last p after which no match was underway */ 721 722 CLEAR(st); 723 SET1(st, startst); 724 st = step(m->g, startst, stopst, st, NOTHING, st); 725 ASSIGN(fresh, st); 726 SP("start", st, *p); 727 coldp = NULL; 728 for (;;) { 729 /* next character */ 730 lastc = c; 731 c = (p == m->endp) ? OUT : *p; 732 if (EQ(st, fresh)) 733 coldp = p; 734 735 /* is there an EOL and/or BOL between lastc and c? */ 736 flagch = '\0'; 737 i = 0; 738 if ( (lastc == '\n' && m->g->cflags®_NEWLINE) || 739 (lastc == OUT && !(m->eflags®_NOTBOL)) ) { 740 flagch = BOL; 741 i = m->g->nbol; 742 } 743 if ( (c == '\n' && m->g->cflags®_NEWLINE) || 744 (c == OUT && !(m->eflags®_NOTEOL)) ) { 745 flagch = (flagch == BOL) ? BOLEOL : EOL; 746 i += m->g->neol; 747 } 748 if (i != 0) { 749 for (; i > 0; i--) 750 st = step(m->g, startst, stopst, st, flagch, st); 751 SP("boleol", st, c); 752 } 753 754 /* how about a word boundary? */ 755 if ( (flagch == BOL || (lastc != OUT && !ISWORD(lastc))) && 756 (c != OUT && ISWORD(c)) ) { 757 flagch = BOW; 758 } 759 if ( (lastc != OUT && ISWORD(lastc)) && 760 (flagch == EOL || (c != OUT && !ISWORD(c))) ) { 761 flagch = EOW; 762 } 763 if (flagch == BOW || flagch == EOW) { 764 st = step(m->g, startst, stopst, st, flagch, st); 765 SP("boweow", st, c); 766 } 767 768 /* are we done? */ 769 if (ISSET(st, stopst) || p == stop) 770 break; /* NOTE BREAK OUT */ 771 772 /* no, we must deal with this character */ 773 ASSIGN(tmp, st); 774 ASSIGN(st, fresh); 775 assert(c != OUT); 776 st = step(m->g, startst, stopst, tmp, c, st); 777 SP("aft", st, c); 778 assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st)); 779 p++; 780 } 781 782 assert(coldp != NULL); 783 m->coldp = coldp; 784 if (ISSET(st, stopst)) 785 return(p+1); 786 else 787 return(NULL); 788} 789 790/* 791 - slow - step through the string more deliberately 792 == static char *slow(register struct match *m, char *start, \ 793 == char *stop, sopno startst, sopno stopst); 794 */ 795static char * /* where it ended */ 796slow(m, start, stop, startst, stopst) 797register struct match *m; 798char *start; 799char *stop; 800sopno startst; 801sopno stopst; 802{ 803 register states st = m->st; 804 register states empty = m->empty; 805 register states tmp = m->tmp; 806 register char *p = start; 807 register int c = (start == m->beginp) ? OUT : *(start-1); 808 register int lastc; /* previous c */ 809 register int flagch; 810 register int i; 811 register char *matchp; /* last p at which a match ended */ 812 813 AT("slow", start, stop, startst, stopst); 814 CLEAR(st); 815 SET1(st, startst); 816 SP("sstart", st, *p); 817 st = step(m->g, startst, stopst, st, NOTHING, st); 818 matchp = NULL; 819 for (;;) { 820 /* next character */ 821 lastc = c; 822 c = (p == m->endp) ? OUT : *p; 823 824 /* is there an EOL and/or BOL between lastc and c? */ 825 flagch = '\0'; 826 i = 0; 827 if ( (lastc == '\n' && m->g->cflags®_NEWLINE) || 828 (lastc == OUT && !(m->eflags®_NOTBOL)) ) { 829 flagch = BOL; 830 i = m->g->nbol; 831 } 832 if ( (c == '\n' && m->g->cflags®_NEWLINE) || 833 (c == OUT && !(m->eflags®_NOTEOL)) ) { 834 flagch = (flagch == BOL) ? BOLEOL : EOL; 835 i += m->g->neol; 836 } 837 if (i != 0) { 838 for (; i > 0; i--) 839 st = step(m->g, startst, stopst, st, flagch, st); 840 SP("sboleol", st, c); 841 } 842 843 /* how about a word boundary? */ 844 if ( (flagch == BOL || (lastc != OUT && !ISWORD(lastc))) && 845 (c != OUT && ISWORD(c)) ) { 846 flagch = BOW; 847 } 848 if ( (lastc != OUT && ISWORD(lastc)) && 849 (flagch == EOL || (c != OUT && !ISWORD(c))) ) { 850 flagch = EOW; 851 } 852 if (flagch == BOW || flagch == EOW) { 853 st = step(m->g, startst, stopst, st, flagch, st); 854 SP("sboweow", st, c); 855 } 856 857 /* are we done? */ 858 if (ISSET(st, stopst)) 859 matchp = p; 860 if (EQ(st, empty) || p == stop) 861 break; /* NOTE BREAK OUT */ 862 863 /* no, we must deal with this character */ 864 ASSIGN(tmp, st); 865 ASSIGN(st, empty); 866 assert(c != OUT); 867 st = step(m->g, startst, stopst, tmp, c, st); 868 SP("saft", st, c); 869 assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st)); 870 p++; 871 } 872 873 return(matchp); 874} 875 876 877/* 878 - step - map set of states reachable before char to set reachable after 879 == static states step(register struct re_guts *g, sopno start, sopno stop, \ 880 == register states bef, int ch, register states aft); 881 == #define BOL (OUT+1) 882 == #define EOL (BOL+1) 883 == #define BOLEOL (BOL+2) 884 == #define NOTHING (BOL+3) 885 == #define BOW (BOL+4) 886 == #define EOW (BOL+5) 887 == #define CODEMAX (BOL+5) // highest code used 888 == #define NONCHAR(c) ((c) > CHAR_MAX) 889 == #define NNONCHAR (CODEMAX-CHAR_MAX) 890 */ 891static states 892step(g, start, stop, bef, ch, aft) 893register struct re_guts *g; 894sopno start; /* start state within strip */ 895sopno stop; /* state after stop state within strip */ 896register states bef; /* states reachable before */ 897int ch; /* character or NONCHAR code */ 898register states aft; /* states already known reachable after */ 899{ 900 register cset *cs; 901 register sop s; 902 register sopno pc; 903 register onestate here; /* note, macros know this name */ 904 register sopno look; 905 register int i; 906 907 for (pc = start, INIT(here, pc); pc != stop; pc++, INC(here)) { 908 s = g->strip[pc]; 909 switch (OP(s)) { 910 case OEND: 911 assert(pc == stop-1); 912 break; 913 case OCHAR: 914 /* only characters can match */ 915 assert(!NONCHAR(ch) || ch != (char)OPND(s)); 916 if (ch == (char)OPND(s)) 917 FWD(aft, bef, 1); 918 break; 919 case OBOL: 920 if (ch == BOL || ch == BOLEOL) 921 FWD(aft, bef, 1); 922 break; 923 case OEOL: 924 if (ch == EOL || ch == BOLEOL) 925 FWD(aft, bef, 1); 926 break; 927 case OBOW: 928 if (ch == BOW) 929 FWD(aft, bef, 1); 930 break; 931 case OEOW: 932 if (ch == EOW) 933 FWD(aft, bef, 1); 934 break; 935 case OANY: 936 if (!NONCHAR(ch)) 937 FWD(aft, bef, 1); 938 break; 939 case OANYOF: 940 cs = &g->sets[OPND(s)]; 941 if (!NONCHAR(ch) && CHIN(cs, ch)) 942 FWD(aft, bef, 1); 943 break; 944 case OBACK_: /* ignored here */ 945 case O_BACK: 946 FWD(aft, aft, 1); 947 break; 948 case OPLUS_: /* forward, this is just an empty */ 949 FWD(aft, aft, 1); 950 break; 951 case O_PLUS: /* both forward and back */ 952 FWD(aft, aft, 1); 953 i = ISSETBACK(aft, OPND(s)); 954 BACK(aft, aft, OPND(s)); 955 if (!i && ISSETBACK(aft, OPND(s))) { 956 /* oho, must reconsider loop body */ 957 pc -= OPND(s) + 1; 958 INIT(here, pc); 959 } 960 break; 961 case OQUEST_: /* two branches, both forward */ 962 FWD(aft, aft, 1); 963 FWD(aft, aft, OPND(s)); 964 break; 965 case O_QUEST: /* just an empty */ 966 FWD(aft, aft, 1); 967 break; 968 case OLPAREN: /* not significant here */ 969 case ORPAREN: 970 FWD(aft, aft, 1); 971 break; 972 case OCH_: /* mark the first two branches */ 973 FWD(aft, aft, 1); 974 assert(OP(g->strip[pc+OPND(s)]) == OOR2); 975 FWD(aft, aft, OPND(s)); 976 break; 977 case OOR1: /* done a branch, find the O_CH */ 978 if (ISSTATEIN(aft, here)) { 979 for (look = 1; 980 OP(s = g->strip[pc+look]) != O_CH; 981 look += OPND(s)) 982 assert(OP(s) == OOR2); 983 FWD(aft, aft, look); 984 } 985 break; 986 case OOR2: /* propagate OCH_'s marking */ 987 FWD(aft, aft, 1); 988 if (OP(g->strip[pc+OPND(s)]) != O_CH) { 989 assert(OP(g->strip[pc+OPND(s)]) == OOR2); 990 FWD(aft, aft, OPND(s)); 991 } 992 break; 993 case O_CH: /* just empty */ 994 FWD(aft, aft, 1); 995 break; 996 default: /* ooooops... */ 997 assert(nope); 998 break; 999 } 1000 } 1001 1002 return(aft); 1003} 1004 1005#ifdef REDEBUG 1006/* 1007 - print - print a set of states 1008 == #ifdef REDEBUG 1009 == static void print(struct match *m, char *caption, states st, \ 1010 == int ch, FILE *d); 1011 == #endif 1012 */ 1013static void 1014print(m, caption, st, ch, d) 1015struct match *m; 1016char *caption; 1017states st; 1018int ch; 1019FILE *d; 1020{ 1021 register struct re_guts *g = m->g; 1022 register int i; 1023 register int first = 1; 1024 1025 if (!(m->eflags®_TRACE)) 1026 return; 1027 1028 (void)fprintf(d, "%s", caption); 1029 if (ch != '\0') 1030 (void)fprintf(d, " %s", pchar(ch)); 1031 for (i = 0; i < g->nstates; i++) 1032 if (ISSET(st, i)) { 1033 (void)fprintf(d, "%s%d", (first) ? "\t" : ", ", i); 1034 first = 0; 1035 } 1036 (void)fprintf(d, "\n"); 1037} 1038 1039/* 1040 - at - print current situation 1041 == #ifdef REDEBUG 1042 == static void at(struct match *m, char *title, char *start, char *stop, \ 1043 == sopno startst, sopno stopst); 1044 == #endif 1045 */ 1046static void 1047at(m, title, start, stop, startst, stopst) 1048struct match *m; 1049char *title; 1050char *start; 1051char *stop; 1052sopno startst; 1053sopno stopst; 1054{ 1055 if (!(m->eflags®_TRACE)) 1056 return; 1057 1058 (void)printf("%s %s-", title, pchar(*start)); 1059 (void)printf("%s ", pchar(*stop)); 1060 (void)printf("%ld-%ld\n", (long)startst, (long)stopst); 1061} 1062 1063#ifndef PCHARDONE 1064#define PCHARDONE /* never again */ 1065/* 1066 - pchar - make a character printable 1067 == #ifdef REDEBUG 1068 == static char *pchar(int ch); 1069 == #endif 1070 * 1071 * Is this identical to regchar() over in debug.c? Well, yes. But a 1072 * duplicate here avoids having a debugging-capable regexec.o tied to 1073 * a matching debug.o, and this is convenient. It all disappears in 1074 * the non-debug compilation anyway, so it doesn't matter much. 1075 */ 1076static char * /* -> representation */ 1077pchar(ch) 1078int ch; 1079{ 1080 static char pbuf[10]; 1081 1082 if (isprint(ch) || ch == ' ') 1083 (void)snprintf(pbuf, sizeof pbuf, "%c", ch); 1084 else 1085 (void)snprintf(pbuf, sizeof pbuf, "\\%o", ch); 1086 return(pbuf); 1087} 1088#endif 1089#endif 1090 1091#undef matcher 1092#undef fast 1093#undef slow 1094#undef dissect 1095#undef backref 1096#undef step 1097#undef print 1098#undef at 1099#undef match 1100#undef nope 1101