engine.c revision 1.12
1/* $OpenBSD: engine.c,v 1.12 2004/11/29 16:49:50 otto Exp $ */ 2 3/*- 4 * Copyright (c) 1992, 1993, 1994 Henry Spencer. 5 * Copyright (c) 1992, 1993, 1994 6 * The Regents of the University of California. All rights reserved. 7 * 8 * This code is derived from software contributed to Berkeley by 9 * Henry Spencer. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 3. Neither the name of the University nor the names of its contributors 20 * may be used to endorse or promote products derived from this software 21 * without specific prior written permission. 22 * 23 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 24 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 25 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 26 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 27 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 28 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 29 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 30 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 31 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 32 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 33 * SUCH DAMAGE. 34 * 35 * @(#)engine.c 8.5 (Berkeley) 3/20/94 36 */ 37 38#if defined(SNAMES) && defined(LIBC_SCCS) && !defined(lint) 39static char enginercsid[] = "$OpenBSD: engine.c,v 1.12 2004/11/29 16:49:50 otto Exp $"; 40#endif /* SNAMES and LIBC_SCCS and not lint */ 41 42/* 43 * The matching engine and friends. This file is #included by regexec.c 44 * after suitable #defines of a variety of macros used herein, so that 45 * different state representations can be used without duplicating masses 46 * of code. 47 */ 48 49#ifdef SNAMES 50#define matcher smatcher 51#define fast sfast 52#define slow sslow 53#define dissect sdissect 54#define backref sbackref 55#define step sstep 56#define print sprint 57#define at sat 58#define match smat 59#define nope snope 60#endif 61#ifdef LNAMES 62#define matcher lmatcher 63#define fast lfast 64#define slow lslow 65#define dissect ldissect 66#define backref lbackref 67#define step lstep 68#define print lprint 69#define at lat 70#define match lmat 71#define nope lnope 72#endif 73 74/* another structure passed up and down to avoid zillions of parameters */ 75struct match { 76 struct re_guts *g; 77 int eflags; 78 regmatch_t *pmatch; /* [nsub+1] (0 element unused) */ 79 char *offp; /* offsets work from here */ 80 char *beginp; /* start of string -- virtual NUL precedes */ 81 char *endp; /* end of string -- virtual NUL here */ 82 char *coldp; /* can be no match starting before here */ 83 char **lastpos; /* [nplus+1] */ 84 STATEVARS; 85 states st; /* current states */ 86 states fresh; /* states for a fresh start */ 87 states tmp; /* temporary */ 88 states empty; /* empty set of states */ 89}; 90 91/* ========= begin header generated by ./mkh ========= */ 92#ifdef __cplusplus 93extern "C" { 94#endif 95 96/* === engine.c === */ 97static int matcher(struct re_guts *g, char *string, size_t nmatch, regmatch_t pmatch[], int eflags); 98static char *dissect(struct match *m, char *start, char *stop, sopno startst, sopno stopst); 99static char *backref(struct match *m, char *start, char *stop, sopno startst, sopno stopst, sopno lev, int); 100static char *fast(struct match *m, char *start, char *stop, sopno startst, sopno stopst); 101static char *slow(struct match *m, char *start, char *stop, sopno startst, sopno stopst); 102static states step(struct re_guts *g, sopno start, sopno stop, states bef, int ch, states aft); 103#define MAX_RECURSION 100 104#define BOL (OUT+1) 105#define EOL (BOL+1) 106#define BOLEOL (BOL+2) 107#define NOTHING (BOL+3) 108#define BOW (BOL+4) 109#define EOW (BOL+5) 110#define CODEMAX (BOL+5) /* highest code used */ 111#define NONCHAR(c) ((c) > CHAR_MAX) 112#define NNONCHAR (CODEMAX-CHAR_MAX) 113#ifdef REDEBUG 114static void print(struct match *m, char *caption, states st, int ch, FILE *d); 115#endif 116#ifdef REDEBUG 117static void at(struct match *m, char *title, char *start, char *stop, sopno startst, sopno stopst); 118#endif 119#ifdef REDEBUG 120static char *pchar(int ch); 121#endif 122 123#ifdef __cplusplus 124} 125#endif 126/* ========= end header generated by ./mkh ========= */ 127 128#ifdef REDEBUG 129#define SP(t, s, c) print(m, t, s, c, stdout) 130#define AT(t, p1, p2, s1, s2) at(m, t, p1, p2, s1, s2) 131#define NOTE(str) { if (m->eflags®_TRACE) (void)printf("=%s\n", (str)); } 132static int nope = 0; 133#else 134#define SP(t, s, c) /* nothing */ 135#define AT(t, p1, p2, s1, s2) /* nothing */ 136#define NOTE(s) /* nothing */ 137#endif 138 139/* 140 - matcher - the actual matching engine 141 == static int matcher(struct re_guts *g, char *string, \ 142 == size_t nmatch, regmatch_t pmatch[], int eflags); 143 */ 144static int /* 0 success, REG_NOMATCH failure */ 145matcher(struct re_guts *g, char *string, size_t nmatch, regmatch_t pmatch[], 146 int eflags) 147{ 148 char *endp; 149 int i; 150 struct match mv; 151 struct match *m = &mv; 152 char *dp; 153 const sopno gf = g->firststate+1; /* +1 for OEND */ 154 const sopno gl = g->laststate; 155 char *start; 156 char *stop; 157 158 /* simplify the situation where possible */ 159 if (g->cflags®_NOSUB) 160 nmatch = 0; 161 if (eflags®_STARTEND) { 162 start = string + pmatch[0].rm_so; 163 stop = string + pmatch[0].rm_eo; 164 } else { 165 start = string; 166 stop = start + strlen(start); 167 } 168 if (stop < start) 169 return(REG_INVARG); 170 171 /* prescreening; this does wonders for this rather slow code */ 172 if (g->must != NULL) { 173 for (dp = start; dp < stop; dp++) 174 if (*dp == g->must[0] && stop - dp >= g->mlen && 175 memcmp(dp, g->must, (size_t)g->mlen) == 0) 176 break; 177 if (dp == stop) /* we didn't find g->must */ 178 return(REG_NOMATCH); 179 } 180 181 /* match struct setup */ 182 m->g = g; 183 m->eflags = eflags; 184 m->pmatch = NULL; 185 m->lastpos = NULL; 186 m->offp = string; 187 m->beginp = start; 188 m->endp = stop; 189 STATESETUP(m, 4); 190 SETUP(m->st); 191 SETUP(m->fresh); 192 SETUP(m->tmp); 193 SETUP(m->empty); 194 CLEAR(m->empty); 195 196 /* this loop does only one repetition except for backrefs */ 197 for (;;) { 198 endp = fast(m, start, stop, gf, gl); 199 if (endp == NULL) { /* a miss */ 200 STATETEARDOWN(m); 201 return(REG_NOMATCH); 202 } 203 if (nmatch == 0 && !g->backrefs) 204 break; /* no further info needed */ 205 206 /* where? */ 207 assert(m->coldp != NULL); 208 for (;;) { 209 NOTE("finding start"); 210 endp = slow(m, m->coldp, stop, gf, gl); 211 if (endp != NULL) 212 break; 213 assert(m->coldp < m->endp); 214 m->coldp++; 215 } 216 if (nmatch == 1 && !g->backrefs) 217 break; /* no further info needed */ 218 219 /* oh my, he wants the subexpressions... */ 220 if (m->pmatch == NULL) 221 m->pmatch = (regmatch_t *)malloc((m->g->nsub + 1) * 222 sizeof(regmatch_t)); 223 if (m->pmatch == NULL) { 224 STATETEARDOWN(m); 225 return(REG_ESPACE); 226 } 227 for (i = 1; i <= m->g->nsub; i++) 228 m->pmatch[i].rm_so = m->pmatch[i].rm_eo = -1; 229 if (!g->backrefs && !(m->eflags®_BACKR)) { 230 NOTE("dissecting"); 231 dp = dissect(m, m->coldp, endp, gf, gl); 232 } else { 233 if (g->nplus > 0 && m->lastpos == NULL) 234 m->lastpos = (char **)malloc((g->nplus+1) * 235 sizeof(char *)); 236 if (g->nplus > 0 && m->lastpos == NULL) { 237 free(m->pmatch); 238 STATETEARDOWN(m); 239 return(REG_ESPACE); 240 } 241 NOTE("backref dissect"); 242 dp = backref(m, m->coldp, endp, gf, gl, (sopno)0, 0); 243 } 244 if (dp != NULL) 245 break; 246 247 /* uh-oh... we couldn't find a subexpression-level match */ 248 assert(g->backrefs); /* must be back references doing it */ 249 assert(g->nplus == 0 || m->lastpos != NULL); 250 for (;;) { 251 if (dp != NULL || endp <= m->coldp) 252 break; /* defeat */ 253 NOTE("backoff"); 254 endp = slow(m, m->coldp, endp-1, gf, gl); 255 if (endp == NULL) 256 break; /* defeat */ 257 /* try it on a shorter possibility */ 258#ifndef NDEBUG 259 for (i = 1; i <= m->g->nsub; i++) { 260 assert(m->pmatch[i].rm_so == -1); 261 assert(m->pmatch[i].rm_eo == -1); 262 } 263#endif 264 NOTE("backoff dissect"); 265 dp = backref(m, m->coldp, endp, gf, gl, (sopno)0, 0); 266 } 267 assert(dp == NULL || dp == endp); 268 if (dp != NULL) /* found a shorter one */ 269 break; 270 271 /* despite initial appearances, there is no match here */ 272 NOTE("false alarm"); 273 if (m->coldp == stop) 274 break; 275 start = m->coldp + 1; /* recycle starting later */ 276 } 277 278 /* fill in the details if requested */ 279 if (nmatch > 0) { 280 pmatch[0].rm_so = m->coldp - m->offp; 281 pmatch[0].rm_eo = endp - m->offp; 282 } 283 if (nmatch > 1) { 284 assert(m->pmatch != NULL); 285 for (i = 1; i < nmatch; i++) 286 if (i <= m->g->nsub) 287 pmatch[i] = m->pmatch[i]; 288 else { 289 pmatch[i].rm_so = -1; 290 pmatch[i].rm_eo = -1; 291 } 292 } 293 294 if (m->pmatch != NULL) 295 free((char *)m->pmatch); 296 if (m->lastpos != NULL) 297 free((char *)m->lastpos); 298 STATETEARDOWN(m); 299 return(0); 300} 301 302/* 303 - dissect - figure out what matched what, no back references 304 == static char *dissect(struct match *m, char *start, \ 305 == char *stop, sopno startst, sopno stopst); 306 */ 307static char * /* == stop (success) always */ 308dissect(struct match *m, char *start, char *stop, sopno startst, sopno stopst) 309{ 310 int i; 311 sopno ss; /* start sop of current subRE */ 312 sopno es; /* end sop of current subRE */ 313 char *sp; /* start of string matched by it */ 314 char *stp; /* string matched by it cannot pass here */ 315 char *rest; /* start of rest of string */ 316 char *tail; /* string unmatched by rest of RE */ 317 sopno ssub; /* start sop of subsubRE */ 318 sopno esub; /* end sop of subsubRE */ 319 char *ssp; /* start of string matched by subsubRE */ 320 char *sep; /* end of string matched by subsubRE */ 321 char *oldssp; /* previous ssp */ 322 char *dp; 323 324 AT("diss", start, stop, startst, stopst); 325 sp = start; 326 for (ss = startst; ss < stopst; ss = es) { 327 /* identify end of subRE */ 328 es = ss; 329 switch (OP(m->g->strip[es])) { 330 case OPLUS_: 331 case OQUEST_: 332 es += OPND(m->g->strip[es]); 333 break; 334 case OCH_: 335 while (OP(m->g->strip[es]) != O_CH) 336 es += OPND(m->g->strip[es]); 337 break; 338 } 339 es++; 340 341 /* figure out what it matched */ 342 switch (OP(m->g->strip[ss])) { 343 case OEND: 344 assert(nope); 345 break; 346 case OCHAR: 347 sp++; 348 break; 349 case OBOL: 350 case OEOL: 351 case OBOW: 352 case OEOW: 353 break; 354 case OANY: 355 case OANYOF: 356 sp++; 357 break; 358 case OBACK_: 359 case O_BACK: 360 assert(nope); 361 break; 362 /* cases where length of match is hard to find */ 363 case OQUEST_: 364 stp = stop; 365 for (;;) { 366 /* how long could this one be? */ 367 rest = slow(m, sp, stp, ss, es); 368 assert(rest != NULL); /* it did match */ 369 /* could the rest match the rest? */ 370 tail = slow(m, rest, stop, es, stopst); 371 if (tail == stop) 372 break; /* yes! */ 373 /* no -- try a shorter match for this one */ 374 stp = rest - 1; 375 assert(stp >= sp); /* it did work */ 376 } 377 ssub = ss + 1; 378 esub = es - 1; 379 /* did innards match? */ 380 if (slow(m, sp, rest, ssub, esub) != NULL) { 381 dp = dissect(m, sp, rest, ssub, esub); 382 assert(dp == rest); 383 } else /* no */ 384 assert(sp == rest); 385 sp = rest; 386 break; 387 case OPLUS_: 388 stp = stop; 389 for (;;) { 390 /* how long could this one be? */ 391 rest = slow(m, sp, stp, ss, es); 392 assert(rest != NULL); /* it did match */ 393 /* could the rest match the rest? */ 394 tail = slow(m, rest, stop, es, stopst); 395 if (tail == stop) 396 break; /* yes! */ 397 /* no -- try a shorter match for this one */ 398 stp = rest - 1; 399 assert(stp >= sp); /* it did work */ 400 } 401 ssub = ss + 1; 402 esub = es - 1; 403 ssp = sp; 404 oldssp = ssp; 405 for (;;) { /* find last match of innards */ 406 sep = slow(m, ssp, rest, ssub, esub); 407 if (sep == NULL || sep == ssp) 408 break; /* failed or matched null */ 409 oldssp = ssp; /* on to next try */ 410 ssp = sep; 411 } 412 if (sep == NULL) { 413 /* last successful match */ 414 sep = ssp; 415 ssp = oldssp; 416 } 417 assert(sep == rest); /* must exhaust substring */ 418 assert(slow(m, ssp, sep, ssub, esub) == rest); 419 dp = dissect(m, ssp, sep, ssub, esub); 420 assert(dp == sep); 421 sp = rest; 422 break; 423 case OCH_: 424 stp = stop; 425 for (;;) { 426 /* how long could this one be? */ 427 rest = slow(m, sp, stp, ss, es); 428 assert(rest != NULL); /* it did match */ 429 /* could the rest match the rest? */ 430 tail = slow(m, rest, stop, es, stopst); 431 if (tail == stop) 432 break; /* yes! */ 433 /* no -- try a shorter match for this one */ 434 stp = rest - 1; 435 assert(stp >= sp); /* it did work */ 436 } 437 ssub = ss + 1; 438 esub = ss + OPND(m->g->strip[ss]) - 1; 439 assert(OP(m->g->strip[esub]) == OOR1); 440 for (;;) { /* find first matching branch */ 441 if (slow(m, sp, rest, ssub, esub) == rest) 442 break; /* it matched all of it */ 443 /* that one missed, try next one */ 444 assert(OP(m->g->strip[esub]) == OOR1); 445 esub++; 446 assert(OP(m->g->strip[esub]) == OOR2); 447 ssub = esub + 1; 448 esub += OPND(m->g->strip[esub]); 449 if (OP(m->g->strip[esub]) == OOR2) 450 esub--; 451 else 452 assert(OP(m->g->strip[esub]) == O_CH); 453 } 454 dp = dissect(m, sp, rest, ssub, esub); 455 assert(dp == rest); 456 sp = rest; 457 break; 458 case O_PLUS: 459 case O_QUEST: 460 case OOR1: 461 case OOR2: 462 case O_CH: 463 assert(nope); 464 break; 465 case OLPAREN: 466 i = OPND(m->g->strip[ss]); 467 assert(0 < i && i <= m->g->nsub); 468 m->pmatch[i].rm_so = sp - m->offp; 469 break; 470 case ORPAREN: 471 i = OPND(m->g->strip[ss]); 472 assert(0 < i && i <= m->g->nsub); 473 m->pmatch[i].rm_eo = sp - m->offp; 474 break; 475 default: /* uh oh */ 476 assert(nope); 477 break; 478 } 479 } 480 481 assert(sp == stop); 482 return(sp); 483} 484 485/* 486 - backref - figure out what matched what, figuring in back references 487 == static char *backref(struct match *m, char *start, \ 488 == char *stop, sopno startst, sopno stopst, sopno lev); 489 */ 490static char * /* == stop (success) or NULL (failure) */ 491backref(struct match *m, char *start, char *stop, sopno startst, sopno stopst, 492 sopno lev, int rec) /* PLUS nesting level */ 493{ 494 int i; 495 sopno ss; /* start sop of current subRE */ 496 char *sp; /* start of string matched by it */ 497 sopno ssub; /* start sop of subsubRE */ 498 sopno esub; /* end sop of subsubRE */ 499 char *ssp; /* start of string matched by subsubRE */ 500 char *dp; 501 size_t len; 502 int hard; 503 sop s; 504 regoff_t offsave; 505 cset *cs; 506 507 AT("back", start, stop, startst, stopst); 508 sp = start; 509 510 /* get as far as we can with easy stuff */ 511 hard = 0; 512 for (ss = startst; !hard && ss < stopst; ss++) 513 switch (OP(s = m->g->strip[ss])) { 514 case OCHAR: 515 if (sp == stop || *sp++ != (char)OPND(s)) 516 return(NULL); 517 break; 518 case OANY: 519 if (sp == stop) 520 return(NULL); 521 sp++; 522 break; 523 case OANYOF: 524 cs = &m->g->sets[OPND(s)]; 525 if (sp == stop || !CHIN(cs, *sp++)) 526 return(NULL); 527 break; 528 case OBOL: 529 if ( (sp == m->beginp && !(m->eflags®_NOTBOL)) || 530 (sp < m->endp && *(sp-1) == '\n' && 531 (m->g->cflags®_NEWLINE)) ) 532 { /* yes */ } 533 else 534 return(NULL); 535 break; 536 case OEOL: 537 if ( (sp == m->endp && !(m->eflags®_NOTEOL)) || 538 (sp < m->endp && *sp == '\n' && 539 (m->g->cflags®_NEWLINE)) ) 540 { /* yes */ } 541 else 542 return(NULL); 543 break; 544 case OBOW: 545 if (( (sp == m->beginp && !(m->eflags®_NOTBOL)) || 546 (sp < m->endp && *(sp-1) == '\n' && 547 (m->g->cflags®_NEWLINE)) || 548 (sp > m->beginp && 549 !ISWORD(*(sp-1))) ) && 550 (sp < m->endp && ISWORD(*sp)) ) 551 { /* yes */ } 552 else 553 return(NULL); 554 break; 555 case OEOW: 556 if (( (sp == m->endp && !(m->eflags®_NOTEOL)) || 557 (sp < m->endp && *sp == '\n' && 558 (m->g->cflags®_NEWLINE)) || 559 (sp < m->endp && !ISWORD(*sp)) ) && 560 (sp > m->beginp && ISWORD(*(sp-1))) ) 561 { /* yes */ } 562 else 563 return(NULL); 564 break; 565 case O_QUEST: 566 break; 567 case OOR1: /* matches null but needs to skip */ 568 ss++; 569 s = m->g->strip[ss]; 570 do { 571 assert(OP(s) == OOR2); 572 ss += OPND(s); 573 } while (OP(s = m->g->strip[ss]) != O_CH); 574 /* note that the ss++ gets us past the O_CH */ 575 break; 576 default: /* have to make a choice */ 577 hard = 1; 578 break; 579 } 580 if (!hard) { /* that was it! */ 581 if (sp != stop) 582 return(NULL); 583 return(sp); 584 } 585 ss--; /* adjust for the for's final increment */ 586 587 /* the hard stuff */ 588 AT("hard", sp, stop, ss, stopst); 589 s = m->g->strip[ss]; 590 switch (OP(s)) { 591 case OBACK_: /* the vilest depths */ 592 i = OPND(s); 593 assert(0 < i && i <= m->g->nsub); 594 if (m->pmatch[i].rm_eo == -1) 595 return(NULL); 596 assert(m->pmatch[i].rm_so != -1); 597 len = m->pmatch[i].rm_eo - m->pmatch[i].rm_so; 598 if (len == 0 && rec++ > MAX_RECURSION) 599 return(NULL); 600 assert(stop - m->beginp >= len); 601 if (sp > stop - len) 602 return(NULL); /* not enough left to match */ 603 ssp = m->offp + m->pmatch[i].rm_so; 604 if (memcmp(sp, ssp, len) != 0) 605 return(NULL); 606 while (m->g->strip[ss] != SOP(O_BACK, i)) 607 ss++; 608 return(backref(m, sp+len, stop, ss+1, stopst, lev, rec)); 609 break; 610 case OQUEST_: /* to null or not */ 611 dp = backref(m, sp, stop, ss+1, stopst, lev, rec); 612 if (dp != NULL) 613 return(dp); /* not */ 614 return(backref(m, sp, stop, ss+OPND(s)+1, stopst, lev, rec)); 615 break; 616 case OPLUS_: 617 assert(m->lastpos != NULL); 618 assert(lev+1 <= m->g->nplus); 619 m->lastpos[lev+1] = sp; 620 return(backref(m, sp, stop, ss+1, stopst, lev+1, rec)); 621 break; 622 case O_PLUS: 623 if (sp == m->lastpos[lev]) /* last pass matched null */ 624 return(backref(m, sp, stop, ss+1, stopst, lev-1, rec)); 625 /* try another pass */ 626 m->lastpos[lev] = sp; 627 dp = backref(m, sp, stop, ss-OPND(s)+1, stopst, lev, rec); 628 if (dp == NULL) 629 return(backref(m, sp, stop, ss+1, stopst, lev-1, rec)); 630 else 631 return(dp); 632 break; 633 case OCH_: /* find the right one, if any */ 634 ssub = ss + 1; 635 esub = ss + OPND(s) - 1; 636 assert(OP(m->g->strip[esub]) == OOR1); 637 for (;;) { /* find first matching branch */ 638 dp = backref(m, sp, stop, ssub, esub, lev, rec); 639 if (dp != NULL) 640 return(dp); 641 /* that one missed, try next one */ 642 if (OP(m->g->strip[esub]) == O_CH) 643 return(NULL); /* there is none */ 644 esub++; 645 assert(OP(m->g->strip[esub]) == OOR2); 646 ssub = esub + 1; 647 esub += OPND(m->g->strip[esub]); 648 if (OP(m->g->strip[esub]) == OOR2) 649 esub--; 650 else 651 assert(OP(m->g->strip[esub]) == O_CH); 652 } 653 break; 654 case OLPAREN: /* must undo assignment if rest fails */ 655 i = OPND(s); 656 assert(0 < i && i <= m->g->nsub); 657 offsave = m->pmatch[i].rm_so; 658 m->pmatch[i].rm_so = sp - m->offp; 659 dp = backref(m, sp, stop, ss+1, stopst, lev, rec); 660 if (dp != NULL) 661 return(dp); 662 m->pmatch[i].rm_so = offsave; 663 return(NULL); 664 break; 665 case ORPAREN: /* must undo assignment if rest fails */ 666 i = OPND(s); 667 assert(0 < i && i <= m->g->nsub); 668 offsave = m->pmatch[i].rm_eo; 669 m->pmatch[i].rm_eo = sp - m->offp; 670 dp = backref(m, sp, stop, ss+1, stopst, lev, rec); 671 if (dp != NULL) 672 return(dp); 673 m->pmatch[i].rm_eo = offsave; 674 return(NULL); 675 break; 676 default: /* uh oh */ 677 assert(nope); 678 break; 679 } 680 681 /* "can't happen" */ 682 assert(nope); 683 /* NOTREACHED */ 684} 685 686/* 687 - fast - step through the string at top speed 688 == static char *fast(struct match *m, char *start, \ 689 == char *stop, sopno startst, sopno stopst); 690 */ 691static char * /* where tentative match ended, or NULL */ 692fast(struct match *m, char *start, char *stop, sopno startst, sopno stopst) 693{ 694 states st = m->st; 695 states fresh = m->fresh; 696 states tmp = m->tmp; 697 char *p = start; 698 int c = (start == m->beginp) ? OUT : *(start-1); 699 int lastc; /* previous c */ 700 int flagch; 701 int i; 702 char *coldp; /* last p after which no match was underway */ 703 704 CLEAR(st); 705 SET1(st, startst); 706 st = step(m->g, startst, stopst, st, NOTHING, st); 707 ASSIGN(fresh, st); 708 SP("start", st, *p); 709 coldp = NULL; 710 for (;;) { 711 /* next character */ 712 lastc = c; 713 c = (p == m->endp) ? OUT : *p; 714 if (EQ(st, fresh)) 715 coldp = p; 716 717 /* is there an EOL and/or BOL between lastc and c? */ 718 flagch = '\0'; 719 i = 0; 720 if ( (lastc == '\n' && m->g->cflags®_NEWLINE) || 721 (lastc == OUT && !(m->eflags®_NOTBOL)) ) { 722 flagch = BOL; 723 i = m->g->nbol; 724 } 725 if ( (c == '\n' && m->g->cflags®_NEWLINE) || 726 (c == OUT && !(m->eflags®_NOTEOL)) ) { 727 flagch = (flagch == BOL) ? BOLEOL : EOL; 728 i += m->g->neol; 729 } 730 if (i != 0) { 731 for (; i > 0; i--) 732 st = step(m->g, startst, stopst, st, flagch, st); 733 SP("boleol", st, c); 734 } 735 736 /* how about a word boundary? */ 737 if ( (flagch == BOL || (lastc != OUT && !ISWORD(lastc))) && 738 (c != OUT && ISWORD(c)) ) { 739 flagch = BOW; 740 } 741 if ( (lastc != OUT && ISWORD(lastc)) && 742 (flagch == EOL || (c != OUT && !ISWORD(c))) ) { 743 flagch = EOW; 744 } 745 if (flagch == BOW || flagch == EOW) { 746 st = step(m->g, startst, stopst, st, flagch, st); 747 SP("boweow", st, c); 748 } 749 750 /* are we done? */ 751 if (ISSET(st, stopst) || p == stop) 752 break; /* NOTE BREAK OUT */ 753 754 /* no, we must deal with this character */ 755 ASSIGN(tmp, st); 756 ASSIGN(st, fresh); 757 assert(c != OUT); 758 st = step(m->g, startst, stopst, tmp, c, st); 759 SP("aft", st, c); 760 assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st)); 761 p++; 762 } 763 764 assert(coldp != NULL); 765 m->coldp = coldp; 766 if (ISSET(st, stopst)) 767 return(p+1); 768 else 769 return(NULL); 770} 771 772/* 773 - slow - step through the string more deliberately 774 == static char *slow(struct match *m, char *start, \ 775 == char *stop, sopno startst, sopno stopst); 776 */ 777static char * /* where it ended */ 778slow(struct match *m, char *start, char *stop, sopno startst, sopno stopst) 779{ 780 states st = m->st; 781 states empty = m->empty; 782 states tmp = m->tmp; 783 char *p = start; 784 int c = (start == m->beginp) ? OUT : *(start-1); 785 int lastc; /* previous c */ 786 int flagch; 787 int i; 788 char *matchp; /* last p at which a match ended */ 789 790 AT("slow", start, stop, startst, stopst); 791 CLEAR(st); 792 SET1(st, startst); 793 SP("sstart", st, *p); 794 st = step(m->g, startst, stopst, st, NOTHING, st); 795 matchp = NULL; 796 for (;;) { 797 /* next character */ 798 lastc = c; 799 c = (p == m->endp) ? OUT : *p; 800 801 /* is there an EOL and/or BOL between lastc and c? */ 802 flagch = '\0'; 803 i = 0; 804 if ( (lastc == '\n' && m->g->cflags®_NEWLINE) || 805 (lastc == OUT && !(m->eflags®_NOTBOL)) ) { 806 flagch = BOL; 807 i = m->g->nbol; 808 } 809 if ( (c == '\n' && m->g->cflags®_NEWLINE) || 810 (c == OUT && !(m->eflags®_NOTEOL)) ) { 811 flagch = (flagch == BOL) ? BOLEOL : EOL; 812 i += m->g->neol; 813 } 814 if (i != 0) { 815 for (; i > 0; i--) 816 st = step(m->g, startst, stopst, st, flagch, st); 817 SP("sboleol", st, c); 818 } 819 820 /* how about a word boundary? */ 821 if ( (flagch == BOL || (lastc != OUT && !ISWORD(lastc))) && 822 (c != OUT && ISWORD(c)) ) { 823 flagch = BOW; 824 } 825 if ( (lastc != OUT && ISWORD(lastc)) && 826 (flagch == EOL || (c != OUT && !ISWORD(c))) ) { 827 flagch = EOW; 828 } 829 if (flagch == BOW || flagch == EOW) { 830 st = step(m->g, startst, stopst, st, flagch, st); 831 SP("sboweow", st, c); 832 } 833 834 /* are we done? */ 835 if (ISSET(st, stopst)) 836 matchp = p; 837 if (EQ(st, empty) || p == stop) 838 break; /* NOTE BREAK OUT */ 839 840 /* no, we must deal with this character */ 841 ASSIGN(tmp, st); 842 ASSIGN(st, empty); 843 assert(c != OUT); 844 st = step(m->g, startst, stopst, tmp, c, st); 845 SP("saft", st, c); 846 assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st)); 847 p++; 848 } 849 850 return(matchp); 851} 852 853 854/* 855 - step - map set of states reachable before char to set reachable after 856 == static states step(struct re_guts *g, sopno start, sopno stop, \ 857 == states bef, int ch, states aft); 858 == #define BOL (OUT+1) 859 == #define EOL (BOL+1) 860 == #define BOLEOL (BOL+2) 861 == #define NOTHING (BOL+3) 862 == #define BOW (BOL+4) 863 == #define EOW (BOL+5) 864 == #define CODEMAX (BOL+5) // highest code used 865 == #define NONCHAR(c) ((c) > CHAR_MAX) 866 == #define NNONCHAR (CODEMAX-CHAR_MAX) 867 */ 868static states 869step(struct re_guts *g, 870 sopno start, /* start state within strip */ 871 sopno stop, /* state after stop state within strip */ 872 states bef, /* states reachable before */ 873 int ch, /* character or NONCHAR code */ 874 states aft) /* states already known reachable after */ 875{ 876 cset *cs; 877 sop s; 878 sopno pc; 879 onestate here; /* note, macros know this name */ 880 sopno look; 881 int i; 882 883 for (pc = start, INIT(here, pc); pc != stop; pc++, INC(here)) { 884 s = g->strip[pc]; 885 switch (OP(s)) { 886 case OEND: 887 assert(pc == stop-1); 888 break; 889 case OCHAR: 890 /* only characters can match */ 891 assert(!NONCHAR(ch) || ch != (char)OPND(s)); 892 if (ch == (char)OPND(s)) 893 FWD(aft, bef, 1); 894 break; 895 case OBOL: 896 if (ch == BOL || ch == BOLEOL) 897 FWD(aft, bef, 1); 898 break; 899 case OEOL: 900 if (ch == EOL || ch == BOLEOL) 901 FWD(aft, bef, 1); 902 break; 903 case OBOW: 904 if (ch == BOW) 905 FWD(aft, bef, 1); 906 break; 907 case OEOW: 908 if (ch == EOW) 909 FWD(aft, bef, 1); 910 break; 911 case OANY: 912 if (!NONCHAR(ch)) 913 FWD(aft, bef, 1); 914 break; 915 case OANYOF: 916 cs = &g->sets[OPND(s)]; 917 if (!NONCHAR(ch) && CHIN(cs, ch)) 918 FWD(aft, bef, 1); 919 break; 920 case OBACK_: /* ignored here */ 921 case O_BACK: 922 FWD(aft, aft, 1); 923 break; 924 case OPLUS_: /* forward, this is just an empty */ 925 FWD(aft, aft, 1); 926 break; 927 case O_PLUS: /* both forward and back */ 928 FWD(aft, aft, 1); 929 i = ISSETBACK(aft, OPND(s)); 930 BACK(aft, aft, OPND(s)); 931 if (!i && ISSETBACK(aft, OPND(s))) { 932 /* oho, must reconsider loop body */ 933 pc -= OPND(s) + 1; 934 INIT(here, pc); 935 } 936 break; 937 case OQUEST_: /* two branches, both forward */ 938 FWD(aft, aft, 1); 939 FWD(aft, aft, OPND(s)); 940 break; 941 case O_QUEST: /* just an empty */ 942 FWD(aft, aft, 1); 943 break; 944 case OLPAREN: /* not significant here */ 945 case ORPAREN: 946 FWD(aft, aft, 1); 947 break; 948 case OCH_: /* mark the first two branches */ 949 FWD(aft, aft, 1); 950 assert(OP(g->strip[pc+OPND(s)]) == OOR2); 951 FWD(aft, aft, OPND(s)); 952 break; 953 case OOR1: /* done a branch, find the O_CH */ 954 if (ISSTATEIN(aft, here)) { 955 for (look = 1; 956 OP(s = g->strip[pc+look]) != O_CH; 957 look += OPND(s)) 958 assert(OP(s) == OOR2); 959 FWD(aft, aft, look); 960 } 961 break; 962 case OOR2: /* propagate OCH_'s marking */ 963 FWD(aft, aft, 1); 964 if (OP(g->strip[pc+OPND(s)]) != O_CH) { 965 assert(OP(g->strip[pc+OPND(s)]) == OOR2); 966 FWD(aft, aft, OPND(s)); 967 } 968 break; 969 case O_CH: /* just empty */ 970 FWD(aft, aft, 1); 971 break; 972 default: /* ooooops... */ 973 assert(nope); 974 break; 975 } 976 } 977 978 return(aft); 979} 980 981#ifdef REDEBUG 982/* 983 - print - print a set of states 984 == #ifdef REDEBUG 985 == static void print(struct match *m, char *caption, states st, \ 986 == int ch, FILE *d); 987 == #endif 988 */ 989static void 990print(struct match *m, char *caption, states st, int ch, FILE *d) 991{ 992 struct re_guts *g = m->g; 993 int i; 994 int first = 1; 995 996 if (!(m->eflags®_TRACE)) 997 return; 998 999 (void)fprintf(d, "%s", caption); 1000 if (ch != '\0') 1001 (void)fprintf(d, " %s", pchar(ch)); 1002 for (i = 0; i < g->nstates; i++) 1003 if (ISSET(st, i)) { 1004 (void)fprintf(d, "%s%d", (first) ? "\t" : ", ", i); 1005 first = 0; 1006 } 1007 (void)fprintf(d, "\n"); 1008} 1009 1010/* 1011 - at - print current situation 1012 == #ifdef REDEBUG 1013 == static void at(struct match *m, char *title, char *start, char *stop, \ 1014 == sopno startst, sopno stopst); 1015 == #endif 1016 */ 1017static void 1018at(struct match *m, char *title, char *start, char *stop, sopno startst, 1019 sopno stopst) 1020{ 1021 if (!(m->eflags®_TRACE)) 1022 return; 1023 1024 (void)printf("%s %s-", title, pchar(*start)); 1025 (void)printf("%s ", pchar(*stop)); 1026 (void)printf("%ld-%ld\n", (long)startst, (long)stopst); 1027} 1028 1029#ifndef PCHARDONE 1030#define PCHARDONE /* never again */ 1031/* 1032 - pchar - make a character printable 1033 == #ifdef REDEBUG 1034 == static char *pchar(int ch); 1035 == #endif 1036 * 1037 * Is this identical to regchar() over in debug.c? Well, yes. But a 1038 * duplicate here avoids having a debugging-capable regexec.o tied to 1039 * a matching debug.o, and this is convenient. It all disappears in 1040 * the non-debug compilation anyway, so it doesn't matter much. 1041 */ 1042static char * /* -> representation */ 1043pchar(int ch) 1044{ 1045 static char pbuf[10]; 1046 1047 if (isprint(ch) || ch == ' ') 1048 (void)snprintf(pbuf, sizeof pbuf, "%c", ch); 1049 else 1050 (void)snprintf(pbuf, sizeof pbuf, "\\%o", ch); 1051 return(pbuf); 1052} 1053#endif 1054#endif 1055 1056#undef matcher 1057#undef fast 1058#undef slow 1059#undef dissect 1060#undef backref 1061#undef step 1062#undef print 1063#undef at 1064#undef match 1065#undef nope 1066