engine.c revision 1.11
1/* $OpenBSD: engine.c,v 1.11 2004/10/17 17:58:54 otto Exp $ */ 2 3/*- 4 * Copyright (c) 1992, 1993, 1994 Henry Spencer. 5 * Copyright (c) 1992, 1993, 1994 6 * The Regents of the University of California. All rights reserved. 7 * 8 * This code is derived from software contributed to Berkeley by 9 * Henry Spencer. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 3. Neither the name of the University nor the names of its contributors 20 * may be used to endorse or promote products derived from this software 21 * without specific prior written permission. 22 * 23 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 24 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 25 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 26 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 27 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 28 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 29 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 30 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 31 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 32 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 33 * SUCH DAMAGE. 34 * 35 * @(#)engine.c 8.5 (Berkeley) 3/20/94 36 */ 37 38#if defined(SNAMES) && defined(LIBC_SCCS) && !defined(lint) 39static char enginercsid[] = "$OpenBSD: engine.c,v 1.11 2004/10/17 17:58:54 otto Exp $"; 40#endif /* SNAMES and LIBC_SCCS and not lint */ 41 42/* 43 * The matching engine and friends. This file is #included by regexec.c 44 * after suitable #defines of a variety of macros used herein, so that 45 * different state representations can be used without duplicating masses 46 * of code. 47 */ 48 49#ifdef SNAMES 50#define matcher smatcher 51#define fast sfast 52#define slow sslow 53#define dissect sdissect 54#define backref sbackref 55#define step sstep 56#define print sprint 57#define at sat 58#define match smat 59#define nope snope 60#endif 61#ifdef LNAMES 62#define matcher lmatcher 63#define fast lfast 64#define slow lslow 65#define dissect ldissect 66#define backref lbackref 67#define step lstep 68#define print lprint 69#define at lat 70#define match lmat 71#define nope lnope 72#endif 73 74/* another structure passed up and down to avoid zillions of parameters */ 75struct match { 76 struct re_guts *g; 77 int eflags; 78 regmatch_t *pmatch; /* [nsub+1] (0 element unused) */ 79 char *offp; /* offsets work from here */ 80 char *beginp; /* start of string -- virtual NUL precedes */ 81 char *endp; /* end of string -- virtual NUL here */ 82 char *coldp; /* can be no match starting before here */ 83 char **lastpos; /* [nplus+1] */ 84 STATEVARS; 85 states st; /* current states */ 86 states fresh; /* states for a fresh start */ 87 states tmp; /* temporary */ 88 states empty; /* empty set of states */ 89}; 90 91/* ========= begin header generated by ./mkh ========= */ 92#ifdef __cplusplus 93extern "C" { 94#endif 95 96/* === engine.c === */ 97static int matcher(struct re_guts *g, char *string, size_t nmatch, regmatch_t pmatch[], int eflags); 98static char *dissect(struct match *m, char *start, char *stop, sopno startst, sopno stopst); 99static char *backref(struct match *m, char *start, char *stop, sopno startst, sopno stopst, sopno lev); 100static char *fast(struct match *m, char *start, char *stop, sopno startst, sopno stopst); 101static char *slow(struct match *m, char *start, char *stop, sopno startst, sopno stopst); 102static states step(struct re_guts *g, sopno start, sopno stop, states bef, int ch, states aft); 103#define BOL (OUT+1) 104#define EOL (BOL+1) 105#define BOLEOL (BOL+2) 106#define NOTHING (BOL+3) 107#define BOW (BOL+4) 108#define EOW (BOL+5) 109#define CODEMAX (BOL+5) /* highest code used */ 110#define NONCHAR(c) ((c) > CHAR_MAX) 111#define NNONCHAR (CODEMAX-CHAR_MAX) 112#ifdef REDEBUG 113static void print(struct match *m, char *caption, states st, int ch, FILE *d); 114#endif 115#ifdef REDEBUG 116static void at(struct match *m, char *title, char *start, char *stop, sopno startst, sopno stopst); 117#endif 118#ifdef REDEBUG 119static char *pchar(int ch); 120#endif 121 122#ifdef __cplusplus 123} 124#endif 125/* ========= end header generated by ./mkh ========= */ 126 127#ifdef REDEBUG 128#define SP(t, s, c) print(m, t, s, c, stdout) 129#define AT(t, p1, p2, s1, s2) at(m, t, p1, p2, s1, s2) 130#define NOTE(str) { if (m->eflags®_TRACE) (void)printf("=%s\n", (str)); } 131static int nope = 0; 132#else 133#define SP(t, s, c) /* nothing */ 134#define AT(t, p1, p2, s1, s2) /* nothing */ 135#define NOTE(s) /* nothing */ 136#endif 137 138/* 139 - matcher - the actual matching engine 140 == static int matcher(struct re_guts *g, char *string, \ 141 == size_t nmatch, regmatch_t pmatch[], int eflags); 142 */ 143static int /* 0 success, REG_NOMATCH failure */ 144matcher(struct re_guts *g, char *string, size_t nmatch, regmatch_t pmatch[], 145 int eflags) 146{ 147 char *endp; 148 int i; 149 struct match mv; 150 struct match *m = &mv; 151 char *dp; 152 const sopno gf = g->firststate+1; /* +1 for OEND */ 153 const sopno gl = g->laststate; 154 char *start; 155 char *stop; 156 157 /* simplify the situation where possible */ 158 if (g->cflags®_NOSUB) 159 nmatch = 0; 160 if (eflags®_STARTEND) { 161 start = string + pmatch[0].rm_so; 162 stop = string + pmatch[0].rm_eo; 163 } else { 164 start = string; 165 stop = start + strlen(start); 166 } 167 if (stop < start) 168 return(REG_INVARG); 169 170 /* prescreening; this does wonders for this rather slow code */ 171 if (g->must != NULL) { 172 for (dp = start; dp < stop; dp++) 173 if (*dp == g->must[0] && stop - dp >= g->mlen && 174 memcmp(dp, g->must, (size_t)g->mlen) == 0) 175 break; 176 if (dp == stop) /* we didn't find g->must */ 177 return(REG_NOMATCH); 178 } 179 180 /* match struct setup */ 181 m->g = g; 182 m->eflags = eflags; 183 m->pmatch = NULL; 184 m->lastpos = NULL; 185 m->offp = string; 186 m->beginp = start; 187 m->endp = stop; 188 STATESETUP(m, 4); 189 SETUP(m->st); 190 SETUP(m->fresh); 191 SETUP(m->tmp); 192 SETUP(m->empty); 193 CLEAR(m->empty); 194 195 /* this loop does only one repetition except for backrefs */ 196 for (;;) { 197 endp = fast(m, start, stop, gf, gl); 198 if (endp == NULL) { /* a miss */ 199 STATETEARDOWN(m); 200 return(REG_NOMATCH); 201 } 202 if (nmatch == 0 && !g->backrefs) 203 break; /* no further info needed */ 204 205 /* where? */ 206 assert(m->coldp != NULL); 207 for (;;) { 208 NOTE("finding start"); 209 endp = slow(m, m->coldp, stop, gf, gl); 210 if (endp != NULL) 211 break; 212 assert(m->coldp < m->endp); 213 m->coldp++; 214 } 215 if (nmatch == 1 && !g->backrefs) 216 break; /* no further info needed */ 217 218 /* oh my, he wants the subexpressions... */ 219 if (m->pmatch == NULL) 220 m->pmatch = (regmatch_t *)malloc((m->g->nsub + 1) * 221 sizeof(regmatch_t)); 222 if (m->pmatch == NULL) { 223 STATETEARDOWN(m); 224 return(REG_ESPACE); 225 } 226 for (i = 1; i <= m->g->nsub; i++) 227 m->pmatch[i].rm_so = m->pmatch[i].rm_eo = -1; 228 if (!g->backrefs && !(m->eflags®_BACKR)) { 229 NOTE("dissecting"); 230 dp = dissect(m, m->coldp, endp, gf, gl); 231 } else { 232 if (g->nplus > 0 && m->lastpos == NULL) 233 m->lastpos = (char **)malloc((g->nplus+1) * 234 sizeof(char *)); 235 if (g->nplus > 0 && m->lastpos == NULL) { 236 free(m->pmatch); 237 STATETEARDOWN(m); 238 return(REG_ESPACE); 239 } 240 NOTE("backref dissect"); 241 dp = backref(m, m->coldp, endp, gf, gl, (sopno)0); 242 } 243 if (dp != NULL) 244 break; 245 246 /* uh-oh... we couldn't find a subexpression-level match */ 247 assert(g->backrefs); /* must be back references doing it */ 248 assert(g->nplus == 0 || m->lastpos != NULL); 249 for (;;) { 250 if (dp != NULL || endp <= m->coldp) 251 break; /* defeat */ 252 NOTE("backoff"); 253 endp = slow(m, m->coldp, endp-1, gf, gl); 254 if (endp == NULL) 255 break; /* defeat */ 256 /* try it on a shorter possibility */ 257#ifndef NDEBUG 258 for (i = 1; i <= m->g->nsub; i++) { 259 assert(m->pmatch[i].rm_so == -1); 260 assert(m->pmatch[i].rm_eo == -1); 261 } 262#endif 263 NOTE("backoff dissect"); 264 dp = backref(m, m->coldp, endp, gf, gl, (sopno)0); 265 } 266 assert(dp == NULL || dp == endp); 267 if (dp != NULL) /* found a shorter one */ 268 break; 269 270 /* despite initial appearances, there is no match here */ 271 NOTE("false alarm"); 272 if (m->coldp == stop) 273 break; 274 start = m->coldp + 1; /* recycle starting later */ 275 } 276 277 /* fill in the details if requested */ 278 if (nmatch > 0) { 279 pmatch[0].rm_so = m->coldp - m->offp; 280 pmatch[0].rm_eo = endp - m->offp; 281 } 282 if (nmatch > 1) { 283 assert(m->pmatch != NULL); 284 for (i = 1; i < nmatch; i++) 285 if (i <= m->g->nsub) 286 pmatch[i] = m->pmatch[i]; 287 else { 288 pmatch[i].rm_so = -1; 289 pmatch[i].rm_eo = -1; 290 } 291 } 292 293 if (m->pmatch != NULL) 294 free((char *)m->pmatch); 295 if (m->lastpos != NULL) 296 free((char *)m->lastpos); 297 STATETEARDOWN(m); 298 return(0); 299} 300 301/* 302 - dissect - figure out what matched what, no back references 303 == static char *dissect(struct match *m, char *start, \ 304 == char *stop, sopno startst, sopno stopst); 305 */ 306static char * /* == stop (success) always */ 307dissect(struct match *m, char *start, char *stop, sopno startst, sopno stopst) 308{ 309 int i; 310 sopno ss; /* start sop of current subRE */ 311 sopno es; /* end sop of current subRE */ 312 char *sp; /* start of string matched by it */ 313 char *stp; /* string matched by it cannot pass here */ 314 char *rest; /* start of rest of string */ 315 char *tail; /* string unmatched by rest of RE */ 316 sopno ssub; /* start sop of subsubRE */ 317 sopno esub; /* end sop of subsubRE */ 318 char *ssp; /* start of string matched by subsubRE */ 319 char *sep; /* end of string matched by subsubRE */ 320 char *oldssp; /* previous ssp */ 321 char *dp; 322 323 AT("diss", start, stop, startst, stopst); 324 sp = start; 325 for (ss = startst; ss < stopst; ss = es) { 326 /* identify end of subRE */ 327 es = ss; 328 switch (OP(m->g->strip[es])) { 329 case OPLUS_: 330 case OQUEST_: 331 es += OPND(m->g->strip[es]); 332 break; 333 case OCH_: 334 while (OP(m->g->strip[es]) != O_CH) 335 es += OPND(m->g->strip[es]); 336 break; 337 } 338 es++; 339 340 /* figure out what it matched */ 341 switch (OP(m->g->strip[ss])) { 342 case OEND: 343 assert(nope); 344 break; 345 case OCHAR: 346 sp++; 347 break; 348 case OBOL: 349 case OEOL: 350 case OBOW: 351 case OEOW: 352 break; 353 case OANY: 354 case OANYOF: 355 sp++; 356 break; 357 case OBACK_: 358 case O_BACK: 359 assert(nope); 360 break; 361 /* cases where length of match is hard to find */ 362 case OQUEST_: 363 stp = stop; 364 for (;;) { 365 /* how long could this one be? */ 366 rest = slow(m, sp, stp, ss, es); 367 assert(rest != NULL); /* it did match */ 368 /* could the rest match the rest? */ 369 tail = slow(m, rest, stop, es, stopst); 370 if (tail == stop) 371 break; /* yes! */ 372 /* no -- try a shorter match for this one */ 373 stp = rest - 1; 374 assert(stp >= sp); /* it did work */ 375 } 376 ssub = ss + 1; 377 esub = es - 1; 378 /* did innards match? */ 379 if (slow(m, sp, rest, ssub, esub) != NULL) { 380 dp = dissect(m, sp, rest, ssub, esub); 381 assert(dp == rest); 382 } else /* no */ 383 assert(sp == rest); 384 sp = rest; 385 break; 386 case OPLUS_: 387 stp = stop; 388 for (;;) { 389 /* how long could this one be? */ 390 rest = slow(m, sp, stp, ss, es); 391 assert(rest != NULL); /* it did match */ 392 /* could the rest match the rest? */ 393 tail = slow(m, rest, stop, es, stopst); 394 if (tail == stop) 395 break; /* yes! */ 396 /* no -- try a shorter match for this one */ 397 stp = rest - 1; 398 assert(stp >= sp); /* it did work */ 399 } 400 ssub = ss + 1; 401 esub = es - 1; 402 ssp = sp; 403 oldssp = ssp; 404 for (;;) { /* find last match of innards */ 405 sep = slow(m, ssp, rest, ssub, esub); 406 if (sep == NULL || sep == ssp) 407 break; /* failed or matched null */ 408 oldssp = ssp; /* on to next try */ 409 ssp = sep; 410 } 411 if (sep == NULL) { 412 /* last successful match */ 413 sep = ssp; 414 ssp = oldssp; 415 } 416 assert(sep == rest); /* must exhaust substring */ 417 assert(slow(m, ssp, sep, ssub, esub) == rest); 418 dp = dissect(m, ssp, sep, ssub, esub); 419 assert(dp == sep); 420 sp = rest; 421 break; 422 case OCH_: 423 stp = stop; 424 for (;;) { 425 /* how long could this one be? */ 426 rest = slow(m, sp, stp, ss, es); 427 assert(rest != NULL); /* it did match */ 428 /* could the rest match the rest? */ 429 tail = slow(m, rest, stop, es, stopst); 430 if (tail == stop) 431 break; /* yes! */ 432 /* no -- try a shorter match for this one */ 433 stp = rest - 1; 434 assert(stp >= sp); /* it did work */ 435 } 436 ssub = ss + 1; 437 esub = ss + OPND(m->g->strip[ss]) - 1; 438 assert(OP(m->g->strip[esub]) == OOR1); 439 for (;;) { /* find first matching branch */ 440 if (slow(m, sp, rest, ssub, esub) == rest) 441 break; /* it matched all of it */ 442 /* that one missed, try next one */ 443 assert(OP(m->g->strip[esub]) == OOR1); 444 esub++; 445 assert(OP(m->g->strip[esub]) == OOR2); 446 ssub = esub + 1; 447 esub += OPND(m->g->strip[esub]); 448 if (OP(m->g->strip[esub]) == OOR2) 449 esub--; 450 else 451 assert(OP(m->g->strip[esub]) == O_CH); 452 } 453 dp = dissect(m, sp, rest, ssub, esub); 454 assert(dp == rest); 455 sp = rest; 456 break; 457 case O_PLUS: 458 case O_QUEST: 459 case OOR1: 460 case OOR2: 461 case O_CH: 462 assert(nope); 463 break; 464 case OLPAREN: 465 i = OPND(m->g->strip[ss]); 466 assert(0 < i && i <= m->g->nsub); 467 m->pmatch[i].rm_so = sp - m->offp; 468 break; 469 case ORPAREN: 470 i = OPND(m->g->strip[ss]); 471 assert(0 < i && i <= m->g->nsub); 472 m->pmatch[i].rm_eo = sp - m->offp; 473 break; 474 default: /* uh oh */ 475 assert(nope); 476 break; 477 } 478 } 479 480 assert(sp == stop); 481 return(sp); 482} 483 484/* 485 - backref - figure out what matched what, figuring in back references 486 == static char *backref(struct match *m, char *start, \ 487 == char *stop, sopno startst, sopno stopst, sopno lev); 488 */ 489static char * /* == stop (success) or NULL (failure) */ 490backref(struct match *m, char *start, char *stop, sopno startst, sopno stopst, 491 sopno lev) /* PLUS nesting level */ 492{ 493 int i; 494 sopno ss; /* start sop of current subRE */ 495 char *sp; /* start of string matched by it */ 496 sopno ssub; /* start sop of subsubRE */ 497 sopno esub; /* end sop of subsubRE */ 498 char *ssp; /* start of string matched by subsubRE */ 499 char *dp; 500 size_t len; 501 int hard; 502 sop s; 503 regoff_t offsave; 504 cset *cs; 505 506 AT("back", start, stop, startst, stopst); 507 sp = start; 508 509 /* get as far as we can with easy stuff */ 510 hard = 0; 511 for (ss = startst; !hard && ss < stopst; ss++) 512 switch (OP(s = m->g->strip[ss])) { 513 case OCHAR: 514 if (sp == stop || *sp++ != (char)OPND(s)) 515 return(NULL); 516 break; 517 case OANY: 518 if (sp == stop) 519 return(NULL); 520 sp++; 521 break; 522 case OANYOF: 523 cs = &m->g->sets[OPND(s)]; 524 if (sp == stop || !CHIN(cs, *sp++)) 525 return(NULL); 526 break; 527 case OBOL: 528 if ( (sp == m->beginp && !(m->eflags®_NOTBOL)) || 529 (sp < m->endp && *(sp-1) == '\n' && 530 (m->g->cflags®_NEWLINE)) ) 531 { /* yes */ } 532 else 533 return(NULL); 534 break; 535 case OEOL: 536 if ( (sp == m->endp && !(m->eflags®_NOTEOL)) || 537 (sp < m->endp && *sp == '\n' && 538 (m->g->cflags®_NEWLINE)) ) 539 { /* yes */ } 540 else 541 return(NULL); 542 break; 543 case OBOW: 544 if (( (sp == m->beginp && !(m->eflags®_NOTBOL)) || 545 (sp < m->endp && *(sp-1) == '\n' && 546 (m->g->cflags®_NEWLINE)) || 547 (sp > m->beginp && 548 !ISWORD(*(sp-1))) ) && 549 (sp < m->endp && ISWORD(*sp)) ) 550 { /* yes */ } 551 else 552 return(NULL); 553 break; 554 case OEOW: 555 if (( (sp == m->endp && !(m->eflags®_NOTEOL)) || 556 (sp < m->endp && *sp == '\n' && 557 (m->g->cflags®_NEWLINE)) || 558 (sp < m->endp && !ISWORD(*sp)) ) && 559 (sp > m->beginp && ISWORD(*(sp-1))) ) 560 { /* yes */ } 561 else 562 return(NULL); 563 break; 564 case O_QUEST: 565 break; 566 case OOR1: /* matches null but needs to skip */ 567 ss++; 568 s = m->g->strip[ss]; 569 do { 570 assert(OP(s) == OOR2); 571 ss += OPND(s); 572 } while (OP(s = m->g->strip[ss]) != O_CH); 573 /* note that the ss++ gets us past the O_CH */ 574 break; 575 default: /* have to make a choice */ 576 hard = 1; 577 break; 578 } 579 if (!hard) { /* that was it! */ 580 if (sp != stop) 581 return(NULL); 582 return(sp); 583 } 584 ss--; /* adjust for the for's final increment */ 585 586 /* the hard stuff */ 587 AT("hard", sp, stop, ss, stopst); 588 s = m->g->strip[ss]; 589 switch (OP(s)) { 590 case OBACK_: /* the vilest depths */ 591 i = OPND(s); 592 assert(0 < i && i <= m->g->nsub); 593 if (m->pmatch[i].rm_eo == -1) 594 return(NULL); 595 assert(m->pmatch[i].rm_so != -1); 596 len = m->pmatch[i].rm_eo - m->pmatch[i].rm_so; 597 if (len == 0) 598 return(NULL); 599 assert(stop - m->beginp >= len); 600 if (sp > stop - len) 601 return(NULL); /* not enough left to match */ 602 ssp = m->offp + m->pmatch[i].rm_so; 603 if (memcmp(sp, ssp, len) != 0) 604 return(NULL); 605 while (m->g->strip[ss] != SOP(O_BACK, i)) 606 ss++; 607 return(backref(m, sp+len, stop, ss+1, stopst, lev)); 608 break; 609 case OQUEST_: /* to null or not */ 610 dp = backref(m, sp, stop, ss+1, stopst, lev); 611 if (dp != NULL) 612 return(dp); /* not */ 613 return(backref(m, sp, stop, ss+OPND(s)+1, stopst, lev)); 614 break; 615 case OPLUS_: 616 assert(m->lastpos != NULL); 617 assert(lev+1 <= m->g->nplus); 618 m->lastpos[lev+1] = sp; 619 return(backref(m, sp, stop, ss+1, stopst, lev+1)); 620 break; 621 case O_PLUS: 622 if (sp == m->lastpos[lev]) /* last pass matched null */ 623 return(backref(m, sp, stop, ss+1, stopst, lev-1)); 624 /* try another pass */ 625 m->lastpos[lev] = sp; 626 dp = backref(m, sp, stop, ss-OPND(s)+1, stopst, lev); 627 if (dp == NULL) 628 return(backref(m, sp, stop, ss+1, stopst, lev-1)); 629 else 630 return(dp); 631 break; 632 case OCH_: /* find the right one, if any */ 633 ssub = ss + 1; 634 esub = ss + OPND(s) - 1; 635 assert(OP(m->g->strip[esub]) == OOR1); 636 for (;;) { /* find first matching branch */ 637 dp = backref(m, sp, stop, ssub, esub, lev); 638 if (dp != NULL) 639 return(dp); 640 /* that one missed, try next one */ 641 if (OP(m->g->strip[esub]) == O_CH) 642 return(NULL); /* there is none */ 643 esub++; 644 assert(OP(m->g->strip[esub]) == OOR2); 645 ssub = esub + 1; 646 esub += OPND(m->g->strip[esub]); 647 if (OP(m->g->strip[esub]) == OOR2) 648 esub--; 649 else 650 assert(OP(m->g->strip[esub]) == O_CH); 651 } 652 break; 653 case OLPAREN: /* must undo assignment if rest fails */ 654 i = OPND(s); 655 assert(0 < i && i <= m->g->nsub); 656 offsave = m->pmatch[i].rm_so; 657 m->pmatch[i].rm_so = sp - m->offp; 658 dp = backref(m, sp, stop, ss+1, stopst, lev); 659 if (dp != NULL) 660 return(dp); 661 m->pmatch[i].rm_so = offsave; 662 return(NULL); 663 break; 664 case ORPAREN: /* must undo assignment if rest fails */ 665 i = OPND(s); 666 assert(0 < i && i <= m->g->nsub); 667 offsave = m->pmatch[i].rm_eo; 668 m->pmatch[i].rm_eo = sp - m->offp; 669 dp = backref(m, sp, stop, ss+1, stopst, lev); 670 if (dp != NULL) 671 return(dp); 672 m->pmatch[i].rm_eo = offsave; 673 return(NULL); 674 break; 675 default: /* uh oh */ 676 assert(nope); 677 break; 678 } 679 680 /* "can't happen" */ 681 assert(nope); 682 /* NOTREACHED */ 683} 684 685/* 686 - fast - step through the string at top speed 687 == static char *fast(struct match *m, char *start, \ 688 == char *stop, sopno startst, sopno stopst); 689 */ 690static char * /* where tentative match ended, or NULL */ 691fast(struct match *m, char *start, char *stop, sopno startst, sopno stopst) 692{ 693 states st = m->st; 694 states fresh = m->fresh; 695 states tmp = m->tmp; 696 char *p = start; 697 int c = (start == m->beginp) ? OUT : *(start-1); 698 int lastc; /* previous c */ 699 int flagch; 700 int i; 701 char *coldp; /* last p after which no match was underway */ 702 703 CLEAR(st); 704 SET1(st, startst); 705 st = step(m->g, startst, stopst, st, NOTHING, st); 706 ASSIGN(fresh, st); 707 SP("start", st, *p); 708 coldp = NULL; 709 for (;;) { 710 /* next character */ 711 lastc = c; 712 c = (p == m->endp) ? OUT : *p; 713 if (EQ(st, fresh)) 714 coldp = p; 715 716 /* is there an EOL and/or BOL between lastc and c? */ 717 flagch = '\0'; 718 i = 0; 719 if ( (lastc == '\n' && m->g->cflags®_NEWLINE) || 720 (lastc == OUT && !(m->eflags®_NOTBOL)) ) { 721 flagch = BOL; 722 i = m->g->nbol; 723 } 724 if ( (c == '\n' && m->g->cflags®_NEWLINE) || 725 (c == OUT && !(m->eflags®_NOTEOL)) ) { 726 flagch = (flagch == BOL) ? BOLEOL : EOL; 727 i += m->g->neol; 728 } 729 if (i != 0) { 730 for (; i > 0; i--) 731 st = step(m->g, startst, stopst, st, flagch, st); 732 SP("boleol", st, c); 733 } 734 735 /* how about a word boundary? */ 736 if ( (flagch == BOL || (lastc != OUT && !ISWORD(lastc))) && 737 (c != OUT && ISWORD(c)) ) { 738 flagch = BOW; 739 } 740 if ( (lastc != OUT && ISWORD(lastc)) && 741 (flagch == EOL || (c != OUT && !ISWORD(c))) ) { 742 flagch = EOW; 743 } 744 if (flagch == BOW || flagch == EOW) { 745 st = step(m->g, startst, stopst, st, flagch, st); 746 SP("boweow", st, c); 747 } 748 749 /* are we done? */ 750 if (ISSET(st, stopst) || p == stop) 751 break; /* NOTE BREAK OUT */ 752 753 /* no, we must deal with this character */ 754 ASSIGN(tmp, st); 755 ASSIGN(st, fresh); 756 assert(c != OUT); 757 st = step(m->g, startst, stopst, tmp, c, st); 758 SP("aft", st, c); 759 assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st)); 760 p++; 761 } 762 763 assert(coldp != NULL); 764 m->coldp = coldp; 765 if (ISSET(st, stopst)) 766 return(p+1); 767 else 768 return(NULL); 769} 770 771/* 772 - slow - step through the string more deliberately 773 == static char *slow(struct match *m, char *start, \ 774 == char *stop, sopno startst, sopno stopst); 775 */ 776static char * /* where it ended */ 777slow(struct match *m, char *start, char *stop, sopno startst, sopno stopst) 778{ 779 states st = m->st; 780 states empty = m->empty; 781 states tmp = m->tmp; 782 char *p = start; 783 int c = (start == m->beginp) ? OUT : *(start-1); 784 int lastc; /* previous c */ 785 int flagch; 786 int i; 787 char *matchp; /* last p at which a match ended */ 788 789 AT("slow", start, stop, startst, stopst); 790 CLEAR(st); 791 SET1(st, startst); 792 SP("sstart", st, *p); 793 st = step(m->g, startst, stopst, st, NOTHING, st); 794 matchp = NULL; 795 for (;;) { 796 /* next character */ 797 lastc = c; 798 c = (p == m->endp) ? OUT : *p; 799 800 /* is there an EOL and/or BOL between lastc and c? */ 801 flagch = '\0'; 802 i = 0; 803 if ( (lastc == '\n' && m->g->cflags®_NEWLINE) || 804 (lastc == OUT && !(m->eflags®_NOTBOL)) ) { 805 flagch = BOL; 806 i = m->g->nbol; 807 } 808 if ( (c == '\n' && m->g->cflags®_NEWLINE) || 809 (c == OUT && !(m->eflags®_NOTEOL)) ) { 810 flagch = (flagch == BOL) ? BOLEOL : EOL; 811 i += m->g->neol; 812 } 813 if (i != 0) { 814 for (; i > 0; i--) 815 st = step(m->g, startst, stopst, st, flagch, st); 816 SP("sboleol", st, c); 817 } 818 819 /* how about a word boundary? */ 820 if ( (flagch == BOL || (lastc != OUT && !ISWORD(lastc))) && 821 (c != OUT && ISWORD(c)) ) { 822 flagch = BOW; 823 } 824 if ( (lastc != OUT && ISWORD(lastc)) && 825 (flagch == EOL || (c != OUT && !ISWORD(c))) ) { 826 flagch = EOW; 827 } 828 if (flagch == BOW || flagch == EOW) { 829 st = step(m->g, startst, stopst, st, flagch, st); 830 SP("sboweow", st, c); 831 } 832 833 /* are we done? */ 834 if (ISSET(st, stopst)) 835 matchp = p; 836 if (EQ(st, empty) || p == stop) 837 break; /* NOTE BREAK OUT */ 838 839 /* no, we must deal with this character */ 840 ASSIGN(tmp, st); 841 ASSIGN(st, empty); 842 assert(c != OUT); 843 st = step(m->g, startst, stopst, tmp, c, st); 844 SP("saft", st, c); 845 assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st)); 846 p++; 847 } 848 849 return(matchp); 850} 851 852 853/* 854 - step - map set of states reachable before char to set reachable after 855 == static states step(struct re_guts *g, sopno start, sopno stop, \ 856 == states bef, int ch, states aft); 857 == #define BOL (OUT+1) 858 == #define EOL (BOL+1) 859 == #define BOLEOL (BOL+2) 860 == #define NOTHING (BOL+3) 861 == #define BOW (BOL+4) 862 == #define EOW (BOL+5) 863 == #define CODEMAX (BOL+5) // highest code used 864 == #define NONCHAR(c) ((c) > CHAR_MAX) 865 == #define NNONCHAR (CODEMAX-CHAR_MAX) 866 */ 867static states 868step(struct re_guts *g, 869 sopno start, /* start state within strip */ 870 sopno stop, /* state after stop state within strip */ 871 states bef, /* states reachable before */ 872 int ch, /* character or NONCHAR code */ 873 states aft) /* states already known reachable after */ 874{ 875 cset *cs; 876 sop s; 877 sopno pc; 878 onestate here; /* note, macros know this name */ 879 sopno look; 880 int i; 881 882 for (pc = start, INIT(here, pc); pc != stop; pc++, INC(here)) { 883 s = g->strip[pc]; 884 switch (OP(s)) { 885 case OEND: 886 assert(pc == stop-1); 887 break; 888 case OCHAR: 889 /* only characters can match */ 890 assert(!NONCHAR(ch) || ch != (char)OPND(s)); 891 if (ch == (char)OPND(s)) 892 FWD(aft, bef, 1); 893 break; 894 case OBOL: 895 if (ch == BOL || ch == BOLEOL) 896 FWD(aft, bef, 1); 897 break; 898 case OEOL: 899 if (ch == EOL || ch == BOLEOL) 900 FWD(aft, bef, 1); 901 break; 902 case OBOW: 903 if (ch == BOW) 904 FWD(aft, bef, 1); 905 break; 906 case OEOW: 907 if (ch == EOW) 908 FWD(aft, bef, 1); 909 break; 910 case OANY: 911 if (!NONCHAR(ch)) 912 FWD(aft, bef, 1); 913 break; 914 case OANYOF: 915 cs = &g->sets[OPND(s)]; 916 if (!NONCHAR(ch) && CHIN(cs, ch)) 917 FWD(aft, bef, 1); 918 break; 919 case OBACK_: /* ignored here */ 920 case O_BACK: 921 FWD(aft, aft, 1); 922 break; 923 case OPLUS_: /* forward, this is just an empty */ 924 FWD(aft, aft, 1); 925 break; 926 case O_PLUS: /* both forward and back */ 927 FWD(aft, aft, 1); 928 i = ISSETBACK(aft, OPND(s)); 929 BACK(aft, aft, OPND(s)); 930 if (!i && ISSETBACK(aft, OPND(s))) { 931 /* oho, must reconsider loop body */ 932 pc -= OPND(s) + 1; 933 INIT(here, pc); 934 } 935 break; 936 case OQUEST_: /* two branches, both forward */ 937 FWD(aft, aft, 1); 938 FWD(aft, aft, OPND(s)); 939 break; 940 case O_QUEST: /* just an empty */ 941 FWD(aft, aft, 1); 942 break; 943 case OLPAREN: /* not significant here */ 944 case ORPAREN: 945 FWD(aft, aft, 1); 946 break; 947 case OCH_: /* mark the first two branches */ 948 FWD(aft, aft, 1); 949 assert(OP(g->strip[pc+OPND(s)]) == OOR2); 950 FWD(aft, aft, OPND(s)); 951 break; 952 case OOR1: /* done a branch, find the O_CH */ 953 if (ISSTATEIN(aft, here)) { 954 for (look = 1; 955 OP(s = g->strip[pc+look]) != O_CH; 956 look += OPND(s)) 957 assert(OP(s) == OOR2); 958 FWD(aft, aft, look); 959 } 960 break; 961 case OOR2: /* propagate OCH_'s marking */ 962 FWD(aft, aft, 1); 963 if (OP(g->strip[pc+OPND(s)]) != O_CH) { 964 assert(OP(g->strip[pc+OPND(s)]) == OOR2); 965 FWD(aft, aft, OPND(s)); 966 } 967 break; 968 case O_CH: /* just empty */ 969 FWD(aft, aft, 1); 970 break; 971 default: /* ooooops... */ 972 assert(nope); 973 break; 974 } 975 } 976 977 return(aft); 978} 979 980#ifdef REDEBUG 981/* 982 - print - print a set of states 983 == #ifdef REDEBUG 984 == static void print(struct match *m, char *caption, states st, \ 985 == int ch, FILE *d); 986 == #endif 987 */ 988static void 989print(struct match *m, char *caption, states st, int ch, FILE *d) 990{ 991 struct re_guts *g = m->g; 992 int i; 993 int first = 1; 994 995 if (!(m->eflags®_TRACE)) 996 return; 997 998 (void)fprintf(d, "%s", caption); 999 if (ch != '\0') 1000 (void)fprintf(d, " %s", pchar(ch)); 1001 for (i = 0; i < g->nstates; i++) 1002 if (ISSET(st, i)) { 1003 (void)fprintf(d, "%s%d", (first) ? "\t" : ", ", i); 1004 first = 0; 1005 } 1006 (void)fprintf(d, "\n"); 1007} 1008 1009/* 1010 - at - print current situation 1011 == #ifdef REDEBUG 1012 == static void at(struct match *m, char *title, char *start, char *stop, \ 1013 == sopno startst, sopno stopst); 1014 == #endif 1015 */ 1016static void 1017at(struct match *m, char *title, char *start, char *stop, sopno startst, 1018 sopno stopst) 1019{ 1020 if (!(m->eflags®_TRACE)) 1021 return; 1022 1023 (void)printf("%s %s-", title, pchar(*start)); 1024 (void)printf("%s ", pchar(*stop)); 1025 (void)printf("%ld-%ld\n", (long)startst, (long)stopst); 1026} 1027 1028#ifndef PCHARDONE 1029#define PCHARDONE /* never again */ 1030/* 1031 - pchar - make a character printable 1032 == #ifdef REDEBUG 1033 == static char *pchar(int ch); 1034 == #endif 1035 * 1036 * Is this identical to regchar() over in debug.c? Well, yes. But a 1037 * duplicate here avoids having a debugging-capable regexec.o tied to 1038 * a matching debug.o, and this is convenient. It all disappears in 1039 * the non-debug compilation anyway, so it doesn't matter much. 1040 */ 1041static char * /* -> representation */ 1042pchar(int ch) 1043{ 1044 static char pbuf[10]; 1045 1046 if (isprint(ch) || ch == ' ') 1047 (void)snprintf(pbuf, sizeof pbuf, "%c", ch); 1048 else 1049 (void)snprintf(pbuf, sizeof pbuf, "\\%o", ch); 1050 return(pbuf); 1051} 1052#endif 1053#endif 1054 1055#undef matcher 1056#undef fast 1057#undef slow 1058#undef dissect 1059#undef backref 1060#undef step 1061#undef print 1062#undef at 1063#undef match 1064#undef nope 1065