engine.c revision 62754
1124208Sdes/*- 2124208Sdes * Copyright (c) 1992, 1993, 1994 Henry Spencer. 3124208Sdes * Copyright (c) 1992, 1993, 1994 4124208Sdes * The Regents of the University of California. All rights reserved. 5124208Sdes * 6124208Sdes * This code is derived from software contributed to Berkeley by 7124208Sdes * Henry Spencer. 8124208Sdes * 9124208Sdes * Redistribution and use in source and binary forms, with or without 10124208Sdes * modification, are permitted provided that the following conditions 11124208Sdes * are met: 12124208Sdes * 1. Redistributions of source code must retain the above copyright 13124208Sdes * notice, this list of conditions and the following disclaimer. 14124208Sdes * 2. Redistributions in binary form must reproduce the above copyright 15124208Sdes * notice, this list of conditions and the following disclaimer in the 16124208Sdes * documentation and/or other materials provided with the distribution. 17124208Sdes * 3. All advertising materials mentioning features or use of this software 18124208Sdes * must display the following acknowledgement: 19124208Sdes * This product includes software developed by the University of 20124208Sdes * California, Berkeley and its contributors. 21124208Sdes * 4. Neither the name of the University nor the names of its contributors 22124208Sdes * may be used to endorse or promote products derived from this software 23124208Sdes * without specific prior written permission. 24124208Sdes * 25124208Sdes * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 26124208Sdes * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 27124208Sdes * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 28124208Sdes * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 29124208Sdes * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 30124208Sdes * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 31124208Sdes * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 32124208Sdes * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 33124208Sdes * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 34124208Sdes * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 35124208Sdes * SUCH DAMAGE. 36124208Sdes * 37124208Sdes * @(#)engine.c 8.5 (Berkeley) 3/20/94 38124208Sdes * 39124208Sdes * $FreeBSD: head/lib/libc/regex/engine.c 62754 2000-07-07 07:46:36Z dcs $ 40124208Sdes */ 41124208Sdes 42124208Sdes/* 43124208Sdes * The matching engine and friends. This file is #included by regexec.c 44124208Sdes * after suitable #defines of a variety of macros used herein, so that 45124208Sdes * different state representations can be used without duplicating masses 46124208Sdes * of code. 47124208Sdes */ 48124208Sdes 49124208Sdes#ifdef SNAMES 50124208Sdes#define matcher smatcher 51124208Sdes#define fast sfast 52124208Sdes#define slow sslow 53124208Sdes#define dissect sdissect 54124208Sdes#define backref sbackref 55124208Sdes#define step sstep 56124208Sdes#define print sprint 57124208Sdes#define at sat 58124208Sdes#define match smat 59124208Sdes#endif 60124208Sdes#ifdef LNAMES 61124208Sdes#define matcher lmatcher 62124208Sdes#define fast lfast 63124208Sdes#define slow lslow 64124208Sdes#define dissect ldissect 65124208Sdes#define backref lbackref 66124208Sdes#define step lstep 67124208Sdes#define print lprint 68124208Sdes#define at lat 69124208Sdes#define match lmat 70124208Sdes#endif 71124208Sdes 72124208Sdes/* another structure passed up and down to avoid zillions of parameters */ 73124208Sdesstruct match { 74124208Sdes struct re_guts *g; 75124208Sdes int eflags; 76124208Sdes regmatch_t *pmatch; /* [nsub+1] (0 element unused) */ 77124208Sdes char *offp; /* offsets work from here */ 78124208Sdes char *beginp; /* start of string -- virtual NUL precedes */ 79124208Sdes char *endp; /* end of string -- virtual NUL here */ 80124208Sdes char *coldp; /* can be no match starting before here */ 81124208Sdes char **lastpos; /* [nplus+1] */ 82124208Sdes STATEVARS; 83124208Sdes states st; /* current states */ 84124208Sdes states fresh; /* states for a fresh start */ 85124208Sdes states tmp; /* temporary */ 86124208Sdes states empty; /* empty set of states */ 87124208Sdes}; 88124208Sdes 89124208Sdes/* ========= begin header generated by ./mkh ========= */ 90124208Sdes#ifdef __cplusplus 91124208Sdesextern "C" { 92124208Sdes#endif 93124208Sdes 94124208Sdes/* === engine.c === */ 95124208Sdesstatic int matcher __P((struct re_guts *g, char *string, size_t nmatch, regmatch_t pmatch[], int eflags)); 96124208Sdesstatic char *dissect __P((struct match *m, char *start, char *stop, sopno startst, sopno stopst)); 97124208Sdesstatic char *backref __P((struct match *m, char *start, char *stop, sopno startst, sopno stopst, sopno lev)); 98124208Sdesstatic char *fast __P((struct match *m, char *start, char *stop, sopno startst, sopno stopst)); 99124208Sdesstatic char *slow __P((struct match *m, char *start, char *stop, sopno startst, sopno stopst)); 100124208Sdesstatic states step __P((struct re_guts *g, sopno start, sopno stop, states bef, int ch, states aft)); 101124208Sdes#define BOL (OUT+1) 102124208Sdes#define EOL (BOL+1) 103124208Sdes#define BOLEOL (BOL+2) 104124208Sdes#define NOTHING (BOL+3) 105124208Sdes#define BOW (BOL+4) 106124208Sdes#define EOW (BOL+5) 107124208Sdes#define CODEMAX (BOL+5) /* highest code used */ 108124208Sdes#define NONCHAR(c) ((c) > CHAR_MAX) 109124208Sdes#define NNONCHAR (CODEMAX-CHAR_MAX) 110124208Sdes#ifdef REDEBUG 111124208Sdesstatic void print __P((struct match *m, char *caption, states st, int ch, FILE *d)); 112124208Sdes#endif 113124208Sdes#ifdef REDEBUG 114124208Sdesstatic void at __P((struct match *m, char *title, char *start, char *stop, sopno startst, sopno stopst)); 115124208Sdes#endif 116124208Sdes#ifdef REDEBUG 117124208Sdesstatic char *pchar __P((int ch)); 118124208Sdes#endif 119124208Sdes 120124208Sdes#ifdef __cplusplus 121124208Sdes} 122124208Sdes#endif 123124208Sdes/* ========= end header generated by ./mkh ========= */ 124124208Sdes 125124208Sdes#ifdef REDEBUG 126124208Sdes#define SP(t, s, c) print(m, t, s, c, stdout) 127124208Sdes#define AT(t, p1, p2, s1, s2) at(m, t, p1, p2, s1, s2) 128124208Sdes#define NOTE(str) { if (m->eflags®_TRACE) printf("=%s\n", (str)); } 129124208Sdes#else 130124208Sdes#define SP(t, s, c) /* nothing */ 131124208Sdes#define AT(t, p1, p2, s1, s2) /* nothing */ 132124208Sdes#define NOTE(s) /* nothing */ 133124208Sdes#endif 134124208Sdes 135124208Sdes/* 136124208Sdes - matcher - the actual matching engine 137124208Sdes == static int matcher(register struct re_guts *g, char *string, \ 138124208Sdes == size_t nmatch, regmatch_t pmatch[], int eflags); 139124208Sdes */ 140124208Sdesstatic int /* 0 success, REG_NOMATCH failure */ 141124208Sdesmatcher(g, string, nmatch, pmatch, eflags) 142124208Sdesregister struct re_guts *g; 143124208Sdeschar *string; 144124208Sdessize_t nmatch; 145124208Sdesregmatch_t pmatch[]; 146124208Sdesint eflags; 147124208Sdes{ 148124208Sdes register char *endp; 149124208Sdes register int i; 150124208Sdes struct match mv; 151124208Sdes register struct match *m = &mv; 152124208Sdes register char *dp; 153124208Sdes register const sopno gf = g->firststate+1; /* +1 for OEND */ 154124208Sdes register const sopno gl = g->laststate; 155124208Sdes char *start; 156124208Sdes char *stop; 157124208Sdes /* Boyer-Moore algorithms variables */ 158124208Sdes register char *pp; 159124208Sdes int cj, mj; 160124208Sdes register char *mustfirst; 161124208Sdes register char *mustlast; 162124208Sdes register int *matchjump; 163124208Sdes register int *charjump; 164124208Sdes 165124208Sdes /* simplify the situation where possible */ 166124208Sdes if (g->cflags®_NOSUB) 167124208Sdes nmatch = 0; 168124208Sdes if (eflags®_STARTEND) { 169124208Sdes start = string + pmatch[0].rm_so; 170124208Sdes stop = string + pmatch[0].rm_eo; 171124208Sdes } else { 172124208Sdes start = string; 173124208Sdes stop = start + strlen(start); 174124208Sdes } 175124208Sdes if (stop < start) 176124208Sdes return(REG_INVARG); 177124208Sdes 178124208Sdes /* prescreening; this does wonders for this rather slow code */ 179124208Sdes if (g->must != NULL) { 180124208Sdes if (g->charjump != NULL && g->matchjump != NULL) { 181124208Sdes mustfirst = g->must; 182124208Sdes mustlast = g->must + g->mlen - 1; 183124208Sdes charjump = g->charjump; 184124208Sdes matchjump = g->matchjump; 185124208Sdes pp = mustlast; 186124208Sdes for (dp = start+g->mlen-1; dp < stop;) { 187124208Sdes /* Fast skip non-matches */ 188124208Sdes while (dp < stop && charjump[*dp]) 189124208Sdes dp += charjump[*dp]; 190124208Sdes 191124208Sdes if (dp >= stop) 192124208Sdes break; 193124208Sdes 194124208Sdes /* Greedy matcher */ 195124208Sdes /* We depend on not being used for 196124208Sdes * for strings of length 1 197124208Sdes */ 198124208Sdes while (*--dp == *--pp && pp != mustfirst); 199124208Sdes 200124208Sdes if (*dp == *pp) 201124208Sdes break; 202124208Sdes 203124208Sdes /* Jump to next possible match */ 204124208Sdes mj = matchjump[pp - mustfirst]; 205124208Sdes cj = charjump[*dp]; 206 dp += (cj < mj ? mj : cj); 207 pp = mustlast; 208 } 209 if (pp != mustfirst) 210 return(REG_NOMATCH); 211 } else { 212 for (dp = start; dp < stop; dp++) 213 if (*dp == g->must[0] && 214 stop - dp >= g->mlen && 215 memcmp(dp, g->must, (size_t)g->mlen) == 0) 216 break; 217 if (dp == stop) /* we didn't find g->must */ 218 return(REG_NOMATCH); 219 } 220 } 221 222 /* match struct setup */ 223 m->g = g; 224 m->eflags = eflags; 225 m->pmatch = NULL; 226 m->lastpos = NULL; 227 m->offp = string; 228 m->beginp = start; 229 m->endp = stop; 230 STATESETUP(m, 4); 231 SETUP(m->st); 232 SETUP(m->fresh); 233 SETUP(m->tmp); 234 SETUP(m->empty); 235 CLEAR(m->empty); 236 237 /* Adjust start according to moffset, to speed things up */ 238 if (g->moffset > -1) 239 start = dp - g->moffset; 240 241 /* this loop does only one repetition except for backrefs */ 242 for (;;) { 243 endp = fast(m, start, stop, gf, gl); 244 if (endp == NULL) { /* a miss */ 245 STATETEARDOWN(m); 246 return(REG_NOMATCH); 247 } 248 if (nmatch == 0 && !g->backrefs) 249 break; /* no further info needed */ 250 251 /* where? */ 252 assert(m->coldp != NULL); 253 for (;;) { 254 NOTE("finding start"); 255 endp = slow(m, m->coldp, stop, gf, gl); 256 if (endp != NULL) 257 break; 258 assert(m->coldp < m->endp); 259 m->coldp++; 260 } 261 if (nmatch == 1 && !g->backrefs) 262 break; /* no further info needed */ 263 264 /* oh my, he wants the subexpressions... */ 265 if (m->pmatch == NULL) 266 m->pmatch = (regmatch_t *)malloc((m->g->nsub + 1) * 267 sizeof(regmatch_t)); 268 if (m->pmatch == NULL) { 269 STATETEARDOWN(m); 270 return(REG_ESPACE); 271 } 272 for (i = 1; i <= m->g->nsub; i++) 273 m->pmatch[i].rm_so = m->pmatch[i].rm_eo = -1; 274 if (!g->backrefs && !(m->eflags®_BACKR)) { 275 NOTE("dissecting"); 276 dp = dissect(m, m->coldp, endp, gf, gl); 277 } else { 278 if (g->nplus > 0 && m->lastpos == NULL) 279 m->lastpos = (char **)malloc((g->nplus+1) * 280 sizeof(char *)); 281 if (g->nplus > 0 && m->lastpos == NULL) { 282 free(m->pmatch); 283 STATETEARDOWN(m); 284 return(REG_ESPACE); 285 } 286 NOTE("backref dissect"); 287 dp = backref(m, m->coldp, endp, gf, gl, (sopno)0); 288 } 289 if (dp != NULL) 290 break; 291 292 /* uh-oh... we couldn't find a subexpression-level match */ 293 assert(g->backrefs); /* must be back references doing it */ 294 assert(g->nplus == 0 || m->lastpos != NULL); 295 for (;;) { 296 if (dp != NULL || endp <= m->coldp) 297 break; /* defeat */ 298 NOTE("backoff"); 299 endp = slow(m, m->coldp, endp-1, gf, gl); 300 if (endp == NULL) 301 break; /* defeat */ 302 /* try it on a shorter possibility */ 303#ifndef NDEBUG 304 for (i = 1; i <= m->g->nsub; i++) { 305 assert(m->pmatch[i].rm_so == -1); 306 assert(m->pmatch[i].rm_eo == -1); 307 } 308#endif 309 NOTE("backoff dissect"); 310 dp = backref(m, m->coldp, endp, gf, gl, (sopno)0); 311 } 312 assert(dp == NULL || dp == endp); 313 if (dp != NULL) /* found a shorter one */ 314 break; 315 316 /* despite initial appearances, there is no match here */ 317 NOTE("false alarm"); 318 start = m->coldp + 1; /* recycle starting later */ 319 assert(start <= stop); 320 } 321 322 /* fill in the details if requested */ 323 if (nmatch > 0) { 324 pmatch[0].rm_so = m->coldp - m->offp; 325 pmatch[0].rm_eo = endp - m->offp; 326 } 327 if (nmatch > 1) { 328 assert(m->pmatch != NULL); 329 for (i = 1; i < nmatch; i++) 330 if (i <= m->g->nsub) 331 pmatch[i] = m->pmatch[i]; 332 else { 333 pmatch[i].rm_so = -1; 334 pmatch[i].rm_eo = -1; 335 } 336 } 337 338 if (m->pmatch != NULL) 339 free((char *)m->pmatch); 340 if (m->lastpos != NULL) 341 free((char *)m->lastpos); 342 STATETEARDOWN(m); 343 return(0); 344} 345 346/* 347 - dissect - figure out what matched what, no back references 348 == static char *dissect(register struct match *m, char *start, \ 349 == char *stop, sopno startst, sopno stopst); 350 */ 351static char * /* == stop (success) always */ 352dissect(m, start, stop, startst, stopst) 353register struct match *m; 354char *start; 355char *stop; 356sopno startst; 357sopno stopst; 358{ 359 register int i; 360 register sopno ss; /* start sop of current subRE */ 361 register sopno es; /* end sop of current subRE */ 362 register char *sp; /* start of string matched by it */ 363 register char *stp; /* string matched by it cannot pass here */ 364 register char *rest; /* start of rest of string */ 365 register char *tail; /* string unmatched by rest of RE */ 366 register sopno ssub; /* start sop of subsubRE */ 367 register sopno esub; /* end sop of subsubRE */ 368 register char *ssp; /* start of string matched by subsubRE */ 369 register char *sep; /* end of string matched by subsubRE */ 370 register char *oldssp; /* previous ssp */ 371 register char *dp; 372 373 AT("diss", start, stop, startst, stopst); 374 sp = start; 375 for (ss = startst; ss < stopst; ss = es) { 376 /* identify end of subRE */ 377 es = ss; 378 switch (OP(m->g->strip[es])) { 379 case OPLUS_: 380 case OQUEST_: 381 es += OPND(m->g->strip[es]); 382 break; 383 case OCH_: 384 while (OP(m->g->strip[es]) != O_CH) 385 es += OPND(m->g->strip[es]); 386 break; 387 } 388 es++; 389 390 /* figure out what it matched */ 391 switch (OP(m->g->strip[ss])) { 392 case OEND: 393 assert(nope); 394 break; 395 case OCHAR: 396 sp++; 397 break; 398 case OBOL: 399 case OEOL: 400 case OBOW: 401 case OEOW: 402 break; 403 case OANY: 404 case OANYOF: 405 sp++; 406 break; 407 case OBACK_: 408 case O_BACK: 409 assert(nope); 410 break; 411 /* cases where length of match is hard to find */ 412 case OQUEST_: 413 stp = stop; 414 for (;;) { 415 /* how long could this one be? */ 416 rest = slow(m, sp, stp, ss, es); 417 assert(rest != NULL); /* it did match */ 418 /* could the rest match the rest? */ 419 tail = slow(m, rest, stop, es, stopst); 420 if (tail == stop) 421 break; /* yes! */ 422 /* no -- try a shorter match for this one */ 423 stp = rest - 1; 424 assert(stp >= sp); /* it did work */ 425 } 426 ssub = ss + 1; 427 esub = es - 1; 428 /* did innards match? */ 429 if (slow(m, sp, rest, ssub, esub) != NULL) { 430 dp = dissect(m, sp, rest, ssub, esub); 431 assert(dp == rest); 432 } else /* no */ 433 assert(sp == rest); 434 sp = rest; 435 break; 436 case OPLUS_: 437 stp = stop; 438 for (;;) { 439 /* how long could this one be? */ 440 rest = slow(m, sp, stp, ss, es); 441 assert(rest != NULL); /* it did match */ 442 /* could the rest match the rest? */ 443 tail = slow(m, rest, stop, es, stopst); 444 if (tail == stop) 445 break; /* yes! */ 446 /* no -- try a shorter match for this one */ 447 stp = rest - 1; 448 assert(stp >= sp); /* it did work */ 449 } 450 ssub = ss + 1; 451 esub = es - 1; 452 ssp = sp; 453 oldssp = ssp; 454 for (;;) { /* find last match of innards */ 455 sep = slow(m, ssp, rest, ssub, esub); 456 if (sep == NULL || sep == ssp) 457 break; /* failed or matched null */ 458 oldssp = ssp; /* on to next try */ 459 ssp = sep; 460 } 461 if (sep == NULL) { 462 /* last successful match */ 463 sep = ssp; 464 ssp = oldssp; 465 } 466 assert(sep == rest); /* must exhaust substring */ 467 assert(slow(m, ssp, sep, ssub, esub) == rest); 468 dp = dissect(m, ssp, sep, ssub, esub); 469 assert(dp == sep); 470 sp = rest; 471 break; 472 case OCH_: 473 stp = stop; 474 for (;;) { 475 /* how long could this one be? */ 476 rest = slow(m, sp, stp, ss, es); 477 assert(rest != NULL); /* it did match */ 478 /* could the rest match the rest? */ 479 tail = slow(m, rest, stop, es, stopst); 480 if (tail == stop) 481 break; /* yes! */ 482 /* no -- try a shorter match for this one */ 483 stp = rest - 1; 484 assert(stp >= sp); /* it did work */ 485 } 486 ssub = ss + 1; 487 esub = ss + OPND(m->g->strip[ss]) - 1; 488 assert(OP(m->g->strip[esub]) == OOR1); 489 for (;;) { /* find first matching branch */ 490 if (slow(m, sp, rest, ssub, esub) == rest) 491 break; /* it matched all of it */ 492 /* that one missed, try next one */ 493 assert(OP(m->g->strip[esub]) == OOR1); 494 esub++; 495 assert(OP(m->g->strip[esub]) == OOR2); 496 ssub = esub + 1; 497 esub += OPND(m->g->strip[esub]); 498 if (OP(m->g->strip[esub]) == OOR2) 499 esub--; 500 else 501 assert(OP(m->g->strip[esub]) == O_CH); 502 } 503 dp = dissect(m, sp, rest, ssub, esub); 504 assert(dp == rest); 505 sp = rest; 506 break; 507 case O_PLUS: 508 case O_QUEST: 509 case OOR1: 510 case OOR2: 511 case O_CH: 512 assert(nope); 513 break; 514 case OLPAREN: 515 i = OPND(m->g->strip[ss]); 516 assert(0 < i && i <= m->g->nsub); 517 m->pmatch[i].rm_so = sp - m->offp; 518 break; 519 case ORPAREN: 520 i = OPND(m->g->strip[ss]); 521 assert(0 < i && i <= m->g->nsub); 522 m->pmatch[i].rm_eo = sp - m->offp; 523 break; 524 default: /* uh oh */ 525 assert(nope); 526 break; 527 } 528 } 529 530 assert(sp == stop); 531 return(sp); 532} 533 534/* 535 - backref - figure out what matched what, figuring in back references 536 == static char *backref(register struct match *m, char *start, \ 537 == char *stop, sopno startst, sopno stopst, sopno lev); 538 */ 539static char * /* == stop (success) or NULL (failure) */ 540backref(m, start, stop, startst, stopst, lev) 541register struct match *m; 542char *start; 543char *stop; 544sopno startst; 545sopno stopst; 546sopno lev; /* PLUS nesting level */ 547{ 548 register int i; 549 register sopno ss; /* start sop of current subRE */ 550 register char *sp; /* start of string matched by it */ 551 register sopno ssub; /* start sop of subsubRE */ 552 register sopno esub; /* end sop of subsubRE */ 553 register char *ssp; /* start of string matched by subsubRE */ 554 register char *dp; 555 register size_t len; 556 register int hard; 557 register sop s; 558 register regoff_t offsave; 559 register cset *cs; 560 561 AT("back", start, stop, startst, stopst); 562 sp = start; 563 564 /* get as far as we can with easy stuff */ 565 hard = 0; 566 for (ss = startst; !hard && ss < stopst; ss++) 567 switch (OP(s = m->g->strip[ss])) { 568 case OCHAR: 569 if (sp == stop || *sp++ != (char)OPND(s)) 570 return(NULL); 571 break; 572 case OANY: 573 if (sp == stop) 574 return(NULL); 575 sp++; 576 break; 577 case OANYOF: 578 cs = &m->g->sets[OPND(s)]; 579 if (sp == stop || !CHIN(cs, *sp++)) 580 return(NULL); 581 break; 582 case OBOL: 583 if ( (sp == m->beginp && !(m->eflags®_NOTBOL)) || 584 (sp < m->endp && *(sp-1) == '\n' && 585 (m->g->cflags®_NEWLINE)) ) 586 { /* yes */ } 587 else 588 return(NULL); 589 break; 590 case OEOL: 591 if ( (sp == m->endp && !(m->eflags®_NOTEOL)) || 592 (sp < m->endp && *sp == '\n' && 593 (m->g->cflags®_NEWLINE)) ) 594 { /* yes */ } 595 else 596 return(NULL); 597 break; 598 case OBOW: 599 if (( (sp == m->beginp && !(m->eflags®_NOTBOL)) || 600 (sp < m->endp && *(sp-1) == '\n' && 601 (m->g->cflags®_NEWLINE)) || 602 (sp > m->beginp && 603 !ISWORD(*(sp-1))) ) && 604 (sp < m->endp && ISWORD(*sp)) ) 605 { /* yes */ } 606 else 607 return(NULL); 608 break; 609 case OEOW: 610 if (( (sp == m->endp && !(m->eflags®_NOTEOL)) || 611 (sp < m->endp && *sp == '\n' && 612 (m->g->cflags®_NEWLINE)) || 613 (sp < m->endp && !ISWORD(*sp)) ) && 614 (sp > m->beginp && ISWORD(*(sp-1))) ) 615 { /* yes */ } 616 else 617 return(NULL); 618 break; 619 case O_QUEST: 620 break; 621 case OOR1: /* matches null but needs to skip */ 622 ss++; 623 s = m->g->strip[ss]; 624 do { 625 assert(OP(s) == OOR2); 626 ss += OPND(s); 627 } while (OP(s = m->g->strip[ss]) != O_CH); 628 /* note that the ss++ gets us past the O_CH */ 629 break; 630 default: /* have to make a choice */ 631 hard = 1; 632 break; 633 } 634 if (!hard) { /* that was it! */ 635 if (sp != stop) 636 return(NULL); 637 return(sp); 638 } 639 ss--; /* adjust for the for's final increment */ 640 641 /* the hard stuff */ 642 AT("hard", sp, stop, ss, stopst); 643 s = m->g->strip[ss]; 644 switch (OP(s)) { 645 case OBACK_: /* the vilest depths */ 646 i = OPND(s); 647 assert(0 < i && i <= m->g->nsub); 648 if (m->pmatch[i].rm_eo == -1) 649 return(NULL); 650 assert(m->pmatch[i].rm_so != -1); 651 len = m->pmatch[i].rm_eo - m->pmatch[i].rm_so; 652 assert(stop - m->beginp >= len); 653 if (sp > stop - len) 654 return(NULL); /* not enough left to match */ 655 ssp = m->offp + m->pmatch[i].rm_so; 656 if (memcmp(sp, ssp, len) != 0) 657 return(NULL); 658 while (m->g->strip[ss] != SOP(O_BACK, i)) 659 ss++; 660 return(backref(m, sp+len, stop, ss+1, stopst, lev)); 661 break; 662 case OQUEST_: /* to null or not */ 663 dp = backref(m, sp, stop, ss+1, stopst, lev); 664 if (dp != NULL) 665 return(dp); /* not */ 666 return(backref(m, sp, stop, ss+OPND(s)+1, stopst, lev)); 667 break; 668 case OPLUS_: 669 assert(m->lastpos != NULL); 670 assert(lev+1 <= m->g->nplus); 671 m->lastpos[lev+1] = sp; 672 return(backref(m, sp, stop, ss+1, stopst, lev+1)); 673 break; 674 case O_PLUS: 675 if (sp == m->lastpos[lev]) /* last pass matched null */ 676 return(backref(m, sp, stop, ss+1, stopst, lev-1)); 677 /* try another pass */ 678 m->lastpos[lev] = sp; 679 dp = backref(m, sp, stop, ss-OPND(s)+1, stopst, lev); 680 if (dp == NULL) 681 return(backref(m, sp, stop, ss+1, stopst, lev-1)); 682 else 683 return(dp); 684 break; 685 case OCH_: /* find the right one, if any */ 686 ssub = ss + 1; 687 esub = ss + OPND(s) - 1; 688 assert(OP(m->g->strip[esub]) == OOR1); 689 for (;;) { /* find first matching branch */ 690 dp = backref(m, sp, stop, ssub, esub, lev); 691 if (dp != NULL) 692 return(dp); 693 /* that one missed, try next one */ 694 if (OP(m->g->strip[esub]) == O_CH) 695 return(NULL); /* there is none */ 696 esub++; 697 assert(OP(m->g->strip[esub]) == OOR2); 698 ssub = esub + 1; 699 esub += OPND(m->g->strip[esub]); 700 if (OP(m->g->strip[esub]) == OOR2) 701 esub--; 702 else 703 assert(OP(m->g->strip[esub]) == O_CH); 704 } 705 break; 706 case OLPAREN: /* must undo assignment if rest fails */ 707 i = OPND(s); 708 assert(0 < i && i <= m->g->nsub); 709 offsave = m->pmatch[i].rm_so; 710 m->pmatch[i].rm_so = sp - m->offp; 711 dp = backref(m, sp, stop, ss+1, stopst, lev); 712 if (dp != NULL) 713 return(dp); 714 m->pmatch[i].rm_so = offsave; 715 return(NULL); 716 break; 717 case ORPAREN: /* must undo assignment if rest fails */ 718 i = OPND(s); 719 assert(0 < i && i <= m->g->nsub); 720 offsave = m->pmatch[i].rm_eo; 721 m->pmatch[i].rm_eo = sp - m->offp; 722 dp = backref(m, sp, stop, ss+1, stopst, lev); 723 if (dp != NULL) 724 return(dp); 725 m->pmatch[i].rm_eo = offsave; 726 return(NULL); 727 break; 728 default: /* uh oh */ 729 assert(nope); 730 break; 731 } 732 733 /* "can't happen" */ 734 assert(nope); 735 /* NOTREACHED */ 736 return "shut up gcc"; 737} 738 739/* 740 - fast - step through the string at top speed 741 == static char *fast(register struct match *m, char *start, \ 742 == char *stop, sopno startst, sopno stopst); 743 */ 744static char * /* where tentative match ended, or NULL */ 745fast(m, start, stop, startst, stopst) 746register struct match *m; 747char *start; 748char *stop; 749sopno startst; 750sopno stopst; 751{ 752 register states st = m->st; 753 register states fresh = m->fresh; 754 register states tmp = m->tmp; 755 register char *p = start; 756 register int c = (start == m->beginp) ? OUT : *(start-1); 757 register int lastc; /* previous c */ 758 register int flagch; 759 register int i; 760 register char *coldp; /* last p after which no match was underway */ 761 762 CLEAR(st); 763 SET1(st, startst); 764 st = step(m->g, startst, stopst, st, NOTHING, st); 765 ASSIGN(fresh, st); 766 SP("start", st, *p); 767 coldp = NULL; 768 for (;;) { 769 /* next character */ 770 lastc = c; 771 c = (p == m->endp) ? OUT : *p; 772 if (EQ(st, fresh)) 773 coldp = p; 774 775 /* is there an EOL and/or BOL between lastc and c? */ 776 flagch = '\0'; 777 i = 0; 778 if ( (lastc == '\n' && m->g->cflags®_NEWLINE) || 779 (lastc == OUT && !(m->eflags®_NOTBOL)) ) { 780 flagch = BOL; 781 i = m->g->nbol; 782 } 783 if ( (c == '\n' && m->g->cflags®_NEWLINE) || 784 (c == OUT && !(m->eflags®_NOTEOL)) ) { 785 flagch = (flagch == BOL) ? BOLEOL : EOL; 786 i += m->g->neol; 787 } 788 if (i != 0) { 789 for (; i > 0; i--) 790 st = step(m->g, startst, stopst, st, flagch, st); 791 SP("boleol", st, c); 792 } 793 794 /* how about a word boundary? */ 795 if ( (flagch == BOL || (lastc != OUT && !ISWORD(lastc))) && 796 (c != OUT && ISWORD(c)) ) { 797 flagch = BOW; 798 } 799 if ( (lastc != OUT && ISWORD(lastc)) && 800 (flagch == EOL || (c != OUT && !ISWORD(c))) ) { 801 flagch = EOW; 802 } 803 if (flagch == BOW || flagch == EOW) { 804 st = step(m->g, startst, stopst, st, flagch, st); 805 SP("boweow", st, c); 806 } 807 808 /* are we done? */ 809 if (ISSET(st, stopst) || p == stop) 810 break; /* NOTE BREAK OUT */ 811 812 /* no, we must deal with this character */ 813 ASSIGN(tmp, st); 814 ASSIGN(st, fresh); 815 assert(c != OUT); 816 st = step(m->g, startst, stopst, tmp, c, st); 817 SP("aft", st, c); 818 assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st)); 819 p++; 820 } 821 822 assert(coldp != NULL); 823 m->coldp = coldp; 824 if (ISSET(st, stopst)) 825 return(p+1); 826 else 827 return(NULL); 828} 829 830/* 831 - slow - step through the string more deliberately 832 == static char *slow(register struct match *m, char *start, \ 833 == char *stop, sopno startst, sopno stopst); 834 */ 835static char * /* where it ended */ 836slow(m, start, stop, startst, stopst) 837register struct match *m; 838char *start; 839char *stop; 840sopno startst; 841sopno stopst; 842{ 843 register states st = m->st; 844 register states empty = m->empty; 845 register states tmp = m->tmp; 846 register char *p = start; 847 register int c = (start == m->beginp) ? OUT : *(start-1); 848 register int lastc; /* previous c */ 849 register int flagch; 850 register int i; 851 register char *matchp; /* last p at which a match ended */ 852 853 AT("slow", start, stop, startst, stopst); 854 CLEAR(st); 855 SET1(st, startst); 856 SP("sstart", st, *p); 857 st = step(m->g, startst, stopst, st, NOTHING, st); 858 matchp = NULL; 859 for (;;) { 860 /* next character */ 861 lastc = c; 862 c = (p == m->endp) ? OUT : *p; 863 864 /* is there an EOL and/or BOL between lastc and c? */ 865 flagch = '\0'; 866 i = 0; 867 if ( (lastc == '\n' && m->g->cflags®_NEWLINE) || 868 (lastc == OUT && !(m->eflags®_NOTBOL)) ) { 869 flagch = BOL; 870 i = m->g->nbol; 871 } 872 if ( (c == '\n' && m->g->cflags®_NEWLINE) || 873 (c == OUT && !(m->eflags®_NOTEOL)) ) { 874 flagch = (flagch == BOL) ? BOLEOL : EOL; 875 i += m->g->neol; 876 } 877 if (i != 0) { 878 for (; i > 0; i--) 879 st = step(m->g, startst, stopst, st, flagch, st); 880 SP("sboleol", st, c); 881 } 882 883 /* how about a word boundary? */ 884 if ( (flagch == BOL || (lastc != OUT && !ISWORD(lastc))) && 885 (c != OUT && ISWORD(c)) ) { 886 flagch = BOW; 887 } 888 if ( (lastc != OUT && ISWORD(lastc)) && 889 (flagch == EOL || (c != OUT && !ISWORD(c))) ) { 890 flagch = EOW; 891 } 892 if (flagch == BOW || flagch == EOW) { 893 st = step(m->g, startst, stopst, st, flagch, st); 894 SP("sboweow", st, c); 895 } 896 897 /* are we done? */ 898 if (ISSET(st, stopst)) 899 matchp = p; 900 if (EQ(st, empty) || p == stop) 901 break; /* NOTE BREAK OUT */ 902 903 /* no, we must deal with this character */ 904 ASSIGN(tmp, st); 905 ASSIGN(st, empty); 906 assert(c != OUT); 907 st = step(m->g, startst, stopst, tmp, c, st); 908 SP("saft", st, c); 909 assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st)); 910 p++; 911 } 912 913 return(matchp); 914} 915 916 917/* 918 - step - map set of states reachable before char to set reachable after 919 == static states step(register struct re_guts *g, sopno start, sopno stop, \ 920 == register states bef, int ch, register states aft); 921 == #define BOL (OUT+1) 922 == #define EOL (BOL+1) 923 == #define BOLEOL (BOL+2) 924 == #define NOTHING (BOL+3) 925 == #define BOW (BOL+4) 926 == #define EOW (BOL+5) 927 == #define CODEMAX (BOL+5) // highest code used 928 == #define NONCHAR(c) ((c) > CHAR_MAX) 929 == #define NNONCHAR (CODEMAX-CHAR_MAX) 930 */ 931static states 932step(g, start, stop, bef, ch, aft) 933register struct re_guts *g; 934sopno start; /* start state within strip */ 935sopno stop; /* state after stop state within strip */ 936register states bef; /* states reachable before */ 937int ch; /* character or NONCHAR code */ 938register states aft; /* states already known reachable after */ 939{ 940 register cset *cs; 941 register sop s; 942 register sopno pc; 943 register onestate here; /* note, macros know this name */ 944 register sopno look; 945 register int i; 946 947 for (pc = start, INIT(here, pc); pc != stop; pc++, INC(here)) { 948 s = g->strip[pc]; 949 switch (OP(s)) { 950 case OEND: 951 assert(pc == stop-1); 952 break; 953 case OCHAR: 954 /* only characters can match */ 955 assert(!NONCHAR(ch) || ch != (char)OPND(s)); 956 if (ch == (char)OPND(s)) 957 FWD(aft, bef, 1); 958 break; 959 case OBOL: 960 if (ch == BOL || ch == BOLEOL) 961 FWD(aft, bef, 1); 962 break; 963 case OEOL: 964 if (ch == EOL || ch == BOLEOL) 965 FWD(aft, bef, 1); 966 break; 967 case OBOW: 968 if (ch == BOW) 969 FWD(aft, bef, 1); 970 break; 971 case OEOW: 972 if (ch == EOW) 973 FWD(aft, bef, 1); 974 break; 975 case OANY: 976 if (!NONCHAR(ch)) 977 FWD(aft, bef, 1); 978 break; 979 case OANYOF: 980 cs = &g->sets[OPND(s)]; 981 if (!NONCHAR(ch) && CHIN(cs, ch)) 982 FWD(aft, bef, 1); 983 break; 984 case OBACK_: /* ignored here */ 985 case O_BACK: 986 FWD(aft, aft, 1); 987 break; 988 case OPLUS_: /* forward, this is just an empty */ 989 FWD(aft, aft, 1); 990 break; 991 case O_PLUS: /* both forward and back */ 992 FWD(aft, aft, 1); 993 i = ISSETBACK(aft, OPND(s)); 994 BACK(aft, aft, OPND(s)); 995 if (!i && ISSETBACK(aft, OPND(s))) { 996 /* oho, must reconsider loop body */ 997 pc -= OPND(s) + 1; 998 INIT(here, pc); 999 } 1000 break; 1001 case OQUEST_: /* two branches, both forward */ 1002 FWD(aft, aft, 1); 1003 FWD(aft, aft, OPND(s)); 1004 break; 1005 case O_QUEST: /* just an empty */ 1006 FWD(aft, aft, 1); 1007 break; 1008 case OLPAREN: /* not significant here */ 1009 case ORPAREN: 1010 FWD(aft, aft, 1); 1011 break; 1012 case OCH_: /* mark the first two branches */ 1013 FWD(aft, aft, 1); 1014 assert(OP(g->strip[pc+OPND(s)]) == OOR2); 1015 FWD(aft, aft, OPND(s)); 1016 break; 1017 case OOR1: /* done a branch, find the O_CH */ 1018 if (ISSTATEIN(aft, here)) { 1019 for (look = 1; 1020 OP(s = g->strip[pc+look]) != O_CH; 1021 look += OPND(s)) 1022 assert(OP(s) == OOR2); 1023 FWD(aft, aft, look); 1024 } 1025 break; 1026 case OOR2: /* propagate OCH_'s marking */ 1027 FWD(aft, aft, 1); 1028 if (OP(g->strip[pc+OPND(s)]) != O_CH) { 1029 assert(OP(g->strip[pc+OPND(s)]) == OOR2); 1030 FWD(aft, aft, OPND(s)); 1031 } 1032 break; 1033 case O_CH: /* just empty */ 1034 FWD(aft, aft, 1); 1035 break; 1036 default: /* ooooops... */ 1037 assert(nope); 1038 break; 1039 } 1040 } 1041 1042 return(aft); 1043} 1044 1045#ifdef REDEBUG 1046/* 1047 - print - print a set of states 1048 == #ifdef REDEBUG 1049 == static void print(struct match *m, char *caption, states st, \ 1050 == int ch, FILE *d); 1051 == #endif 1052 */ 1053static void 1054print(m, caption, st, ch, d) 1055struct match *m; 1056char *caption; 1057states st; 1058int ch; 1059FILE *d; 1060{ 1061 register struct re_guts *g = m->g; 1062 register int i; 1063 register int first = 1; 1064 1065 if (!(m->eflags®_TRACE)) 1066 return; 1067 1068 fprintf(d, "%s", caption); 1069 if (ch != '\0') 1070 fprintf(d, " %s", pchar(ch)); 1071 for (i = 0; i < g->nstates; i++) 1072 if (ISSET(st, i)) { 1073 fprintf(d, "%s%d", (first) ? "\t" : ", ", i); 1074 first = 0; 1075 } 1076 fprintf(d, "\n"); 1077} 1078 1079/* 1080 - at - print current situation 1081 == #ifdef REDEBUG 1082 == static void at(struct match *m, char *title, char *start, char *stop, \ 1083 == sopno startst, sopno stopst); 1084 == #endif 1085 */ 1086static void 1087at(m, title, start, stop, startst, stopst) 1088struct match *m; 1089char *title; 1090char *start; 1091char *stop; 1092sopno startst; 1093sopno stopst; 1094{ 1095 if (!(m->eflags®_TRACE)) 1096 return; 1097 1098 printf("%s %s-", title, pchar(*start)); 1099 printf("%s ", pchar(*stop)); 1100 printf("%ld-%ld\n", (long)startst, (long)stopst); 1101} 1102 1103#ifndef PCHARDONE 1104#define PCHARDONE /* never again */ 1105/* 1106 - pchar - make a character printable 1107 == #ifdef REDEBUG 1108 == static char *pchar(int ch); 1109 == #endif 1110 * 1111 * Is this identical to regchar() over in debug.c? Well, yes. But a 1112 * duplicate here avoids having a debugging-capable regexec.o tied to 1113 * a matching debug.o, and this is convenient. It all disappears in 1114 * the non-debug compilation anyway, so it doesn't matter much. 1115 */ 1116static char * /* -> representation */ 1117pchar(ch) 1118int ch; 1119{ 1120 static char pbuf[10]; 1121 1122 if (isprint((uch)ch) || ch == ' ') 1123 sprintf(pbuf, "%c", ch); 1124 else 1125 sprintf(pbuf, "\\%o", ch); 1126 return(pbuf); 1127} 1128#endif 1129#endif 1130 1131#undef matcher 1132#undef fast 1133#undef slow 1134#undef dissect 1135#undef backref 1136#undef step 1137#undef print 1138#undef at 1139#undef match 1140