engine.c revision 62391
1356290Sjkim/*- 2238405Sjkim * Copyright (c) 1992, 1993, 1994 Henry Spencer. 3238405Sjkim * Copyright (c) 1992, 1993, 1994 4238405Sjkim * The Regents of the University of California. All rights reserved. 5238405Sjkim * 6238405Sjkim * This code is derived from software contributed to Berkeley by 7238405Sjkim * Henry Spencer. 8238405Sjkim * 9238405Sjkim * Redistribution and use in source and binary forms, with or without 10238405Sjkim * modification, are permitted provided that the following conditions 11238405Sjkim * are met: 12238405Sjkim * 1. Redistributions of source code must retain the above copyright 13238405Sjkim * notice, this list of conditions and the following disclaimer. 14238405Sjkim * 2. Redistributions in binary form must reproduce the above copyright 15238405Sjkim * notice, this list of conditions and the following disclaimer in the 16238405Sjkim * documentation and/or other materials provided with the distribution. 17238405Sjkim * 3. All advertising materials mentioning features or use of this software 18238405Sjkim * must display the following acknowledgement: 19238405Sjkim * This product includes software developed by the University of 20238405Sjkim * California, Berkeley and its contributors. 21238405Sjkim * 4. Neither the name of the University nor the names of its contributors 22238405Sjkim * may be used to endorse or promote products derived from this software 23238405Sjkim * without specific prior written permission. 24238405Sjkim * 25238405Sjkim * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 26238405Sjkim * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 27238405Sjkim * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 28238405Sjkim * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 29238405Sjkim * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 30238405Sjkim * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 31238405Sjkim * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 32238405Sjkim * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 33238405Sjkim * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 34238405Sjkim * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 35238405Sjkim * SUCH DAMAGE. 36238405Sjkim * 37238405Sjkim * @(#)engine.c 8.5 (Berkeley) 3/20/94 38238405Sjkim * 39238405Sjkim * $FreeBSD: head/lib/libc/regex/engine.c 62391 2000-07-02 10:58:07Z dcs $ 40238405Sjkim */ 41276861Sjkim 42276861Sjkim/* 43238405Sjkim * The matching engine and friends. This file is #included by regexec.c 44238405Sjkim * after suitable #defines of a variety of macros used herein, so that 45238405Sjkim * different state representations can be used without duplicating masses 46238405Sjkim * of code. 47238405Sjkim */ 48238405Sjkim 49312826Sjkim#ifdef SNAMES 50238405Sjkim#define matcher smatcher 51238405Sjkim#define fast sfast 52238405Sjkim#define slow sslow 53276861Sjkim#define dissect sdissect 54276861Sjkim#define backref sbackref 55276861Sjkim#define step sstep 56238405Sjkim#define print sprint 57344604Sjkim#define at sat 58344604Sjkim#define match smat 59344604Sjkim#endif 60344604Sjkim#ifdef LNAMES 61344604Sjkim#define matcher lmatcher 62344604Sjkim#define fast lfast 63238405Sjkim#define slow lslow 64344604Sjkim#define dissect ldissect 65344604Sjkim#define backref lbackref 66344604Sjkim#define step lstep 67344604Sjkim#define print lprint 68276861Sjkim#define at lat 69238405Sjkim#define match lmat 70344604Sjkim#endif 71238405Sjkim 72238405Sjkim/* another structure passed up and down to avoid zillions of parameters */ 73238405Sjkimstruct match { 74238405Sjkim struct re_guts *g; 75238405Sjkim int eflags; 76238405Sjkim regmatch_t *pmatch; /* [nsub+1] (0 element unused) */ 77238405Sjkim char *offp; /* offsets work from here */ 78238405Sjkim char *beginp; /* start of string -- virtual NUL precedes */ 79238405Sjkim char *endp; /* end of string -- virtual NUL here */ 80238405Sjkim char *coldp; /* can be no match starting before here */ 81238405Sjkim char **lastpos; /* [nplus+1] */ 82238405Sjkim STATEVARS; 83238405Sjkim states st; /* current states */ 84238405Sjkim states fresh; /* states for a fresh start */ 85238405Sjkim states tmp; /* temporary */ 86238405Sjkim states empty; /* empty set of states */ 87238405Sjkim}; 88238405Sjkim 89238405Sjkim/* ========= begin header generated by ./mkh ========= */ 90238405Sjkim#ifdef __cplusplus 91238405Sjkimextern "C" { 92238405Sjkim#endif 93238405Sjkim 94238405Sjkim/* === engine.c === */ 95238405Sjkimstatic int matcher __P((struct re_guts *g, char *string, size_t nmatch, regmatch_t pmatch[], int eflags)); 96238405Sjkimstatic char *dissect __P((struct match *m, char *start, char *stop, sopno startst, sopno stopst)); 97238405Sjkimstatic char *backref __P((struct match *m, char *start, char *stop, sopno startst, sopno stopst, sopno lev)); 98238405Sjkimstatic char *fast __P((struct match *m, char *start, char *stop, sopno startst, sopno stopst)); 99238405Sjkimstatic char *slow __P((struct match *m, char *start, char *stop, sopno startst, sopno stopst)); 100238405Sjkimstatic states step __P((struct re_guts *g, sopno start, sopno stop, states bef, int ch, states aft)); 101238405Sjkim#define BOL (OUT+1) 102238405Sjkim#define EOL (BOL+1) 103238405Sjkim#define BOLEOL (BOL+2) 104238405Sjkim#define NOTHING (BOL+3) 105238405Sjkim#define BOW (BOL+4) 106238405Sjkim#define EOW (BOL+5) 107238405Sjkim#define CODEMAX (BOL+5) /* highest code used */ 108238405Sjkim#define NONCHAR(c) ((c) > CHAR_MAX) 109238405Sjkim#define NNONCHAR (CODEMAX-CHAR_MAX) 110238405Sjkim#ifdef REDEBUG 111238405Sjkimstatic void print __P((struct match *m, char *caption, states st, int ch, FILE *d)); 112238405Sjkim#endif 113238405Sjkim#ifdef REDEBUG 114238405Sjkimstatic void at __P((struct match *m, char *title, char *start, char *stop, sopno startst, sopno stopst)); 115238405Sjkim#endif 116238405Sjkim#ifdef REDEBUG 117238405Sjkimstatic char *pchar __P((int ch)); 118238405Sjkim#endif 119238405Sjkim 120238405Sjkim#ifdef __cplusplus 121238405Sjkim} 122238405Sjkim#endif 123238405Sjkim/* ========= end header generated by ./mkh ========= */ 124238405Sjkim 125238405Sjkim#ifdef REDEBUG 126238405Sjkim#define SP(t, s, c) print(m, t, s, c, stdout) 127238405Sjkim#define AT(t, p1, p2, s1, s2) at(m, t, p1, p2, s1, s2) 128238405Sjkim#define NOTE(str) { if (m->eflags®_TRACE) printf("=%s\n", (str)); } 129238405Sjkim#else 130238405Sjkim#define SP(t, s, c) /* nothing */ 131238405Sjkim#define AT(t, p1, p2, s1, s2) /* nothing */ 132238405Sjkim#define NOTE(s) /* nothing */ 133238405Sjkim#endif 134238405Sjkim 135238405Sjkim/* 136356290Sjkim - matcher - the actual matching engine 137238405Sjkim == static int matcher(register struct re_guts *g, char *string, \ 138238405Sjkim == size_t nmatch, regmatch_t pmatch[], int eflags); 139238405Sjkim */ 140238405Sjkimstatic int /* 0 success, REG_NOMATCH failure */ 141238405Sjkimmatcher(g, string, nmatch, pmatch, eflags) 142238405Sjkimregister struct re_guts *g; 143238405Sjkimchar *string; 144238405Sjkimsize_t nmatch; 145238405Sjkimregmatch_t pmatch[]; 146238405Sjkimint eflags; 147238405Sjkim{ 148238405Sjkim register char *endp; 149238405Sjkim register int i; 150238405Sjkim struct match mv; 151238405Sjkim register struct match *m = &mv; 152238405Sjkim register char *dp; 153238405Sjkim register const sopno gf = g->firststate+1; /* +1 for OEND */ 154238405Sjkim register const sopno gl = g->laststate; 155344604Sjkim char *start; 156238405Sjkim char *stop; 157238405Sjkim /* Boyer-Moore algorithms variables */ 158344604Sjkim register unsigned char *pp; 159238405Sjkim int cj, mj; 160238405Sjkim register unsigned char *mustfirst; 161238405Sjkim register unsigned char *mustlast; 162238405Sjkim register int mustlen; 163238405Sjkim register int *matchjump; 164238405Sjkim register int *charjump; 165238405Sjkim register unsigned char *bmp; 166238405Sjkim register unsigned char *bmps; 167344604Sjkim 168238405Sjkim /* simplify the situation where possible */ 169238405Sjkim if (g->cflags®_NOSUB) 170238405Sjkim nmatch = 0; 171344604Sjkim if (eflags®_STARTEND) { 172238405Sjkim start = string + pmatch[0].rm_so; 173238405Sjkim stop = string + pmatch[0].rm_eo; 174238405Sjkim } else { 175344604Sjkim start = string; 176238405Sjkim stop = start + strlen(start); 177238405Sjkim } 178238405Sjkim if (stop < start) 179238405Sjkim return(REG_INVARG); 180337982Sjkim 181238405Sjkim /* prescreening; this does wonders for this rather slow code */ 182238405Sjkim if (g->must != NULL) { 183238405Sjkim if (g->charjump != NULL && g->matchjump != NULL) { 184238405Sjkim mustlen = -g->mlen; 185238405Sjkim mustfirst = g->must; 186238405Sjkim mustlast = g->must + g->mlen - 1; 187238405Sjkim charjump = g->charjump; 188238405Sjkim matchjump = g->matchjump; 189238405Sjkim bmps = stop; 190238405Sjkim pp = mustlast; 191238405Sjkim for (bmp = start+g->mlen-1; bmp < bmps;) { 192238405Sjkim /* Fast skip non-matches */ 193238405Sjkim while (bmp < bmps && charjump[*bmp]) 194238405Sjkim bmp += charjump[*bmp]; 195238405Sjkim 196238405Sjkim if (bmp >= bmps) 197238405Sjkim break; 198238405Sjkim 199238405Sjkim /* Greedy matcher */ 200238405Sjkim /* We depend on not being used for 201238405Sjkim * for strings of length 1 202238405Sjkim */ 203238405Sjkim while (*--bmp == *--pp && pp != mustfirst); 204238405Sjkim 205238405Sjkim if (*bmp == *pp) 206238405Sjkim break; 207238405Sjkim 208238405Sjkim /* Jump to next possible match */ 209238405Sjkim mj = matchjump[pp - mustfirst]; 210238405Sjkim cj = charjump[*bmp]; 211238405Sjkim bmp += (cj < mj ? mj : cj); 212238405Sjkim pp = mustlast; 213238405Sjkim } 214238405Sjkim if (pp != mustfirst) 215238405Sjkim return(REG_NOMATCH); 216238405Sjkim dp = bmp; 217344604Sjkim } else { 218344604Sjkim for (dp = start; dp < stop; dp++) 219344604Sjkim if (*dp == g->must[0] && 220344604Sjkim stop - dp >= g->mlen && 221344604Sjkim memcmp(dp, g->must, (size_t)g->mlen) == 0) 222344604Sjkim break; 223238405Sjkim if (dp == stop) /* we didn't find g->must */ 224238405Sjkim return(REG_NOMATCH); 225238405Sjkim } 226 } 227 228 /* match struct setup */ 229 m->g = g; 230 m->eflags = eflags; 231 m->pmatch = NULL; 232 m->lastpos = NULL; 233 m->offp = string; 234 m->beginp = start; 235 m->endp = stop; 236 STATESETUP(m, 4); 237 SETUP(m->st); 238 SETUP(m->fresh); 239 SETUP(m->tmp); 240 SETUP(m->empty); 241 CLEAR(m->empty); 242 243 /* Adjust start according to moffset, to speed things up */ 244 if (g->moffset > -1) 245 start = dp - g->moffset; 246 247 /* this loop does only one repetition except for backrefs */ 248 for (;;) { 249 endp = fast(m, start, stop, gf, gl); 250 if (endp == NULL) { /* a miss */ 251 STATETEARDOWN(m); 252 return(REG_NOMATCH); 253 } 254 if (nmatch == 0 && !g->backrefs) 255 break; /* no further info needed */ 256 257 /* where? */ 258 assert(m->coldp != NULL); 259 for (;;) { 260 NOTE("finding start"); 261 endp = slow(m, m->coldp, stop, gf, gl); 262 if (endp != NULL) 263 break; 264 assert(m->coldp < m->endp); 265 m->coldp++; 266 } 267 if (nmatch == 1 && !g->backrefs) 268 break; /* no further info needed */ 269 270 /* oh my, he wants the subexpressions... */ 271 if (m->pmatch == NULL) 272 m->pmatch = (regmatch_t *)malloc((m->g->nsub + 1) * 273 sizeof(regmatch_t)); 274 if (m->pmatch == NULL) { 275 STATETEARDOWN(m); 276 return(REG_ESPACE); 277 } 278 for (i = 1; i <= m->g->nsub; i++) 279 m->pmatch[i].rm_so = m->pmatch[i].rm_eo = -1; 280 if (!g->backrefs && !(m->eflags®_BACKR)) { 281 NOTE("dissecting"); 282 dp = dissect(m, m->coldp, endp, gf, gl); 283 } else { 284 if (g->nplus > 0 && m->lastpos == NULL) 285 m->lastpos = (char **)malloc((g->nplus+1) * 286 sizeof(char *)); 287 if (g->nplus > 0 && m->lastpos == NULL) { 288 free(m->pmatch); 289 STATETEARDOWN(m); 290 return(REG_ESPACE); 291 } 292 NOTE("backref dissect"); 293 dp = backref(m, m->coldp, endp, gf, gl, (sopno)0); 294 } 295 if (dp != NULL) 296 break; 297 298 /* uh-oh... we couldn't find a subexpression-level match */ 299 assert(g->backrefs); /* must be back references doing it */ 300 assert(g->nplus == 0 || m->lastpos != NULL); 301 for (;;) { 302 if (dp != NULL || endp <= m->coldp) 303 break; /* defeat */ 304 NOTE("backoff"); 305 endp = slow(m, m->coldp, endp-1, gf, gl); 306 if (endp == NULL) 307 break; /* defeat */ 308 /* try it on a shorter possibility */ 309#ifndef NDEBUG 310 for (i = 1; i <= m->g->nsub; i++) { 311 assert(m->pmatch[i].rm_so == -1); 312 assert(m->pmatch[i].rm_eo == -1); 313 } 314#endif 315 NOTE("backoff dissect"); 316 dp = backref(m, m->coldp, endp, gf, gl, (sopno)0); 317 } 318 assert(dp == NULL || dp == endp); 319 if (dp != NULL) /* found a shorter one */ 320 break; 321 322 /* despite initial appearances, there is no match here */ 323 NOTE("false alarm"); 324 start = m->coldp + 1; /* recycle starting later */ 325 assert(start <= stop); 326 } 327 328 /* fill in the details if requested */ 329 if (nmatch > 0) { 330 pmatch[0].rm_so = m->coldp - m->offp; 331 pmatch[0].rm_eo = endp - m->offp; 332 } 333 if (nmatch > 1) { 334 assert(m->pmatch != NULL); 335 for (i = 1; i < nmatch; i++) 336 if (i <= m->g->nsub) 337 pmatch[i] = m->pmatch[i]; 338 else { 339 pmatch[i].rm_so = -1; 340 pmatch[i].rm_eo = -1; 341 } 342 } 343 344 if (m->pmatch != NULL) 345 free((char *)m->pmatch); 346 if (m->lastpos != NULL) 347 free((char *)m->lastpos); 348 STATETEARDOWN(m); 349 return(0); 350} 351 352/* 353 - dissect - figure out what matched what, no back references 354 == static char *dissect(register struct match *m, char *start, \ 355 == char *stop, sopno startst, sopno stopst); 356 */ 357static char * /* == stop (success) always */ 358dissect(m, start, stop, startst, stopst) 359register struct match *m; 360char *start; 361char *stop; 362sopno startst; 363sopno stopst; 364{ 365 register int i; 366 register sopno ss; /* start sop of current subRE */ 367 register sopno es; /* end sop of current subRE */ 368 register char *sp; /* start of string matched by it */ 369 register char *stp; /* string matched by it cannot pass here */ 370 register char *rest; /* start of rest of string */ 371 register char *tail; /* string unmatched by rest of RE */ 372 register sopno ssub; /* start sop of subsubRE */ 373 register sopno esub; /* end sop of subsubRE */ 374 register char *ssp; /* start of string matched by subsubRE */ 375 register char *sep; /* end of string matched by subsubRE */ 376 register char *oldssp; /* previous ssp */ 377 register char *dp; 378 379 AT("diss", start, stop, startst, stopst); 380 sp = start; 381 for (ss = startst; ss < stopst; ss = es) { 382 /* identify end of subRE */ 383 es = ss; 384 switch (OP(m->g->strip[es])) { 385 case OPLUS_: 386 case OQUEST_: 387 es += OPND(m->g->strip[es]); 388 break; 389 case OCH_: 390 while (OP(m->g->strip[es]) != O_CH) 391 es += OPND(m->g->strip[es]); 392 break; 393 } 394 es++; 395 396 /* figure out what it matched */ 397 switch (OP(m->g->strip[ss])) { 398 case OEND: 399 assert(nope); 400 break; 401 case OCHAR: 402 sp++; 403 break; 404 case OBOL: 405 case OEOL: 406 case OBOW: 407 case OEOW: 408 break; 409 case OANY: 410 case OANYOF: 411 sp++; 412 break; 413 case OBACK_: 414 case O_BACK: 415 assert(nope); 416 break; 417 /* cases where length of match is hard to find */ 418 case OQUEST_: 419 stp = stop; 420 for (;;) { 421 /* how long could this one be? */ 422 rest = slow(m, sp, stp, ss, es); 423 assert(rest != NULL); /* it did match */ 424 /* could the rest match the rest? */ 425 tail = slow(m, rest, stop, es, stopst); 426 if (tail == stop) 427 break; /* yes! */ 428 /* no -- try a shorter match for this one */ 429 stp = rest - 1; 430 assert(stp >= sp); /* it did work */ 431 } 432 ssub = ss + 1; 433 esub = es - 1; 434 /* did innards match? */ 435 if (slow(m, sp, rest, ssub, esub) != NULL) { 436 dp = dissect(m, sp, rest, ssub, esub); 437 assert(dp == rest); 438 } else /* no */ 439 assert(sp == rest); 440 sp = rest; 441 break; 442 case OPLUS_: 443 stp = stop; 444 for (;;) { 445 /* how long could this one be? */ 446 rest = slow(m, sp, stp, ss, es); 447 assert(rest != NULL); /* it did match */ 448 /* could the rest match the rest? */ 449 tail = slow(m, rest, stop, es, stopst); 450 if (tail == stop) 451 break; /* yes! */ 452 /* no -- try a shorter match for this one */ 453 stp = rest - 1; 454 assert(stp >= sp); /* it did work */ 455 } 456 ssub = ss + 1; 457 esub = es - 1; 458 ssp = sp; 459 oldssp = ssp; 460 for (;;) { /* find last match of innards */ 461 sep = slow(m, ssp, rest, ssub, esub); 462 if (sep == NULL || sep == ssp) 463 break; /* failed or matched null */ 464 oldssp = ssp; /* on to next try */ 465 ssp = sep; 466 } 467 if (sep == NULL) { 468 /* last successful match */ 469 sep = ssp; 470 ssp = oldssp; 471 } 472 assert(sep == rest); /* must exhaust substring */ 473 assert(slow(m, ssp, sep, ssub, esub) == rest); 474 dp = dissect(m, ssp, sep, ssub, esub); 475 assert(dp == sep); 476 sp = rest; 477 break; 478 case OCH_: 479 stp = stop; 480 for (;;) { 481 /* how long could this one be? */ 482 rest = slow(m, sp, stp, ss, es); 483 assert(rest != NULL); /* it did match */ 484 /* could the rest match the rest? */ 485 tail = slow(m, rest, stop, es, stopst); 486 if (tail == stop) 487 break; /* yes! */ 488 /* no -- try a shorter match for this one */ 489 stp = rest - 1; 490 assert(stp >= sp); /* it did work */ 491 } 492 ssub = ss + 1; 493 esub = ss + OPND(m->g->strip[ss]) - 1; 494 assert(OP(m->g->strip[esub]) == OOR1); 495 for (;;) { /* find first matching branch */ 496 if (slow(m, sp, rest, ssub, esub) == rest) 497 break; /* it matched all of it */ 498 /* that one missed, try next one */ 499 assert(OP(m->g->strip[esub]) == OOR1); 500 esub++; 501 assert(OP(m->g->strip[esub]) == OOR2); 502 ssub = esub + 1; 503 esub += OPND(m->g->strip[esub]); 504 if (OP(m->g->strip[esub]) == OOR2) 505 esub--; 506 else 507 assert(OP(m->g->strip[esub]) == O_CH); 508 } 509 dp = dissect(m, sp, rest, ssub, esub); 510 assert(dp == rest); 511 sp = rest; 512 break; 513 case O_PLUS: 514 case O_QUEST: 515 case OOR1: 516 case OOR2: 517 case O_CH: 518 assert(nope); 519 break; 520 case OLPAREN: 521 i = OPND(m->g->strip[ss]); 522 assert(0 < i && i <= m->g->nsub); 523 m->pmatch[i].rm_so = sp - m->offp; 524 break; 525 case ORPAREN: 526 i = OPND(m->g->strip[ss]); 527 assert(0 < i && i <= m->g->nsub); 528 m->pmatch[i].rm_eo = sp - m->offp; 529 break; 530 default: /* uh oh */ 531 assert(nope); 532 break; 533 } 534 } 535 536 assert(sp == stop); 537 return(sp); 538} 539 540/* 541 - backref - figure out what matched what, figuring in back references 542 == static char *backref(register struct match *m, char *start, \ 543 == char *stop, sopno startst, sopno stopst, sopno lev); 544 */ 545static char * /* == stop (success) or NULL (failure) */ 546backref(m, start, stop, startst, stopst, lev) 547register struct match *m; 548char *start; 549char *stop; 550sopno startst; 551sopno stopst; 552sopno lev; /* PLUS nesting level */ 553{ 554 register int i; 555 register sopno ss; /* start sop of current subRE */ 556 register char *sp; /* start of string matched by it */ 557 register sopno ssub; /* start sop of subsubRE */ 558 register sopno esub; /* end sop of subsubRE */ 559 register char *ssp; /* start of string matched by subsubRE */ 560 register char *dp; 561 register size_t len; 562 register int hard; 563 register sop s; 564 register regoff_t offsave; 565 register cset *cs; 566 567 AT("back", start, stop, startst, stopst); 568 sp = start; 569 570 /* get as far as we can with easy stuff */ 571 hard = 0; 572 for (ss = startst; !hard && ss < stopst; ss++) 573 switch (OP(s = m->g->strip[ss])) { 574 case OCHAR: 575 if (sp == stop || *sp++ != (char)OPND(s)) 576 return(NULL); 577 break; 578 case OANY: 579 if (sp == stop) 580 return(NULL); 581 sp++; 582 break; 583 case OANYOF: 584 cs = &m->g->sets[OPND(s)]; 585 if (sp == stop || !CHIN(cs, *sp++)) 586 return(NULL); 587 break; 588 case OBOL: 589 if ( (sp == m->beginp && !(m->eflags®_NOTBOL)) || 590 (sp < m->endp && *(sp-1) == '\n' && 591 (m->g->cflags®_NEWLINE)) ) 592 { /* yes */ } 593 else 594 return(NULL); 595 break; 596 case OEOL: 597 if ( (sp == m->endp && !(m->eflags®_NOTEOL)) || 598 (sp < m->endp && *sp == '\n' && 599 (m->g->cflags®_NEWLINE)) ) 600 { /* yes */ } 601 else 602 return(NULL); 603 break; 604 case OBOW: 605 if (( (sp == m->beginp && !(m->eflags®_NOTBOL)) || 606 (sp < m->endp && *(sp-1) == '\n' && 607 (m->g->cflags®_NEWLINE)) || 608 (sp > m->beginp && 609 !ISWORD(*(sp-1))) ) && 610 (sp < m->endp && ISWORD(*sp)) ) 611 { /* yes */ } 612 else 613 return(NULL); 614 break; 615 case OEOW: 616 if (( (sp == m->endp && !(m->eflags®_NOTEOL)) || 617 (sp < m->endp && *sp == '\n' && 618 (m->g->cflags®_NEWLINE)) || 619 (sp < m->endp && !ISWORD(*sp)) ) && 620 (sp > m->beginp && ISWORD(*(sp-1))) ) 621 { /* yes */ } 622 else 623 return(NULL); 624 break; 625 case O_QUEST: 626 break; 627 case OOR1: /* matches null but needs to skip */ 628 ss++; 629 s = m->g->strip[ss]; 630 do { 631 assert(OP(s) == OOR2); 632 ss += OPND(s); 633 } while (OP(s = m->g->strip[ss]) != O_CH); 634 /* note that the ss++ gets us past the O_CH */ 635 break; 636 default: /* have to make a choice */ 637 hard = 1; 638 break; 639 } 640 if (!hard) { /* that was it! */ 641 if (sp != stop) 642 return(NULL); 643 return(sp); 644 } 645 ss--; /* adjust for the for's final increment */ 646 647 /* the hard stuff */ 648 AT("hard", sp, stop, ss, stopst); 649 s = m->g->strip[ss]; 650 switch (OP(s)) { 651 case OBACK_: /* the vilest depths */ 652 i = OPND(s); 653 assert(0 < i && i <= m->g->nsub); 654 if (m->pmatch[i].rm_eo == -1) 655 return(NULL); 656 assert(m->pmatch[i].rm_so != -1); 657 len = m->pmatch[i].rm_eo - m->pmatch[i].rm_so; 658 assert(stop - m->beginp >= len); 659 if (sp > stop - len) 660 return(NULL); /* not enough left to match */ 661 ssp = m->offp + m->pmatch[i].rm_so; 662 if (memcmp(sp, ssp, len) != 0) 663 return(NULL); 664 while (m->g->strip[ss] != SOP(O_BACK, i)) 665 ss++; 666 return(backref(m, sp+len, stop, ss+1, stopst, lev)); 667 break; 668 case OQUEST_: /* to null or not */ 669 dp = backref(m, sp, stop, ss+1, stopst, lev); 670 if (dp != NULL) 671 return(dp); /* not */ 672 return(backref(m, sp, stop, ss+OPND(s)+1, stopst, lev)); 673 break; 674 case OPLUS_: 675 assert(m->lastpos != NULL); 676 assert(lev+1 <= m->g->nplus); 677 m->lastpos[lev+1] = sp; 678 return(backref(m, sp, stop, ss+1, stopst, lev+1)); 679 break; 680 case O_PLUS: 681 if (sp == m->lastpos[lev]) /* last pass matched null */ 682 return(backref(m, sp, stop, ss+1, stopst, lev-1)); 683 /* try another pass */ 684 m->lastpos[lev] = sp; 685 dp = backref(m, sp, stop, ss-OPND(s)+1, stopst, lev); 686 if (dp == NULL) 687 return(backref(m, sp, stop, ss+1, stopst, lev-1)); 688 else 689 return(dp); 690 break; 691 case OCH_: /* find the right one, if any */ 692 ssub = ss + 1; 693 esub = ss + OPND(s) - 1; 694 assert(OP(m->g->strip[esub]) == OOR1); 695 for (;;) { /* find first matching branch */ 696 dp = backref(m, sp, stop, ssub, esub, lev); 697 if (dp != NULL) 698 return(dp); 699 /* that one missed, try next one */ 700 if (OP(m->g->strip[esub]) == O_CH) 701 return(NULL); /* there is none */ 702 esub++; 703 assert(OP(m->g->strip[esub]) == OOR2); 704 ssub = esub + 1; 705 esub += OPND(m->g->strip[esub]); 706 if (OP(m->g->strip[esub]) == OOR2) 707 esub--; 708 else 709 assert(OP(m->g->strip[esub]) == O_CH); 710 } 711 break; 712 case OLPAREN: /* must undo assignment if rest fails */ 713 i = OPND(s); 714 assert(0 < i && i <= m->g->nsub); 715 offsave = m->pmatch[i].rm_so; 716 m->pmatch[i].rm_so = sp - m->offp; 717 dp = backref(m, sp, stop, ss+1, stopst, lev); 718 if (dp != NULL) 719 return(dp); 720 m->pmatch[i].rm_so = offsave; 721 return(NULL); 722 break; 723 case ORPAREN: /* must undo assignment if rest fails */ 724 i = OPND(s); 725 assert(0 < i && i <= m->g->nsub); 726 offsave = m->pmatch[i].rm_eo; 727 m->pmatch[i].rm_eo = sp - m->offp; 728 dp = backref(m, sp, stop, ss+1, stopst, lev); 729 if (dp != NULL) 730 return(dp); 731 m->pmatch[i].rm_eo = offsave; 732 return(NULL); 733 break; 734 default: /* uh oh */ 735 assert(nope); 736 break; 737 } 738 739 /* "can't happen" */ 740 assert(nope); 741 /* NOTREACHED */ 742 return "shut up gcc"; 743} 744 745/* 746 - fast - step through the string at top speed 747 == static char *fast(register struct match *m, char *start, \ 748 == char *stop, sopno startst, sopno stopst); 749 */ 750static char * /* where tentative match ended, or NULL */ 751fast(m, start, stop, startst, stopst) 752register struct match *m; 753char *start; 754char *stop; 755sopno startst; 756sopno stopst; 757{ 758 register states st = m->st; 759 register states fresh = m->fresh; 760 register states tmp = m->tmp; 761 register char *p = start; 762 register int c = (start == m->beginp) ? OUT : *(start-1); 763 register int lastc; /* previous c */ 764 register int flagch; 765 register int i; 766 register char *coldp; /* last p after which no match was underway */ 767 768 CLEAR(st); 769 SET1(st, startst); 770 st = step(m->g, startst, stopst, st, NOTHING, st); 771 ASSIGN(fresh, st); 772 SP("start", st, *p); 773 coldp = NULL; 774 for (;;) { 775 /* next character */ 776 lastc = c; 777 c = (p == m->endp) ? OUT : *p; 778 if (EQ(st, fresh)) 779 coldp = p; 780 781 /* is there an EOL and/or BOL between lastc and c? */ 782 flagch = '\0'; 783 i = 0; 784 if ( (lastc == '\n' && m->g->cflags®_NEWLINE) || 785 (lastc == OUT && !(m->eflags®_NOTBOL)) ) { 786 flagch = BOL; 787 i = m->g->nbol; 788 } 789 if ( (c == '\n' && m->g->cflags®_NEWLINE) || 790 (c == OUT && !(m->eflags®_NOTEOL)) ) { 791 flagch = (flagch == BOL) ? BOLEOL : EOL; 792 i += m->g->neol; 793 } 794 if (i != 0) { 795 for (; i > 0; i--) 796 st = step(m->g, startst, stopst, st, flagch, st); 797 SP("boleol", st, c); 798 } 799 800 /* how about a word boundary? */ 801 if ( (flagch == BOL || (lastc != OUT && !ISWORD(lastc))) && 802 (c != OUT && ISWORD(c)) ) { 803 flagch = BOW; 804 } 805 if ( (lastc != OUT && ISWORD(lastc)) && 806 (flagch == EOL || (c != OUT && !ISWORD(c))) ) { 807 flagch = EOW; 808 } 809 if (flagch == BOW || flagch == EOW) { 810 st = step(m->g, startst, stopst, st, flagch, st); 811 SP("boweow", st, c); 812 } 813 814 /* are we done? */ 815 if (ISSET(st, stopst) || p == stop) 816 break; /* NOTE BREAK OUT */ 817 818 /* no, we must deal with this character */ 819 ASSIGN(tmp, st); 820 ASSIGN(st, fresh); 821 assert(c != OUT); 822 st = step(m->g, startst, stopst, tmp, c, st); 823 SP("aft", st, c); 824 assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st)); 825 p++; 826 } 827 828 assert(coldp != NULL); 829 m->coldp = coldp; 830 if (ISSET(st, stopst)) 831 return(p+1); 832 else 833 return(NULL); 834} 835 836/* 837 - slow - step through the string more deliberately 838 == static char *slow(register struct match *m, char *start, \ 839 == char *stop, sopno startst, sopno stopst); 840 */ 841static char * /* where it ended */ 842slow(m, start, stop, startst, stopst) 843register struct match *m; 844char *start; 845char *stop; 846sopno startst; 847sopno stopst; 848{ 849 register states st = m->st; 850 register states empty = m->empty; 851 register states tmp = m->tmp; 852 register char *p = start; 853 register int c = (start == m->beginp) ? OUT : *(start-1); 854 register int lastc; /* previous c */ 855 register int flagch; 856 register int i; 857 register char *matchp; /* last p at which a match ended */ 858 859 AT("slow", start, stop, startst, stopst); 860 CLEAR(st); 861 SET1(st, startst); 862 SP("sstart", st, *p); 863 st = step(m->g, startst, stopst, st, NOTHING, st); 864 matchp = NULL; 865 for (;;) { 866 /* next character */ 867 lastc = c; 868 c = (p == m->endp) ? OUT : *p; 869 870 /* is there an EOL and/or BOL between lastc and c? */ 871 flagch = '\0'; 872 i = 0; 873 if ( (lastc == '\n' && m->g->cflags®_NEWLINE) || 874 (lastc == OUT && !(m->eflags®_NOTBOL)) ) { 875 flagch = BOL; 876 i = m->g->nbol; 877 } 878 if ( (c == '\n' && m->g->cflags®_NEWLINE) || 879 (c == OUT && !(m->eflags®_NOTEOL)) ) { 880 flagch = (flagch == BOL) ? BOLEOL : EOL; 881 i += m->g->neol; 882 } 883 if (i != 0) { 884 for (; i > 0; i--) 885 st = step(m->g, startst, stopst, st, flagch, st); 886 SP("sboleol", st, c); 887 } 888 889 /* how about a word boundary? */ 890 if ( (flagch == BOL || (lastc != OUT && !ISWORD(lastc))) && 891 (c != OUT && ISWORD(c)) ) { 892 flagch = BOW; 893 } 894 if ( (lastc != OUT && ISWORD(lastc)) && 895 (flagch == EOL || (c != OUT && !ISWORD(c))) ) { 896 flagch = EOW; 897 } 898 if (flagch == BOW || flagch == EOW) { 899 st = step(m->g, startst, stopst, st, flagch, st); 900 SP("sboweow", st, c); 901 } 902 903 /* are we done? */ 904 if (ISSET(st, stopst)) 905 matchp = p; 906 if (EQ(st, empty) || p == stop) 907 break; /* NOTE BREAK OUT */ 908 909 /* no, we must deal with this character */ 910 ASSIGN(tmp, st); 911 ASSIGN(st, empty); 912 assert(c != OUT); 913 st = step(m->g, startst, stopst, tmp, c, st); 914 SP("saft", st, c); 915 assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st)); 916 p++; 917 } 918 919 return(matchp); 920} 921 922 923/* 924 - step - map set of states reachable before char to set reachable after 925 == static states step(register struct re_guts *g, sopno start, sopno stop, \ 926 == register states bef, int ch, register states aft); 927 == #define BOL (OUT+1) 928 == #define EOL (BOL+1) 929 == #define BOLEOL (BOL+2) 930 == #define NOTHING (BOL+3) 931 == #define BOW (BOL+4) 932 == #define EOW (BOL+5) 933 == #define CODEMAX (BOL+5) // highest code used 934 == #define NONCHAR(c) ((c) > CHAR_MAX) 935 == #define NNONCHAR (CODEMAX-CHAR_MAX) 936 */ 937static states 938step(g, start, stop, bef, ch, aft) 939register struct re_guts *g; 940sopno start; /* start state within strip */ 941sopno stop; /* state after stop state within strip */ 942register states bef; /* states reachable before */ 943int ch; /* character or NONCHAR code */ 944register states aft; /* states already known reachable after */ 945{ 946 register cset *cs; 947 register sop s; 948 register sopno pc; 949 register onestate here; /* note, macros know this name */ 950 register sopno look; 951 register int i; 952 953 for (pc = start, INIT(here, pc); pc != stop; pc++, INC(here)) { 954 s = g->strip[pc]; 955 switch (OP(s)) { 956 case OEND: 957 assert(pc == stop-1); 958 break; 959 case OCHAR: 960 /* only characters can match */ 961 assert(!NONCHAR(ch) || ch != (char)OPND(s)); 962 if (ch == (char)OPND(s)) 963 FWD(aft, bef, 1); 964 break; 965 case OBOL: 966 if (ch == BOL || ch == BOLEOL) 967 FWD(aft, bef, 1); 968 break; 969 case OEOL: 970 if (ch == EOL || ch == BOLEOL) 971 FWD(aft, bef, 1); 972 break; 973 case OBOW: 974 if (ch == BOW) 975 FWD(aft, bef, 1); 976 break; 977 case OEOW: 978 if (ch == EOW) 979 FWD(aft, bef, 1); 980 break; 981 case OANY: 982 if (!NONCHAR(ch)) 983 FWD(aft, bef, 1); 984 break; 985 case OANYOF: 986 cs = &g->sets[OPND(s)]; 987 if (!NONCHAR(ch) && CHIN(cs, ch)) 988 FWD(aft, bef, 1); 989 break; 990 case OBACK_: /* ignored here */ 991 case O_BACK: 992 FWD(aft, aft, 1); 993 break; 994 case OPLUS_: /* forward, this is just an empty */ 995 FWD(aft, aft, 1); 996 break; 997 case O_PLUS: /* both forward and back */ 998 FWD(aft, aft, 1); 999 i = ISSETBACK(aft, OPND(s)); 1000 BACK(aft, aft, OPND(s)); 1001 if (!i && ISSETBACK(aft, OPND(s))) { 1002 /* oho, must reconsider loop body */ 1003 pc -= OPND(s) + 1; 1004 INIT(here, pc); 1005 } 1006 break; 1007 case OQUEST_: /* two branches, both forward */ 1008 FWD(aft, aft, 1); 1009 FWD(aft, aft, OPND(s)); 1010 break; 1011 case O_QUEST: /* just an empty */ 1012 FWD(aft, aft, 1); 1013 break; 1014 case OLPAREN: /* not significant here */ 1015 case ORPAREN: 1016 FWD(aft, aft, 1); 1017 break; 1018 case OCH_: /* mark the first two branches */ 1019 FWD(aft, aft, 1); 1020 assert(OP(g->strip[pc+OPND(s)]) == OOR2); 1021 FWD(aft, aft, OPND(s)); 1022 break; 1023 case OOR1: /* done a branch, find the O_CH */ 1024 if (ISSTATEIN(aft, here)) { 1025 for (look = 1; 1026 OP(s = g->strip[pc+look]) != O_CH; 1027 look += OPND(s)) 1028 assert(OP(s) == OOR2); 1029 FWD(aft, aft, look); 1030 } 1031 break; 1032 case OOR2: /* propagate OCH_'s marking */ 1033 FWD(aft, aft, 1); 1034 if (OP(g->strip[pc+OPND(s)]) != O_CH) { 1035 assert(OP(g->strip[pc+OPND(s)]) == OOR2); 1036 FWD(aft, aft, OPND(s)); 1037 } 1038 break; 1039 case O_CH: /* just empty */ 1040 FWD(aft, aft, 1); 1041 break; 1042 default: /* ooooops... */ 1043 assert(nope); 1044 break; 1045 } 1046 } 1047 1048 return(aft); 1049} 1050 1051#ifdef REDEBUG 1052/* 1053 - print - print a set of states 1054 == #ifdef REDEBUG 1055 == static void print(struct match *m, char *caption, states st, \ 1056 == int ch, FILE *d); 1057 == #endif 1058 */ 1059static void 1060print(m, caption, st, ch, d) 1061struct match *m; 1062char *caption; 1063states st; 1064int ch; 1065FILE *d; 1066{ 1067 register struct re_guts *g = m->g; 1068 register int i; 1069 register int first = 1; 1070 1071 if (!(m->eflags®_TRACE)) 1072 return; 1073 1074 fprintf(d, "%s", caption); 1075 if (ch != '\0') 1076 fprintf(d, " %s", pchar(ch)); 1077 for (i = 0; i < g->nstates; i++) 1078 if (ISSET(st, i)) { 1079 fprintf(d, "%s%d", (first) ? "\t" : ", ", i); 1080 first = 0; 1081 } 1082 fprintf(d, "\n"); 1083} 1084 1085/* 1086 - at - print current situation 1087 == #ifdef REDEBUG 1088 == static void at(struct match *m, char *title, char *start, char *stop, \ 1089 == sopno startst, sopno stopst); 1090 == #endif 1091 */ 1092static void 1093at(m, title, start, stop, startst, stopst) 1094struct match *m; 1095char *title; 1096char *start; 1097char *stop; 1098sopno startst; 1099sopno stopst; 1100{ 1101 if (!(m->eflags®_TRACE)) 1102 return; 1103 1104 printf("%s %s-", title, pchar(*start)); 1105 printf("%s ", pchar(*stop)); 1106 printf("%ld-%ld\n", (long)startst, (long)stopst); 1107} 1108 1109#ifndef PCHARDONE 1110#define PCHARDONE /* never again */ 1111/* 1112 - pchar - make a character printable 1113 == #ifdef REDEBUG 1114 == static char *pchar(int ch); 1115 == #endif 1116 * 1117 * Is this identical to regchar() over in debug.c? Well, yes. But a 1118 * duplicate here avoids having a debugging-capable regexec.o tied to 1119 * a matching debug.o, and this is convenient. It all disappears in 1120 * the non-debug compilation anyway, so it doesn't matter much. 1121 */ 1122static char * /* -> representation */ 1123pchar(ch) 1124int ch; 1125{ 1126 static char pbuf[10]; 1127 1128 if (isprint((uch)ch) || ch == ' ') 1129 sprintf(pbuf, "%c", ch); 1130 else 1131 sprintf(pbuf, "\\%o", ch); 1132 return(pbuf); 1133} 1134#endif 1135#endif 1136 1137#undef matcher 1138#undef fast 1139#undef slow 1140#undef dissect 1141#undef backref 1142#undef step 1143#undef print 1144#undef at 1145#undef match 1146