1/* $NetBSD: engine.c,v 1.30 2024/01/23 15:32:54 christos Exp $ */ 2 3/*- 4 * SPDX-License-Identifier: BSD-3-Clause 5 * 6 * Copyright (c) 1992, 1993, 1994 Henry Spencer. 7 * Copyright (c) 1992, 1993, 1994 8 * The Regents of the University of California. All rights reserved. 9 * 10 * This code is derived from software contributed to Berkeley by 11 * Henry Spencer. 12 * 13 * Redistribution and use in source and binary forms, with or without 14 * modification, are permitted provided that the following conditions 15 * are met: 16 * 1. Redistributions of source code must retain the above copyright 17 * notice, this list of conditions and the following disclaimer. 18 * 2. Redistributions in binary form must reproduce the above copyright 19 * notice, this list of conditions and the following disclaimer in the 20 * documentation and/or other materials provided with the distribution. 21 * 3. Neither the name of the University nor the names of its contributors 22 * may be used to endorse or promote products derived from this software 23 * without specific prior written permission. 24 * 25 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 26 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 27 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 28 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 29 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 30 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 31 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 32 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 33 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 34 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 35 * SUCH DAMAGE. 36 * 37 * @(#)engine.c 8.5 (Berkeley) 3/20/94 38 */ 39 40#include <sys/cdefs.h> 41#ifdef __FBSDID 42__FBSDID("$FreeBSD: head/lib/libc/regex/engine.c 368358 2020-12-05 03:16:05Z kevans $"); 43#endif 44__RCSID("$NetBSD: engine.c,v 1.30 2024/01/23 15:32:54 christos Exp $"); 45 46#include <stdbool.h> 47 48/* 49 * The matching engine and friends. This file is #included by regexec.c 50 * after suitable #defines of a variety of macros used herein, so that 51 * different state representations can be used without duplicating masses 52 * of code. 53 */ 54 55#ifdef SNAMES 56#define stepback sstepback 57#define matcher smatcher 58#define walk swalk 59#define dissect sdissect 60#define backref sbackref 61#define step sstep 62#define print sprint 63#define at sat 64#define match smat 65#endif 66#ifdef LNAMES 67#define stepback lstepback 68#define matcher lmatcher 69#define walk lwalk 70#define dissect ldissect 71#define backref lbackref 72#define step lstep 73#define print lprint 74#define at lat 75#define match lmat 76#endif 77#ifdef MNAMES 78#define stepback mstepback 79#define matcher mmatcher 80#define walk mwalk 81#define dissect mdissect 82#define backref mbackref 83#define step mstep 84#define print mprint 85#define at mat 86#define match mmat 87#endif 88 89/* another structure passed up and down to avoid zillions of parameters */ 90struct match { 91 struct re_guts *g; 92 int eflags; 93 regmatch_t *pmatch; /* [nsub+1] (0 element unused) */ 94 const char *offp; /* offsets work from here */ 95 const char *beginp; /* start of string -- virtual NUL precedes */ 96 const char *endp; /* end of string -- virtual NUL here */ 97 const char *coldp; /* can be no match starting before here */ 98 const char **lastpos; /* [nplus+1] */ 99 STATEVARS; 100 states st; /* current states */ 101 states fresh; /* states for a fresh start */ 102 states tmp; /* temporary */ 103 states empty; /* empty set of states */ 104 mbstate_t mbs; /* multibyte conversion state */ 105}; 106 107/* ========= begin header generated by ./mkh ========= */ 108#ifdef __cplusplus 109extern "C" { 110#endif 111 112/* === engine.c === */ 113static int matcher(struct re_guts *g, const char *string, size_t nmatch, regmatch_t pmatch[], int eflags); 114static const char *dissect(struct match *m, const char *start, const char *stop, sopno startst, sopno stopst); 115static const char *backref(struct match *m, const char *start, const char *stop, sopno startst, sopno stopst, sopno lev, int); 116static const char *walk(struct match *m, const char *start, const char *stop, sopno startst, sopno stopst, bool fast); 117static states step(struct re_guts *g, sopno start, sopno stop, states bef, wint_t ch, states aft, int sflags); 118#define MAX_RECURSION 100 119#define BOL (OUT-1) 120#define EOL (BOL-1) 121#define BOLEOL (BOL-2) 122#define NOTHING (BOL-3) 123#define BOW (BOL-4) 124#define EOW (BOL-5) 125#define BADCHAR (BOL-6) 126#define NWBND (BOL-7) 127#define NONCHAR(c) ((c) <= OUT) 128/* sflags */ 129#define SBOS 0x0001 130#define SEOS 0x0002 131 132#ifdef REDEBUG 133static void print(struct match *m, const char *caption, states st, int ch, FILE *d); 134#endif 135#ifdef REDEBUG 136static void at(struct match *m, const char *title, const char *start, const char *stop, sopno startst, sopno stopst); 137#endif 138#ifdef REDEBUG 139static const char *pchar(int ch); 140#endif 141 142#ifdef __cplusplus 143} 144#endif 145/* ========= end header generated by ./mkh ========= */ 146 147#ifdef REDEBUG 148#define SP(t, s, c) print(m, t, s, c, stdout) 149#define AT(t, p1, p2, s1, s2) at(m, t, p1, p2, s1, s2) 150#define NOTE(str) { if (m->eflags®_TRACE) printf("=%s\n", (str)); } 151#else 152#define SP(t, s, c) /* nothing */ 153#define AT(t, p1, p2, s1, s2) /* nothing */ 154#define NOTE(s) /* nothing */ 155#endif 156 157/* 158 * Given a multibyte string pointed to by start, step back nchar characters 159 * from current position pointed to by cur. 160 */ 161static const char * 162stepback(const char *start, const char *cur, int nchar) 163{ 164#ifdef NLS 165 const char *ret; 166 size_t wc, mbc; 167 mbstate_t mbs; 168 size_t clen; 169 170 if (MB_CUR_MAX == 1) 171 goto out; 172 173 ret = cur; 174 for (wc = nchar; wc > 0; wc--) { 175 for (mbc = 1; mbc <= MB_CUR_MAX; mbc++) { 176 if ((ret - mbc) < start) 177 return (NULL); 178 memset(&mbs, 0, sizeof(mbs)); 179 clen = mbrtowc(NULL, ret - mbc, mbc, &mbs); 180 if (clen != (size_t)-1 && clen != (size_t)-2) 181 break; 182 } 183 if (mbc > MB_CUR_MAX) 184 return (NULL); 185 ret -= mbc; 186 } 187 188 return (ret); 189out: 190#endif 191 return (cur - nchar) > start ? cur - nchar : NULL; 192} 193 194/* 195 - matcher - the actual matching engine 196 == static int matcher(struct re_guts *g, const char *string, \ 197 == size_t nmatch, regmatch_t pmatch[], int eflags); 198 */ 199static int /* 0 success, REG_NOMATCH failure */ 200matcher(struct re_guts *g, 201 const char *string, 202 size_t nmatch, 203 regmatch_t pmatch[], 204 int eflags) 205{ 206 const char *endp; 207 size_t i; 208 struct match mv; 209 struct match *m = &mv; 210 const char *dp = NULL; 211 const sopno gf = g->firststate+1; /* +1 for OEND */ 212 const sopno gl = g->laststate; 213 const char *start; 214 const char *stop; 215 /* Boyer-Moore algorithms variables */ 216 const char *pp; 217 size_t cj, mj; 218 const char *mustfirst; 219 const char *mustlast; 220 size_t *matchjump; 221 size_t *charjump; 222 int error = 0; 223 224 _DIAGASSERT(g != NULL); 225 _DIAGASSERT(string != NULL); 226 /* pmatch checked below */ 227 228 /* simplify the situation where possible */ 229 if (g->cflags®_NOSUB) 230 nmatch = 0; 231 if (eflags®_STARTEND) { 232 _DIAGASSERT(pmatch != NULL); 233 start = string + (size_t)pmatch[0].rm_so; 234 stop = string + (size_t)pmatch[0].rm_eo; 235 } else { 236 start = string; 237 stop = start + strlen(start); 238 } 239 if (stop < start) 240 return(REG_INVARG); 241 242 /* prescreening; this does wonders for this rather slow code */ 243 if (g->must != NULL) { 244 if (g->charjump != NULL && g->matchjump != NULL) { 245 mustfirst = g->must; 246 mustlast = g->must + g->mlen - 1; 247 charjump = g->charjump; 248 matchjump = g->matchjump; 249 pp = mustlast; 250 for (dp = start+g->mlen-1; dp < stop;) { 251 /* Fast skip non-matches */ 252 while (dp < stop && charjump[(int)*dp]) 253 dp += charjump[(int)*dp]; 254 255 if (dp >= stop) 256 break; 257 258 /* Greedy matcher */ 259 /* We depend on not being used for 260 * for strings of length 1 261 */ 262 while (*--dp == *--pp && pp != mustfirst); 263 264 if (*dp == *pp) 265 break; 266 267 /* Jump to next possible match */ 268 mj = matchjump[pp - mustfirst]; 269 cj = charjump[(int)*dp]; 270 dp += (cj < mj ? mj : cj); 271 pp = mustlast; 272 } 273 if (pp != mustfirst) 274 return(REG_NOMATCH); 275 } else { 276 for (dp = start; dp < stop; dp++) 277 if (*dp == g->must[0] && 278 (size_t)(stop - dp) >= g->mlen && 279 memcmp(dp, g->must, (size_t)g->mlen) == 0) 280 break; 281 if (dp == stop) /* we didn't find g->must */ 282 return(REG_NOMATCH); 283 } 284 } 285 286 /* match struct setup */ 287 m->g = g; 288 m->eflags = eflags; 289 m->pmatch = NULL; 290 m->lastpos = NULL; 291 m->offp = string; 292 m->beginp = start; 293 m->endp = stop; 294 STATESETUP(m, 4); 295 SETUP(m->st); 296 SETUP(m->fresh); 297 SETUP(m->tmp); 298 SETUP(m->empty); 299 CLEAR(m->empty); 300 ZAPSTATE(&m->mbs); 301 302 /* Adjust start according to moffset, to speed things up */ 303 if (dp != NULL && g->moffset > -1) { 304 const char *nstart; 305 306 nstart = stepback(start, dp, g->moffset); 307 if (nstart != NULL) 308 start = nstart; 309 } 310 311 SP("mloop", m->st, *start); 312 313 /* this loop does only one repetition except for backrefs */ 314 for (;;) { 315 endp = walk(m, start, stop, gf, gl, true); 316 if (endp == NULL) { /* a miss */ 317 error = REG_NOMATCH; 318 goto done; 319 } 320 if (nmatch == 0 && !g->backrefs) 321 break; /* no further info needed */ 322 323 /* where? */ 324 assert(m->coldp != NULL); 325 for (;;) { 326 NOTE("finding start"); 327 endp = walk(m, m->coldp, stop, gf, gl, false); 328 if (endp != NULL) 329 break; 330 assert(m->coldp < m->endp); 331 m->coldp += XMBRTOWC(NULL, m->coldp, 332 (size_t)(m->endp - m->coldp), &m->mbs, 0); 333 } 334 if (nmatch == 1 && !g->backrefs) 335 break; /* no further info needed */ 336 337 /* oh my, he wants the subexpressions... */ 338 if (m->pmatch == NULL) 339 m->pmatch = (regmatch_t *)malloc((m->g->nsub + 1) * 340 sizeof(regmatch_t)); 341 if (m->pmatch == NULL) { 342 error = REG_ESPACE; 343 goto done; 344 } 345 for (i = 1; i <= m->g->nsub; i++) 346 m->pmatch[i].rm_so = m->pmatch[i].rm_eo = -1; 347 if (!g->backrefs && !(m->eflags®_BACKR)) { 348 NOTE("dissecting"); 349 dp = dissect(m, m->coldp, endp, gf, gl); 350 } else { 351 if (g->nplus > 0 && m->lastpos == NULL) 352 m->lastpos = malloc((g->nplus+1) * 353 sizeof(const char *)); 354 if (g->nplus > 0 && m->lastpos == NULL) { 355 error = REG_ESPACE; 356 goto done; 357 } 358 NOTE("backref dissect"); 359 dp = backref(m, m->coldp, endp, gf, gl, (sopno)0, 0); 360 } 361 if (dp != NULL) 362 break; 363 364 /* uh-oh... we couldn't find a subexpression-level match */ 365 assert(g->backrefs); /* must be back references doing it */ 366 assert(g->nplus == 0 || m->lastpos != NULL); 367 for (;;) { 368 if (dp != NULL || endp <= m->coldp) 369 break; /* defeat */ 370 NOTE("backoff"); 371 endp = walk(m, m->coldp, endp-1, gf, gl, false); 372 if (endp == NULL) 373 break; /* defeat */ 374 /* try it on a shorter possibility */ 375#ifndef NDEBUG 376 for (i = 1; i <= m->g->nsub; i++) { 377 assert(m->pmatch[i].rm_so == (regoff_t)-1); 378 assert(m->pmatch[i].rm_eo == (regoff_t)-1); 379 } 380#endif 381 NOTE("backoff dissect"); 382 dp = backref(m, m->coldp, endp, gf, gl, (sopno)0, 0); 383 } 384 assert(dp == NULL || dp == endp); 385 if (dp != NULL) /* found a shorter one */ 386 break; 387 388 /* despite initial appearances, there is no match here */ 389 NOTE("false alarm"); 390 /* recycle starting later */ 391 start = m->coldp + XMBRTOWC(NULL, m->coldp, 392 (size_t)(stop - m->coldp), &m->mbs, 0); 393 assert(start <= stop); 394 } 395 396 /* fill in the details if requested */ 397 if (nmatch > 0) { 398 _DIAGASSERT(pmatch != NULL); 399 pmatch[0].rm_so = m->coldp - m->offp; 400 pmatch[0].rm_eo = endp - m->offp; 401 } 402 if (nmatch > 1) { 403 assert(m->pmatch != NULL); 404 for (i = 1; i < nmatch; i++) 405 if (i <= m->g->nsub) 406 pmatch[i] = m->pmatch[i]; 407 else { 408 pmatch[i].rm_so = (regoff_t)-1; 409 pmatch[i].rm_eo = (regoff_t)-1; 410 } 411 } 412 413done: 414 if (m->pmatch != NULL) { 415 free(m->pmatch); 416 m->pmatch = NULL; 417 } 418 if (m->lastpos != NULL) { 419 free(__UNCONST(m->lastpos)); 420 m->lastpos = NULL; 421 } 422 STATETEARDOWN(m); 423 return error; 424} 425 426/* 427 - dissect - figure out what matched what, no back references 428 == static const char *dissect(struct match *m, const char *start, \ 429 == const char *stop, sopno startst, sopno stopst); 430 */ 431static const char * /* == stop (success) always */ 432dissect( 433 struct match *m, 434 const char *start, 435 const char *stop, 436 sopno startst, 437 sopno stopst) 438{ 439 int i; 440 sopno ss; /* start sop of current subRE */ 441 sopno es; /* end sop of current subRE */ 442 const char *sp; /* start of string matched by it */ 443 const char *stp; /* string matched by it cannot pass here */ 444 const char *rest; /* start of rest of string */ 445 const char *tail; /* string unmatched by rest of RE */ 446 sopno ssub; /* start sop of subsubRE */ 447 sopno esub; /* end sop of subsubRE */ 448 const char *ssp; /* start of string matched by subsubRE */ 449 const char *sep; /* end of string matched by subsubRE */ 450 const char *oldssp; /* previous ssp */ 451 const char *dp __unused; 452 453 _DIAGASSERT(m != NULL); 454 _DIAGASSERT(start != NULL); 455 _DIAGASSERT(stop != NULL); 456 457 AT("diss", start, stop, startst, stopst); 458 sp = start; 459 for (ss = startst; ss < stopst; ss = es) { 460 /* identify end of subRE */ 461 es = ss; 462 switch (OP(m->g->strip[es])) { 463 case OPLUS_: 464 case OQUEST_: 465 es += OPND(m->g->strip[es]); 466 break; 467 case OCH_: 468 while (OP(m->g->strip[es]) != O_CH) 469 es += OPND(m->g->strip[es]); 470 break; 471 } 472 es++; 473 474 /* figure out what it matched */ 475 switch (OP(m->g->strip[ss])) { 476 case OEND: 477 assert(nope); 478 break; 479 case OCHAR: 480 sp += XMBRTOWC(NULL, sp, (size_t)(stop - start), 481 &m->mbs, 0); 482 break; 483 case OBOL: 484 case OEOL: 485 case OBOW: 486 case OEOW: 487 case OBOS: 488 case OEOS: 489 case OWBND: 490 case ONWBND: 491 break; 492 case OANY: 493 case OANYOF: 494 sp += XMBRTOWC(NULL, sp, (size_t)(stop - start), 495 &m->mbs, 0); 496 break; 497 case OBACK_: 498 case O_BACK: 499 assert(nope); 500 break; 501 /* cases where length of match is hard to find */ 502 case OQUEST_: 503 stp = stop; 504 for (;;) { 505 /* how long could this one be? */ 506 rest = walk(m, sp, stp, ss, es, false); 507 assert(rest != NULL); /* it did match */ 508 /* could the rest match the rest? */ 509 tail = walk(m, rest, stop, es, stopst, false); 510 if (tail == stop) 511 break; /* yes! */ 512 /* no -- try a shorter match for this one */ 513 stp = rest - 1; 514 assert(stp >= sp); /* it did work */ 515 } 516 ssub = ss + 1; 517 esub = es - 1; 518 /* did innards match? */ 519 if (walk(m, sp, rest, ssub, esub, false) != NULL) { 520 dp = dissect(m, sp, rest, ssub, esub); 521 assert(dp == rest); 522 } else /* no */ 523 assert(sp == rest); 524 sp = rest; 525 break; 526 case OPLUS_: 527 stp = stop; 528 for (;;) { 529 /* how long could this one be? */ 530 rest = walk(m, sp, stp, ss, es, false); 531 assert(rest != NULL); /* it did match */ 532 /* could the rest match the rest? */ 533 tail = walk(m, rest, stop, es, stopst, false); 534 if (tail == stop) 535 break; /* yes! */ 536 /* no -- try a shorter match for this one */ 537 stp = rest - 1; 538 assert(stp >= sp); /* it did work */ 539 } 540 ssub = ss + 1; 541 esub = es - 1; 542 ssp = sp; 543 oldssp = ssp; 544 for (;;) { /* find last match of innards */ 545 sep = walk(m, ssp, rest, ssub, esub, false); 546 if (sep == NULL || sep == ssp) 547 break; /* failed or matched null */ 548 oldssp = ssp; /* on to next try */ 549 ssp = sep; 550 } 551 if (sep == NULL) { 552 /* last successful match */ 553 sep = ssp; 554 ssp = oldssp; 555 } 556 assert(sep == rest); /* must exhaust substring */ 557 assert(walk(m, ssp, sep, ssub, esub, false) == rest); 558 dp = dissect(m, ssp, sep, ssub, esub); 559 assert(dp == sep); 560 sp = rest; 561 break; 562 case OCH_: 563 stp = stop; 564 for (;;) { 565 /* how long could this one be? */ 566 rest = walk(m, sp, stp, ss, es, false); 567 assert(rest != NULL); /* it did match */ 568 /* could the rest match the rest? */ 569 tail = walk(m, rest, stop, es, stopst, false); 570 if (tail == stop) 571 break; /* yes! */ 572 /* no -- try a shorter match for this one */ 573 stp = rest - 1; 574 assert(stp >= sp); /* it did work */ 575 } 576 ssub = ss + 1; 577 esub = ss + OPND(m->g->strip[ss]) - 1; 578 assert(OP(m->g->strip[esub]) == OOR1); 579 for (;;) { /* find first matching branch */ 580 if (walk(m, sp, rest, ssub, esub, false) == rest) 581 break; /* it matched all of it */ 582 /* that one missed, try next one */ 583 assert(OP(m->g->strip[esub]) == OOR1); 584 esub++; 585 assert(OP(m->g->strip[esub]) == OOR2); 586 ssub = esub + 1; 587 esub += OPND(m->g->strip[esub]); 588 if (OP(m->g->strip[esub]) == OOR2) 589 esub--; 590 else 591 assert(OP(m->g->strip[esub]) == O_CH); 592 } 593 dp = dissect(m, sp, rest, ssub, esub); 594 assert(dp == rest); 595 sp = rest; 596 break; 597 case O_PLUS: 598 case O_QUEST: 599 case OOR1: 600 case OOR2: 601 case O_CH: 602 assert(nope); 603 break; 604 case OLPAREN: 605 i = OPND(m->g->strip[ss]); 606 assert(0 < i && i <= m->g->nsub); 607 m->pmatch[i].rm_so = sp - m->offp; 608 break; 609 case ORPAREN: 610 i = OPND(m->g->strip[ss]); 611 assert(0 < i && i <= m->g->nsub); 612 m->pmatch[i].rm_eo = sp - m->offp; 613 break; 614 default: /* uh oh */ 615 assert(nope); 616 break; 617 } 618 } 619 620 assert(sp == stop); 621 return(sp); 622} 623 624#define ISBOW(m, sp) \ 625 (sp < m->endp && ISWORD(*sp) && \ 626 ((sp == m->beginp && !(m->eflags®_NOTBOL)) || \ 627 (sp > m->offp && !ISWORD(*(sp-1))))) 628#define ISEOW(m, sp) \ 629 (((sp == m->endp && !(m->eflags®_NOTEOL)) || \ 630 (sp < m->endp && *sp == '\n' && \ 631 (m->g->cflags®_NEWLINE)) || \ 632 (sp < m->endp && !ISWORD(*sp)) ) && \ 633 (sp > m->beginp && ISWORD(*(sp-1)))) \ 634 635/* 636 - backref - figure out what matched what, figuring in back references 637 == static const char *backref(struct match *m, const char *start, \ 638 == const char *stop, sopno startst, sopno stopst, sopno lev); 639 */ 640static const char * /* == stop (success) or NULL (failure) */ 641backref( 642 struct match *m, 643 const char *start, 644 const char *stop, 645 sopno startst, 646 sopno stopst, 647 sopno lev, /* PLUS nesting level */ 648 int rec) 649{ 650 int i; 651 sopno ss; /* start sop of current subRE */ 652 const char *sp; /* start of string matched by it */ 653 sopno ssub; /* start sop of subsubRE */ 654 sopno esub; /* end sop of subsubRE */ 655 const char *ssp; /* start of string matched by subsubRE */ 656 const char *dp; 657 size_t len; 658 int hard; 659 sop s; 660 regoff_t offsave; 661 cset *cs; 662 wint_t wc; 663 664 _DIAGASSERT(m != NULL); 665 _DIAGASSERT(start != NULL); 666 _DIAGASSERT(stop != NULL); 667 668 AT("back", start, stop, startst, stopst); 669 sp = start; 670 671 /* get as far as we can with easy stuff */ 672 hard = 0; 673 for (ss = startst; !hard && ss < stopst; ss++) 674 switch (OP(s = m->g->strip[ss])) { 675 case OCHAR: 676 if (sp == stop) 677 return(NULL); 678 sp += XMBRTOWC(&wc, sp, (size_t)(stop - sp), 679 &m->mbs, BADCHAR); 680 if (wc != (wint_t)OPND(s)) 681 return(NULL); 682 break; 683 case OANY: 684 if (sp == stop) 685 return(NULL); 686 sp += XMBRTOWC(&wc, sp, (size_t)(stop - sp), 687 &m->mbs, BADCHAR); 688 if (wc == BADCHAR) 689 return (NULL); 690 break; 691 case OANYOF: 692 if (sp == stop) 693 return (NULL); 694 cs = &m->g->sets[OPND(s)]; 695 sp += XMBRTOWC(&wc, sp, (size_t)(stop - sp), 696 &m->mbs, BADCHAR); 697 if (wc == BADCHAR || !CHIN(cs, wc)) 698 return(NULL); 699 break; 700 case OBOS: 701 if (sp == m->beginp && (m->eflags & REG_NOTBOL) == 0) 702 { /* yes */ } 703 else 704 return(NULL); 705 break; 706 case OEOS: 707 if (sp == m->endp && (m->eflags & REG_NOTEOL) == 0) 708 { /* yes */ } 709 else 710 return(NULL); 711 break; 712 case OBOL: 713 if ((sp == m->beginp && !(m->eflags®_NOTBOL)) || 714 (sp > m->offp && sp < m->endp && 715 *(sp-1) == '\n' && (m->g->cflags®_NEWLINE))) 716 { /* yes */ } 717 else 718 return(NULL); 719 break; 720 case OEOL: 721 if ( (sp == m->endp && !(m->eflags®_NOTEOL)) || 722 (sp < m->endp && *sp == '\n' && 723 (m->g->cflags®_NEWLINE)) ) 724 { /* yes */ } 725 else 726 return(NULL); 727 break; 728 case OWBND: 729 if (ISBOW(m, sp) || ISEOW(m, sp)) 730 { /* yes */ } 731 else 732 return(NULL); 733 break; 734 case ONWBND: 735 if (((sp == m->beginp) && !ISWORD(*sp)) || 736 (sp == m->endp && !ISWORD(*(sp - 1)))) 737 { /* yes, beginning/end of subject */ } 738 else if (ISWORD(*(sp - 1)) == ISWORD(*sp)) 739 { /* yes, beginning/end of subject */ } 740 else 741 return(NULL); 742 break; 743 case OBOW: 744 if (ISBOW(m, sp)) 745 { /* yes */ } 746 else 747 return(NULL); 748 break; 749 case OEOW: 750 if (ISEOW(m, sp)) 751 { /* yes */ } 752 else 753 return(NULL); 754 break; 755 case O_QUEST: 756 break; 757 case OOR1: /* matches null but needs to skip */ 758 ss++; 759 s = m->g->strip[ss]; 760 do { 761 assert(OP(s) == OOR2); 762 ss += OPND(s); 763 } while (OP(s = m->g->strip[ss]) != O_CH); 764 /* note that the ss++ gets us past the O_CH */ 765 break; 766 default: /* have to make a choice */ 767 hard = 1; 768 break; 769 } 770 if (!hard) { /* that was it! */ 771 if (sp != stop) 772 return(NULL); 773 return(sp); 774 } 775 ss--; /* adjust for the for's final increment */ 776 777 /* the hard stuff */ 778 AT("hard", sp, stop, ss, stopst); 779 s = m->g->strip[ss]; 780 switch (OP(s)) { 781 case OBACK_: /* the vilest depths */ 782 i = OPND(s); 783 assert(0 < i && i <= m->g->nsub); 784 if (m->pmatch[i].rm_eo == -1) 785 return(NULL); 786 assert(m->pmatch[i].rm_so != -1); 787 len = (size_t)(m->pmatch[i].rm_eo - m->pmatch[i].rm_so); 788 if (len == 0 && rec++ > MAX_RECURSION) 789 return(NULL); 790 assert(stop - m->beginp >= len); 791 if (sp > stop - len) 792 return(NULL); /* not enough left to match */ 793 ssp = m->offp + (size_t)m->pmatch[i].rm_so; 794 if (memcmp(sp, ssp, len) != 0) 795 return(NULL); 796 while (m->g->strip[ss] != SOP(O_BACK, i)) 797 ss++; 798 return(backref(m, sp+len, stop, ss+1, stopst, lev, rec)); 799 case OQUEST_: /* to null or not */ 800 dp = backref(m, sp, stop, ss+1, stopst, lev, rec); 801 if (dp != NULL) 802 return(dp); /* not */ 803 return(backref(m, sp, stop, ss+OPND(s)+1, stopst, lev, rec)); 804 case OPLUS_: 805 assert(m->lastpos != NULL); 806 assert(lev+1 <= m->g->nplus); 807 m->lastpos[lev+1] = sp; 808 return(backref(m, sp, stop, ss+1, stopst, lev+1, rec)); 809 case O_PLUS: 810 if (sp == m->lastpos[lev]) /* last pass matched null */ 811 return(backref(m, sp, stop, ss+1, stopst, lev-1, rec)); 812 /* try another pass */ 813 m->lastpos[lev] = sp; 814 dp = backref(m, sp, stop, ss-OPND(s)+1, stopst, lev, rec); 815 if (dp == NULL) 816 return(backref(m, sp, stop, ss+1, stopst, lev-1, rec)); 817 else 818 return(dp); 819 case OCH_: /* find the right one, if any */ 820 ssub = ss + 1; 821 esub = ss + OPND(s) - 1; 822 assert(OP(m->g->strip[esub]) == OOR1); 823 for (;;) { /* find first matching branch */ 824 dp = backref(m, sp, stop, ssub, esub, lev, rec); 825 if (dp != NULL) 826 return(dp); 827 /* that one missed, try next one */ 828 if (OP(m->g->strip[esub]) == O_CH) 829 return(NULL); /* there is none */ 830 esub++; 831 assert(OP(m->g->strip[esub]) == OOR2); 832 ssub = esub + 1; 833 esub += OPND(m->g->strip[esub]); 834 if (OP(m->g->strip[esub]) == OOR2) 835 esub--; 836 else 837 assert(OP(m->g->strip[esub]) == O_CH); 838 } 839 /* NOTREACHED */ 840 break; 841 case OLPAREN: /* must undo assignment if rest fails */ 842 i = OPND(s); 843 assert(0 < i && i <= m->g->nsub); 844 offsave = m->pmatch[i].rm_so; 845 m->pmatch[i].rm_so = sp - m->offp; 846 dp = backref(m, sp, stop, ss+1, stopst, lev, rec); 847 if (dp != NULL) 848 return(dp); 849 m->pmatch[i].rm_so = offsave; 850 return(NULL); 851 case ORPAREN: /* must undo assignment if rest fails */ 852 i = OPND(s); 853 assert(0 < i && i <= m->g->nsub); 854 offsave = m->pmatch[i].rm_eo; 855 m->pmatch[i].rm_eo = sp - m->offp; 856 dp = backref(m, sp, stop, ss+1, stopst, lev, rec); 857 if (dp != NULL) 858 return(dp); 859 m->pmatch[i].rm_eo = offsave; 860 return(NULL); 861 default: /* uh oh */ 862 assert(nope); 863 break; 864 } 865 866 /* "can't happen" */ 867 assert(nope); 868 /* NOTREACHED */ 869 return "shut up gcc"; 870} 871 872/* 873 - walk - step through the string either quickly or slowly 874 == static const char *walk(struct match *m, const char *start, \ 875 == const char *stop, sopno startst, sopno stopst, bool fast); 876 */ 877static const char * /* where it ended, or NULL */ 878walk(struct match *m, const char *start, const char *stop, sopno startst, 879 sopno stopst, bool fast) 880{ 881 states st = m->st; 882 states fresh = m->fresh; 883 states empty = m->empty; 884 states tmp = m->tmp; 885 const char *p = start; 886 wint_t c; 887 wint_t lastc; /* previous c */ 888 wint_t flagch; 889 int sflags; 890 const char *matchp; /* last p at which a match ended */ 891 size_t i, clen; 892 893 _DIAGASSERT(m != NULL); 894 _DIAGASSERT(start != NULL); 895 _DIAGASSERT(stop != NULL); 896 897 sflags = 0; 898 AT("walk", start, stop, startst, stopst); 899 CLEAR(st); 900 SET1(st, startst); 901 SP("sstart", st, *p); 902 st = step(m->g, startst, stopst, st, NOTHING, st, sflags); 903 if (fast) 904 ASSIGN(fresh, st); 905 matchp = NULL; 906 if (start == m->offp || (start == m->beginp && !(m->eflags®_NOTBOL))) 907 c = OUT; 908 else { 909 /* 910 * XXX Wrong if the previous character was multi-byte. 911 * Newline never is (in encodings supported by FreeBSD), 912 * so this only breaks the ISWORD tests below. 913 */ 914 c = (uch)*(start - 1); 915 } 916 for (;;) { 917 /* next character */ 918 lastc = c; 919 sflags = 0; 920 if (p == m->endp) { 921 c = OUT; 922 clen = 0; 923 } else 924 clen = XMBRTOWC(&c, p, (size_t)(m->endp - p), 925 &m->mbs, BADCHAR); 926 927 if (fast && EQ(st, fresh)) 928 matchp = p; 929 930 /* is there an EOL and/or BOL between lastc and c? */ 931 flagch = '\0'; 932 i = 0; 933 if ( (lastc == '\n' && m->g->cflags®_NEWLINE) || 934 (lastc == OUT && !(m->eflags®_NOTBOL)) ) { 935 flagch = BOL; 936 i = m->g->nbol; 937 } 938 if ( (c == '\n' && m->g->cflags®_NEWLINE) || 939 (c == OUT && !(m->eflags®_NOTEOL)) ) { 940 flagch = (flagch == BOL) ? BOLEOL : EOL; 941 i += m->g->neol; 942 } 943 if (lastc == OUT && (m->eflags & REG_NOTBOL) == 0) { 944 sflags |= SBOS; 945 /* Step one more for BOS. */ 946 i++; 947 } 948 if (c == OUT && (m->eflags & REG_NOTEOL) == 0) { 949 sflags |= SEOS; 950 /* Step one more for EOS. */ 951 i++; 952 } 953 if (i != 0) { 954 for (; i > 0; i--) 955 st = step(m->g, startst, stopst, st, flagch, st, 956 sflags); 957 SP("sboleol", st, c); 958 } 959 960 /* how about a word boundary? */ 961 if ( (flagch == BOL || (lastc != OUT && !ISWORD(lastc))) && 962 (c != OUT && ISWORD(c)) ) { 963 flagch = BOW; 964 } 965 if ( (lastc != OUT && ISWORD(lastc)) && 966 (flagch == EOL || (c != OUT && !ISWORD(c))) ) { 967 flagch = EOW; 968 } 969 if (flagch == BOW || flagch == EOW) { 970 st = step(m->g, startst, stopst, st, flagch, st, sflags); 971 SP("sboweow", st, c); 972 } 973 if (lastc != OUT && c != OUT && 974 ISWORD(lastc) == ISWORD(c)) { 975 flagch = NWBND; 976 } else if ((lastc == OUT && !ISWORD(c)) || 977 (c == OUT && !ISWORD(lastc))) { 978 flagch = NWBND; 979 } 980 if (flagch == NWBND) { 981 st = step(m->g, startst, stopst, st, flagch, st, sflags); 982 SP("snwbnd", st, c); 983 } 984 985 /* are we done? */ 986 if (ISSET(st, stopst)) { 987 if (fast) 988 break; 989 else 990 matchp = p; 991 } 992 if (EQ(st, empty) || p == stop || clen > (size_t)(stop - p)) 993 break; /* NOTE BREAK OUT */ 994 995 /* no, we must deal with this character */ 996 ASSIGN(tmp, st); 997 if (fast) 998 ASSIGN(st, fresh); 999 else 1000 ASSIGN(st, empty); 1001 assert(c != OUT); 1002 st = step(m->g, startst, stopst, tmp, c, st, sflags); 1003 SP("saft", st, c); 1004 assert(EQ(step(m->g, startst, stopst, st, NOTHING, st, sflags), 1005 st)); 1006 p += clen; 1007 } 1008 1009 if (fast) { 1010 assert(matchp != NULL); 1011 m->coldp = matchp; 1012 if (ISSET(st, stopst)) 1013 return (p + XMBRTOWC(NULL, p, (size_t)(stop - p), 1014 &m->mbs, 0)); 1015 else 1016 return (NULL); 1017 } else 1018 return (matchp); 1019} 1020 1021/* 1022 - step - map set of states reachable before char to set reachable after 1023 == static states step(struct re_guts *g, sopno start, sopno stop, \ 1024 == states bef, int ch, states aft); 1025 == #define BOL (OUT-1) 1026 == #define EOL (BOL-1) 1027 == #define BOLEOL (BOL-2) 1028 == #define NOTHING (BOL-3) 1029 == #define BOW (BOL-4) 1030 == #define EOW (BOL-5) 1031 == #define BADCHAR (BOL-6) 1032 == #define NONCHAR(c) ((c) <= OUT) 1033 */ 1034static states 1035step(struct re_guts *g, 1036 sopno start, /* start state within strip */ 1037 sopno stop, /* state after stop state within strip */ 1038 states bef, /* states reachable before */ 1039 wint_t ch, /* character or NONCHAR code */ 1040 states aft, /* states already known reachable after */ 1041 int sflags) /* state flags */ 1042{ 1043 cset *cs; 1044 sop s; 1045 sopno pc; 1046 onestate here; /* note, macros know this name */ 1047 sopno look; 1048 int i; 1049 1050 _DIAGASSERT(g != NULL); 1051 1052 for (pc = start, INIT(here, pc); pc != stop; pc++, INC(here)) { 1053 s = g->strip[pc]; 1054 switch (OP(s)) { 1055 case OEND: 1056 assert(pc == stop-1); 1057 break; 1058 case OCHAR: 1059 /* only characters can match */ 1060 assert(!NONCHAR(ch) || ch != OPND(s)); 1061 if (ch == (wint_t)OPND(s)) 1062 FWD(aft, bef, 1); 1063 break; 1064 case OBOS: 1065 if ((ch == BOL || ch == BOLEOL) && (sflags & SBOS) != 0) 1066 FWD(aft, bef, 1); 1067 break; 1068 case OEOS: 1069 if ((ch == EOL || ch == BOLEOL) && (sflags & SEOS) != 0) 1070 FWD(aft, bef, 1); 1071 break; 1072 case OBOL: 1073 if (ch == BOL || ch == BOLEOL) 1074 FWD(aft, bef, 1); 1075 break; 1076 case OEOL: 1077 if (ch == EOL || ch == BOLEOL) 1078 FWD(aft, bef, 1); 1079 break; 1080 case OBOW: 1081 if (ch == BOW) 1082 FWD(aft, bef, 1); 1083 break; 1084 case OEOW: 1085 if (ch == EOW) 1086 FWD(aft, bef, 1); 1087 break; 1088 case OWBND: 1089 if (ch == BOW || ch == EOW) 1090 FWD(aft, bef, 1); 1091 break; 1092 case ONWBND: 1093 if (ch == NWBND) 1094 FWD(aft, aft, 1); 1095 break; 1096 case OANY: 1097 if (!NONCHAR(ch)) 1098 FWD(aft, bef, 1); 1099 break; 1100 case OANYOF: 1101 cs = &g->sets[OPND(s)]; 1102 if (!NONCHAR(ch) && CHIN(cs, ch)) 1103 FWD(aft, bef, 1); 1104 break; 1105 case OBACK_: /* ignored here */ 1106 case O_BACK: 1107 FWD(aft, aft, 1); 1108 break; 1109 case OPLUS_: /* forward, this is just an empty */ 1110 FWD(aft, aft, 1); 1111 break; 1112 case O_PLUS: /* both forward and back */ 1113 FWD(aft, aft, 1); 1114 i = ISSETBACK(aft, OPND(s)); 1115 BACK(aft, aft, OPND(s)); 1116 if (!i && ISSETBACK(aft, OPND(s))) { 1117 /* oho, must reconsider loop body */ 1118 pc -= OPND(s) + 1; 1119 INIT(here, pc); 1120 } 1121 break; 1122 case OQUEST_: /* two branches, both forward */ 1123 FWD(aft, aft, 1); 1124 FWD(aft, aft, OPND(s)); 1125 break; 1126 case O_QUEST: /* just an empty */ 1127 FWD(aft, aft, 1); 1128 break; 1129 case OLPAREN: /* not significant here */ 1130 case ORPAREN: 1131 FWD(aft, aft, 1); 1132 break; 1133 case OCH_: /* mark the first two branches */ 1134 FWD(aft, aft, 1); 1135 assert(OP(g->strip[pc+OPND(s)]) == OOR2); 1136 FWD(aft, aft, OPND(s)); 1137 break; 1138 case OOR1: /* done a branch, find the O_CH */ 1139 if (ISSTATEIN(aft, here)) { 1140 for (look = 1; 1141 OP(s = g->strip[pc+look]) != O_CH; 1142 look += OPND(s)) 1143 assert(OP(s) == OOR2); 1144 FWD(aft, aft, look + 1); 1145 } 1146 break; 1147 case OOR2: /* propagate OCH_'s marking */ 1148 FWD(aft, aft, 1); 1149 if (OP(g->strip[pc+OPND(s)]) != O_CH) { 1150 assert(OP(g->strip[pc+OPND(s)]) == OOR2); 1151 FWD(aft, aft, OPND(s)); 1152 } 1153 break; 1154 case O_CH: /* just empty */ 1155 FWD(aft, aft, 1); 1156 break; 1157 default: /* ooooops... */ 1158 assert(nope); 1159 break; 1160 } 1161 } 1162 1163 return(aft); 1164} 1165 1166#ifdef REDEBUG 1167/* 1168 - print - print a set of states 1169 == #ifdef REDEBUG 1170 == static void print(struct match *m, const char *caption, states st, \ 1171 == int ch, FILE *d); 1172 == #endif 1173 */ 1174static void 1175print(struct match *m, 1176 const char *caption, 1177 states st, 1178 int ch, 1179 FILE *d) 1180{ 1181 struct re_guts *g = m->g; 1182 sopno i; 1183 int first = 1; 1184 1185 _DIAGASSERT(m != NULL); 1186 _DIAGASSERT(caption != NULL); 1187 1188 if (!(m->eflags®_TRACE)) 1189 return; 1190 1191 _DIAGASSERT(d != NULL); 1192 1193 fprintf(d, "%s", caption); 1194 if (ch != '\0') 1195 fprintf(d, " %s", pchar(ch)); 1196 for (i = 0; i < g->nstates; i++) 1197 if (ISSET(st, i)) { 1198 fprintf(d, "%s%lu", (first) ? "\t" : ", ", i); 1199 first = 0; 1200 } 1201 fprintf(d, "\n"); 1202} 1203 1204/* 1205 - at - print current situation 1206 == #ifdef REDEBUG 1207 == static void at(struct match *m, const char *title, const char *start, \ 1208 == const char *stop, sopno startst, sopno stopst); 1209 == #endif 1210 */ 1211static void 1212at( struct match *m, 1213 const char *title, 1214 const char *start, 1215 const char *stop, 1216 sopno startst, 1217 sopno stopst) 1218{ 1219 1220 _DIAGASSERT(m != NULL); 1221 _DIAGASSERT(title != NULL); 1222 _DIAGASSERT(start != NULL); 1223 _DIAGASSERT(stop != NULL); 1224 1225 if (!(m->eflags®_TRACE)) 1226 return; 1227 1228 printf("%s %s-", title, pchar(*start)); 1229 printf("%s ", pchar(*stop)); 1230 printf("%ld-%ld\n", (long)startst, (long)stopst); 1231} 1232 1233#ifndef PCHARDONE 1234#define PCHARDONE /* never again */ 1235/* 1236 - pchar - make a character printable 1237 == #ifdef REDEBUG 1238 == static const char *pchar(int ch); 1239 == #endif 1240 * 1241 * Is this identical to regchar() over in debug.c? Well, yes. But a 1242 * duplicate here avoids having a debugging-capable regexec.o tied to 1243 * a matching debug.o, and this is convenient. It all disappears in 1244 * the non-debug compilation anyway, so it doesn't matter much. 1245 */ 1246static const char * /* -> representation */ 1247pchar(int ch) 1248{ 1249 static char pbuf[10]; 1250 1251 if (isprint((uch)ch) || ch == ' ') 1252 snprintf(pbuf, sizeof(pbuf), "%c", ch); 1253 else 1254 snprintf(pbuf, sizeof(pbuf), "\\%o", ch); 1255 return(pbuf); 1256} 1257#endif 1258#endif 1259 1260#undef stepback 1261#undef matcher 1262#undef walk 1263#undef dissect 1264#undef backref 1265#undef step 1266#undef print 1267#undef at 1268#undef match 1269