1/* 2 * re_*comp and friends - compile REs 3 * This file #includes several others (see the bottom). 4 * 5 * Copyright (c) 1998, 1999 Henry Spencer. All rights reserved. 6 * 7 * Development of this software was funded, in part, by Cray Research Inc., 8 * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics 9 * Corporation, none of whom are responsible for the results. The author 10 * thanks all of them. 11 * 12 * Redistribution and use in source and binary forms -- with or without 13 * modification -- are permitted for any purpose, provided that 14 * redistributions in source form retain this entire copyright notice and 15 * indicate the origin and nature of any modifications. 16 * 17 * I'd appreciate being given credit for this package in the documentation of 18 * software which uses it, but that is not a requirement. 19 * 20 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, 21 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY 22 * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL 23 * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 24 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 25 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; 26 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 27 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR 28 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF 29 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 * 31 */ 32 33#include "regguts.h" 34 35/* 36 * forward declarations, up here so forward datatypes etc. are defined early 37 */ 38/* =====^!^===== begin forwards =====^!^===== */ 39/* automatically gathered by fwd; do not hand-edit */ 40/* === regcomp.c === */ 41int compile(regex_t *, const chr *, size_t, int); 42static void moresubs(struct vars *, int); 43static int freev(struct vars *, int); 44static void makesearch(struct vars *, struct nfa *); 45static struct subre *parse(struct vars *, int, int, struct state *, struct state *); 46static struct subre *parsebranch(struct vars *, int, int, struct state *, struct state *, int); 47static void parseqatom(struct vars *, int, int, struct state *, struct state *, struct subre *); 48static void nonword(struct vars *, int, struct state *, struct state *); 49static void word(struct vars *, int, struct state *, struct state *); 50static int scannum(struct vars *); 51static void repeat(struct vars *, struct state *, struct state *, int, int); 52static void bracket(struct vars *, struct state *, struct state *); 53static void cbracket(struct vars *, struct state *, struct state *); 54static void brackpart(struct vars *, struct state *, struct state *); 55static const chr *scanplain(struct vars *); 56static void onechr(struct vars *, pchr, struct state *, struct state *); 57static void dovec(struct vars *, struct cvec *, struct state *, struct state *); 58static void wordchrs(struct vars *); 59static struct subre *subre(struct vars *, int, int, struct state *, struct state *); 60static void freesubre(struct vars *, struct subre *); 61static void freesrnode(struct vars *, struct subre *); 62static void optst(struct vars *, struct subre *); 63static int numst(struct subre *, int); 64static void markst(struct subre *); 65static void cleanst(struct vars *); 66static long nfatree(struct vars *, struct subre *, FILE *); 67static long nfanode(struct vars *, struct subre *, FILE *); 68static int newlacon(struct vars *, struct state *, struct state *, int); 69static void freelacons(struct subre *, int); 70static void rfree(regex_t *); 71static void dump(regex_t *, FILE *); 72static void dumpst(struct subre *, FILE *, int); 73static void stdump(struct subre *, FILE *, int); 74static const char *stid(struct subre *, char *, size_t); 75/* === regc_lex.c === */ 76static void lexstart(struct vars *); 77static void prefixes(struct vars *); 78static void lexnest(struct vars *, const chr *, const chr *); 79static void lexword(struct vars *); 80static int next(struct vars *); 81static int lexescape(struct vars *); 82static chr lexdigits(struct vars *, int, int, int); 83static int brenext(struct vars *, pchr); 84static void skip(struct vars *); 85static chr newline(NOPARMS); 86#ifdef REG_DEBUG 87static const chr *ch(NOPARMS); 88#endif 89static chr chrnamed(struct vars *, const chr *, const chr *, pchr); 90/* === regc_color.c === */ 91static void initcm(struct vars *, struct colormap *); 92static void freecm(struct colormap *); 93static void cmtreefree(struct colormap *, union tree *, int); 94static color setcolor(struct colormap *, pchr, pcolor); 95static color maxcolor(struct colormap *); 96static color newcolor(struct colormap *); 97static void freecolor(struct colormap *, pcolor); 98static color pseudocolor(struct colormap *); 99static color subcolor(struct colormap *, pchr c); 100static color newsub(struct colormap *, pcolor); 101static void subrange(struct vars *, pchr, pchr, struct state *, struct state *); 102static void subblock(struct vars *, pchr, struct state *, struct state *); 103static void okcolors(struct nfa *, struct colormap *); 104static void colorchain(struct colormap *, struct arc *); 105static void uncolorchain(struct colormap *, struct arc *); 106static void rainbow(struct nfa *, struct colormap *, int, pcolor, struct state *, struct state *); 107static void colorcomplement(struct nfa *, struct colormap *, int, struct state *, struct state *, struct state *); 108#ifdef REG_DEBUG 109static void dumpcolors(struct colormap *, FILE *); 110static void fillcheck(struct colormap *, union tree *, int, FILE *); 111static void dumpchr(pchr, FILE *); 112#endif 113/* === regc_nfa.c === */ 114static struct nfa *newnfa(struct vars *, struct colormap *, struct nfa *); 115static void freenfa(struct nfa *); 116static struct state *newstate(struct nfa *); 117static struct state *newfstate(struct nfa *, int flag); 118static void dropstate(struct nfa *, struct state *); 119static void freestate(struct nfa *, struct state *); 120static void destroystate(struct nfa *, struct state *); 121static void newarc(struct nfa *, int, pcolor, struct state *, struct state *); 122static struct arc *allocarc(struct nfa *, struct state *); 123static void freearc(struct nfa *, struct arc *); 124static struct arc *findarc(struct state *, int, pcolor); 125static void cparc(struct nfa *, struct arc *, struct state *, struct state *); 126static void moveins(struct nfa *, struct state *, struct state *); 127static void copyins(struct nfa *, struct state *, struct state *); 128static void moveouts(struct nfa *, struct state *, struct state *); 129static void copyouts(struct nfa *, struct state *, struct state *); 130static void cloneouts(struct nfa *, struct state *, struct state *, struct state *, int); 131static void delsub(struct nfa *, struct state *, struct state *); 132static void deltraverse(struct nfa *, struct state *, struct state *); 133static void dupnfa(struct nfa *, struct state *, struct state *, struct state *, struct state *); 134static void duptraverse(struct nfa *, struct state *, struct state *); 135static void cleartraverse(struct nfa *, struct state *); 136static void specialcolors(struct nfa *); 137static long optimize(struct nfa *, FILE *); 138static void pullback(struct nfa *, FILE *); 139static int pull(struct nfa *, struct arc *); 140static void pushfwd(struct nfa *, FILE *); 141static int push(struct nfa *, struct arc *); 142#define INCOMPATIBLE 1 /* destroys arc */ 143#define SATISFIED 2 /* constraint satisfied */ 144#define COMPATIBLE 3 /* compatible but not satisfied yet */ 145static int combine(struct arc *, struct arc *); 146static void fixempties(struct nfa *, FILE *); 147static int unempty(struct nfa *, struct arc *); 148static void cleanup(struct nfa *); 149static void markreachable(struct nfa *, struct state *, struct state *, struct state *); 150static void markcanreach(struct nfa *, struct state *, struct state *, struct state *); 151static long analyze(struct nfa *); 152static void compact(struct nfa *, struct cnfa *); 153static void carcsort(struct carc *, struct carc *); 154static void freecnfa(struct cnfa *); 155static void dumpnfa(struct nfa *, FILE *); 156#ifdef REG_DEBUG 157static void dumpstate(struct state *, FILE *); 158static void dumparcs(struct state *, FILE *); 159static int dumprarcs(struct arc *, struct state *, FILE *, int); 160static void dumparc(struct arc *, struct state *, FILE *); 161#endif 162static void dumpcnfa(struct cnfa *, FILE *); 163#ifdef REG_DEBUG 164static void dumpcstate(int, struct carc *, struct cnfa *, FILE *); 165#endif 166/* === regc_cvec.c === */ 167static struct cvec *clearcvec(struct cvec *); 168static void addchr(struct cvec *, pchr); 169static void addrange(struct cvec *, pchr, pchr); 170static struct cvec *newcvec(int, int); 171static struct cvec *getcvec(struct vars *, int, int); 172static void freecvec(struct cvec *); 173/* === regc_locale.c === */ 174static celt element(struct vars *, const chr *, const chr *); 175static struct cvec *range(struct vars *, celt, celt, int); 176static int before(celt, celt); 177static struct cvec *eclass(struct vars *, celt, int); 178static struct cvec *cclass(struct vars *, const chr *, const chr *, int); 179static struct cvec *allcases(struct vars *, pchr); 180static int cmp(const chr *, const chr *, size_t); 181static int casecmp(const chr *, const chr *, size_t); 182/* automatically gathered by fwd; do not hand-edit */ 183/* =====^!^===== end forwards =====^!^===== */ 184 185/* internal variables, bundled for easy passing around */ 186struct vars { 187 regex_t *re; 188 const chr *now; /* scan pointer into string */ 189 const chr *stop; /* end of string */ 190 const chr *savenow; /* saved now and stop for "subroutine call" */ 191 const chr *savestop; 192 int err; /* error code (0 if none) */ 193 int cflags; /* copy of compile flags */ 194 int lasttype; /* type of previous token */ 195 int nexttype; /* type of next token */ 196 chr nextvalue; /* value (if any) of next token */ 197 int lexcon; /* lexical context type (see lex.c) */ 198 int nsubexp; /* subexpression count */ 199 struct subre **subs; /* subRE pointer vector */ 200 size_t nsubs; /* length of vector */ 201 struct subre *sub10[10]; /* initial vector, enough for most */ 202 struct nfa *nfa; /* the NFA */ 203 struct colormap *cm; /* character color map */ 204 color nlcolor; /* color of newline */ 205 struct state *wordchrs; /* state in nfa holding word-char outarcs */ 206 struct subre *tree; /* subexpression tree */ 207 struct subre *treechain; /* all tree nodes allocated */ 208 struct subre *treefree; /* any free tree nodes */ 209 int ntree; /* number of tree nodes */ 210 struct cvec *cv; /* interface cvec */ 211 struct cvec *cv2; /* utility cvec */ 212 struct subre *lacons; /* lookahead-constraint vector */ 213 int nlacons; /* size of lacons */ 214}; 215 216/* parsing macros; most know that `v' is the struct vars pointer */ 217#define NEXT() (next(v)) /* advance by one token */ 218#define SEE(t) (v->nexttype == (t)) /* is next token this? */ 219#define EAT(t) (SEE(t) && next(v)) /* if next is this, swallow it */ 220#define VISERR(vv) ((vv)->err != 0)/* have we seen an error yet? */ 221#define ISERR() VISERR(v) 222#define VERR(vv,e) \ 223 ((vv)->nexttype = EOS, ((vv)->err) ? (vv)->err : ((vv)->err = (e))) 224#define ERR(e) VERR(v, e) /* record an error */ 225#define NOERR() {if (ISERR()) return;} /* if error seen, return */ 226#define NOERRN() {if (ISERR()) return NULL;} /* NOERR with retval */ 227#define NOERRZ() {if (ISERR()) return 0;} /* NOERR with retval */ 228#define INSIST(c, e) ((c) ? 0 : ERR(e)) /* if condition false, error */ 229#define NOTE(b) (v->re->re_info |= (b)) /* note visible condition */ 230#define EMPTYARC(x, y) newarc(v->nfa, EMPTY, 0, x, y) 231 232/* token type codes, some also used as NFA arc types */ 233#define EMPTY 'n' /* no token present */ 234#define EOS 'e' /* end of string */ 235#define PLAIN 'p' /* ordinary character */ 236#define DIGIT 'd' /* digit (in bound) */ 237#define BACKREF 'b' /* back reference */ 238#define COLLEL 'I' /* start of [. */ 239#define ECLASS 'E' /* start of [= */ 240#define CCLASS 'C' /* start of [: */ 241#define END 'X' /* end of [. [= [: */ 242#define RANGE 'R' /* - within [] which might be range delim. */ 243#define LACON 'L' /* lookahead constraint subRE */ 244#define AHEAD 'a' /* color-lookahead arc */ 245#define BEHIND 'r' /* color-lookbehind arc */ 246#define WBDRY 'w' /* word boundary constraint */ 247#define NWBDRY 'W' /* non-word-boundary constraint */ 248#define SBEGIN 'A' /* beginning of string (even if not BOL) */ 249#define SEND 'Z' /* end of string (even if not EOL) */ 250#define PREFER 'P' /* length preference */ 251 252/* is an arc colored, and hence on a color chain? */ 253#define COLORED(a) \ 254 ((a)->type == PLAIN || (a)->type == AHEAD || (a)->type == BEHIND) 255 256/* static function list */ 257static struct fns functions = { 258 rfree, /* regfree insides */ 259}; 260 261/* 262 - compile - compile regular expression 263 ^ int compile(regex_t *, const chr *, size_t, int); 264 */ 265int 266compile( 267 regex_t *re, 268 const chr *string, 269 size_t len, 270 int flags) 271{ 272 AllocVars(v); 273 struct guts *g; 274 int i; 275 size_t j; 276 FILE *debug = (flags®_PROGRESS) ? stdout : NULL; 277#define CNOERR() { if (ISERR()) return freev(v, v->err); } 278 279 /* 280 * Sanity checks. 281 */ 282 283 if (re == NULL || string == NULL) { 284 FreeVars(v); 285 return REG_INVARG; 286 } 287 if ((flags®_QUOTE) && (flags&(REG_ADVANCED|REG_EXPANDED|REG_NEWLINE))) { 288 FreeVars(v); 289 return REG_INVARG; 290 } 291 if (!(flags®_EXTENDED) && (flags®_ADVF)) { 292 FreeVars(v); 293 return REG_INVARG; 294 } 295 296 /* 297 * Initial setup (after which freev() is callable). 298 */ 299 300 v->re = re; 301 v->now = string; 302 v->stop = v->now + len; 303 v->savenow = v->savestop = NULL; 304 v->err = 0; 305 v->cflags = flags; 306 v->nsubexp = 0; 307 v->subs = v->sub10; 308 v->nsubs = 10; 309 for (j = 0; j < v->nsubs; j++) { 310 v->subs[j] = NULL; 311 } 312 v->nfa = NULL; 313 v->cm = NULL; 314 v->nlcolor = COLORLESS; 315 v->wordchrs = NULL; 316 v->tree = NULL; 317 v->treechain = NULL; 318 v->treefree = NULL; 319 v->cv = NULL; 320 v->cv2 = NULL; 321 v->lacons = NULL; 322 v->nlacons = 0; 323 re->re_magic = REMAGIC; 324 re->re_info = 0; /* bits get set during parse */ 325 re->re_csize = sizeof(chr); 326 re->re_guts = NULL; 327 re->re_fns = VS(&functions); 328 329 /* 330 * More complex setup, malloced things. 331 */ 332 333 re->re_guts = VS(MALLOC(sizeof(struct guts))); 334 if (re->re_guts == NULL) { 335 return freev(v, REG_ESPACE); 336 } 337 g = (struct guts *) re->re_guts; 338 g->tree = NULL; 339 initcm(v, &g->cmap); 340 v->cm = &g->cmap; 341 g->lacons = NULL; 342 g->nlacons = 0; 343 ZAPCNFA(g->search); 344 v->nfa = newnfa(v, v->cm, NULL); 345 CNOERR(); 346 v->cv = newcvec(100, 20); 347 if (v->cv == NULL) { 348 return freev(v, REG_ESPACE); 349 } 350 351 /* 352 * Parsing. 353 */ 354 355 lexstart(v); /* also handles prefixes */ 356 if ((v->cflags®_NLSTOP) || (v->cflags®_NLANCH)) { 357 /* 358 * Assign newline a unique color. 359 */ 360 361 v->nlcolor = subcolor(v->cm, newline()); 362 okcolors(v->nfa, v->cm); 363 } 364 CNOERR(); 365 v->tree = parse(v, EOS, PLAIN, v->nfa->init, v->nfa->final); 366 assert(SEE(EOS)); /* even if error; ISERR() => SEE(EOS) */ 367 CNOERR(); 368 assert(v->tree != NULL); 369 370 /* 371 * Finish setup of nfa and its subre tree. 372 */ 373 374 specialcolors(v->nfa); 375 CNOERR(); 376 if (debug != NULL) { 377 fprintf(debug, "\n\n\n========= RAW ==========\n"); 378 dumpnfa(v->nfa, debug); 379 dumpst(v->tree, debug, 1); 380 } 381 optst(v, v->tree); 382 v->ntree = numst(v->tree, 1); 383 markst(v->tree); 384 cleanst(v); 385 if (debug != NULL) { 386 fprintf(debug, "\n\n\n========= TREE FIXED ==========\n"); 387 dumpst(v->tree, debug, 1); 388 } 389 390 /* 391 * Build compacted NFAs for tree and lacons. 392 */ 393 394 re->re_info |= nfatree(v, v->tree, debug); 395 CNOERR(); 396 assert(v->nlacons == 0 || v->lacons != NULL); 397 for (i = 1; i < v->nlacons; i++) { 398 if (debug != NULL) { 399 fprintf(debug, "\n\n\n========= LA%d ==========\n", i); 400 } 401 nfanode(v, &v->lacons[i], debug); 402 } 403 CNOERR(); 404 if (v->tree->flags&SHORTER) { 405 NOTE(REG_USHORTEST); 406 } 407 408 /* 409 * Build compacted NFAs for tree, lacons, fast search. 410 */ 411 412 if (debug != NULL) { 413 fprintf(debug, "\n\n\n========= SEARCH ==========\n"); 414 } 415 416 /* 417 * Can sacrifice main NFA now, so use it as work area. 418 */ 419 420 (DISCARD) optimize(v->nfa, debug); 421 CNOERR(); 422 makesearch(v, v->nfa); 423 CNOERR(); 424 compact(v->nfa, &g->search); 425 CNOERR(); 426 427 /* 428 * Looks okay, package it up. 429 */ 430 431 re->re_nsub = v->nsubexp; 432 v->re = NULL; /* freev no longer frees re */ 433 g->magic = GUTSMAGIC; 434 g->cflags = v->cflags; 435 g->info = re->re_info; 436 g->nsub = re->re_nsub; 437 g->tree = v->tree; 438 v->tree = NULL; 439 g->ntree = v->ntree; 440 g->compare = (v->cflags®_ICASE) ? casecmp : cmp; 441 g->lacons = v->lacons; 442 v->lacons = NULL; 443 g->nlacons = v->nlacons; 444 445 if (flags®_DUMP) { 446 dump(re, stdout); 447 } 448 449 assert(v->err == 0); 450 return freev(v, 0); 451} 452 453/* 454 - moresubs - enlarge subRE vector 455 ^ static void moresubs(struct vars *, int); 456 */ 457static void 458moresubs( 459 struct vars *v, 460 int wanted) /* want enough room for this one */ 461{ 462 struct subre **p; 463 size_t n; 464 465 assert(wanted > 0 && (size_t)wanted >= v->nsubs); 466 n = (size_t)wanted * 3 / 2 + 1; 467 if (v->subs == v->sub10) { 468 p = (struct subre **) MALLOC(n * sizeof(struct subre *)); 469 if (p != NULL) { 470 memcpy(p, v->subs, v->nsubs * sizeof(struct subre *)); 471 } 472 } else { 473 p = (struct subre **) REALLOC(v->subs, n*sizeof(struct subre *)); 474 } 475 if (p == NULL) { 476 ERR(REG_ESPACE); 477 return; 478 } 479 480 v->subs = p; 481 for (p = &v->subs[v->nsubs]; v->nsubs < n; p++, v->nsubs++) { 482 *p = NULL; 483 } 484 assert(v->nsubs == n); 485 assert((size_t)wanted < v->nsubs); 486} 487 488/* 489 - freev - free vars struct's substructures where necessary 490 * Optionally does error-number setting, and always returns error code (if 491 * any), to make error-handling code terser. 492 ^ static int freev(struct vars *, int); 493 */ 494static int 495freev( 496 struct vars *v, 497 int err) 498{ 499 register int ret; 500 501 if (v->re != NULL) { 502 rfree(v->re); 503 } 504 if (v->subs != v->sub10) { 505 FREE(v->subs); 506 } 507 if (v->nfa != NULL) { 508 freenfa(v->nfa); 509 } 510 if (v->tree != NULL) { 511 freesubre(v, v->tree); 512 } 513 if (v->treechain != NULL) { 514 cleanst(v); 515 } 516 if (v->cv != NULL) { 517 freecvec(v->cv); 518 } 519 if (v->cv2 != NULL) { 520 freecvec(v->cv2); 521 } 522 if (v->lacons != NULL) { 523 freelacons(v->lacons, v->nlacons); 524 } 525 ERR(err); /* nop if err==0 */ 526 527 ret = v->err; 528 FreeVars(v); 529 return ret; 530} 531 532/* 533 - makesearch - turn an NFA into a search NFA (implicit prepend of .*?) 534 * NFA must have been optimize()d already. 535 ^ static void makesearch(struct vars *, struct nfa *); 536 */ 537static void 538makesearch( 539 struct vars *v, 540 struct nfa *nfa) 541{ 542 struct arc *a, *b; 543 struct state *pre = nfa->pre; 544 struct state *s, *s2, *slist; 545 546 /* 547 * No loops are needed if it's anchored. 548 */ 549 550 for (a = pre->outs; a != NULL; a = a->outchain) { 551 assert(a->type == PLAIN); 552 if (a->co != nfa->bos[0] && a->co != nfa->bos[1]) { 553 break; 554 } 555 } 556 if (a != NULL) { 557 /* 558 * Add implicit .* in front. 559 */ 560 561 rainbow(nfa, v->cm, PLAIN, COLORLESS, pre, pre); 562 563 /* 564 * And ^* and \A* too -- not always necessary, but harmless. 565 */ 566 567 newarc(nfa, PLAIN, nfa->bos[0], pre, pre); 568 newarc(nfa, PLAIN, nfa->bos[1], pre, pre); 569 } 570 571 /* 572 * Now here's the subtle part. Because many REs have no lookback 573 * constraints, often knowing when you were in the pre state tells you 574 * little; it's the next state(s) that are informative. But some of them 575 * may have other inarcs, i.e. it may be possible to make actual progress 576 * and then return to one of them. We must de-optimize such cases, 577 * splitting each such state into progress and no-progress states. 578 */ 579 580 /* 581 * First, make a list of the states. 582 */ 583 584 slist = NULL; 585 for (a=pre->outs ; a!=NULL ; a=a->outchain) { 586 s = a->to; 587 for (b=s->ins ; b!=NULL ; b=b->inchain) { 588 if (b->from != pre) { 589 break; 590 } 591 } 592 if (b != NULL && s->tmp == NULL) { 593 /* 594 * Must be split if not already in the list (fixes bugs 505048, 595 * 230589, 840258, 504785). 596 */ 597 598 s->tmp = slist; 599 slist = s; 600 } 601 } 602 603 /* 604 * Do the splits. 605 */ 606 607 for (s=slist ; s!=NULL ; s=s2) { 608 s2 = newstate(nfa); 609 610 copyouts(nfa, s, s2); 611 for (a=s->ins ; a!=NULL ; a=b) { 612 b = a->inchain; 613 614 if (a->from != pre) { 615 cparc(nfa, a, a->from, s2); 616 freearc(nfa, a); 617 } 618 } 619 s2 = s->tmp; 620 s->tmp = NULL; /* clean up while we're at it */ 621 } 622} 623 624/* 625 - parse - parse an RE 626 * This is actually just the top level, which parses a bunch of branches tied 627 * together with '|'. They appear in the tree as the left children of a chain 628 * of '|' subres. 629 ^ static struct subre *parse(struct vars *, int, int, struct state *, 630 ^ struct state *); 631 */ 632static struct subre * 633parse( 634 struct vars *v, 635 int stopper, /* EOS or ')' */ 636 int type, /* LACON (lookahead subRE) or PLAIN */ 637 struct state *init, /* initial state */ 638 struct state *final) /* final state */ 639{ 640 struct state *left, *right; /* scaffolding for branch */ 641 struct subre *branches; /* top level */ 642 struct subre *branch; /* current branch */ 643 struct subre *t; /* temporary */ 644 int firstbranch; /* is this the first branch? */ 645 646 assert(stopper == ')' || stopper == EOS); 647 648 branches = subre(v, '|', LONGER, init, final); 649 NOERRN(); 650 branch = branches; 651 firstbranch = 1; 652 do { /* a branch */ 653 if (!firstbranch) { 654 /* 655 * Need a place to hang the branch. 656 */ 657 658 branch->right = subre(v, '|', LONGER, init, final); 659 NOERRN(); 660 branch = branch->right; 661 } 662 firstbranch = 0; 663 left = newstate(v->nfa); 664 right = newstate(v->nfa); 665 NOERRN(); 666 EMPTYARC(init, left); 667 EMPTYARC(right, final); 668 NOERRN(); 669 branch->left = parsebranch(v, stopper, type, left, right, 0); 670 NOERRN(); 671 branch->flags |= UP(branch->flags | branch->left->flags); 672 if ((branch->flags &~ branches->flags) != 0) { /* new flags */ 673 for (t = branches; t != branch; t = t->right) { 674 t->flags |= branch->flags; 675 } 676 } 677 } while (EAT('|')); 678 assert(SEE(stopper) || SEE(EOS)); 679 680 if (!SEE(stopper)) { 681 assert(stopper == ')' && SEE(EOS)); 682 ERR(REG_EPAREN); 683 } 684 685 /* 686 * Optimize out simple cases. 687 */ 688 689 if (branch == branches) { /* only one branch */ 690 assert(branch->right == NULL); 691 t = branch->left; 692 branch->left = NULL; 693 freesubre(v, branches); 694 branches = t; 695 } else if (!MESSY(branches->flags)) { /* no interesting innards */ 696 freesubre(v, branches->left); 697 branches->left = NULL; 698 freesubre(v, branches->right); 699 branches->right = NULL; 700 branches->op = '='; 701 } 702 703 return branches; 704} 705 706/* 707 - parsebranch - parse one branch of an RE 708 * This mostly manages concatenation, working closely with parseqatom(). 709 * Concatenated things are bundled up as much as possible, with separate 710 * ',' nodes introduced only when necessary due to substructure. 711 ^ static struct subre *parsebranch(struct vars *, int, int, struct state *, 712 ^ struct state *, int); 713 */ 714static struct subre * 715parsebranch( 716 struct vars *v, 717 int stopper, /* EOS or ')' */ 718 int type, /* LACON (lookahead subRE) or PLAIN */ 719 struct state *left, /* leftmost state */ 720 struct state *right, /* rightmost state */ 721 int partial) /* is this only part of a branch? */ 722{ 723 struct state *lp; /* left end of current construct */ 724 int seencontent; /* is there anything in this branch yet? */ 725 struct subre *t; 726 727 lp = left; 728 seencontent = 0; 729 t = subre(v, '=', 0, left, right); /* op '=' is tentative */ 730 NOERRN(); 731 while (!SEE('|') && !SEE(stopper) && !SEE(EOS)) { 732 if (seencontent) { /* implicit concat operator */ 733 lp = newstate(v->nfa); 734 NOERRN(); 735 moveins(v->nfa, right, lp); 736 } 737 seencontent = 1; 738 739 /* NB, recursion in parseqatom() may swallow rest of branch */ 740 parseqatom(v, stopper, type, lp, right, t); 741 } 742 743 if (!seencontent) { /* empty branch */ 744 if (!partial) { 745 NOTE(REG_UUNSPEC); 746 } 747 assert(lp == left); 748 EMPTYARC(left, right); 749 } 750 751 return t; 752} 753 754/* 755 - parseqatom - parse one quantified atom or constraint of an RE 756 * The bookkeeping near the end cooperates very closely with parsebranch(); in 757 * particular, it contains a recursion that can involve parsing the rest of 758 * the branch, making this function's name somewhat inaccurate. 759 ^ static void parseqatom(struct vars *, int, int, struct state *, 760 ^ struct state *, struct subre *); 761 */ 762static void 763parseqatom( 764 struct vars *v, 765 int stopper, /* EOS or ')' */ 766 int type, /* LACON (lookahead subRE) or PLAIN */ 767 struct state *lp, /* left state to hang it on */ 768 struct state *rp, /* right state to hang it on */ 769 struct subre *top) /* subtree top */ 770{ 771 struct state *s; /* temporaries for new states */ 772 struct state *s2; 773#define ARCV(t, val) newarc(v->nfa, t, val, lp, rp) 774 int m, n; 775 struct subre *atom; /* atom's subtree */ 776 struct subre *t; 777 int cap; /* capturing parens? */ 778 int pos; /* positive lookahead? */ 779 int subno; /* capturing-parens or backref number */ 780 int atomtype; 781 int qprefer; /* quantifier short/long preference */ 782 int f; 783 struct subre **atomp; /* where the pointer to atom is */ 784 785 /* 786 * Initial bookkeeping. 787 */ 788 789 atom = NULL; 790 assert(lp->nouts == 0); /* must string new code */ 791 assert(rp->nins == 0); /* between lp and rp */ 792 subno = 0; /* just to shut lint up */ 793 794 /* 795 * An atom or constraint... 796 */ 797 798 atomtype = v->nexttype; 799 switch (atomtype) { 800 /* first, constraints, which end by returning */ 801 case '^': 802 ARCV('^', 1); 803 if (v->cflags®_NLANCH) { 804 ARCV(BEHIND, v->nlcolor); 805 } 806 NEXT(); 807 return; 808 case '$': 809 ARCV('$', 1); 810 if (v->cflags®_NLANCH) { 811 ARCV(AHEAD, v->nlcolor); 812 } 813 NEXT(); 814 return; 815 case SBEGIN: 816 ARCV('^', 1); /* BOL */ 817 ARCV('^', 0); /* or BOS */ 818 NEXT(); 819 return; 820 case SEND: 821 ARCV('$', 1); /* EOL */ 822 ARCV('$', 0); /* or EOS */ 823 NEXT(); 824 return; 825 case '<': 826 wordchrs(v); /* does NEXT() */ 827 s = newstate(v->nfa); 828 NOERR(); 829 nonword(v, BEHIND, lp, s); 830 word(v, AHEAD, s, rp); 831 return; 832 case '>': 833 wordchrs(v); /* does NEXT() */ 834 s = newstate(v->nfa); 835 NOERR(); 836 word(v, BEHIND, lp, s); 837 nonword(v, AHEAD, s, rp); 838 return; 839 case WBDRY: 840 wordchrs(v); /* does NEXT() */ 841 s = newstate(v->nfa); 842 NOERR(); 843 nonword(v, BEHIND, lp, s); 844 word(v, AHEAD, s, rp); 845 s = newstate(v->nfa); 846 NOERR(); 847 word(v, BEHIND, lp, s); 848 nonword(v, AHEAD, s, rp); 849 return; 850 case NWBDRY: 851 wordchrs(v); /* does NEXT() */ 852 s = newstate(v->nfa); 853 NOERR(); 854 word(v, BEHIND, lp, s); 855 word(v, AHEAD, s, rp); 856 s = newstate(v->nfa); 857 NOERR(); 858 nonword(v, BEHIND, lp, s); 859 nonword(v, AHEAD, s, rp); 860 return; 861 case LACON: /* lookahead constraint */ 862 pos = v->nextvalue; 863 NEXT(); 864 s = newstate(v->nfa); 865 s2 = newstate(v->nfa); 866 NOERR(); 867 t = parse(v, ')', LACON, s, s2); 868 freesubre(v, t); /* internal structure irrelevant */ 869 assert(SEE(')') || ISERR()); 870 NEXT(); 871 n = newlacon(v, s, s2, pos); 872 NOERR(); 873 ARCV(LACON, n); 874 return; 875 876 /* 877 * Then errors, to get them out of the way. 878 */ 879 880 case '*': 881 case '+': 882 case '?': 883 case '{': 884 ERR(REG_BADRPT); 885 return; 886 default: 887 ERR(REG_ASSERT); 888 return; 889 890 /* 891 * Then plain characters, and minor variants on that theme. 892 */ 893 894 case ')': /* unbalanced paren */ 895 if ((v->cflags®_ADVANCED) != REG_EXTENDED) { 896 ERR(REG_EPAREN); 897 return; 898 } 899 900 /* 901 * Legal in EREs due to specification botch. 902 */ 903 904 NOTE(REG_UPBOTCH); 905 /* fallthrough into case PLAIN */ 906 case PLAIN: 907 onechr(v, v->nextvalue, lp, rp); 908 okcolors(v->nfa, v->cm); 909 NOERR(); 910 NEXT(); 911 break; 912 case '[': 913 if (v->nextvalue == 1) { 914 bracket(v, lp, rp); 915 } else { 916 cbracket(v, lp, rp); 917 } 918 assert(SEE(']') || ISERR()); 919 NEXT(); 920 break; 921 case '.': 922 rainbow(v->nfa, v->cm, PLAIN, 923 (v->cflags®_NLSTOP) ? v->nlcolor : COLORLESS, lp, rp); 924 NEXT(); 925 break; 926 927 /* 928 * And finally the ugly stuff. 929 */ 930 931 case '(': /* value flags as capturing or non */ 932 cap = (type == LACON) ? 0 : v->nextvalue; 933 if (cap) { 934 v->nsubexp++; 935 subno = v->nsubexp; 936 if ((size_t)subno >= v->nsubs) { 937 moresubs(v, subno); 938 } 939 assert((size_t)subno < v->nsubs); 940 } else { 941 atomtype = PLAIN; /* something that's not '(' */ 942 } 943 NEXT(); 944 945 /* 946 * Need new endpoints because tree will contain pointers. 947 */ 948 949 s = newstate(v->nfa); 950 s2 = newstate(v->nfa); 951 NOERR(); 952 EMPTYARC(lp, s); 953 EMPTYARC(s2, rp); 954 NOERR(); 955 atom = parse(v, ')', PLAIN, s, s2); 956 assert(SEE(')') || ISERR()); 957 NEXT(); 958 NOERR(); 959 if (cap) { 960 v->subs[subno] = atom; 961 t = subre(v, '(', atom->flags|CAP, lp, rp); 962 NOERR(); 963 t->subno = subno; 964 t->left = atom; 965 atom = t; 966 } 967 968 /* 969 * Postpone everything else pending possible {0}. 970 */ 971 972 break; 973 case BACKREF: /* the Feature From The Black Lagoon */ 974 INSIST(type != LACON, REG_ESUBREG); 975 INSIST(v->nextvalue < v->nsubs, REG_ESUBREG); 976 INSIST(v->subs[v->nextvalue] != NULL, REG_ESUBREG); 977 NOERR(); 978 assert(v->nextvalue > 0); 979 atom = subre(v, 'b', BACKR, lp, rp); 980 subno = v->nextvalue; 981 atom->subno = subno; 982 EMPTYARC(lp, rp); /* temporarily, so there's something */ 983 NEXT(); 984 break; 985 } 986 987 /* 988 * ...and an atom may be followed by a quantifier. 989 */ 990 991 switch (v->nexttype) { 992 case '*': 993 m = 0; 994 n = INFINITY; 995 qprefer = (v->nextvalue) ? LONGER : SHORTER; 996 NEXT(); 997 break; 998 case '+': 999 m = 1; 1000 n = INFINITY; 1001 qprefer = (v->nextvalue) ? LONGER : SHORTER; 1002 NEXT(); 1003 break; 1004 case '?': 1005 m = 0; 1006 n = 1; 1007 qprefer = (v->nextvalue) ? LONGER : SHORTER; 1008 NEXT(); 1009 break; 1010 case '{': 1011 NEXT(); 1012 m = scannum(v); 1013 if (EAT(',')) { 1014 if (SEE(DIGIT)) { 1015 n = scannum(v); 1016 } else { 1017 n = INFINITY; 1018 } 1019 if (m > n) { 1020 ERR(REG_BADBR); 1021 return; 1022 } 1023 1024 /* 1025 * {m,n} exercises preference, even if it's {m,m} 1026 */ 1027 1028 qprefer = (v->nextvalue) ? LONGER : SHORTER; 1029 } else { 1030 n = m; 1031 /* 1032 * {m} passes operand's preference through. 1033 */ 1034 1035 qprefer = 0; 1036 } 1037 if (!SEE('}')) { /* catches errors too */ 1038 ERR(REG_BADBR); 1039 return; 1040 } 1041 NEXT(); 1042 break; 1043 default: /* no quantifier */ 1044 m = n = 1; 1045 qprefer = 0; 1046 break; 1047 } 1048 1049 /* 1050 * Annoying special case: {0} or {0,0} cancels everything. 1051 */ 1052 1053 if (m == 0 && n == 0) { 1054 if (atom != NULL) { 1055 freesubre(v, atom); 1056 } 1057 if (atomtype == '(') { 1058 v->subs[subno] = NULL; 1059 } 1060 delsub(v->nfa, lp, rp); 1061 EMPTYARC(lp, rp); 1062 return; 1063 } 1064 1065 /* 1066 * If not a messy case, avoid hard part. 1067 */ 1068 1069 assert(!MESSY(top->flags)); 1070 f = top->flags | qprefer | ((atom != NULL) ? atom->flags : 0); 1071 if (atomtype != '(' && atomtype != BACKREF && !MESSY(UP(f))) { 1072 if (!(m == 1 && n == 1)) { 1073 repeat(v, lp, rp, m, n); 1074 } 1075 if (atom != NULL) { 1076 freesubre(v, atom); 1077 } 1078 top->flags = f; 1079 return; 1080 } 1081 1082 /* 1083 * hard part: something messy 1084 * That is, capturing parens, back reference, short/long clash, or an atom 1085 * with substructure containing one of those. 1086 */ 1087 1088 /* 1089 * Now we'll need a subre for the contents even if they're boring. 1090 */ 1091 1092 if (atom == NULL) { 1093 atom = subre(v, '=', 0, lp, rp); 1094 NOERR(); 1095 } 1096 1097 /* 1098 * Prepare a general-purpose state skeleton. 1099 * 1100 * ---> [s] ---prefix---> [begin] ---atom---> [end] ----rest---> [rp] 1101 * / / 1102 * [lp] ----> [s2] ----bypass--------------------- 1103 * 1104 * where bypass is an empty, and prefix is some repetitions of atom 1105 */ 1106 1107 s = newstate(v->nfa); /* first, new endpoints for the atom */ 1108 s2 = newstate(v->nfa); 1109 NOERR(); 1110 moveouts(v->nfa, lp, s); 1111 moveins(v->nfa, rp, s2); 1112 NOERR(); 1113 atom->begin = s; 1114 atom->end = s2; 1115 s = newstate(v->nfa); /* and spots for prefix and bypass */ 1116 s2 = newstate(v->nfa); 1117 NOERR(); 1118 EMPTYARC(lp, s); 1119 EMPTYARC(lp, s2); 1120 NOERR(); 1121 1122 /* 1123 * Break remaining subRE into x{...} and what follows. 1124 */ 1125 1126 t = subre(v, '.', COMBINE(qprefer, atom->flags), lp, rp); 1127 t->left = atom; 1128 atomp = &t->left; 1129 1130 /* 1131 * Here we should recurse... but we must postpone that to the end. 1132 */ 1133 1134 /* 1135 * Split top into prefix and remaining. 1136 */ 1137 1138 assert(top->op == '=' && top->left == NULL && top->right == NULL); 1139 top->left = subre(v, '=', top->flags, top->begin, lp); 1140 top->op = '.'; 1141 top->right = t; 1142 1143 /* 1144 * If it's a backref, now is the time to replicate the subNFA. 1145 */ 1146 1147 if (atomtype == BACKREF) { 1148 assert(atom->begin->nouts == 1); /* just the EMPTY */ 1149 delsub(v->nfa, atom->begin, atom->end); 1150 assert(v->subs[subno] != NULL); 1151 1152 /* 1153 * And here's why the recursion got postponed: it must wait until the 1154 * skeleton is filled in, because it may hit a backref that wants to 1155 * copy the filled-in skeleton. 1156 */ 1157 1158 dupnfa(v->nfa, v->subs[subno]->begin, v->subs[subno]->end, 1159 atom->begin, atom->end); 1160 NOERR(); 1161 } 1162 1163 /* 1164 * It's quantifier time; first, turn x{0,...} into x{1,...}|empty 1165 */ 1166 1167 if (m == 0) { 1168 EMPTYARC(s2, atom->end);/* the bypass */ 1169 assert(PREF(qprefer) != 0); 1170 f = COMBINE(qprefer, atom->flags); 1171 t = subre(v, '|', f, lp, atom->end); 1172 NOERR(); 1173 t->left = atom; 1174 t->right = subre(v, '|', PREF(f), s2, atom->end); 1175 NOERR(); 1176 t->right->left = subre(v, '=', 0, s2, atom->end); 1177 NOERR(); 1178 *atomp = t; 1179 atomp = &t->left; 1180 m = 1; 1181 } 1182 1183 /* 1184 * Deal with the rest of the quantifier. 1185 */ 1186 1187 if (atomtype == BACKREF) { 1188 /* 1189 * Special case: backrefs have internal quantifiers. 1190 */ 1191 1192 EMPTYARC(s, atom->begin); /* empty prefix */ 1193 1194 /* 1195 * Just stuff everything into atom. 1196 */ 1197 1198 repeat(v, atom->begin, atom->end, m, n); 1199 atom->min = (short) m; 1200 atom->max = (short) n; 1201 atom->flags |= COMBINE(qprefer, atom->flags); 1202 } else if (m == 1 && n == 1) { 1203 /* 1204 * No/vacuous quantifier: done. 1205 */ 1206 1207 EMPTYARC(s, atom->begin); /* empty prefix */ 1208 } else { 1209 /* 1210 * Turn x{m,n} into x{m-1,n-1}x, with capturing parens in only second 1211 * x 1212 */ 1213 1214 dupnfa(v->nfa, atom->begin, atom->end, s, atom->begin); 1215 assert(m >= 1 && m != INFINITY && n >= 1); 1216 repeat(v, s, atom->begin, m-1, (n == INFINITY) ? n : n-1); 1217 f = COMBINE(qprefer, atom->flags); 1218 t = subre(v, '.', f, s, atom->end); /* prefix and atom */ 1219 NOERR(); 1220 t->left = subre(v, '=', PREF(f), s, atom->begin); 1221 NOERR(); 1222 t->right = atom; 1223 *atomp = t; 1224 } 1225 1226 /* 1227 * And finally, look after that postponed recursion. 1228 */ 1229 1230 t = top->right; 1231 if (!(SEE('|') || SEE(stopper) || SEE(EOS))) { 1232 t->right = parsebranch(v, stopper, type, atom->end, rp, 1); 1233 } else { 1234 EMPTYARC(atom->end, rp); 1235 t->right = subre(v, '=', 0, atom->end, rp); 1236 } 1237 assert(SEE('|') || SEE(stopper) || SEE(EOS)); 1238 t->flags |= COMBINE(t->flags, t->right->flags); 1239 top->flags |= COMBINE(top->flags, t->flags); 1240} 1241 1242/* 1243 - nonword - generate arcs for non-word-character ahead or behind 1244 ^ static void nonword(struct vars *, int, struct state *, struct state *); 1245 */ 1246static void 1247nonword( 1248 struct vars *v, 1249 int dir, /* AHEAD or BEHIND */ 1250 struct state *lp, 1251 struct state *rp) 1252{ 1253 int anchor = (dir == AHEAD) ? '$' : '^'; 1254 1255 assert(dir == AHEAD || dir == BEHIND); 1256 newarc(v->nfa, anchor, 1, lp, rp); 1257 newarc(v->nfa, anchor, 0, lp, rp); 1258 colorcomplement(v->nfa, v->cm, dir, v->wordchrs, lp, rp); 1259 /* (no need for special attention to \n) */ 1260} 1261 1262/* 1263 - word - generate arcs for word character ahead or behind 1264 ^ static void word(struct vars *, int, struct state *, struct state *); 1265 */ 1266static void 1267word( 1268 struct vars *v, 1269 int dir, /* AHEAD or BEHIND */ 1270 struct state *lp, 1271 struct state *rp) 1272{ 1273 assert(dir == AHEAD || dir == BEHIND); 1274 cloneouts(v->nfa, v->wordchrs, lp, rp, dir); 1275 /* (no need for special attention to \n) */ 1276} 1277 1278/* 1279 - scannum - scan a number 1280 ^ static int scannum(struct vars *); 1281 */ 1282static int /* value, <= DUPMAX */ 1283scannum( 1284 struct vars *v) 1285{ 1286 int n = 0; 1287 1288 while (SEE(DIGIT) && n < DUPMAX) { 1289 n = n*10 + v->nextvalue; 1290 NEXT(); 1291 } 1292 if (SEE(DIGIT) || n > DUPMAX) { 1293 ERR(REG_BADBR); 1294 return 0; 1295 } 1296 return n; 1297} 1298 1299/* 1300 - repeat - replicate subNFA for quantifiers 1301 * The duplication sequences used here are chosen carefully so that any 1302 * pointers starting out pointing into the subexpression end up pointing into 1303 * the last occurrence. (Note that it may not be strung between the same left 1304 * and right end states, however!) This used to be important for the subRE 1305 * tree, although the important bits are now handled by the in-line code in 1306 * parse(), and when this is called, it doesn't matter any more. 1307 ^ static void repeat(struct vars *, struct state *, struct state *, int, int); 1308 */ 1309static void 1310repeat( 1311 struct vars *v, 1312 struct state *lp, 1313 struct state *rp, 1314 int m, 1315 int n) 1316{ 1317#define SOME 2 1318#define INF 3 1319#define PAIR(x, y) ((x)*4 + (y)) 1320#define REDUCE(x) ( ((x) == INFINITY) ? INF : (((x) > 1) ? SOME : (x)) ) 1321 const int rm = REDUCE(m); 1322 const int rn = REDUCE(n); 1323 struct state *s, *s2; 1324 1325 switch (PAIR(rm, rn)) { 1326 case PAIR(0, 0): /* empty string */ 1327 delsub(v->nfa, lp, rp); 1328 EMPTYARC(lp, rp); 1329 break; 1330 case PAIR(0, 1): /* do as x| */ 1331 EMPTYARC(lp, rp); 1332 break; 1333 case PAIR(0, SOME): /* do as x{1,n}| */ 1334 repeat(v, lp, rp, 1, n); 1335 NOERR(); 1336 EMPTYARC(lp, rp); 1337 break; 1338 case PAIR(0, INF): /* loop x around */ 1339 s = newstate(v->nfa); 1340 NOERR(); 1341 moveouts(v->nfa, lp, s); 1342 moveins(v->nfa, rp, s); 1343 EMPTYARC(lp, s); 1344 EMPTYARC(s, rp); 1345 break; 1346 case PAIR(1, 1): /* no action required */ 1347 break; 1348 case PAIR(1, SOME): /* do as x{0,n-1}x = (x{1,n-1}|)x */ 1349 s = newstate(v->nfa); 1350 NOERR(); 1351 moveouts(v->nfa, lp, s); 1352 dupnfa(v->nfa, s, rp, lp, s); 1353 NOERR(); 1354 repeat(v, lp, s, 1, n-1); 1355 NOERR(); 1356 EMPTYARC(lp, s); 1357 break; 1358 case PAIR(1, INF): /* add loopback arc */ 1359 s = newstate(v->nfa); 1360 s2 = newstate(v->nfa); 1361 NOERR(); 1362 moveouts(v->nfa, lp, s); 1363 moveins(v->nfa, rp, s2); 1364 EMPTYARC(lp, s); 1365 EMPTYARC(s2, rp); 1366 EMPTYARC(s2, s); 1367 break; 1368 case PAIR(SOME, SOME): /* do as x{m-1,n-1}x */ 1369 s = newstate(v->nfa); 1370 NOERR(); 1371 moveouts(v->nfa, lp, s); 1372 dupnfa(v->nfa, s, rp, lp, s); 1373 NOERR(); 1374 repeat(v, lp, s, m-1, n-1); 1375 break; 1376 case PAIR(SOME, INF): /* do as x{m-1,}x */ 1377 s = newstate(v->nfa); 1378 NOERR(); 1379 moveouts(v->nfa, lp, s); 1380 dupnfa(v->nfa, s, rp, lp, s); 1381 NOERR(); 1382 repeat(v, lp, s, m-1, n); 1383 break; 1384 default: 1385 ERR(REG_ASSERT); 1386 break; 1387 } 1388} 1389 1390/* 1391 - bracket - handle non-complemented bracket expression 1392 * Also called from cbracket for complemented bracket expressions. 1393 ^ static void bracket(struct vars *, struct state *, struct state *); 1394 */ 1395static void 1396bracket( 1397 struct vars *v, 1398 struct state *lp, 1399 struct state *rp) 1400{ 1401 assert(SEE('[')); 1402 NEXT(); 1403 while (!SEE(']') && !SEE(EOS)) { 1404 brackpart(v, lp, rp); 1405 } 1406 assert(SEE(']') || ISERR()); 1407 okcolors(v->nfa, v->cm); 1408} 1409 1410/* 1411 - cbracket - handle complemented bracket expression 1412 * We do it by calling bracket() with dummy endpoints, and then complementing 1413 * the result. The alternative would be to invoke rainbow(), and then delete 1414 * arcs as the b.e. is seen... but that gets messy. 1415 ^ static void cbracket(struct vars *, struct state *, struct state *); 1416 */ 1417static void 1418cbracket( 1419 struct vars *v, 1420 struct state *lp, 1421 struct state *rp) 1422{ 1423 struct state *left = newstate(v->nfa); 1424 struct state *right = newstate(v->nfa); 1425 1426 NOERR(); 1427 bracket(v, left, right); 1428 if (v->cflags®_NLSTOP) { 1429 newarc(v->nfa, PLAIN, v->nlcolor, left, right); 1430 } 1431 NOERR(); 1432 1433 assert(lp->nouts == 0); /* all outarcs will be ours */ 1434 1435 /* 1436 * Easy part of complementing, and all there is to do since the MCCE code 1437 * was removed. 1438 */ 1439 1440 colorcomplement(v->nfa, v->cm, PLAIN, left, lp, rp); 1441 NOERR(); 1442 dropstate(v->nfa, left); 1443 assert(right->nins == 0); 1444 freestate(v->nfa, right); 1445 return; 1446} 1447 1448/* 1449 - brackpart - handle one item (or range) within a bracket expression 1450 ^ static void brackpart(struct vars *, struct state *, struct state *); 1451 */ 1452static void 1453brackpart( 1454 struct vars *v, 1455 struct state *lp, 1456 struct state *rp) 1457{ 1458 celt startc, endc; 1459 struct cvec *cv; 1460 const chr *startp, *endp; 1461 chr c[1]; 1462 1463 /* 1464 * Parse something, get rid of special cases, take shortcuts. 1465 */ 1466 1467 switch (v->nexttype) { 1468 case RANGE: /* a-b-c or other botch */ 1469 ERR(REG_ERANGE); 1470 return; 1471 break; 1472 case PLAIN: 1473 c[0] = v->nextvalue; 1474 NEXT(); 1475 1476 /* 1477 * Shortcut for ordinary chr (not range). 1478 */ 1479 1480 if (!SEE(RANGE)) { 1481 onechr(v, c[0], lp, rp); 1482 return; 1483 } 1484 startc = element(v, c, c+1); 1485 NOERR(); 1486 break; 1487 case COLLEL: 1488 startp = v->now; 1489 endp = scanplain(v); 1490 INSIST(startp < endp, REG_ECOLLATE); 1491 NOERR(); 1492 startc = element(v, startp, endp); 1493 NOERR(); 1494 break; 1495 case ECLASS: 1496 startp = v->now; 1497 endp = scanplain(v); 1498 INSIST(startp < endp, REG_ECOLLATE); 1499 NOERR(); 1500 startc = element(v, startp, endp); 1501 NOERR(); 1502 cv = eclass(v, startc, (v->cflags®_ICASE)); 1503 NOERR(); 1504 dovec(v, cv, lp, rp); 1505 return; 1506 break; 1507 case CCLASS: 1508 startp = v->now; 1509 endp = scanplain(v); 1510 INSIST(startp < endp, REG_ECTYPE); 1511 NOERR(); 1512 cv = cclass(v, startp, endp, (v->cflags®_ICASE)); 1513 NOERR(); 1514 dovec(v, cv, lp, rp); 1515 return; 1516 break; 1517 default: 1518 ERR(REG_ASSERT); 1519 return; 1520 break; 1521 } 1522 1523 if (SEE(RANGE)) { 1524 NEXT(); 1525 switch (v->nexttype) { 1526 case PLAIN: 1527 case RANGE: 1528 c[0] = v->nextvalue; 1529 NEXT(); 1530 endc = element(v, c, c+1); 1531 NOERR(); 1532 break; 1533 case COLLEL: 1534 startp = v->now; 1535 endp = scanplain(v); 1536 INSIST(startp < endp, REG_ECOLLATE); 1537 NOERR(); 1538 endc = element(v, startp, endp); 1539 NOERR(); 1540 break; 1541 default: 1542 ERR(REG_ERANGE); 1543 return; 1544 break; 1545 } 1546 } else { 1547 endc = startc; 1548 } 1549 1550 /* 1551 * Ranges are unportable. Actually, standard C does guarantee that digits 1552 * are contiguous, but making that an exception is just too complicated. 1553 */ 1554 1555 if (startc != endc) { 1556 NOTE(REG_UUNPORT); 1557 } 1558 cv = range(v, startc, endc, (v->cflags®_ICASE)); 1559 NOERR(); 1560 dovec(v, cv, lp, rp); 1561} 1562 1563/* 1564 - scanplain - scan PLAIN contents of [. etc. 1565 * Certain bits of trickery in lex.c know that this code does not try to look 1566 * past the final bracket of the [. etc. 1567 ^ static const chr *scanplain(struct vars *); 1568 */ 1569static const chr * /* just after end of sequence */ 1570scanplain( 1571 struct vars *v) 1572{ 1573 const chr *endp; 1574 1575 assert(SEE(COLLEL) || SEE(ECLASS) || SEE(CCLASS)); 1576 NEXT(); 1577 1578 endp = v->now; 1579 while (SEE(PLAIN)) { 1580 endp = v->now; 1581 NEXT(); 1582 } 1583 1584 assert(SEE(END) || ISERR()); 1585 NEXT(); 1586 1587 return endp; 1588} 1589 1590/* 1591 - onechr - fill in arcs for a plain character, and possible case complements 1592 * This is mostly a shortcut for efficient handling of the common case. 1593 ^ static void onechr(struct vars *, pchr, struct state *, struct state *); 1594 */ 1595static void 1596onechr( 1597 struct vars *v, 1598 pchr c, 1599 struct state *lp, 1600 struct state *rp) 1601{ 1602 if (!(v->cflags®_ICASE)) { 1603 newarc(v->nfa, PLAIN, subcolor(v->cm, c), lp, rp); 1604 return; 1605 } 1606 1607 /* 1608 * Rats, need general case anyway... 1609 */ 1610 1611 dovec(v, allcases(v, c), lp, rp); 1612} 1613 1614/* 1615 - dovec - fill in arcs for each element of a cvec 1616 ^ static void dovec(struct vars *, struct cvec *, struct state *, 1617 ^ struct state *); 1618 */ 1619static void 1620dovec( 1621 struct vars *v, 1622 struct cvec *cv, 1623 struct state *lp, 1624 struct state *rp) 1625{ 1626 chr ch, from, to; 1627 const chr *p; 1628 int i; 1629 1630 for (p = cv->chrs, i = cv->nchrs; i > 0; p++, i--) { 1631 ch = *p; 1632 newarc(v->nfa, PLAIN, subcolor(v->cm, ch), lp, rp); 1633 } 1634 1635 for (p = cv->ranges, i = cv->nranges; i > 0; p += 2, i--) { 1636 from = *p; 1637 to = *(p+1); 1638 if (from <= to) { 1639 subrange(v, from, to, lp, rp); 1640 } 1641 } 1642 1643} 1644 1645/* 1646 - wordchrs - set up word-chr list for word-boundary stuff, if needed 1647 * The list is kept as a bunch of arcs between two dummy states; it's disposed 1648 * of by the unreachable-states sweep in NFA optimization. Does NEXT(). Must 1649 * not be called from any unusual lexical context. This should be reconciled 1650 * with the \w etc. handling in lex.c, and should be cleaned up to reduce 1651 * dependencies on input scanning. 1652 ^ static void wordchrs(struct vars *); 1653 */ 1654static void 1655wordchrs( 1656 struct vars *v) 1657{ 1658 struct state *left, *right; 1659 1660 if (v->wordchrs != NULL) { 1661 NEXT(); /* for consistency */ 1662 return; 1663 } 1664 1665 left = newstate(v->nfa); 1666 right = newstate(v->nfa); 1667 NOERR(); 1668 1669 /* 1670 * Fine point: implemented with [::], and lexer will set REG_ULOCALE. 1671 */ 1672 1673 lexword(v); 1674 NEXT(); 1675 assert(v->savenow != NULL && SEE('[')); 1676 bracket(v, left, right); 1677 assert((v->savenow != NULL && SEE(']')) || ISERR()); 1678 NEXT(); 1679 NOERR(); 1680 v->wordchrs = left; 1681} 1682 1683/* 1684 - subre - allocate a subre 1685 ^ static struct subre *subre(struct vars *, int, int, struct state *, 1686 ^ struct state *); 1687 */ 1688static struct subre * 1689subre( 1690 struct vars *v, 1691 int op, 1692 int flags, 1693 struct state *begin, 1694 struct state *end) 1695{ 1696 struct subre *ret = v->treefree; 1697 1698 if (ret != NULL) { 1699 v->treefree = ret->left; 1700 } else { 1701 ret = (struct subre *) MALLOC(sizeof(struct subre)); 1702 if (ret == NULL) { 1703 ERR(REG_ESPACE); 1704 return NULL; 1705 } 1706 ret->chain = v->treechain; 1707 v->treechain = ret; 1708 } 1709 1710 assert(strchr("|.b(=", op) != NULL); 1711 1712 ret->op = op; 1713 ret->flags = flags; 1714 ret->retry = 0; 1715 ret->subno = 0; 1716 ret->min = ret->max = 1; 1717 ret->left = NULL; 1718 ret->right = NULL; 1719 ret->begin = begin; 1720 ret->end = end; 1721 ZAPCNFA(ret->cnfa); 1722 1723 return ret; 1724} 1725 1726/* 1727 - freesubre - free a subRE subtree 1728 ^ static void freesubre(struct vars *, struct subre *); 1729 */ 1730static void 1731freesubre( 1732 struct vars *v, /* might be NULL */ 1733 struct subre *sr) 1734{ 1735 if (sr == NULL) { 1736 return; 1737 } 1738 1739 if (sr->left != NULL) { 1740 freesubre(v, sr->left); 1741 } 1742 if (sr->right != NULL) { 1743 freesubre(v, sr->right); 1744 } 1745 1746 freesrnode(v, sr); 1747} 1748 1749/* 1750 - freesrnode - free one node in a subRE subtree 1751 ^ static void freesrnode(struct vars *, struct subre *); 1752 */ 1753static void 1754freesrnode( 1755 struct vars *v, /* might be NULL */ 1756 struct subre *sr) 1757{ 1758 if (sr == NULL) { 1759 return; 1760 } 1761 1762 if (!NULLCNFA(sr->cnfa)) { 1763 freecnfa(&sr->cnfa); 1764 } 1765 sr->flags = 0; 1766 1767 if (v != NULL) { 1768 sr->left = v->treefree; 1769 v->treefree = sr; 1770 } else { 1771 FREE(sr); 1772 } 1773} 1774 1775/* 1776 - optst - optimize a subRE subtree 1777 ^ static void optst(struct vars *, struct subre *); 1778 */ 1779static void 1780optst( 1781 struct vars *v, 1782 struct subre *t) 1783{ 1784 /* 1785 * DGP (2007-11-13): I assume it was the programmer's intent to eventually 1786 * come back and add code to optimize subRE trees, but the routine coded 1787 * just spends effort traversing the tree and doing nothing. We can do 1788 * nothing with less effort. 1789 */ 1790 1791 return; 1792} 1793 1794/* 1795 - numst - number tree nodes (assigning retry indexes) 1796 ^ static int numst(struct subre *, int); 1797 */ 1798static int /* next number */ 1799numst( 1800 struct subre *t, 1801 int start) /* starting point for subtree numbers */ 1802{ 1803 int i; 1804 1805 assert(t != NULL); 1806 1807 i = start; 1808 t->retry = (short) i++; 1809 if (t->left != NULL) { 1810 i = numst(t->left, i); 1811 } 1812 if (t->right != NULL) { 1813 i = numst(t->right, i); 1814 } 1815 return i; 1816} 1817 1818/* 1819 - markst - mark tree nodes as INUSE 1820 ^ static void markst(struct subre *); 1821 */ 1822static void 1823markst( 1824 struct subre *t) 1825{ 1826 assert(t != NULL); 1827 1828 t->flags |= INUSE; 1829 if (t->left != NULL) { 1830 markst(t->left); 1831 } 1832 if (t->right != NULL) { 1833 markst(t->right); 1834 } 1835} 1836 1837/* 1838 - cleanst - free any tree nodes not marked INUSE 1839 ^ static void cleanst(struct vars *); 1840 */ 1841static void 1842cleanst( 1843 struct vars *v) 1844{ 1845 struct subre *t; 1846 struct subre *next; 1847 1848 for (t = v->treechain; t != NULL; t = next) { 1849 next = t->chain; 1850 if (!(t->flags&INUSE)) { 1851 FREE(t); 1852 } 1853 } 1854 v->treechain = NULL; 1855 v->treefree = NULL; /* just on general principles */ 1856} 1857 1858/* 1859 - nfatree - turn a subRE subtree into a tree of compacted NFAs 1860 ^ static long nfatree(struct vars *, struct subre *, FILE *); 1861 */ 1862static long /* optimize results from top node */ 1863nfatree( 1864 struct vars *v, 1865 struct subre *t, 1866 FILE *f) /* for debug output */ 1867{ 1868 assert(t != NULL && t->begin != NULL); 1869 1870 if (t->left != NULL) { 1871 (DISCARD) nfatree(v, t->left, f); 1872 } 1873 if (t->right != NULL) { 1874 (DISCARD) nfatree(v, t->right, f); 1875 } 1876 1877 return nfanode(v, t, f); 1878} 1879 1880/* 1881 - nfanode - do one NFA for nfatree 1882 ^ static long nfanode(struct vars *, struct subre *, FILE *); 1883 */ 1884static long /* optimize results */ 1885nfanode( 1886 struct vars *v, 1887 struct subre *t, 1888 FILE *f) /* for debug output */ 1889{ 1890 struct nfa *nfa; 1891 long ret = 0; 1892 char idbuf[50]; 1893 1894 assert(t->begin != NULL); 1895 1896 if (f != NULL) { 1897 fprintf(f, "\n\n\n========= TREE NODE %s ==========\n", 1898 stid(t, idbuf, sizeof(idbuf))); 1899 } 1900 nfa = newnfa(v, v->cm, v->nfa); 1901 NOERRZ(); 1902 dupnfa(nfa, t->begin, t->end, nfa->init, nfa->final); 1903 if (!ISERR()) { 1904 specialcolors(nfa); 1905 ret = optimize(nfa, f); 1906 } 1907 if (!ISERR()) { 1908 compact(nfa, &t->cnfa); 1909 } 1910 1911 freenfa(nfa); 1912 return ret; 1913} 1914 1915/* 1916 - newlacon - allocate a lookahead-constraint subRE 1917 ^ static int newlacon(struct vars *, struct state *, struct state *, int); 1918 */ 1919static int /* lacon number */ 1920newlacon( 1921 struct vars *v, 1922 struct state *begin, 1923 struct state *end, 1924 int pos) 1925{ 1926 struct subre *sub; 1927 int n; 1928 1929 if (v->nlacons == 0) { 1930 v->lacons = (struct subre *) MALLOC(2 * sizeof(struct subre)); 1931 n = 1; /* skip 0th */ 1932 v->nlacons = 2; 1933 } else { 1934 v->lacons = (struct subre *) REALLOC(v->lacons, 1935 (v->nlacons+1)*sizeof(struct subre)); 1936 n = v->nlacons++; 1937 } 1938 1939 if (v->lacons == NULL) { 1940 ERR(REG_ESPACE); 1941 return 0; 1942 } 1943 1944 sub = &v->lacons[n]; 1945 sub->begin = begin; 1946 sub->end = end; 1947 sub->subno = pos; 1948 ZAPCNFA(sub->cnfa); 1949 return n; 1950} 1951 1952/* 1953 - freelacons - free lookahead-constraint subRE vector 1954 ^ static void freelacons(struct subre *, int); 1955 */ 1956static void 1957freelacons( 1958 struct subre *subs, 1959 int n) 1960{ 1961 struct subre *sub; 1962 int i; 1963 1964 assert(n > 0); 1965 for (sub=subs+1, i=n-1; i>0; sub++, i--) { /* no 0th */ 1966 if (!NULLCNFA(sub->cnfa)) { 1967 freecnfa(&sub->cnfa); 1968 } 1969 } 1970 FREE(subs); 1971} 1972 1973/* 1974 - rfree - free a whole RE (insides of regfree) 1975 ^ static void rfree(regex_t *); 1976 */ 1977static void 1978rfree( 1979 regex_t *re) 1980{ 1981 struct guts *g; 1982 1983 if (re == NULL || re->re_magic != REMAGIC) { 1984 return; 1985 } 1986 1987 re->re_magic = 0; /* invalidate RE */ 1988 g = (struct guts *) re->re_guts; 1989 re->re_guts = NULL; 1990 re->re_fns = NULL; 1991 g->magic = 0; 1992 freecm(&g->cmap); 1993 if (g->tree != NULL) { 1994 freesubre(NULL, g->tree); 1995 } 1996 if (g->lacons != NULL) { 1997 freelacons(g->lacons, g->nlacons); 1998 } 1999 if (!NULLCNFA(g->search)) { 2000 freecnfa(&g->search); 2001 } 2002 FREE(g); 2003} 2004 2005/* 2006 - dump - dump an RE in human-readable form 2007 ^ static void dump(regex_t *, FILE *); 2008 */ 2009static void 2010dump( 2011 regex_t *re, 2012 FILE *f) 2013{ 2014#ifdef REG_DEBUG 2015 struct guts *g; 2016 int i; 2017 2018 if (re->re_magic != REMAGIC) { 2019 fprintf(f, "bad magic number (0x%x not 0x%x)\n", 2020 re->re_magic, REMAGIC); 2021 } 2022 if (re->re_guts == NULL) { 2023 fprintf(f, "NULL guts!!!\n"); 2024 return; 2025 } 2026 g = (struct guts *) re->re_guts; 2027 if (g->magic != GUTSMAGIC) { 2028 fprintf(f, "bad guts magic number (0x%x not 0x%x)\n", 2029 g->magic, GUTSMAGIC); 2030 } 2031 2032 fprintf(f, "\n\n\n========= DUMP ==========\n"); 2033 fprintf(f, "nsub %d, info 0%lo, csize %d, ntree %d\n", 2034 re->re_nsub, re->re_info, re->re_csize, g->ntree); 2035 2036 dumpcolors(&g->cmap, f); 2037 if (!NULLCNFA(g->search)) { 2038 printf("\nsearch:\n"); 2039 dumpcnfa(&g->search, f); 2040 } 2041 for (i = 1; i < g->nlacons; i++) { 2042 fprintf(f, "\nla%d (%s):\n", i, 2043 (g->lacons[i].subno) ? "positive" : "negative"); 2044 dumpcnfa(&g->lacons[i].cnfa, f); 2045 } 2046 fprintf(f, "\n"); 2047 dumpst(g->tree, f, 0); 2048#endif 2049} 2050 2051/* 2052 - dumpst - dump a subRE tree 2053 ^ static void dumpst(struct subre *, FILE *, int); 2054 */ 2055static void 2056dumpst( 2057 struct subre *t, 2058 FILE *f, 2059 int nfapresent) /* is the original NFA still around? */ 2060{ 2061 if (t == NULL) { 2062 fprintf(f, "null tree\n"); 2063 } else { 2064 stdump(t, f, nfapresent); 2065 } 2066 fflush(f); 2067} 2068 2069/* 2070 - stdump - recursive guts of dumpst 2071 ^ static void stdump(struct subre *, FILE *, int); 2072 */ 2073static void 2074stdump( 2075 struct subre *t, 2076 FILE *f, 2077 int nfapresent) /* is the original NFA still around? */ 2078{ 2079 char idbuf[50]; 2080 2081 fprintf(f, "%s. `%c'", stid(t, idbuf, sizeof(idbuf)), t->op); 2082 if (t->flags&LONGER) { 2083 fprintf(f, " longest"); 2084 } 2085 if (t->flags&SHORTER) { 2086 fprintf(f, " shortest"); 2087 } 2088 if (t->flags&MIXED) { 2089 fprintf(f, " hasmixed"); 2090 } 2091 if (t->flags&CAP) { 2092 fprintf(f, " hascapture"); 2093 } 2094 if (t->flags&BACKR) { 2095 fprintf(f, " hasbackref"); 2096 } 2097 if (!(t->flags&INUSE)) { 2098 fprintf(f, " UNUSED"); 2099 } 2100 if (t->subno != 0) { 2101 fprintf(f, " (#%d)", t->subno); 2102 } 2103 if (t->min != 1 || t->max != 1) { 2104 fprintf(f, " {%d,", t->min); 2105 if (t->max != INFINITY) { 2106 fprintf(f, "%d", t->max); 2107 } 2108 fprintf(f, "}"); 2109 } 2110 if (nfapresent) { 2111 fprintf(f, " %ld-%ld", (long)t->begin->no, (long)t->end->no); 2112 } 2113 if (t->left != NULL) { 2114 fprintf(f, " L:%s", stid(t->left, idbuf, sizeof(idbuf))); 2115 } 2116 if (t->right != NULL) { 2117 fprintf(f, " R:%s", stid(t->right, idbuf, sizeof(idbuf))); 2118 } 2119 if (!NULLCNFA(t->cnfa)) { 2120 fprintf(f, "\n"); 2121 dumpcnfa(&t->cnfa, f); 2122 } 2123 fprintf(f, "\n"); 2124 if (t->left != NULL) { 2125 stdump(t->left, f, nfapresent); 2126 } 2127 if (t->right != NULL) { 2128 stdump(t->right, f, nfapresent); 2129 } 2130} 2131 2132/* 2133 - stid - identify a subtree node for dumping 2134 ^ static char *stid(struct subre *, char *, size_t); 2135 */ 2136static const char * /* points to buf or constant string */ 2137stid( 2138 struct subre *t, 2139 char *buf, 2140 size_t bufsize) 2141{ 2142 /* 2143 * Big enough for hex int or decimal t->retry? 2144 */ 2145 2146 if (bufsize < sizeof(void*)*2 + 3 || bufsize < sizeof(t->retry)*3 + 1) { 2147 return "unable"; 2148 } 2149 if (t->retry != 0) { 2150 sprintf(buf, "%d", t->retry); 2151 } else { 2152 sprintf(buf, "%p", t); 2153 } 2154 return buf; 2155} 2156 2157#include "regc_lex.c" 2158#include "regc_color.c" 2159#include "regc_nfa.c" 2160#include "regc_cvec.c" 2161#include "regc_locale.c" 2162 2163/* 2164 * Local Variables: 2165 * mode: c 2166 * c-basic-offset: 4 2167 * fill-column: 78 2168 * End: 2169 */ 2170