1/**************************************************************** 2Copyright (C) Lucent Technologies 1997 3All Rights Reserved 4 5Permission to use, copy, modify, and distribute this software and 6its documentation for any purpose and without fee is hereby 7granted, provided that the above copyright notice appear in all 8copies and that both that the copyright notice and this 9permission notice and warranty disclaimer appear in supporting 10documentation, and that the name Lucent Technologies or any of 11its entities not be used in advertising or publicity pertaining 12to distribution of the software without specific, written prior 13permission. 14 15LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, 16INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. 17IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY 18SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 19WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER 20IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, 21ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF 22THIS SOFTWARE. 23****************************************************************/ 24 25/* lasciate ogne speranza, voi ch'intrate. */ 26 27#if HAVE_NBTOOL_CONFIG_H 28#include "nbtool_config.h" 29#endif 30 31#define DEBUG 32 33#include <ctype.h> 34#include <limits.h> 35#include <stdio.h> 36#include <string.h> 37#include <stdlib.h> 38#include "awk.h" 39#include "awkgram.h" 40 41#define MAXLIN 22 42 43#define type(v) (v)->nobj /* badly overloaded here */ 44#define info(v) (v)->ntype /* badly overloaded here */ 45#define left(v) (v)->narg[0] 46#define right(v) (v)->narg[1] 47#define parent(v) (v)->nnext 48 49#define LEAF case CCL: case NCCL: case CHAR: case DOT: case FINAL: case ALL: 50#define ELEAF case EMPTYRE: /* empty string in regexp */ 51#define UNARY case STAR: case PLUS: case QUEST: 52 53/* encoding in tree Nodes: 54 leaf (CCL, NCCL, CHAR, DOT, FINAL, ALL, EMPTYRE): 55 left is index, right contains value or pointer to value 56 unary (STAR, PLUS, QUEST): left is child, right is null 57 binary (CAT, OR): left and right are children 58 parent contains pointer to parent 59*/ 60 61 62int *setvec; 63int *tmpset; 64int maxsetvec = 0; 65 66int rtok; /* next token in current re */ 67int rlxval; 68static const uschar *rlxstr; 69static const uschar *prestr; /* current position in current re */ 70static const uschar *lastre; /* origin of last re */ 71static const uschar *lastatom; /* origin of last Atom */ 72static const uschar *starttok; 73static const uschar *basestr; /* starts with original, replaced during 74 repetition processing */ 75static const uschar *firstbasestr; 76 77static int setcnt; 78static int poscnt; 79 80const char *patbeg; 81int patlen; 82 83#define NFA 128 /* cache this many dynamic fa's */ 84fa *fatab[NFA]; 85int nfatab = 0; /* entries in fatab */ 86 87static int * 88intalloc(size_t n, const char *f) 89{ 90 void *p = calloc(n, sizeof(int)); 91 if (p == NULL) 92 overflo(f); 93 return p; 94} 95 96static void 97resizesetvec(const char *f) 98{ 99 if (maxsetvec == 0) 100 maxsetvec = MAXLIN; 101 else 102 maxsetvec *= 4; 103 setvec = realloc(setvec, maxsetvec * sizeof(*setvec)); 104 tmpset = realloc(tmpset, maxsetvec * sizeof(*tmpset)); 105 if (setvec == NULL || tmpset == NULL) 106 overflo(f); 107} 108 109static void 110resize_state(fa *f, int state) 111{ 112 void *p; 113 int i, new_count; 114 115 if (++state < f->state_count) 116 return; 117 118 new_count = state + 10; /* needs to be tuned */ 119 120 p = realloc(f->gototab, new_count * sizeof(f->gototab[0])); 121 if (p == NULL) 122 goto out; 123 f->gototab = p; 124 125 p = realloc(f->out, new_count * sizeof(f->out[0])); 126 if (p == NULL) 127 goto out; 128 f->out = p; 129 130 p = realloc(f->posns, new_count * sizeof(f->posns[0])); 131 if (p == NULL) 132 goto out; 133 f->posns = p; 134 135 for (i = f->state_count; i < new_count; ++i) { 136 f->gototab[i] = calloc(NCHARS, sizeof(**f->gototab)); 137 if (f->gototab[i] == NULL) 138 goto out; 139 f->out[i] = 0; 140 f->posns[i] = NULL; 141 } 142 f->state_count = new_count; 143 return; 144out: 145 overflo(__func__); 146} 147 148fa *makedfa(const char *s, bool anchor) /* returns dfa for reg expr s */ 149{ 150 int i, use, nuse; 151 fa *pfa; 152 static int now = 1; 153 154 if (setvec == NULL) { /* first time through any RE */ 155 resizesetvec(__func__); 156 } 157 158 if (compile_time != RUNNING) /* a constant for sure */ 159 return mkdfa(s, anchor); 160 for (i = 0; i < nfatab; i++) /* is it there already? */ 161 if (fatab[i]->anchor == anchor 162 && strcmp((const char *) fatab[i]->restr, s) == 0) { 163 fatab[i]->use = now++; 164 return fatab[i]; 165 } 166 pfa = mkdfa(s, anchor); 167 if (nfatab < NFA) { /* room for another */ 168 fatab[nfatab] = pfa; 169 fatab[nfatab]->use = now++; 170 nfatab++; 171 return pfa; 172 } 173 use = fatab[0]->use; /* replace least-recently used */ 174 nuse = 0; 175 for (i = 1; i < nfatab; i++) 176 if (fatab[i]->use < use) { 177 use = fatab[i]->use; 178 nuse = i; 179 } 180 freefa(fatab[nuse]); 181 fatab[nuse] = pfa; 182 pfa->use = now++; 183 return pfa; 184} 185 186fa *mkdfa(const char *s, bool anchor) /* does the real work of making a dfa */ 187 /* anchor = true for anchored matches, else false */ 188{ 189 Node *p, *p1; 190 fa *f; 191 192 firstbasestr = (const uschar *) s; 193 basestr = firstbasestr; 194 p = reparse(s); 195 p1 = op2(CAT, op2(STAR, op2(ALL, NIL, NIL), NIL), p); 196 /* put ALL STAR in front of reg. exp. */ 197 p1 = op2(CAT, p1, op2(FINAL, NIL, NIL)); 198 /* put FINAL after reg. exp. */ 199 200 poscnt = 0; 201 penter(p1); /* enter parent pointers and leaf indices */ 202 if ((f = calloc(1, sizeof(fa) + poscnt * sizeof(rrow))) == NULL) 203 overflo(__func__); 204 f->accept = poscnt-1; /* penter has computed number of positions in re */ 205 cfoll(f, p1); /* set up follow sets */ 206 freetr(p1); 207 resize_state(f, 1); 208 f->posns[0] = intalloc(*(f->re[0].lfollow), __func__); 209 f->posns[1] = intalloc(1, __func__); 210 *f->posns[1] = 0; 211 f->initstat = makeinit(f, anchor); 212 f->anchor = anchor; 213 f->restr = (uschar *) tostring(s); 214 if (firstbasestr != basestr) { 215 if (basestr) 216 xfree(basestr); 217 } 218 return f; 219} 220 221int makeinit(fa *f, bool anchor) 222{ 223 int i, k; 224 225 f->curstat = 2; 226 f->out[2] = 0; 227 k = *(f->re[0].lfollow); 228 xfree(f->posns[2]); 229 f->posns[2] = intalloc(k + 1, __func__); 230 for (i = 0; i <= k; i++) { 231 (f->posns[2])[i] = (f->re[0].lfollow)[i]; 232 } 233 if ((f->posns[2])[1] == f->accept) 234 f->out[2] = 1; 235 for (i = 0; i < NCHARS; i++) 236 f->gototab[2][i] = 0; 237 f->curstat = cgoto(f, 2, HAT); 238 if (anchor) { 239 *f->posns[2] = k-1; /* leave out position 0 */ 240 for (i = 0; i < k; i++) { 241 (f->posns[0])[i] = (f->posns[2])[i]; 242 } 243 244 f->out[0] = f->out[2]; 245 if (f->curstat != 2) 246 --(*f->posns[f->curstat]); 247 } 248 return f->curstat; 249} 250 251void penter(Node *p) /* set up parent pointers and leaf indices */ 252{ 253 switch (type(p)) { 254 ELEAF 255 LEAF 256 info(p) = poscnt; 257 poscnt++; 258 break; 259 UNARY 260 penter(left(p)); 261 parent(left(p)) = p; 262 break; 263 case CAT: 264 case OR: 265 penter(left(p)); 266 penter(right(p)); 267 parent(left(p)) = p; 268 parent(right(p)) = p; 269 break; 270 case ZERO: 271 break; 272 default: /* can't happen */ 273 FATAL("can't happen: unknown type %d in penter", type(p)); 274 break; 275 } 276} 277 278void freetr(Node *p) /* free parse tree */ 279{ 280 switch (type(p)) { 281 ELEAF 282 LEAF 283 xfree(p); 284 break; 285 UNARY 286 case ZERO: 287 freetr(left(p)); 288 xfree(p); 289 break; 290 case CAT: 291 case OR: 292 freetr(left(p)); 293 freetr(right(p)); 294 xfree(p); 295 break; 296 default: /* can't happen */ 297 FATAL("can't happen: unknown type %d in freetr", type(p)); 298 break; 299 } 300} 301 302/* in the parsing of regular expressions, metacharacters like . have */ 303/* to be seen literally; \056 is not a metacharacter. */ 304 305int hexstr(const uschar **pp) /* find and eval hex string at pp, return new p */ 306{ /* only pick up one 8-bit byte (2 chars) */ 307 const uschar *p; 308 int n = 0; 309 int i; 310 311 for (i = 0, p = *pp; i < 2 && isxdigit(*p); i++, p++) { 312 if (isdigit(*p)) 313 n = 16 * n + *p - '0'; 314 else if (*p >= 'a' && *p <= 'f') 315 n = 16 * n + *p - 'a' + 10; 316 else if (*p >= 'A' && *p <= 'F') 317 n = 16 * n + *p - 'A' + 10; 318 } 319 *pp = p; 320 return n; 321} 322 323#define isoctdigit(c) ((c) >= '0' && (c) <= '7') /* multiple use of arg */ 324 325int quoted(const uschar **pp) /* pick up next thing after a \\ */ 326 /* and increment *pp */ 327{ 328 const uschar *p = *pp; 329 int c; 330 331 if ((c = *p++) == 't') 332 c = '\t'; 333 else if (c == 'n') 334 c = '\n'; 335 else if (c == 'f') 336 c = '\f'; 337 else if (c == 'r') 338 c = '\r'; 339 else if (c == 'b') 340 c = '\b'; 341 else if (c == 'v') 342 c = '\v'; 343 else if (c == 'a') 344 c = '\a'; 345 else if (c == '\\') 346 c = '\\'; 347 else if (c == 'x') { /* hexadecimal goo follows */ 348 c = hexstr(&p); /* this adds a null if number is invalid */ 349 } else if (isoctdigit(c)) { /* \d \dd \ddd */ 350 int n = c - '0'; 351 if (isoctdigit(*p)) { 352 n = 8 * n + *p++ - '0'; 353 if (isoctdigit(*p)) 354 n = 8 * n + *p++ - '0'; 355 } 356 c = n; 357 } /* else */ 358 /* c = c; */ 359 *pp = p; 360 return c; 361} 362 363char *cclenter(const char *argp) /* add a character class */ 364{ 365 int i, c, c2; 366 const uschar *op, *p = (const uschar *) argp; 367 uschar *bp; 368 static uschar *buf = NULL; 369 static int bufsz = 100; 370 371 op = p; 372 if (buf == NULL && (buf = malloc(bufsz)) == NULL) 373 FATAL("out of space for character class [%.10s...] 1", p); 374 bp = buf; 375 for (i = 0; (c = *p++) != 0; ) { 376 if (c == '\\') { 377 c = quoted(&p); 378 } else if (c == '-' && i > 0 && bp[-1] != 0) { 379 if (*p != 0) { 380 c = bp[-1]; 381 c2 = *p++; 382 if (c2 == '\\') 383 c2 = quoted(&p); 384 if (c > c2) { /* empty; ignore */ 385 bp--; 386 i--; 387 continue; 388 } 389 while (c < c2) { 390 if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, "cclenter1")) 391 FATAL("out of space for character class [%.10s...] 2", p); 392 *bp++ = ++c; 393 i++; 394 } 395 continue; 396 } 397 } 398 if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, "cclenter2")) 399 FATAL("out of space for character class [%.10s...] 3", p); 400 *bp++ = c; 401 i++; 402 } 403 *bp = 0; 404 dprintf( ("cclenter: in = |%s|, out = |%s|\n", op, buf) ); 405 xfree(op); 406 return (char *) tostring((char *) buf); 407} 408 409void overflo(const char *s) 410{ 411 FATAL("regular expression too big: out of space in %.30s...", s); 412} 413 414void cfoll(fa *f, Node *v) /* enter follow set of each leaf of vertex v into lfollow[leaf] */ 415{ 416 int i; 417 int *p; 418 419 switch (type(v)) { 420 ELEAF 421 LEAF 422 f->re[info(v)].ltype = type(v); 423 f->re[info(v)].lval.np = right(v); 424 while (f->accept >= maxsetvec) { /* guessing here! */ 425 resizesetvec(__func__); 426 } 427 for (i = 0; i <= f->accept; i++) 428 setvec[i] = 0; 429 setcnt = 0; 430 follow(v); /* computes setvec and setcnt */ 431 p = intalloc(setcnt + 1, __func__); 432 f->re[info(v)].lfollow = p; 433 *p = setcnt; 434 for (i = f->accept; i >= 0; i--) 435 if (setvec[i] == 1) 436 *++p = i; 437 break; 438 UNARY 439 cfoll(f,left(v)); 440 break; 441 case CAT: 442 case OR: 443 cfoll(f,left(v)); 444 cfoll(f,right(v)); 445 break; 446 case ZERO: 447 break; 448 default: /* can't happen */ 449 FATAL("can't happen: unknown type %d in cfoll", type(v)); 450 } 451} 452 453int first(Node *p) /* collects initially active leaves of p into setvec */ 454 /* returns 0 if p matches empty string */ 455{ 456 int b, lp; 457 458 switch (type(p)) { 459 ELEAF 460 LEAF 461 lp = info(p); /* look for high-water mark of subscripts */ 462 while (setcnt >= maxsetvec || lp >= maxsetvec) { /* guessing here! */ 463 resizesetvec(__func__); 464 } 465 if (type(p) == EMPTYRE) { 466 setvec[lp] = 0; 467 return(0); 468 } 469 if (setvec[lp] != 1) { 470 setvec[lp] = 1; 471 setcnt++; 472 } 473 if (type(p) == CCL && (*(char *) right(p)) == '\0') 474 return(0); /* empty CCL */ 475 return(1); 476 case PLUS: 477 if (first(left(p)) == 0) 478 return(0); 479 return(1); 480 case STAR: 481 case QUEST: 482 first(left(p)); 483 return(0); 484 case CAT: 485 if (first(left(p)) == 0 && first(right(p)) == 0) return(0); 486 return(1); 487 case OR: 488 b = first(right(p)); 489 if (first(left(p)) == 0 || b == 0) return(0); 490 return(1); 491 case ZERO: 492 return 0; 493 } 494 FATAL("can't happen: unknown type %d in first", type(p)); /* can't happen */ 495 return(-1); 496} 497 498void follow(Node *v) /* collects leaves that can follow v into setvec */ 499{ 500 Node *p; 501 502 if (type(v) == FINAL) 503 return; 504 p = parent(v); 505 switch (type(p)) { 506 case STAR: 507 case PLUS: 508 first(v); 509 follow(p); 510 return; 511 512 case OR: 513 case QUEST: 514 follow(p); 515 return; 516 517 case CAT: 518 if (v == left(p)) { /* v is left child of p */ 519 if (first(right(p)) == 0) { 520 follow(p); 521 return; 522 } 523 } else /* v is right child */ 524 follow(p); 525 return; 526 } 527} 528 529int member(int c, const char *sarg) /* is c in s? */ 530{ 531 const uschar *s = (const uschar *) sarg; 532 533 while (*s) 534 if (c == *s++) 535 return(1); 536 return(0); 537} 538 539int match(fa *f, const char *p0) /* shortest match ? */ 540{ 541 int s, ns; 542 const uschar *p = (const uschar *) p0; 543 544 s = f->initstat; 545 assert (s < f->state_count); 546 547 if (f->out[s]) 548 return(1); 549 do { 550 /* assert(*p < NCHARS); */ 551 if ((ns = f->gototab[s][*p]) != 0) 552 s = ns; 553 else 554 s = cgoto(f, s, *p); 555 if (f->out[s]) 556 return(1); 557 } while (*p++ != 0); 558 return(0); 559} 560 561int pmatch(fa *f, const char *p0) /* longest match, for sub */ 562{ 563 int s, ns; 564 const uschar *p = (const uschar *) p0; 565 const uschar *q; 566 567 s = f->initstat; 568 assert(s < f->state_count); 569 570 patbeg = (const char *)p; 571 patlen = -1; 572 do { 573 q = p; 574 do { 575 if (f->out[s]) /* final state */ 576 patlen = q-p; 577 /* assert(*q < NCHARS); */ 578 if ((ns = f->gototab[s][*q]) != 0) 579 s = ns; 580 else 581 s = cgoto(f, s, *q); 582 583 assert(s < f->state_count); 584 585 if (s == 1) { /* no transition */ 586 if (patlen >= 0) { 587 patbeg = (const char *) p; 588 return(1); 589 } 590 else 591 goto nextin; /* no match */ 592 } 593 } while (*q++ != 0); 594 if (f->out[s]) 595 patlen = q-p-1; /* don't count $ */ 596 if (patlen >= 0) { 597 patbeg = (const char *) p; 598 return(1); 599 } 600 nextin: 601 s = 2; 602 } while (*p++); 603 return (0); 604} 605 606int nematch(fa *f, const char *p0) /* non-empty match, for sub */ 607{ 608 int s, ns; 609 const uschar *p = (const uschar *) p0; 610 const uschar *q; 611 612 s = f->initstat; 613 assert(s < f->state_count); 614 615 patbeg = (const char *)p; 616 patlen = -1; 617 while (*p) { 618 q = p; 619 do { 620 if (f->out[s]) /* final state */ 621 patlen = q-p; 622 /* assert(*q < NCHARS); */ 623 if ((ns = f->gototab[s][*q]) != 0) 624 s = ns; 625 else 626 s = cgoto(f, s, *q); 627 if (s == 1) { /* no transition */ 628 if (patlen > 0) { 629 patbeg = (const char *) p; 630 return(1); 631 } else 632 goto nnextin; /* no nonempty match */ 633 } 634 } while (*q++ != 0); 635 if (f->out[s]) 636 patlen = q-p-1; /* don't count $ */ 637 if (patlen > 0 ) { 638 patbeg = (const char *) p; 639 return(1); 640 } 641 nnextin: 642 s = 2; 643 p++; 644 } 645 return (0); 646} 647 648 649/* 650 * NAME 651 * fnematch 652 * 653 * DESCRIPTION 654 * A stream-fed version of nematch which transfers characters to a 655 * null-terminated buffer. All characters up to and including the last 656 * character of the matching text or EOF are placed in the buffer. If 657 * a match is found, patbeg and patlen are set appropriately. 658 * 659 * RETURN VALUES 660 * false No match found. 661 * true Match found. 662 */ 663 664bool fnematch(fa *pfa, FILE *f, char **pbuf, int *pbufsize, int quantum) 665{ 666 char *buf = *pbuf; 667 int bufsize = *pbufsize; 668 int c, i, j, k, ns, s; 669 670 s = pfa->initstat; 671 patlen = 0; 672 673 /* 674 * All indices relative to buf. 675 * i <= j <= k <= bufsize 676 * 677 * i: origin of active substring 678 * j: current character 679 * k: destination of next getc() 680 */ 681 i = -1, k = 0; 682 do { 683 j = i++; 684 do { 685 if (++j == k) { 686 if (k == bufsize) 687 if (!adjbuf((char **) &buf, &bufsize, bufsize+1, quantum, 0, "fnematch")) 688 FATAL("stream '%.30s...' too long", buf); 689 buf[k++] = (c = getc(f)) != EOF ? c : 0; 690 } 691 c = buf[j]; 692 /* assert(c < NCHARS); */ 693 694 if ((ns = pfa->gototab[s][c]) != 0) 695 s = ns; 696 else 697 s = cgoto(pfa, s, c); 698 699 if (pfa->out[s]) { /* final state */ 700 patlen = j - i + 1; 701 if (c == 0) /* don't count $ */ 702 patlen--; 703 } 704 } while (buf[j] && s != 1); 705 s = 2; 706 } while (buf[i] && !patlen); 707 708 /* adjbuf() may have relocated a resized buffer. Inform the world. */ 709 *pbuf = buf; 710 *pbufsize = bufsize; 711 712 if (patlen) { 713 patbeg = (char *) buf + i; 714 /* 715 * Under no circumstances is the last character fed to 716 * the automaton part of the match. It is EOF's nullbyte, 717 * or it sent the automaton into a state with no further 718 * transitions available (s==1), or both. Room for a 719 * terminating nullbyte is guaranteed. 720 * 721 * ungetc any chars after the end of matching text 722 * (except for EOF's nullbyte, if present) and null 723 * terminate the buffer. 724 */ 725 do 726 if (buf[--k] && ungetc(buf[k], f) == EOF) 727 FATAL("unable to ungetc '%c'", buf[k]); 728 while (k > i + patlen); 729 buf[k] = '\0'; 730 return true; 731 } 732 else 733 return false; 734} 735 736Node *reparse(const char *p) /* parses regular expression pointed to by p */ 737{ /* uses relex() to scan regular expression */ 738 Node *np; 739 740 dprintf( ("reparse <%s>\n", p) ); 741 lastre = prestr = (const uschar *) p; /* prestr points to string to be parsed */ 742 rtok = relex(); 743 /* GNU compatibility: an empty regexp matches anything */ 744 if (rtok == '\0') { 745 /* FATAL("empty regular expression"); previous */ 746 return(op2(EMPTYRE, NIL, NIL)); 747 } 748 np = regexp(); 749 if (rtok != '\0') 750 FATAL("syntax error in regular expression %s at %s", lastre, prestr); 751 return(np); 752} 753 754Node *regexp(void) /* top-level parse of reg expr */ 755{ 756 return (alt(concat(primary()))); 757} 758 759Node *primary(void) 760{ 761 Node *np; 762 int savelastatom; 763 764 switch (rtok) { 765 case CHAR: 766 lastatom = starttok; 767 np = op2(CHAR, NIL, itonp(rlxval)); 768 rtok = relex(); 769 return (unary(np)); 770 case ALL: 771 rtok = relex(); 772 return (unary(op2(ALL, NIL, NIL))); 773 case EMPTYRE: 774 rtok = relex(); 775 return (unary(op2(EMPTYRE, NIL, NIL))); 776 case DOT: 777 lastatom = starttok; 778 rtok = relex(); 779 return (unary(op2(DOT, NIL, NIL))); 780 case CCL: 781 np = op2(CCL, NIL, (Node*) cclenter((const char *) rlxstr)); 782 lastatom = starttok; 783 rtok = relex(); 784 return (unary(np)); 785 case NCCL: 786 np = op2(NCCL, NIL, (Node *) cclenter((const char *) rlxstr)); 787 lastatom = starttok; 788 rtok = relex(); 789 return (unary(np)); 790 case '^': 791 rtok = relex(); 792 return (unary(op2(CHAR, NIL, itonp(HAT)))); 793 case '$': 794 rtok = relex(); 795 return (unary(op2(CHAR, NIL, NIL))); 796 case '(': 797 lastatom = starttok; 798 savelastatom = starttok - basestr; /* Retain over recursion */ 799 rtok = relex(); 800 if (rtok == ')') { /* special pleading for () */ 801 rtok = relex(); 802 return unary(op2(CCL, NIL, (Node *) tostring(""))); 803 } 804 np = regexp(); 805 if (rtok == ')') { 806 lastatom = basestr + savelastatom; /* Restore */ 807 rtok = relex(); 808 return (unary(np)); 809 } 810 else 811 FATAL("syntax error in regular expression %s at %s", lastre, prestr); 812 default: 813 FATAL("illegal primary in regular expression %s at %s", lastre, prestr); 814 } 815 return 0; /*NOTREACHED*/ 816} 817 818Node *concat(Node *np) 819{ 820 switch (rtok) { 821 case CHAR: case DOT: case ALL: case CCL: case NCCL: case '$': case '(': 822 return (concat(op2(CAT, np, primary()))); 823 case EMPTYRE: 824 rtok = relex(); 825 return (concat(op2(CAT, op2(CCL, NIL, (Node *) tostring("")), 826 primary()))); 827 } 828 return (np); 829} 830 831Node *alt(Node *np) 832{ 833 if (rtok == OR) { 834 rtok = relex(); 835 return (alt(op2(OR, np, concat(primary())))); 836 } 837 return (np); 838} 839 840Node *unary(Node *np) 841{ 842 switch (rtok) { 843 case STAR: 844 rtok = relex(); 845 return (unary(op2(STAR, np, NIL))); 846 case PLUS: 847 rtok = relex(); 848 return (unary(op2(PLUS, np, NIL))); 849 case QUEST: 850 rtok = relex(); 851 return (unary(op2(QUEST, np, NIL))); 852 case ZERO: 853 rtok = relex(); 854 return (unary(op2(ZERO, np, NIL))); 855 default: 856 return (np); 857 } 858} 859 860/* 861 * Character class definitions conformant to the POSIX locale as 862 * defined in IEEE P1003.1 draft 7 of June 2001, assuming the source 863 * and operating character sets are both ASCII (ISO646) or supersets 864 * thereof. 865 * 866 * Note that to avoid overflowing the temporary buffer used in 867 * relex(), the expanded character class (prior to range expansion) 868 * must be less than twice the size of their full name. 869 */ 870 871/* Because isblank doesn't show up in any of the header files on any 872 * system i use, it's defined here. if some other locale has a richer 873 * definition of "blank", define HAS_ISBLANK and provide your own 874 * version. 875 * the parentheses here are an attempt to find a path through the maze 876 * of macro definition and/or function and/or version provided. thanks 877 * to nelson beebe for the suggestion; let's see if it works everywhere. 878 */ 879 880/* #define HAS_ISBLANK */ 881#ifndef HAS_ISBLANK 882 883int (xisblank)(int c) 884{ 885 return c==' ' || c=='\t'; 886} 887 888#endif 889 890static const struct charclass { 891 const char *cc_name; 892 int cc_namelen; 893 int (*cc_func)(int); 894} charclasses[] = { 895 { "alnum", 5, isalnum }, 896 { "alpha", 5, isalpha }, 897#ifndef HAS_ISBLANK 898 { "blank", 5, xisblank }, 899#else 900 { "blank", 5, isblank }, 901#endif 902 { "cntrl", 5, iscntrl }, 903 { "digit", 5, isdigit }, 904 { "graph", 5, isgraph }, 905 { "lower", 5, islower }, 906 { "print", 5, isprint }, 907 { "punct", 5, ispunct }, 908 { "space", 5, isspace }, 909 { "upper", 5, isupper }, 910 { "xdigit", 6, isxdigit }, 911 { NULL, 0, NULL }, 912}; 913 914#define REPEAT_SIMPLE 0 915#define REPEAT_PLUS_APPENDED 1 916#define REPEAT_WITH_Q 2 917#define REPEAT_ZERO 3 918 919static int 920replace_repeat(const uschar *reptok, int reptoklen, const uschar *atom, 921 int atomlen, int firstnum, int secondnum, int special_case) 922{ 923 int i, j; 924 uschar *buf = 0; 925 int ret = 1; 926 int init_q = (firstnum == 0); /* first added char will be ? */ 927 int n_q_reps = secondnum-firstnum; /* m>n, so reduce until {1,m-n} left */ 928 int prefix_length = reptok - basestr; /* prefix includes first rep */ 929 int suffix_length = strlen((const char *) reptok) - reptoklen; /* string after rep specifier */ 930 int size = prefix_length + suffix_length; 931 932 if (firstnum > 1) { /* add room for reps 2 through firstnum */ 933 size += atomlen*(firstnum-1); 934 } 935 936 /* Adjust size of buffer for special cases */ 937 if (special_case == REPEAT_PLUS_APPENDED) { 938 size++; /* for the final + */ 939 } else if (special_case == REPEAT_WITH_Q) { 940 size += init_q + (atomlen+1)* n_q_reps; 941 } else if (special_case == REPEAT_ZERO) { 942 size += 2; /* just a null ERE: () */ 943 } 944 if ((buf = malloc(size + 1)) == NULL) 945 FATAL("out of space in reg expr %.10s..", lastre); 946 memcpy(buf, basestr, prefix_length); /* copy prefix */ 947 j = prefix_length; 948 if (special_case == REPEAT_ZERO) { 949 j -= atomlen; 950 buf[j++] = '('; 951 buf[j++] = ')'; 952 } 953 for (i = 1; i < firstnum; i++) { /* copy x reps */ 954 memcpy(&buf[j], atom, atomlen); 955 j += atomlen; 956 } 957 if (special_case == REPEAT_PLUS_APPENDED) { 958 buf[j++] = '+'; 959 } else if (special_case == REPEAT_WITH_Q) { 960 if (init_q) 961 buf[j++] = '?'; 962 for (i = init_q; i < n_q_reps; i++) { /* copy x? reps */ 963 memcpy(&buf[j], atom, atomlen); 964 j += atomlen; 965 buf[j++] = '?'; 966 } 967 } 968 memcpy(&buf[j], reptok+reptoklen, suffix_length); 969 if (special_case == REPEAT_ZERO) { 970 buf[j+suffix_length] = '\0'; 971 } else { 972 buf[size] = '\0'; 973 } 974 /* free old basestr */ 975 if (firstbasestr != basestr) { 976 if (basestr) 977 xfree(basestr); 978 } 979 basestr = buf; 980 prestr = buf + prefix_length; 981 if (special_case == REPEAT_ZERO) { 982 prestr -= atomlen; 983 ret++; 984 } 985 return ret; 986} 987 988static int repeat(const uschar *reptok, int reptoklen, const uschar *atom, 989 int atomlen, int firstnum, int secondnum) 990{ 991 /* 992 In general, the repetition specifier or "bound" is replaced here 993 by an equivalent ERE string, repeating the immediately previous atom 994 and appending ? and + as needed. Note that the first copy of the 995 atom is left in place, except in the special_case of a zero-repeat 996 (i.e., {0}). 997 */ 998 if (secondnum < 0) { /* means {n,} -> repeat n-1 times followed by PLUS */ 999 if (firstnum < 2) { 1000 /* 0 or 1: should be handled before you get here */ 1001 FATAL("internal error"); 1002 } else { 1003 return replace_repeat(reptok, reptoklen, atom, atomlen, 1004 firstnum, secondnum, REPEAT_PLUS_APPENDED); 1005 } 1006 } else if (firstnum == secondnum) { /* {n} or {n,n} -> simply repeat n-1 times */ 1007 if (firstnum == 0) { /* {0} or {0,0} */ 1008 /* This case is unusual because the resulting 1009 replacement string might actually be SMALLER than 1010 the original ERE */ 1011 return replace_repeat(reptok, reptoklen, atom, atomlen, 1012 firstnum, secondnum, REPEAT_ZERO); 1013 } else { /* (firstnum >= 1) */ 1014 return replace_repeat(reptok, reptoklen, atom, atomlen, 1015 firstnum, secondnum, REPEAT_SIMPLE); 1016 } 1017 } else if (firstnum < secondnum) { /* {n,m} -> repeat n-1 times then alternate */ 1018 /* x{n,m} => xx...x{1, m-n+1} => xx...x?x?x?..x? */ 1019 return replace_repeat(reptok, reptoklen, atom, atomlen, 1020 firstnum, secondnum, REPEAT_WITH_Q); 1021 } else { /* Error - shouldn't be here (n>m) */ 1022 FATAL("internal error"); 1023 } 1024 return 0; 1025} 1026 1027int relex(void) /* lexical analyzer for reparse */ 1028{ 1029 int c, n; 1030 int cflag; 1031 static uschar *buf = NULL; 1032 static int bufsz = 100; 1033 uschar *bp; 1034 const struct charclass *cc; 1035 int i; 1036 int num, m; 1037 bool commafound, digitfound; 1038 const uschar *startreptok; 1039 static int parens = 0; 1040 1041rescan: 1042 starttok = prestr; 1043 1044 switch (c = *prestr++) { 1045 case '|': return OR; 1046 case '*': return STAR; 1047 case '+': return PLUS; 1048 case '?': return QUEST; 1049 case '.': return DOT; 1050 case '\0': prestr--; return '\0'; 1051 case '^': 1052 case '$': 1053 return c; 1054 case '(': 1055 parens++; 1056 return c; 1057 case ')': 1058 if (parens) { 1059 parens--; 1060 return c; 1061 } 1062 /* unmatched close parenthesis; per POSIX, treat as literal */ 1063 rlxval = c; 1064 return CHAR; 1065 case '\\': 1066 rlxval = quoted(&prestr); 1067 return CHAR; 1068 default: 1069 rlxval = c; 1070 return CHAR; 1071 case '[': 1072 if (buf == NULL && (buf = malloc(bufsz)) == NULL) 1073 FATAL("out of space in reg expr %.10s..", lastre); 1074 bp = buf; 1075 if (*prestr == '^') { 1076 cflag = 1; 1077 prestr++; 1078 } 1079 else 1080 cflag = 0; 1081 n = 2 * strlen((const char *) prestr)+1; 1082 if (!adjbuf((char **) &buf, &bufsz, n, n, (char **) &bp, "relex1")) 1083 FATAL("out of space for reg expr %.10s...", lastre); 1084 for (; ; ) { 1085 if ((c = *prestr++) == '\\') { 1086 *bp++ = '\\'; 1087 if ((c = *prestr++) == '\0') 1088 FATAL("nonterminated character class %.20s...", lastre); 1089 *bp++ = c; 1090 /* } else if (c == '\n') { */ 1091 /* FATAL("newline in character class %.20s...", lastre); */ 1092 } else if (c == '[' && *prestr == ':') { 1093 /* POSIX char class names, Dag-Erling Smorgrav, des@ofug.org */ 1094 for (cc = charclasses; cc->cc_name; cc++) 1095 if (strncmp((const char *) prestr + 1, (const char *) cc->cc_name, cc->cc_namelen) == 0) 1096 break; 1097 if (cc->cc_name != NULL && prestr[1 + cc->cc_namelen] == ':' && 1098 prestr[2 + cc->cc_namelen] == ']') { 1099 prestr += cc->cc_namelen + 3; 1100 /* 1101 * BUG: We begin at 1, instead of 0, since we 1102 * would otherwise prematurely terminate the 1103 * string for classes like [[:cntrl:]]. This 1104 * means that we can't match the NUL character, 1105 * not without first adapting the entire 1106 * program to track each string's length. 1107 */ 1108 for (i = 1; i <= UCHAR_MAX; i++) { 1109 if (!adjbuf((char **) &buf, &bufsz, bp-buf+1, 100, (char **) &bp, "relex2")) 1110 FATAL("out of space for reg expr %.10s...", lastre); 1111 if (cc->cc_func(i)) { 1112 *bp++ = i; 1113 n++; 1114 } 1115 } 1116 } else 1117 *bp++ = c; 1118 } else if (c == '[' && *prestr == '.') { 1119 char collate_char; 1120 prestr++; 1121 collate_char = *prestr++; 1122 if (*prestr == '.' && prestr[1] == ']') { 1123 prestr += 2; 1124 /* Found it: map via locale TBD: for 1125 now, simply return this char. This 1126 is sufficient to pass conformance 1127 test awk.ex 156 1128 */ 1129 if (*prestr == ']') { 1130 prestr++; 1131 rlxval = collate_char; 1132 return CHAR; 1133 } 1134 } 1135 } else if (c == '[' && *prestr == '=') { 1136 char equiv_char; 1137 prestr++; 1138 equiv_char = *prestr++; 1139 if (*prestr == '=' && prestr[1] == ']') { 1140 prestr += 2; 1141 /* Found it: map via locale TBD: for now 1142 simply return this char. This is 1143 sufficient to pass conformance test 1144 awk.ex 156 1145 */ 1146 if (*prestr == ']') { 1147 prestr++; 1148 rlxval = equiv_char; 1149 return CHAR; 1150 } 1151 } 1152 } else if (c == '\0') { 1153 FATAL("nonterminated character class %.20s", lastre); 1154 } else if (bp == buf) { /* 1st char is special */ 1155 *bp++ = c; 1156 } else if (c == ']') { 1157 *bp++ = 0; 1158 rlxstr = (uschar *) tostring((char *) buf); 1159 if (cflag == 0) 1160 return CCL; 1161 else 1162 return NCCL; 1163 } else 1164 *bp++ = c; 1165 } 1166 break; 1167 case '{': 1168 if (isdigit(*(prestr))) { 1169 num = 0; /* Process as a repetition */ 1170 n = -1; m = -1; 1171 commafound = false; 1172 digitfound = false; 1173 startreptok = prestr-1; 1174 /* Remember start of previous atom here ? */ 1175 } else { /* just a { char, not a repetition */ 1176 rlxval = c; 1177 return CHAR; 1178 } 1179 for (; ; ) { 1180 if ((c = *prestr++) == '}') { 1181 if (commafound) { 1182 if (digitfound) { /* {n,m} */ 1183 m = num; 1184 if (m < n) 1185 FATAL("illegal repetition expression: class %.20s", 1186 lastre); 1187 if (n == 0 && m == 1) { 1188 return QUEST; 1189 } 1190 } else { /* {n,} */ 1191 if (n == 0) 1192 return STAR; 1193 else if (n == 1) 1194 return PLUS; 1195 } 1196 } else { 1197 if (digitfound) { /* {n} same as {n,n} */ 1198 n = num; 1199 m = num; 1200 } else { /* {} */ 1201 FATAL("illegal repetition expression: class %.20s", 1202 lastre); 1203 } 1204 } 1205 if (repeat(starttok, prestr-starttok, lastatom, 1206 startreptok - lastatom, n, m) > 0) { 1207 if (n == 0 && m == 0) { 1208 return ZERO; 1209 } 1210 /* must rescan input for next token */ 1211 goto rescan; 1212 } 1213 /* Failed to replace: eat up {...} characters 1214 and treat like just PLUS */ 1215 return PLUS; 1216 } else if (c == '\0') { 1217 FATAL("nonterminated character class %.20s", 1218 lastre); 1219 } else if (isdigit(c)) { 1220 num = 10 * num + c - '0'; 1221 digitfound = true; 1222 } else if (c == ',') { 1223 if (commafound) 1224 FATAL("illegal repetition expression: class %.20s", 1225 lastre); 1226 /* looking for {n,} or {n,m} */ 1227 commafound = true; 1228 n = num; 1229 digitfound = false; /* reset */ 1230 num = 0; 1231 } else { 1232 FATAL("illegal repetition expression: class %.20s", 1233 lastre); 1234 } 1235 } 1236 break; 1237 } 1238} 1239 1240int cgoto(fa *f, int s, int c) 1241{ 1242 int *p, *q; 1243 int i, j, k; 1244 1245 assert(c == HAT || c < NCHARS); 1246 while (f->accept >= maxsetvec) { /* guessing here! */ 1247 resizesetvec(__func__); 1248 } 1249 for (i = 0; i <= f->accept; i++) 1250 setvec[i] = 0; 1251 setcnt = 0; 1252 resize_state(f, s); 1253 /* compute positions of gototab[s,c] into setvec */ 1254 p = f->posns[s]; 1255 for (i = 1; i <= *p; i++) { 1256 if ((k = f->re[p[i]].ltype) != FINAL) { 1257 if ((k == CHAR && c == ptoi(f->re[p[i]].lval.np)) 1258 || (k == DOT && c != 0 && c != HAT) 1259 || (k == ALL && c != 0) 1260 || (k == EMPTYRE && c != 0) 1261 || (k == CCL && member(c, (char *) f->re[p[i]].lval.up)) 1262 || (k == NCCL && !member(c, (char *) f->re[p[i]].lval.up) && c != 0 && c != HAT)) { 1263 q = f->re[p[i]].lfollow; 1264 for (j = 1; j <= *q; j++) { 1265 if (q[j] >= maxsetvec) { 1266 resizesetvec(__func__); 1267 } 1268 if (setvec[q[j]] == 0) { 1269 setcnt++; 1270 setvec[q[j]] = 1; 1271 } 1272 } 1273 } 1274 } 1275 } 1276 /* determine if setvec is a previous state */ 1277 tmpset[0] = setcnt; 1278 j = 1; 1279 for (i = f->accept; i >= 0; i--) 1280 if (setvec[i]) { 1281 tmpset[j++] = i; 1282 } 1283 resize_state(f, f->curstat > s ? f->curstat : s); 1284 /* tmpset == previous state? */ 1285 for (i = 1; i <= f->curstat; i++) { 1286 p = f->posns[i]; 1287 if ((k = tmpset[0]) != p[0]) 1288 goto different; 1289 for (j = 1; j <= k; j++) 1290 if (tmpset[j] != p[j]) 1291 goto different; 1292 /* setvec is state i */ 1293 if (c != HAT) 1294 f->gototab[s][c] = i; 1295 return i; 1296 different:; 1297 } 1298 1299 /* add tmpset to current set of states */ 1300 ++(f->curstat); 1301 resize_state(f, f->curstat); 1302 for (i = 0; i < NCHARS; i++) 1303 f->gototab[f->curstat][i] = 0; 1304 xfree(f->posns[f->curstat]); 1305 p = intalloc(setcnt + 1, __func__); 1306 1307 f->posns[f->curstat] = p; 1308 if (c != HAT) 1309 f->gototab[s][c] = f->curstat; 1310 for (i = 0; i <= setcnt; i++) 1311 p[i] = tmpset[i]; 1312 if (setvec[f->accept]) 1313 f->out[f->curstat] = 1; 1314 else 1315 f->out[f->curstat] = 0; 1316 return f->curstat; 1317} 1318 1319 1320void freefa(fa *f) /* free a finite automaton */ 1321{ 1322 int i; 1323 1324 if (f == NULL) 1325 return; 1326 for (i = 0; i < f->state_count; i++) 1327 xfree(f->gototab[i]) 1328 for (i = 0; i <= f->curstat; i++) 1329 xfree(f->posns[i]); 1330 for (i = 0; i <= f->accept; i++) { 1331 xfree(f->re[i].lfollow); 1332 if (f->re[i].ltype == CCL || f->re[i].ltype == NCCL) 1333 xfree(f->re[i].lval.np); 1334 } 1335 xfree(f->restr); 1336 xfree(f->out); 1337 xfree(f->posns); 1338 xfree(f->gototab); 1339 xfree(f); 1340} 1341