b.c revision 170331
185587Sobrien/**************************************************************** 285587SobrienCopyright (C) Lucent Technologies 1997 385587SobrienAll Rights Reserved 485587Sobrien 585587SobrienPermission to use, copy, modify, and distribute this software and 685587Sobrienits documentation for any purpose and without fee is hereby 785587Sobriengranted, provided that the above copyright notice appear in all 885587Sobriencopies and that both that the copyright notice and this 985587Sobrienpermission notice and warranty disclaimer appear in supporting 1085587Sobriendocumentation, and that the name Lucent Technologies or any of 1185587Sobrienits entities not be used in advertising or publicity pertaining 1285587Sobriento distribution of the software without specific, written prior 1385587Sobrienpermission. 1485587Sobrien 1585587SobrienLUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, 1685587SobrienINCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. 1785587SobrienIN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY 1885587SobrienSPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 1985587SobrienWHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER 2085587SobrienIN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, 2185587SobrienARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF 2285587SobrienTHIS SOFTWARE. 2385587Sobrien****************************************************************/ 2485587Sobrien 25170331Srafan/* lasciate ogne speranza, voi ch'intrate. */ 2685587Sobrien 2785587Sobrien#define DEBUG 2885587Sobrien 2985587Sobrien#include <ctype.h> 3085587Sobrien#include <stdio.h> 3185587Sobrien#include <string.h> 3285587Sobrien#include <stdlib.h> 3385587Sobrien#include "awk.h" 3485587Sobrien#include "ytab.h" 3585587Sobrien 36118194Sru#define HAT (NCHARS+2) /* matches ^ in regular expr */ 3785587Sobrien /* NCHARS is 2**n */ 3885587Sobrien#define MAXLIN 22 3985587Sobrien 4085587Sobrien#define type(v) (v)->nobj /* badly overloaded here */ 4185587Sobrien#define info(v) (v)->ntype /* badly overloaded here */ 4285587Sobrien#define left(v) (v)->narg[0] 4385587Sobrien#define right(v) (v)->narg[1] 4485587Sobrien#define parent(v) (v)->nnext 4585587Sobrien 4685587Sobrien#define LEAF case CCL: case NCCL: case CHAR: case DOT: case FINAL: case ALL: 47170331Srafan#define ELEAF case EMPTYRE: /* empty string in regexp */ 4885587Sobrien#define UNARY case STAR: case PLUS: case QUEST: 4985587Sobrien 5085587Sobrien/* encoding in tree Nodes: 51170331Srafan leaf (CCL, NCCL, CHAR, DOT, FINAL, ALL, EMPTYRE): 5285587Sobrien left is index, right contains value or pointer to value 5385587Sobrien unary (STAR, PLUS, QUEST): left is child, right is null 5485587Sobrien binary (CAT, OR): left and right are children 5585587Sobrien parent contains pointer to parent 5685587Sobrien*/ 5785587Sobrien 5885587Sobrien 5985587Sobrienint *setvec; 6085587Sobrienint *tmpset; 6185587Sobrienint maxsetvec = 0; 6285587Sobrien 6385587Sobrienint rtok; /* next token in current re */ 6485587Sobrienint rlxval; 6585587Sobrienstatic uschar *rlxstr; 6685587Sobrienstatic uschar *prestr; /* current position in current re */ 6785587Sobrienstatic uschar *lastre; /* origin of last re */ 6885587Sobrien 6985587Sobrienstatic int setcnt; 7085587Sobrienstatic int poscnt; 7185587Sobrien 7285587Sobrienchar *patbeg; 7385587Sobrienint patlen; 7485587Sobrien 7585587Sobrien#define NFA 20 /* cache this many dynamic fa's */ 7685587Sobrienfa *fatab[NFA]; 7785587Sobrienint nfatab = 0; /* entries in fatab */ 7885587Sobrien 79107806Sobrienfa *makedfa(const char *s, int anchor) /* returns dfa for reg expr s */ 8085587Sobrien{ 8185587Sobrien int i, use, nuse; 8285587Sobrien fa *pfa; 8385587Sobrien static int now = 1; 8485587Sobrien 8585587Sobrien if (setvec == 0) { /* first time through any RE */ 8685587Sobrien maxsetvec = MAXLIN; 8785587Sobrien setvec = (int *) malloc(maxsetvec * sizeof(int)); 8885587Sobrien tmpset = (int *) malloc(maxsetvec * sizeof(int)); 8985587Sobrien if (setvec == 0 || tmpset == 0) 9085587Sobrien overflo("out of space initializing makedfa"); 9185587Sobrien } 9285587Sobrien 9385587Sobrien if (compile_time) /* a constant for sure */ 9485587Sobrien return mkdfa(s, anchor); 9585587Sobrien for (i = 0; i < nfatab; i++) /* is it there already? */ 9685587Sobrien if (fatab[i]->anchor == anchor 9790902Sdes && strcmp((const char *) fatab[i]->restr, s) == 0) { 9885587Sobrien fatab[i]->use = now++; 9985587Sobrien return fatab[i]; 10085587Sobrien } 10185587Sobrien pfa = mkdfa(s, anchor); 10285587Sobrien if (nfatab < NFA) { /* room for another */ 10385587Sobrien fatab[nfatab] = pfa; 10485587Sobrien fatab[nfatab]->use = now++; 10585587Sobrien nfatab++; 10685587Sobrien return pfa; 10785587Sobrien } 10885587Sobrien use = fatab[0]->use; /* replace least-recently used */ 10985587Sobrien nuse = 0; 11085587Sobrien for (i = 1; i < nfatab; i++) 11185587Sobrien if (fatab[i]->use < use) { 11285587Sobrien use = fatab[i]->use; 11385587Sobrien nuse = i; 11485587Sobrien } 11585587Sobrien freefa(fatab[nuse]); 11685587Sobrien fatab[nuse] = pfa; 11785587Sobrien pfa->use = now++; 11885587Sobrien return pfa; 11985587Sobrien} 12085587Sobrien 121107806Sobrienfa *mkdfa(const char *s, int anchor) /* does the real work of making a dfa */ 12285587Sobrien /* anchor = 1 for anchored matches, else 0 */ 12385587Sobrien{ 12485587Sobrien Node *p, *p1; 12585587Sobrien fa *f; 12685587Sobrien 12785587Sobrien p = reparse(s); 12885587Sobrien p1 = op2(CAT, op2(STAR, op2(ALL, NIL, NIL), NIL), p); 12985587Sobrien /* put ALL STAR in front of reg. exp. */ 13085587Sobrien p1 = op2(CAT, p1, op2(FINAL, NIL, NIL)); 13185587Sobrien /* put FINAL after reg. exp. */ 13285587Sobrien 13385587Sobrien poscnt = 0; 13485587Sobrien penter(p1); /* enter parent pointers and leaf indices */ 13585587Sobrien if ((f = (fa *) calloc(1, sizeof(fa) + poscnt*sizeof(rrow))) == NULL) 13685587Sobrien overflo("out of space for fa"); 13785587Sobrien f->accept = poscnt-1; /* penter has computed number of positions in re */ 13885587Sobrien cfoll(f, p1); /* set up follow sets */ 13985587Sobrien freetr(p1); 14085587Sobrien if ((f->posns[0] = (int *) calloc(1, *(f->re[0].lfollow)*sizeof(int))) == NULL) 14185587Sobrien overflo("out of space in makedfa"); 14285587Sobrien if ((f->posns[1] = (int *) calloc(1, sizeof(int))) == NULL) 14385587Sobrien overflo("out of space in makedfa"); 14485587Sobrien *f->posns[1] = 0; 14585587Sobrien f->initstat = makeinit(f, anchor); 14685587Sobrien f->anchor = anchor; 14785587Sobrien f->restr = (uschar *) tostring(s); 14885587Sobrien return f; 14985587Sobrien} 15085587Sobrien 15185587Sobrienint makeinit(fa *f, int anchor) 15285587Sobrien{ 15385587Sobrien int i, k; 15485587Sobrien 15585587Sobrien f->curstat = 2; 15685587Sobrien f->out[2] = 0; 15785587Sobrien f->reset = 0; 15885587Sobrien k = *(f->re[0].lfollow); 15985587Sobrien xfree(f->posns[2]); 16085587Sobrien if ((f->posns[2] = (int *) calloc(1, (k+1)*sizeof(int))) == NULL) 16185587Sobrien overflo("out of space in makeinit"); 16285587Sobrien for (i=0; i <= k; i++) { 16385587Sobrien (f->posns[2])[i] = (f->re[0].lfollow)[i]; 16485587Sobrien } 16585587Sobrien if ((f->posns[2])[1] == f->accept) 16685587Sobrien f->out[2] = 1; 16785587Sobrien for (i=0; i < NCHARS; i++) 16885587Sobrien f->gototab[2][i] = 0; 16985587Sobrien f->curstat = cgoto(f, 2, HAT); 17085587Sobrien if (anchor) { 17185587Sobrien *f->posns[2] = k-1; /* leave out position 0 */ 17285587Sobrien for (i=0; i < k; i++) { 17385587Sobrien (f->posns[0])[i] = (f->posns[2])[i]; 17485587Sobrien } 17585587Sobrien 17685587Sobrien f->out[0] = f->out[2]; 17785587Sobrien if (f->curstat != 2) 17885587Sobrien --(*f->posns[f->curstat]); 17985587Sobrien } 18085587Sobrien return f->curstat; 18185587Sobrien} 18285587Sobrien 18385587Sobrienvoid penter(Node *p) /* set up parent pointers and leaf indices */ 18485587Sobrien{ 18585587Sobrien switch (type(p)) { 186170331Srafan ELEAF 18785587Sobrien LEAF 18885587Sobrien info(p) = poscnt; 18985587Sobrien poscnt++; 19085587Sobrien break; 19185587Sobrien UNARY 19285587Sobrien penter(left(p)); 19385587Sobrien parent(left(p)) = p; 19485587Sobrien break; 19585587Sobrien case CAT: 19685587Sobrien case OR: 19785587Sobrien penter(left(p)); 19885587Sobrien penter(right(p)); 19985587Sobrien parent(left(p)) = p; 20085587Sobrien parent(right(p)) = p; 20185587Sobrien break; 20285587Sobrien default: /* can't happen */ 20385587Sobrien FATAL("can't happen: unknown type %d in penter", type(p)); 20485587Sobrien break; 20585587Sobrien } 20685587Sobrien} 20785587Sobrien 20885587Sobrienvoid freetr(Node *p) /* free parse tree */ 20985587Sobrien{ 21085587Sobrien switch (type(p)) { 211170331Srafan ELEAF 21285587Sobrien LEAF 21385587Sobrien xfree(p); 21485587Sobrien break; 21585587Sobrien UNARY 21685587Sobrien freetr(left(p)); 21785587Sobrien xfree(p); 21885587Sobrien break; 21985587Sobrien case CAT: 22085587Sobrien case OR: 22185587Sobrien freetr(left(p)); 22285587Sobrien freetr(right(p)); 22385587Sobrien xfree(p); 22485587Sobrien break; 22585587Sobrien default: /* can't happen */ 22685587Sobrien FATAL("can't happen: unknown type %d in freetr", type(p)); 22785587Sobrien break; 22885587Sobrien } 22985587Sobrien} 23085587Sobrien 23185587Sobrien/* in the parsing of regular expressions, metacharacters like . have */ 23285587Sobrien/* to be seen literally; \056 is not a metacharacter. */ 23385587Sobrien 23485587Sobrienint hexstr(char **pp) /* find and eval hex string at pp, return new p */ 23585587Sobrien{ /* only pick up one 8-bit byte (2 chars) */ 23685587Sobrien uschar *p; 23785587Sobrien int n = 0; 23885587Sobrien int i; 23985587Sobrien 24085587Sobrien for (i = 0, p = (uschar *) *pp; i < 2 && isxdigit(*p); i++, p++) { 24185587Sobrien if (isdigit(*p)) 24285587Sobrien n = 16 * n + *p - '0'; 24385587Sobrien else if (*p >= 'a' && *p <= 'f') 24485587Sobrien n = 16 * n + *p - 'a' + 10; 24585587Sobrien else if (*p >= 'A' && *p <= 'F') 24685587Sobrien n = 16 * n + *p - 'A' + 10; 24785587Sobrien } 24885587Sobrien *pp = (char *) p; 24985587Sobrien return n; 25085587Sobrien} 25185587Sobrien 25285587Sobrien#define isoctdigit(c) ((c) >= '0' && (c) <= '7') /* multiple use of arg */ 25385587Sobrien 25485587Sobrienint quoted(char **pp) /* pick up next thing after a \\ */ 25585587Sobrien /* and increment *pp */ 25685587Sobrien{ 25785587Sobrien char *p = *pp; 25885587Sobrien int c; 25985587Sobrien 26085587Sobrien if ((c = *p++) == 't') 26185587Sobrien c = '\t'; 26285587Sobrien else if (c == 'n') 26385587Sobrien c = '\n'; 26485587Sobrien else if (c == 'f') 26585587Sobrien c = '\f'; 26685587Sobrien else if (c == 'r') 26785587Sobrien c = '\r'; 26885587Sobrien else if (c == 'b') 26985587Sobrien c = '\b'; 27085587Sobrien else if (c == '\\') 27185587Sobrien c = '\\'; 27285587Sobrien else if (c == 'x') { /* hexadecimal goo follows */ 27385587Sobrien c = hexstr(&p); /* this adds a null if number is invalid */ 27485587Sobrien } else if (isoctdigit(c)) { /* \d \dd \ddd */ 27585587Sobrien int n = c - '0'; 27685587Sobrien if (isoctdigit(*p)) { 27785587Sobrien n = 8 * n + *p++ - '0'; 27885587Sobrien if (isoctdigit(*p)) 27985587Sobrien n = 8 * n + *p++ - '0'; 28085587Sobrien } 28185587Sobrien c = n; 28285587Sobrien } /* else */ 28385587Sobrien /* c = c; */ 28485587Sobrien *pp = p; 28585587Sobrien return c; 28685587Sobrien} 28785587Sobrien 288107806Sobrienchar *cclenter(const char *argp) /* add a character class */ 289107806Sobrien{ 29085587Sobrien int i, c, c2; 29185587Sobrien uschar *p = (uschar *) argp; 29285587Sobrien uschar *op, *bp; 29385587Sobrien static uschar *buf = 0; 29485587Sobrien static int bufsz = 100; 29585587Sobrien 29685587Sobrien op = p; 29785587Sobrien if (buf == 0 && (buf = (uschar *) malloc(bufsz)) == NULL) 29885587Sobrien FATAL("out of space for character class [%.10s...] 1", p); 29985587Sobrien bp = buf; 30085587Sobrien for (i = 0; (c = *p++) != 0; ) { 30185587Sobrien if (c == '\\') { 30285587Sobrien c = quoted((char **) &p); 30385587Sobrien } else if (c == '-' && i > 0 && bp[-1] != 0) { 30485587Sobrien if (*p != 0) { 30585587Sobrien c = bp[-1]; 30685587Sobrien c2 = *p++; 30785587Sobrien if (c2 == '\\') 30885587Sobrien c2 = quoted((char **) &p); 309118194Sru if (c > c2) { /* empty; ignore */ 31085587Sobrien bp--; 31185587Sobrien i--; 31285587Sobrien continue; 31385587Sobrien } 314118194Sru while (c < c2) { 315170331Srafan if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, "cclenter1")) 31685587Sobrien FATAL("out of space for character class [%.10s...] 2", p); 317118194Sru *bp++ = ++c; 31885587Sobrien i++; 31985587Sobrien } 32085587Sobrien continue; 32185587Sobrien } 32285587Sobrien } 323170331Srafan if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, "cclenter2")) 32485587Sobrien FATAL("out of space for character class [%.10s...] 3", p); 32585587Sobrien *bp++ = c; 32685587Sobrien i++; 32785587Sobrien } 32885587Sobrien *bp = 0; 32985587Sobrien dprintf( ("cclenter: in = |%s|, out = |%s|\n", op, buf) ); 33085587Sobrien xfree(op); 33185587Sobrien return (char *) tostring((char *) buf); 33285587Sobrien} 33385587Sobrien 334107806Sobrienvoid overflo(const char *s) 33585587Sobrien{ 33685587Sobrien FATAL("regular expression too big: %.30s...", s); 33785587Sobrien} 33885587Sobrien 33985587Sobrienvoid cfoll(fa *f, Node *v) /* enter follow set of each leaf of vertex v into lfollow[leaf] */ 34085587Sobrien{ 34185587Sobrien int i; 34285587Sobrien int *p; 34385587Sobrien 34485587Sobrien switch (type(v)) { 345170331Srafan ELEAF 34685587Sobrien LEAF 34785587Sobrien f->re[info(v)].ltype = type(v); 34885587Sobrien f->re[info(v)].lval.np = right(v); 34985587Sobrien while (f->accept >= maxsetvec) { /* guessing here! */ 35085587Sobrien maxsetvec *= 4; 35185587Sobrien setvec = (int *) realloc(setvec, maxsetvec * sizeof(int)); 35285587Sobrien tmpset = (int *) realloc(tmpset, maxsetvec * sizeof(int)); 35385587Sobrien if (setvec == 0 || tmpset == 0) 35485587Sobrien overflo("out of space in cfoll()"); 35585587Sobrien } 35685587Sobrien for (i = 0; i <= f->accept; i++) 35785587Sobrien setvec[i] = 0; 35885587Sobrien setcnt = 0; 35985587Sobrien follow(v); /* computes setvec and setcnt */ 36085587Sobrien if ((p = (int *) calloc(1, (setcnt+1)*sizeof(int))) == NULL) 36185587Sobrien overflo("out of space building follow set"); 36285587Sobrien f->re[info(v)].lfollow = p; 36385587Sobrien *p = setcnt; 36485587Sobrien for (i = f->accept; i >= 0; i--) 36585587Sobrien if (setvec[i] == 1) 36685587Sobrien *++p = i; 36785587Sobrien break; 36885587Sobrien UNARY 36985587Sobrien cfoll(f,left(v)); 37085587Sobrien break; 37185587Sobrien case CAT: 37285587Sobrien case OR: 37385587Sobrien cfoll(f,left(v)); 37485587Sobrien cfoll(f,right(v)); 37585587Sobrien break; 37685587Sobrien default: /* can't happen */ 37785587Sobrien FATAL("can't happen: unknown type %d in cfoll", type(v)); 37885587Sobrien } 37985587Sobrien} 38085587Sobrien 38185587Sobrienint first(Node *p) /* collects initially active leaves of p into setvec */ 382170331Srafan /* returns 0 if p matches empty string */ 38385587Sobrien{ 38485587Sobrien int b, lp; 38585587Sobrien 38685587Sobrien switch (type(p)) { 387170331Srafan ELEAF 38885587Sobrien LEAF 38985587Sobrien lp = info(p); /* look for high-water mark of subscripts */ 39085587Sobrien while (setcnt >= maxsetvec || lp >= maxsetvec) { /* guessing here! */ 39185587Sobrien maxsetvec *= 4; 39285587Sobrien setvec = (int *) realloc(setvec, maxsetvec * sizeof(int)); 39385587Sobrien tmpset = (int *) realloc(tmpset, maxsetvec * sizeof(int)); 39485587Sobrien if (setvec == 0 || tmpset == 0) 39585587Sobrien overflo("out of space in first()"); 39685587Sobrien } 397170331Srafan if (type(p) == EMPTYRE) { 398170331Srafan setvec[lp] = 0; 399170331Srafan return(0); 400170331Srafan } 40185587Sobrien if (setvec[lp] != 1) { 40285587Sobrien setvec[lp] = 1; 40385587Sobrien setcnt++; 40485587Sobrien } 40585587Sobrien if (type(p) == CCL && (*(char *) right(p)) == '\0') 40685587Sobrien return(0); /* empty CCL */ 40785587Sobrien else return(1); 40885587Sobrien case PLUS: 40985587Sobrien if (first(left(p)) == 0) return(0); 41085587Sobrien return(1); 41185587Sobrien case STAR: 41285587Sobrien case QUEST: 41385587Sobrien first(left(p)); 41485587Sobrien return(0); 41585587Sobrien case CAT: 41685587Sobrien if (first(left(p)) == 0 && first(right(p)) == 0) return(0); 41785587Sobrien return(1); 41885587Sobrien case OR: 41985587Sobrien b = first(right(p)); 42085587Sobrien if (first(left(p)) == 0 || b == 0) return(0); 42185587Sobrien return(1); 42285587Sobrien } 42385587Sobrien FATAL("can't happen: unknown type %d in first", type(p)); /* can't happen */ 42485587Sobrien return(-1); 42585587Sobrien} 42685587Sobrien 42785587Sobrienvoid follow(Node *v) /* collects leaves that can follow v into setvec */ 42885587Sobrien{ 42985587Sobrien Node *p; 43085587Sobrien 43185587Sobrien if (type(v) == FINAL) 43285587Sobrien return; 43385587Sobrien p = parent(v); 43485587Sobrien switch (type(p)) { 43585587Sobrien case STAR: 43685587Sobrien case PLUS: 43785587Sobrien first(v); 43885587Sobrien follow(p); 43985587Sobrien return; 44085587Sobrien 44185587Sobrien case OR: 44285587Sobrien case QUEST: 44385587Sobrien follow(p); 44485587Sobrien return; 44585587Sobrien 44685587Sobrien case CAT: 44785587Sobrien if (v == left(p)) { /* v is left child of p */ 44885587Sobrien if (first(right(p)) == 0) { 44985587Sobrien follow(p); 45085587Sobrien return; 45185587Sobrien } 45285587Sobrien } else /* v is right child */ 45385587Sobrien follow(p); 45485587Sobrien return; 45585587Sobrien } 45685587Sobrien} 45785587Sobrien 458107806Sobrienint member(int c, const char *sarg) /* is c in s? */ 45985587Sobrien{ 46085587Sobrien uschar *s = (uschar *) sarg; 46185587Sobrien 46285587Sobrien while (*s) 46385587Sobrien if (c == *s++) 46485587Sobrien return(1); 46585587Sobrien return(0); 46685587Sobrien} 46785587Sobrien 468107806Sobrienint match(fa *f, const char *p0) /* shortest match ? */ 46985587Sobrien{ 47085587Sobrien int s, ns; 47185587Sobrien uschar *p = (uschar *) p0; 47285587Sobrien 47385587Sobrien s = f->reset ? makeinit(f,0) : f->initstat; 47485587Sobrien if (f->out[s]) 47585587Sobrien return(1); 47685587Sobrien do { 477170331Srafan /* assert(*p < NCHARS); */ 47885587Sobrien if ((ns = f->gototab[s][*p]) != 0) 47985587Sobrien s = ns; 48085587Sobrien else 48185587Sobrien s = cgoto(f, s, *p); 48285587Sobrien if (f->out[s]) 48385587Sobrien return(1); 48485587Sobrien } while (*p++ != 0); 48585587Sobrien return(0); 48685587Sobrien} 48785587Sobrien 488107806Sobrienint pmatch(fa *f, const char *p0) /* longest match, for sub */ 48985587Sobrien{ 49085587Sobrien int s, ns; 49185587Sobrien uschar *p = (uschar *) p0; 49285587Sobrien uschar *q; 49385587Sobrien int i, k; 49485587Sobrien 495125601Sru /* s = f->reset ? makeinit(f,1) : f->initstat; */ 496125601Sru if (f->reset) { 497125601Sru f->initstat = s = makeinit(f,1); 498125601Sru } else { 499125601Sru s = f->initstat; 500125601Sru } 50185587Sobrien patbeg = (char *) p; 50285587Sobrien patlen = -1; 50385587Sobrien do { 50485587Sobrien q = p; 50585587Sobrien do { 50685587Sobrien if (f->out[s]) /* final state */ 50785587Sobrien patlen = q-p; 508170331Srafan /* assert(*q < NCHARS); */ 50985587Sobrien if ((ns = f->gototab[s][*q]) != 0) 51085587Sobrien s = ns; 51185587Sobrien else 51285587Sobrien s = cgoto(f, s, *q); 51385587Sobrien if (s == 1) { /* no transition */ 51485587Sobrien if (patlen >= 0) { 51585587Sobrien patbeg = (char *) p; 51685587Sobrien return(1); 51785587Sobrien } 51885587Sobrien else 51985587Sobrien goto nextin; /* no match */ 52085587Sobrien } 52185587Sobrien } while (*q++ != 0); 52285587Sobrien if (f->out[s]) 52385587Sobrien patlen = q-p-1; /* don't count $ */ 52485587Sobrien if (patlen >= 0) { 52585587Sobrien patbeg = (char *) p; 52685587Sobrien return(1); 52785587Sobrien } 52885587Sobrien nextin: 52985587Sobrien s = 2; 53085587Sobrien if (f->reset) { 53185587Sobrien for (i = 2; i <= f->curstat; i++) 53285587Sobrien xfree(f->posns[i]); 53385587Sobrien k = *f->posns[0]; 53485587Sobrien if ((f->posns[2] = (int *) calloc(1, (k+1)*sizeof(int))) == NULL) 53585587Sobrien overflo("out of space in pmatch"); 53685587Sobrien for (i = 0; i <= k; i++) 53785587Sobrien (f->posns[2])[i] = (f->posns[0])[i]; 53885587Sobrien f->initstat = f->curstat = 2; 53985587Sobrien f->out[2] = f->out[0]; 54085587Sobrien for (i = 0; i < NCHARS; i++) 54185587Sobrien f->gototab[2][i] = 0; 54285587Sobrien } 54385587Sobrien } while (*p++ != 0); 54485587Sobrien return (0); 54585587Sobrien} 54685587Sobrien 547107806Sobrienint nematch(fa *f, const char *p0) /* non-empty match, for sub */ 54885587Sobrien{ 54985587Sobrien int s, ns; 55085587Sobrien uschar *p = (uschar *) p0; 55185587Sobrien uschar *q; 55285587Sobrien int i, k; 55385587Sobrien 554125601Sru /* s = f->reset ? makeinit(f,1) : f->initstat; */ 555125601Sru if (f->reset) { 556125601Sru f->initstat = s = makeinit(f,1); 557125601Sru } else { 558125601Sru s = f->initstat; 559125601Sru } 56085587Sobrien patlen = -1; 56185587Sobrien while (*p) { 56285587Sobrien q = p; 56385587Sobrien do { 56485587Sobrien if (f->out[s]) /* final state */ 56585587Sobrien patlen = q-p; 566170331Srafan /* assert(*q < NCHARS); */ 56785587Sobrien if ((ns = f->gototab[s][*q]) != 0) 56885587Sobrien s = ns; 56985587Sobrien else 57085587Sobrien s = cgoto(f, s, *q); 57185587Sobrien if (s == 1) { /* no transition */ 57285587Sobrien if (patlen > 0) { 57385587Sobrien patbeg = (char *) p; 57485587Sobrien return(1); 57585587Sobrien } else 57685587Sobrien goto nnextin; /* no nonempty match */ 57785587Sobrien } 57885587Sobrien } while (*q++ != 0); 57985587Sobrien if (f->out[s]) 58085587Sobrien patlen = q-p-1; /* don't count $ */ 58185587Sobrien if (patlen > 0 ) { 58285587Sobrien patbeg = (char *) p; 58385587Sobrien return(1); 58485587Sobrien } 58585587Sobrien nnextin: 58685587Sobrien s = 2; 58785587Sobrien if (f->reset) { 58885587Sobrien for (i = 2; i <= f->curstat; i++) 58985587Sobrien xfree(f->posns[i]); 59085587Sobrien k = *f->posns[0]; 59185587Sobrien if ((f->posns[2] = (int *) calloc(1, (k+1)*sizeof(int))) == NULL) 59285587Sobrien overflo("out of state space"); 59385587Sobrien for (i = 0; i <= k; i++) 59485587Sobrien (f->posns[2])[i] = (f->posns[0])[i]; 59585587Sobrien f->initstat = f->curstat = 2; 59685587Sobrien f->out[2] = f->out[0]; 59785587Sobrien for (i = 0; i < NCHARS; i++) 59885587Sobrien f->gototab[2][i] = 0; 59985587Sobrien } 60085587Sobrien p++; 60185587Sobrien } 60285587Sobrien return (0); 60385587Sobrien} 60485587Sobrien 605107806SobrienNode *reparse(const char *p) /* parses regular expression pointed to by p */ 60685587Sobrien{ /* uses relex() to scan regular expression */ 60785587Sobrien Node *np; 60885587Sobrien 60985587Sobrien dprintf( ("reparse <%s>\n", p) ); 61085587Sobrien lastre = prestr = (uschar *) p; /* prestr points to string to be parsed */ 61185587Sobrien rtok = relex(); 612107806Sobrien /* GNU compatibility: an empty regexp matches anything */ 613170331Srafan if (rtok == '\0') { 614107806Sobrien /* FATAL("empty regular expression"); previous */ 615170331Srafan return(op2(EMPTYRE, NIL, NIL)); 616170331Srafan } 61785587Sobrien np = regexp(); 61885587Sobrien if (rtok != '\0') 61985587Sobrien FATAL("syntax error in regular expression %s at %s", lastre, prestr); 62085587Sobrien return(np); 62185587Sobrien} 62285587Sobrien 62385587SobrienNode *regexp(void) /* top-level parse of reg expr */ 62485587Sobrien{ 62585587Sobrien return (alt(concat(primary()))); 62685587Sobrien} 62785587Sobrien 62885587SobrienNode *primary(void) 62985587Sobrien{ 63085587Sobrien Node *np; 63185587Sobrien 63285587Sobrien switch (rtok) { 63385587Sobrien case CHAR: 63485587Sobrien np = op2(CHAR, NIL, itonp(rlxval)); 63585587Sobrien rtok = relex(); 63685587Sobrien return (unary(np)); 63785587Sobrien case ALL: 63885587Sobrien rtok = relex(); 63985587Sobrien return (unary(op2(ALL, NIL, NIL))); 640170331Srafan case EMPTYRE: 641170331Srafan rtok = relex(); 642170331Srafan return (unary(op2(ALL, NIL, NIL))); 64385587Sobrien case DOT: 64485587Sobrien rtok = relex(); 64585587Sobrien return (unary(op2(DOT, NIL, NIL))); 64685587Sobrien case CCL: 64785587Sobrien np = op2(CCL, NIL, (Node*) cclenter((char *) rlxstr)); 64885587Sobrien rtok = relex(); 64985587Sobrien return (unary(np)); 65085587Sobrien case NCCL: 65185587Sobrien np = op2(NCCL, NIL, (Node *) cclenter((char *) rlxstr)); 65285587Sobrien rtok = relex(); 65385587Sobrien return (unary(np)); 65485587Sobrien case '^': 65585587Sobrien rtok = relex(); 65685587Sobrien return (unary(op2(CHAR, NIL, itonp(HAT)))); 65785587Sobrien case '$': 65885587Sobrien rtok = relex(); 65985587Sobrien return (unary(op2(CHAR, NIL, NIL))); 66085587Sobrien case '(': 66185587Sobrien rtok = relex(); 66285587Sobrien if (rtok == ')') { /* special pleading for () */ 66385587Sobrien rtok = relex(); 66485587Sobrien return unary(op2(CCL, NIL, (Node *) tostring(""))); 66585587Sobrien } 66685587Sobrien np = regexp(); 66785587Sobrien if (rtok == ')') { 66885587Sobrien rtok = relex(); 66985587Sobrien return (unary(np)); 67085587Sobrien } 67185587Sobrien else 67285587Sobrien FATAL("syntax error in regular expression %s at %s", lastre, prestr); 67385587Sobrien default: 67485587Sobrien FATAL("illegal primary in regular expression %s at %s", lastre, prestr); 67585587Sobrien } 67685587Sobrien return 0; /*NOTREACHED*/ 67785587Sobrien} 67885587Sobrien 67985587SobrienNode *concat(Node *np) 68085587Sobrien{ 68185587Sobrien switch (rtok) { 682170331Srafan case CHAR: case DOT: case ALL: case EMPTYRE: case CCL: case NCCL: case '$': case '(': 68385587Sobrien return (concat(op2(CAT, np, primary()))); 68485587Sobrien } 68585587Sobrien return (np); 68685587Sobrien} 68785587Sobrien 68885587SobrienNode *alt(Node *np) 68985587Sobrien{ 69085587Sobrien if (rtok == OR) { 69185587Sobrien rtok = relex(); 69285587Sobrien return (alt(op2(OR, np, concat(primary())))); 69385587Sobrien } 69485587Sobrien return (np); 69585587Sobrien} 69685587Sobrien 69785587SobrienNode *unary(Node *np) 69885587Sobrien{ 69985587Sobrien switch (rtok) { 70085587Sobrien case STAR: 70185587Sobrien rtok = relex(); 70285587Sobrien return (unary(op2(STAR, np, NIL))); 70385587Sobrien case PLUS: 70485587Sobrien rtok = relex(); 70585587Sobrien return (unary(op2(PLUS, np, NIL))); 70685587Sobrien case QUEST: 70785587Sobrien rtok = relex(); 70885587Sobrien return (unary(op2(QUEST, np, NIL))); 70985587Sobrien default: 71085587Sobrien return (np); 71185587Sobrien } 71285587Sobrien} 71385587Sobrien 71490902Sdes/* 71590902Sdes * Character class definitions conformant to the POSIX locale as 71690902Sdes * defined in IEEE P1003.1 draft 7 of June 2001, assuming the source 71790902Sdes * and operating character sets are both ASCII (ISO646) or supersets 71890902Sdes * thereof. 71990902Sdes * 72090902Sdes * Note that to avoid overflowing the temporary buffer used in 72190902Sdes * relex(), the expanded character class (prior to range expansion) 72290902Sdes * must be less than twice the size of their full name. 72390902Sdes */ 724112336Sobrien 725112336Sobrien/* Because isblank doesn't show up in any of the header files on any 726112336Sobrien * system i use, it's defined here. if some other locale has a richer 727112336Sobrien * definition of "blank", define HAS_ISBLANK and provide your own 728112336Sobrien * version. 729118194Sru * the parentheses here are an attempt to find a path through the maze 730118194Sru * of macro definition and/or function and/or version provided. thanks 731118194Sru * to nelson beebe for the suggestion; let's see if it works everywhere. 732112336Sobrien */ 733112336Sobrien 734112336Sobrien#ifndef HAS_ISBLANK 735112336Sobrien 736118194Sruint (isblank)(int c) 737112336Sobrien{ 738112336Sobrien return c==' ' || c=='\t'; 739112336Sobrien} 740112336Sobrien 741112336Sobrien#endif 742112336Sobrien 74390902Sdesstruct charclass { 74490902Sdes const char *cc_name; 74590902Sdes int cc_namelen; 746112336Sobrien int (*cc_func)(int); 74790902Sdes} charclasses[] = { 748112336Sobrien { "alnum", 5, isalnum }, 749112336Sobrien { "alpha", 5, isalpha }, 750112336Sobrien { "blank", 5, isblank }, 751112336Sobrien { "cntrl", 5, iscntrl }, 752112336Sobrien { "digit", 5, isdigit }, 753112336Sobrien { "graph", 5, isgraph }, 754112336Sobrien { "lower", 5, islower }, 755112336Sobrien { "print", 5, isprint }, 756112336Sobrien { "punct", 5, ispunct }, 757112336Sobrien { "space", 5, isspace }, 758112336Sobrien { "upper", 5, isupper }, 759112336Sobrien { "xdigit", 6, isxdigit }, 76090902Sdes { NULL, 0, NULL }, 76190902Sdes}; 76290902Sdes 76390902Sdes 76485587Sobrienint relex(void) /* lexical analyzer for reparse */ 76585587Sobrien{ 76685587Sobrien int c, n; 76785587Sobrien int cflag; 76885587Sobrien static uschar *buf = 0; 76985587Sobrien static int bufsz = 100; 77085587Sobrien uschar *bp; 77190902Sdes struct charclass *cc; 772112336Sobrien int i; 77385587Sobrien 77485587Sobrien switch (c = *prestr++) { 77585587Sobrien case '|': return OR; 77685587Sobrien case '*': return STAR; 77785587Sobrien case '+': return PLUS; 77885587Sobrien case '?': return QUEST; 77985587Sobrien case '.': return DOT; 78085587Sobrien case '\0': prestr--; return '\0'; 78185587Sobrien case '^': 78285587Sobrien case '$': 78385587Sobrien case '(': 78485587Sobrien case ')': 78585587Sobrien return c; 78685587Sobrien case '\\': 78785587Sobrien rlxval = quoted((char **) &prestr); 78885587Sobrien return CHAR; 78985587Sobrien default: 79085587Sobrien rlxval = c; 79185587Sobrien return CHAR; 79285587Sobrien case '[': 79385587Sobrien if (buf == 0 && (buf = (uschar *) malloc(bufsz)) == NULL) 79485587Sobrien FATAL("out of space in reg expr %.10s..", lastre); 79585587Sobrien bp = buf; 79685587Sobrien if (*prestr == '^') { 79785587Sobrien cflag = 1; 79885587Sobrien prestr++; 79985587Sobrien } 80085587Sobrien else 80185587Sobrien cflag = 0; 80290902Sdes n = 2 * strlen((const char *) prestr)+1; 803170331Srafan if (!adjbuf((char **) &buf, &bufsz, n, n, (char **) &bp, "relex1")) 80485587Sobrien FATAL("out of space for reg expr %.10s...", lastre); 80585587Sobrien for (; ; ) { 80685587Sobrien if ((c = *prestr++) == '\\') { 80785587Sobrien *bp++ = '\\'; 80885587Sobrien if ((c = *prestr++) == '\0') 80985587Sobrien FATAL("nonterminated character class %.20s...", lastre); 81085587Sobrien *bp++ = c; 81185587Sobrien /* } else if (c == '\n') { */ 81285587Sobrien /* FATAL("newline in character class %.20s...", lastre); */ 81390902Sdes } else if (c == '[' && *prestr == ':') { 81490902Sdes /* POSIX char class names, Dag-Erling Smorgrav, des@ofug.org */ 81590902Sdes for (cc = charclasses; cc->cc_name; cc++) 81690902Sdes if (strncmp((const char *) prestr + 1, (const char *) cc->cc_name, cc->cc_namelen) == 0) 81790902Sdes break; 81890902Sdes if (cc->cc_name != NULL && prestr[1 + cc->cc_namelen] == ':' && 81990902Sdes prestr[2 + cc->cc_namelen] == ']') { 82090902Sdes prestr += cc->cc_namelen + 3; 821112336Sobrien for (i = 0; i < NCHARS; i++) { 822170331Srafan if (!adjbuf((char **) &buf, &bufsz, bp-buf+1, 100, (char **) &bp, "relex2")) 823112336Sobrien FATAL("out of space for reg expr %.10s...", lastre); 824112336Sobrien if (cc->cc_func(i)) { 825112336Sobrien *bp++ = i; 826112336Sobrien n++; 827112336Sobrien } 828112336Sobrien } 82990902Sdes } else 83090902Sdes *bp++ = c; 83185587Sobrien } else if (c == '\0') { 83285587Sobrien FATAL("nonterminated character class %.20s", lastre); 83385587Sobrien } else if (bp == buf) { /* 1st char is special */ 83485587Sobrien *bp++ = c; 83585587Sobrien } else if (c == ']') { 83685587Sobrien *bp++ = 0; 83785587Sobrien rlxstr = (uschar *) tostring((char *) buf); 83885587Sobrien if (cflag == 0) 83985587Sobrien return CCL; 84085587Sobrien else 84185587Sobrien return NCCL; 84285587Sobrien } else 84385587Sobrien *bp++ = c; 84485587Sobrien } 84585587Sobrien } 84685587Sobrien} 84785587Sobrien 84885587Sobrienint cgoto(fa *f, int s, int c) 84985587Sobrien{ 85085587Sobrien int i, j, k; 85185587Sobrien int *p, *q; 85285587Sobrien 853146299Sru assert(c == HAT || c < NCHARS); 85485587Sobrien while (f->accept >= maxsetvec) { /* guessing here! */ 85585587Sobrien maxsetvec *= 4; 85685587Sobrien setvec = (int *) realloc(setvec, maxsetvec * sizeof(int)); 85785587Sobrien tmpset = (int *) realloc(tmpset, maxsetvec * sizeof(int)); 85885587Sobrien if (setvec == 0 || tmpset == 0) 85985587Sobrien overflo("out of space in cgoto()"); 86085587Sobrien } 86185587Sobrien for (i = 0; i <= f->accept; i++) 86285587Sobrien setvec[i] = 0; 86385587Sobrien setcnt = 0; 86485587Sobrien /* compute positions of gototab[s,c] into setvec */ 86585587Sobrien p = f->posns[s]; 86685587Sobrien for (i = 1; i <= *p; i++) { 86785587Sobrien if ((k = f->re[p[i]].ltype) != FINAL) { 86885587Sobrien if ((k == CHAR && c == ptoi(f->re[p[i]].lval.np)) 86985587Sobrien || (k == DOT && c != 0 && c != HAT) 87085587Sobrien || (k == ALL && c != 0) 871170331Srafan || (k == EMPTYRE && c != 0) 87285587Sobrien || (k == CCL && member(c, (char *) f->re[p[i]].lval.up)) 87385587Sobrien || (k == NCCL && !member(c, (char *) f->re[p[i]].lval.up) && c != 0 && c != HAT)) { 87485587Sobrien q = f->re[p[i]].lfollow; 87585587Sobrien for (j = 1; j <= *q; j++) { 87685587Sobrien if (q[j] >= maxsetvec) { 87785587Sobrien maxsetvec *= 4; 87885587Sobrien setvec = (int *) realloc(setvec, maxsetvec * sizeof(int)); 87985587Sobrien tmpset = (int *) realloc(setvec, maxsetvec * sizeof(int)); 88085587Sobrien if (setvec == 0 || tmpset == 0) 88185587Sobrien overflo("cgoto overflow"); 88285587Sobrien } 88385587Sobrien if (setvec[q[j]] == 0) { 88485587Sobrien setcnt++; 88585587Sobrien setvec[q[j]] = 1; 88685587Sobrien } 88785587Sobrien } 88885587Sobrien } 88985587Sobrien } 89085587Sobrien } 89185587Sobrien /* determine if setvec is a previous state */ 89285587Sobrien tmpset[0] = setcnt; 89385587Sobrien j = 1; 89485587Sobrien for (i = f->accept; i >= 0; i--) 89585587Sobrien if (setvec[i]) { 89685587Sobrien tmpset[j++] = i; 89785587Sobrien } 89885587Sobrien /* tmpset == previous state? */ 89985587Sobrien for (i = 1; i <= f->curstat; i++) { 90085587Sobrien p = f->posns[i]; 90185587Sobrien if ((k = tmpset[0]) != p[0]) 90285587Sobrien goto different; 90385587Sobrien for (j = 1; j <= k; j++) 90485587Sobrien if (tmpset[j] != p[j]) 90585587Sobrien goto different; 90685587Sobrien /* setvec is state i */ 90785587Sobrien f->gototab[s][c] = i; 90885587Sobrien return i; 90985587Sobrien different:; 91085587Sobrien } 91185587Sobrien 91285587Sobrien /* add tmpset to current set of states */ 91385587Sobrien if (f->curstat >= NSTATES-1) { 91485587Sobrien f->curstat = 2; 91585587Sobrien f->reset = 1; 91685587Sobrien for (i = 2; i < NSTATES; i++) 91785587Sobrien xfree(f->posns[i]); 91885587Sobrien } else 91985587Sobrien ++(f->curstat); 92085587Sobrien for (i = 0; i < NCHARS; i++) 92185587Sobrien f->gototab[f->curstat][i] = 0; 92285587Sobrien xfree(f->posns[f->curstat]); 92385587Sobrien if ((p = (int *) calloc(1, (setcnt+1)*sizeof(int))) == NULL) 92485587Sobrien overflo("out of space in cgoto"); 92585587Sobrien 92685587Sobrien f->posns[f->curstat] = p; 92785587Sobrien f->gototab[s][c] = f->curstat; 92885587Sobrien for (i = 0; i <= setcnt; i++) 92985587Sobrien p[i] = tmpset[i]; 93085587Sobrien if (setvec[f->accept]) 93185587Sobrien f->out[f->curstat] = 1; 93285587Sobrien else 93385587Sobrien f->out[f->curstat] = 0; 93485587Sobrien return f->curstat; 93585587Sobrien} 93685587Sobrien 93785587Sobrien 93885587Sobrienvoid freefa(fa *f) /* free a finite automaton */ 93985587Sobrien{ 94085587Sobrien int i; 94185587Sobrien 94285587Sobrien if (f == NULL) 94385587Sobrien return; 94485587Sobrien for (i = 0; i <= f->curstat; i++) 94585587Sobrien xfree(f->posns[i]); 94685587Sobrien for (i = 0; i <= f->accept; i++) { 94785587Sobrien xfree(f->re[i].lfollow); 94885587Sobrien if (f->re[i].ltype == CCL || f->re[i].ltype == NCCL) 94985587Sobrien xfree((f->re[i].lval.np)); 95085587Sobrien } 95185587Sobrien xfree(f->restr); 95285587Sobrien xfree(f); 95385587Sobrien} 954