b.c revision 112336
185587Sobrien/**************************************************************** 285587SobrienCopyright (C) Lucent Technologies 1997 385587SobrienAll Rights Reserved 485587Sobrien 585587SobrienPermission to use, copy, modify, and distribute this software and 685587Sobrienits documentation for any purpose and without fee is hereby 785587Sobriengranted, provided that the above copyright notice appear in all 885587Sobriencopies and that both that the copyright notice and this 985587Sobrienpermission notice and warranty disclaimer appear in supporting 1085587Sobriendocumentation, and that the name Lucent Technologies or any of 1185587Sobrienits entities not be used in advertising or publicity pertaining 1285587Sobriento distribution of the software without specific, written prior 1385587Sobrienpermission. 1485587Sobrien 1585587SobrienLUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, 1685587SobrienINCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. 1785587SobrienIN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY 1885587SobrienSPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 1985587SobrienWHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER 2085587SobrienIN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, 2185587SobrienARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF 2285587SobrienTHIS SOFTWARE. 2385587Sobrien****************************************************************/ 2485587Sobrien 2585587Sobrien/* lasciate ogne speranza, voi ch'entrate. */ 2685587Sobrien 2785587Sobrien#define DEBUG 2885587Sobrien 2985587Sobrien#include <ctype.h> 3085587Sobrien#include <stdio.h> 3185587Sobrien#include <string.h> 3285587Sobrien#include <stdlib.h> 3385587Sobrien#include "awk.h" 3485587Sobrien#include "ytab.h" 3585587Sobrien 3685587Sobrien#define HAT (NCHARS-2) /* matches ^ in regular expr */ 3785587Sobrien /* NCHARS is 2**n */ 3885587Sobrien#define MAXLIN 22 3985587Sobrien 4085587Sobrien#define type(v) (v)->nobj /* badly overloaded here */ 4185587Sobrien#define info(v) (v)->ntype /* badly overloaded here */ 4285587Sobrien#define left(v) (v)->narg[0] 4385587Sobrien#define right(v) (v)->narg[1] 4485587Sobrien#define parent(v) (v)->nnext 4585587Sobrien 4685587Sobrien#define LEAF case CCL: case NCCL: case CHAR: case DOT: case FINAL: case ALL: 4785587Sobrien#define UNARY case STAR: case PLUS: case QUEST: 4885587Sobrien 4985587Sobrien/* encoding in tree Nodes: 5085587Sobrien leaf (CCL, NCCL, CHAR, DOT, FINAL, ALL): 5185587Sobrien left is index, right contains value or pointer to value 5285587Sobrien unary (STAR, PLUS, QUEST): left is child, right is null 5385587Sobrien binary (CAT, OR): left and right are children 5485587Sobrien parent contains pointer to parent 5585587Sobrien*/ 5685587Sobrien 5785587Sobrien 5885587Sobrienint *setvec; 5985587Sobrienint *tmpset; 6085587Sobrienint maxsetvec = 0; 6185587Sobrien 6285587Sobrienint rtok; /* next token in current re */ 6385587Sobrienint rlxval; 6485587Sobrienstatic uschar *rlxstr; 6585587Sobrienstatic uschar *prestr; /* current position in current re */ 6685587Sobrienstatic uschar *lastre; /* origin of last re */ 6785587Sobrien 6885587Sobrienstatic int setcnt; 6985587Sobrienstatic int poscnt; 7085587Sobrien 7185587Sobrienchar *patbeg; 7285587Sobrienint patlen; 7385587Sobrien 7485587Sobrien#define NFA 20 /* cache this many dynamic fa's */ 7585587Sobrienfa *fatab[NFA]; 7685587Sobrienint nfatab = 0; /* entries in fatab */ 7785587Sobrien 78107806Sobrienfa *makedfa(const char *s, int anchor) /* returns dfa for reg expr s */ 7985587Sobrien{ 8085587Sobrien int i, use, nuse; 8185587Sobrien fa *pfa; 8285587Sobrien static int now = 1; 8385587Sobrien 8485587Sobrien if (setvec == 0) { /* first time through any RE */ 8585587Sobrien maxsetvec = MAXLIN; 8685587Sobrien setvec = (int *) malloc(maxsetvec * sizeof(int)); 8785587Sobrien tmpset = (int *) malloc(maxsetvec * sizeof(int)); 8885587Sobrien if (setvec == 0 || tmpset == 0) 8985587Sobrien overflo("out of space initializing makedfa"); 9085587Sobrien } 9185587Sobrien 9285587Sobrien if (compile_time) /* a constant for sure */ 9385587Sobrien return mkdfa(s, anchor); 9485587Sobrien for (i = 0; i < nfatab; i++) /* is it there already? */ 9585587Sobrien if (fatab[i]->anchor == anchor 9690902Sdes && strcmp((const char *) fatab[i]->restr, s) == 0) { 9785587Sobrien fatab[i]->use = now++; 9885587Sobrien return fatab[i]; 9985587Sobrien } 10085587Sobrien pfa = mkdfa(s, anchor); 10185587Sobrien if (nfatab < NFA) { /* room for another */ 10285587Sobrien fatab[nfatab] = pfa; 10385587Sobrien fatab[nfatab]->use = now++; 10485587Sobrien nfatab++; 10585587Sobrien return pfa; 10685587Sobrien } 10785587Sobrien use = fatab[0]->use; /* replace least-recently used */ 10885587Sobrien nuse = 0; 10985587Sobrien for (i = 1; i < nfatab; i++) 11085587Sobrien if (fatab[i]->use < use) { 11185587Sobrien use = fatab[i]->use; 11285587Sobrien nuse = i; 11385587Sobrien } 11485587Sobrien freefa(fatab[nuse]); 11585587Sobrien fatab[nuse] = pfa; 11685587Sobrien pfa->use = now++; 11785587Sobrien return pfa; 11885587Sobrien} 11985587Sobrien 120107806Sobrienfa *mkdfa(const char *s, int anchor) /* does the real work of making a dfa */ 12185587Sobrien /* anchor = 1 for anchored matches, else 0 */ 12285587Sobrien{ 12385587Sobrien Node *p, *p1; 12485587Sobrien fa *f; 12585587Sobrien 12685587Sobrien p = reparse(s); 12785587Sobrien p1 = op2(CAT, op2(STAR, op2(ALL, NIL, NIL), NIL), p); 12885587Sobrien /* put ALL STAR in front of reg. exp. */ 12985587Sobrien p1 = op2(CAT, p1, op2(FINAL, NIL, NIL)); 13085587Sobrien /* put FINAL after reg. exp. */ 13185587Sobrien 13285587Sobrien poscnt = 0; 13385587Sobrien penter(p1); /* enter parent pointers and leaf indices */ 13485587Sobrien if ((f = (fa *) calloc(1, sizeof(fa) + poscnt*sizeof(rrow))) == NULL) 13585587Sobrien overflo("out of space for fa"); 13685587Sobrien f->accept = poscnt-1; /* penter has computed number of positions in re */ 13785587Sobrien cfoll(f, p1); /* set up follow sets */ 13885587Sobrien freetr(p1); 13985587Sobrien if ((f->posns[0] = (int *) calloc(1, *(f->re[0].lfollow)*sizeof(int))) == NULL) 14085587Sobrien overflo("out of space in makedfa"); 14185587Sobrien if ((f->posns[1] = (int *) calloc(1, sizeof(int))) == NULL) 14285587Sobrien overflo("out of space in makedfa"); 14385587Sobrien *f->posns[1] = 0; 14485587Sobrien f->initstat = makeinit(f, anchor); 14585587Sobrien f->anchor = anchor; 14685587Sobrien f->restr = (uschar *) tostring(s); 14785587Sobrien return f; 14885587Sobrien} 14985587Sobrien 15085587Sobrienint makeinit(fa *f, int anchor) 15185587Sobrien{ 15285587Sobrien int i, k; 15385587Sobrien 15485587Sobrien f->curstat = 2; 15585587Sobrien f->out[2] = 0; 15685587Sobrien f->reset = 0; 15785587Sobrien k = *(f->re[0].lfollow); 15885587Sobrien xfree(f->posns[2]); 15985587Sobrien if ((f->posns[2] = (int *) calloc(1, (k+1)*sizeof(int))) == NULL) 16085587Sobrien overflo("out of space in makeinit"); 16185587Sobrien for (i=0; i <= k; i++) { 16285587Sobrien (f->posns[2])[i] = (f->re[0].lfollow)[i]; 16385587Sobrien } 16485587Sobrien if ((f->posns[2])[1] == f->accept) 16585587Sobrien f->out[2] = 1; 16685587Sobrien for (i=0; i < NCHARS; i++) 16785587Sobrien f->gototab[2][i] = 0; 16885587Sobrien f->curstat = cgoto(f, 2, HAT); 16985587Sobrien if (anchor) { 17085587Sobrien *f->posns[2] = k-1; /* leave out position 0 */ 17185587Sobrien for (i=0; i < k; i++) { 17285587Sobrien (f->posns[0])[i] = (f->posns[2])[i]; 17385587Sobrien } 17485587Sobrien 17585587Sobrien f->out[0] = f->out[2]; 17685587Sobrien if (f->curstat != 2) 17785587Sobrien --(*f->posns[f->curstat]); 17885587Sobrien } 17985587Sobrien return f->curstat; 18085587Sobrien} 18185587Sobrien 18285587Sobrienvoid penter(Node *p) /* set up parent pointers and leaf indices */ 18385587Sobrien{ 18485587Sobrien switch (type(p)) { 18585587Sobrien LEAF 18685587Sobrien info(p) = poscnt; 18785587Sobrien poscnt++; 18885587Sobrien break; 18985587Sobrien UNARY 19085587Sobrien penter(left(p)); 19185587Sobrien parent(left(p)) = p; 19285587Sobrien break; 19385587Sobrien case CAT: 19485587Sobrien case OR: 19585587Sobrien penter(left(p)); 19685587Sobrien penter(right(p)); 19785587Sobrien parent(left(p)) = p; 19885587Sobrien parent(right(p)) = p; 19985587Sobrien break; 20085587Sobrien default: /* can't happen */ 20185587Sobrien FATAL("can't happen: unknown type %d in penter", type(p)); 20285587Sobrien break; 20385587Sobrien } 20485587Sobrien} 20585587Sobrien 20685587Sobrienvoid freetr(Node *p) /* free parse tree */ 20785587Sobrien{ 20885587Sobrien switch (type(p)) { 20985587Sobrien LEAF 21085587Sobrien xfree(p); 21185587Sobrien break; 21285587Sobrien UNARY 21385587Sobrien freetr(left(p)); 21485587Sobrien xfree(p); 21585587Sobrien break; 21685587Sobrien case CAT: 21785587Sobrien case OR: 21885587Sobrien freetr(left(p)); 21985587Sobrien freetr(right(p)); 22085587Sobrien xfree(p); 22185587Sobrien break; 22285587Sobrien default: /* can't happen */ 22385587Sobrien FATAL("can't happen: unknown type %d in freetr", type(p)); 22485587Sobrien break; 22585587Sobrien } 22685587Sobrien} 22785587Sobrien 22885587Sobrien/* in the parsing of regular expressions, metacharacters like . have */ 22985587Sobrien/* to be seen literally; \056 is not a metacharacter. */ 23085587Sobrien 23185587Sobrienint hexstr(char **pp) /* find and eval hex string at pp, return new p */ 23285587Sobrien{ /* only pick up one 8-bit byte (2 chars) */ 23385587Sobrien uschar *p; 23485587Sobrien int n = 0; 23585587Sobrien int i; 23685587Sobrien 23785587Sobrien for (i = 0, p = (uschar *) *pp; i < 2 && isxdigit(*p); i++, p++) { 23885587Sobrien if (isdigit(*p)) 23985587Sobrien n = 16 * n + *p - '0'; 24085587Sobrien else if (*p >= 'a' && *p <= 'f') 24185587Sobrien n = 16 * n + *p - 'a' + 10; 24285587Sobrien else if (*p >= 'A' && *p <= 'F') 24385587Sobrien n = 16 * n + *p - 'A' + 10; 24485587Sobrien } 24585587Sobrien *pp = (char *) p; 24685587Sobrien return n; 24785587Sobrien} 24885587Sobrien 24985587Sobrien#define isoctdigit(c) ((c) >= '0' && (c) <= '7') /* multiple use of arg */ 25085587Sobrien 25185587Sobrienint quoted(char **pp) /* pick up next thing after a \\ */ 25285587Sobrien /* and increment *pp */ 25385587Sobrien{ 25485587Sobrien char *p = *pp; 25585587Sobrien int c; 25685587Sobrien 25785587Sobrien if ((c = *p++) == 't') 25885587Sobrien c = '\t'; 25985587Sobrien else if (c == 'n') 26085587Sobrien c = '\n'; 26185587Sobrien else if (c == 'f') 26285587Sobrien c = '\f'; 26385587Sobrien else if (c == 'r') 26485587Sobrien c = '\r'; 26585587Sobrien else if (c == 'b') 26685587Sobrien c = '\b'; 26785587Sobrien else if (c == '\\') 26885587Sobrien c = '\\'; 26985587Sobrien else if (c == 'x') { /* hexadecimal goo follows */ 27085587Sobrien c = hexstr(&p); /* this adds a null if number is invalid */ 27185587Sobrien } else if (isoctdigit(c)) { /* \d \dd \ddd */ 27285587Sobrien int n = c - '0'; 27385587Sobrien if (isoctdigit(*p)) { 27485587Sobrien n = 8 * n + *p++ - '0'; 27585587Sobrien if (isoctdigit(*p)) 27685587Sobrien n = 8 * n + *p++ - '0'; 27785587Sobrien } 27885587Sobrien c = n; 27985587Sobrien } /* else */ 28085587Sobrien /* c = c; */ 28185587Sobrien *pp = p; 28285587Sobrien return c; 28385587Sobrien} 28485587Sobrien 285112336Sobrienstatic int collate_range_cmp(int a, int b) 286112336Sobrien{ 287112336Sobrien int r; 288112336Sobrien static char s[2][2]; 289112336Sobrien 290112336Sobrien if ((uschar)a == (uschar)b) 291112336Sobrien return 0; 292112336Sobrien s[0][0] = a; 293112336Sobrien s[1][0] = b; 294112336Sobrien if ((r = strcoll(s[0], s[1])) == 0) 295112336Sobrien r = (uschar)a - (uschar)b; 296112336Sobrien return r; 297112336Sobrien} 298112336Sobrien 299107806Sobrienchar *cclenter(const char *argp) /* add a character class */ 300107806Sobrien{ 30185587Sobrien int i, c, c2; 302112336Sobrien int j; 30385587Sobrien uschar *p = (uschar *) argp; 30485587Sobrien uschar *op, *bp; 30585587Sobrien static uschar *buf = 0; 30685587Sobrien static int bufsz = 100; 30785587Sobrien 30885587Sobrien op = p; 30985587Sobrien if (buf == 0 && (buf = (uschar *) malloc(bufsz)) == NULL) 31085587Sobrien FATAL("out of space for character class [%.10s...] 1", p); 31185587Sobrien bp = buf; 31285587Sobrien for (i = 0; (c = *p++) != 0; ) { 31385587Sobrien if (c == '\\') { 31485587Sobrien c = quoted((char **) &p); 31585587Sobrien } else if (c == '-' && i > 0 && bp[-1] != 0) { 31685587Sobrien if (*p != 0) { 31785587Sobrien c = bp[-1]; 31885587Sobrien c2 = *p++; 31985587Sobrien if (c2 == '\\') 32085587Sobrien c2 = quoted((char **) &p); 321112336Sobrien if (collate_range_cmp(c, c2) > 0) { /* empty; ignore */ 32285587Sobrien bp--; 32385587Sobrien i--; 32485587Sobrien continue; 32585587Sobrien } 326112336Sobrien for (j = 0; j < NCHARS; j++) { 327112336Sobrien if ((collate_range_cmp(c, j) > 0) || 328112336Sobrien collate_range_cmp(j, c2) > 0) 329112336Sobrien continue; 33085587Sobrien if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, 0)) 33185587Sobrien FATAL("out of space for character class [%.10s...] 2", p); 332112336Sobrien *bp++ = j; 33385587Sobrien i++; 33485587Sobrien } 33585587Sobrien continue; 33685587Sobrien } 33785587Sobrien } 33885587Sobrien if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, 0)) 33985587Sobrien FATAL("out of space for character class [%.10s...] 3", p); 34085587Sobrien *bp++ = c; 34185587Sobrien i++; 34285587Sobrien } 34385587Sobrien *bp = 0; 34485587Sobrien dprintf( ("cclenter: in = |%s|, out = |%s|\n", op, buf) ); 34585587Sobrien xfree(op); 34685587Sobrien return (char *) tostring((char *) buf); 34785587Sobrien} 34885587Sobrien 349107806Sobrienvoid overflo(const char *s) 35085587Sobrien{ 35185587Sobrien FATAL("regular expression too big: %.30s...", s); 35285587Sobrien} 35385587Sobrien 35485587Sobrienvoid cfoll(fa *f, Node *v) /* enter follow set of each leaf of vertex v into lfollow[leaf] */ 35585587Sobrien{ 35685587Sobrien int i; 35785587Sobrien int *p; 35885587Sobrien 35985587Sobrien switch (type(v)) { 36085587Sobrien LEAF 36185587Sobrien f->re[info(v)].ltype = type(v); 36285587Sobrien f->re[info(v)].lval.np = right(v); 36385587Sobrien while (f->accept >= maxsetvec) { /* guessing here! */ 36485587Sobrien maxsetvec *= 4; 36585587Sobrien setvec = (int *) realloc(setvec, maxsetvec * sizeof(int)); 36685587Sobrien tmpset = (int *) realloc(tmpset, maxsetvec * sizeof(int)); 36785587Sobrien if (setvec == 0 || tmpset == 0) 36885587Sobrien overflo("out of space in cfoll()"); 36985587Sobrien } 37085587Sobrien for (i = 0; i <= f->accept; i++) 37185587Sobrien setvec[i] = 0; 37285587Sobrien setcnt = 0; 37385587Sobrien follow(v); /* computes setvec and setcnt */ 37485587Sobrien if ((p = (int *) calloc(1, (setcnt+1)*sizeof(int))) == NULL) 37585587Sobrien overflo("out of space building follow set"); 37685587Sobrien f->re[info(v)].lfollow = p; 37785587Sobrien *p = setcnt; 37885587Sobrien for (i = f->accept; i >= 0; i--) 37985587Sobrien if (setvec[i] == 1) 38085587Sobrien *++p = i; 38185587Sobrien break; 38285587Sobrien UNARY 38385587Sobrien cfoll(f,left(v)); 38485587Sobrien break; 38585587Sobrien case CAT: 38685587Sobrien case OR: 38785587Sobrien cfoll(f,left(v)); 38885587Sobrien cfoll(f,right(v)); 38985587Sobrien break; 39085587Sobrien default: /* can't happen */ 39185587Sobrien FATAL("can't happen: unknown type %d in cfoll", type(v)); 39285587Sobrien } 39385587Sobrien} 39485587Sobrien 39585587Sobrienint first(Node *p) /* collects initially active leaves of p into setvec */ 39685587Sobrien /* returns 1 if p matches empty string */ 39785587Sobrien{ 39885587Sobrien int b, lp; 39985587Sobrien 40085587Sobrien switch (type(p)) { 40185587Sobrien LEAF 40285587Sobrien lp = info(p); /* look for high-water mark of subscripts */ 40385587Sobrien while (setcnt >= maxsetvec || lp >= maxsetvec) { /* guessing here! */ 40485587Sobrien maxsetvec *= 4; 40585587Sobrien setvec = (int *) realloc(setvec, maxsetvec * sizeof(int)); 40685587Sobrien tmpset = (int *) realloc(tmpset, maxsetvec * sizeof(int)); 40785587Sobrien if (setvec == 0 || tmpset == 0) 40885587Sobrien overflo("out of space in first()"); 40985587Sobrien } 41085587Sobrien if (setvec[lp] != 1) { 41185587Sobrien setvec[lp] = 1; 41285587Sobrien setcnt++; 41385587Sobrien } 41485587Sobrien if (type(p) == CCL && (*(char *) right(p)) == '\0') 41585587Sobrien return(0); /* empty CCL */ 41685587Sobrien else return(1); 41785587Sobrien case PLUS: 41885587Sobrien if (first(left(p)) == 0) return(0); 41985587Sobrien return(1); 42085587Sobrien case STAR: 42185587Sobrien case QUEST: 42285587Sobrien first(left(p)); 42385587Sobrien return(0); 42485587Sobrien case CAT: 42585587Sobrien if (first(left(p)) == 0 && first(right(p)) == 0) return(0); 42685587Sobrien return(1); 42785587Sobrien case OR: 42885587Sobrien b = first(right(p)); 42985587Sobrien if (first(left(p)) == 0 || b == 0) return(0); 43085587Sobrien return(1); 43185587Sobrien } 43285587Sobrien FATAL("can't happen: unknown type %d in first", type(p)); /* can't happen */ 43385587Sobrien return(-1); 43485587Sobrien} 43585587Sobrien 43685587Sobrienvoid follow(Node *v) /* collects leaves that can follow v into setvec */ 43785587Sobrien{ 43885587Sobrien Node *p; 43985587Sobrien 44085587Sobrien if (type(v) == FINAL) 44185587Sobrien return; 44285587Sobrien p = parent(v); 44385587Sobrien switch (type(p)) { 44485587Sobrien case STAR: 44585587Sobrien case PLUS: 44685587Sobrien first(v); 44785587Sobrien follow(p); 44885587Sobrien return; 44985587Sobrien 45085587Sobrien case OR: 45185587Sobrien case QUEST: 45285587Sobrien follow(p); 45385587Sobrien return; 45485587Sobrien 45585587Sobrien case CAT: 45685587Sobrien if (v == left(p)) { /* v is left child of p */ 45785587Sobrien if (first(right(p)) == 0) { 45885587Sobrien follow(p); 45985587Sobrien return; 46085587Sobrien } 46185587Sobrien } else /* v is right child */ 46285587Sobrien follow(p); 46385587Sobrien return; 46485587Sobrien } 46585587Sobrien} 46685587Sobrien 467107806Sobrienint member(int c, const char *sarg) /* is c in s? */ 46885587Sobrien{ 46985587Sobrien uschar *s = (uschar *) sarg; 47085587Sobrien 47185587Sobrien while (*s) 47285587Sobrien if (c == *s++) 47385587Sobrien return(1); 47485587Sobrien return(0); 47585587Sobrien} 47685587Sobrien 477107806Sobrienint match(fa *f, const char *p0) /* shortest match ? */ 47885587Sobrien{ 47985587Sobrien int s, ns; 48085587Sobrien uschar *p = (uschar *) p0; 48185587Sobrien 48285587Sobrien s = f->reset ? makeinit(f,0) : f->initstat; 48385587Sobrien if (f->out[s]) 48485587Sobrien return(1); 48585587Sobrien do { 48685587Sobrien if ((ns = f->gototab[s][*p]) != 0) 48785587Sobrien s = ns; 48885587Sobrien else 48985587Sobrien s = cgoto(f, s, *p); 49085587Sobrien if (f->out[s]) 49185587Sobrien return(1); 49285587Sobrien } while (*p++ != 0); 49385587Sobrien return(0); 49485587Sobrien} 49585587Sobrien 496107806Sobrienint pmatch(fa *f, const char *p0) /* longest match, for sub */ 49785587Sobrien{ 49885587Sobrien int s, ns; 49985587Sobrien uschar *p = (uschar *) p0; 50085587Sobrien uschar *q; 50185587Sobrien int i, k; 50285587Sobrien 50385587Sobrien s = f->reset ? makeinit(f,1) : f->initstat; 50485587Sobrien patbeg = (char *) p; 50585587Sobrien patlen = -1; 50685587Sobrien do { 50785587Sobrien q = p; 50885587Sobrien do { 50985587Sobrien if (f->out[s]) /* final state */ 51085587Sobrien patlen = q-p; 51185587Sobrien if ((ns = f->gototab[s][*q]) != 0) 51285587Sobrien s = ns; 51385587Sobrien else 51485587Sobrien s = cgoto(f, s, *q); 51585587Sobrien if (s == 1) { /* no transition */ 51685587Sobrien if (patlen >= 0) { 51785587Sobrien patbeg = (char *) p; 51885587Sobrien return(1); 51985587Sobrien } 52085587Sobrien else 52185587Sobrien goto nextin; /* no match */ 52285587Sobrien } 52385587Sobrien } while (*q++ != 0); 52485587Sobrien if (f->out[s]) 52585587Sobrien patlen = q-p-1; /* don't count $ */ 52685587Sobrien if (patlen >= 0) { 52785587Sobrien patbeg = (char *) p; 52885587Sobrien return(1); 52985587Sobrien } 53085587Sobrien nextin: 53185587Sobrien s = 2; 53285587Sobrien if (f->reset) { 53385587Sobrien for (i = 2; i <= f->curstat; i++) 53485587Sobrien xfree(f->posns[i]); 53585587Sobrien k = *f->posns[0]; 53685587Sobrien if ((f->posns[2] = (int *) calloc(1, (k+1)*sizeof(int))) == NULL) 53785587Sobrien overflo("out of space in pmatch"); 53885587Sobrien for (i = 0; i <= k; i++) 53985587Sobrien (f->posns[2])[i] = (f->posns[0])[i]; 54085587Sobrien f->initstat = f->curstat = 2; 54185587Sobrien f->out[2] = f->out[0]; 54285587Sobrien for (i = 0; i < NCHARS; i++) 54385587Sobrien f->gototab[2][i] = 0; 54485587Sobrien } 54585587Sobrien } while (*p++ != 0); 54685587Sobrien return (0); 54785587Sobrien} 54885587Sobrien 549107806Sobrienint nematch(fa *f, const char *p0) /* non-empty match, for sub */ 55085587Sobrien{ 55185587Sobrien int s, ns; 55285587Sobrien uschar *p = (uschar *) p0; 55385587Sobrien uschar *q; 55485587Sobrien int i, k; 55585587Sobrien 55685587Sobrien s = f->reset ? makeinit(f,1) : f->initstat; 55785587Sobrien patlen = -1; 55885587Sobrien while (*p) { 55985587Sobrien q = p; 56085587Sobrien do { 56185587Sobrien if (f->out[s]) /* final state */ 56285587Sobrien patlen = q-p; 56385587Sobrien if ((ns = f->gototab[s][*q]) != 0) 56485587Sobrien s = ns; 56585587Sobrien else 56685587Sobrien s = cgoto(f, s, *q); 56785587Sobrien if (s == 1) { /* no transition */ 56885587Sobrien if (patlen > 0) { 56985587Sobrien patbeg = (char *) p; 57085587Sobrien return(1); 57185587Sobrien } else 57285587Sobrien goto nnextin; /* no nonempty match */ 57385587Sobrien } 57485587Sobrien } while (*q++ != 0); 57585587Sobrien if (f->out[s]) 57685587Sobrien patlen = q-p-1; /* don't count $ */ 57785587Sobrien if (patlen > 0 ) { 57885587Sobrien patbeg = (char *) p; 57985587Sobrien return(1); 58085587Sobrien } 58185587Sobrien nnextin: 58285587Sobrien s = 2; 58385587Sobrien if (f->reset) { 58485587Sobrien for (i = 2; i <= f->curstat; i++) 58585587Sobrien xfree(f->posns[i]); 58685587Sobrien k = *f->posns[0]; 58785587Sobrien if ((f->posns[2] = (int *) calloc(1, (k+1)*sizeof(int))) == NULL) 58885587Sobrien overflo("out of state space"); 58985587Sobrien for (i = 0; i <= k; i++) 59085587Sobrien (f->posns[2])[i] = (f->posns[0])[i]; 59185587Sobrien f->initstat = f->curstat = 2; 59285587Sobrien f->out[2] = f->out[0]; 59385587Sobrien for (i = 0; i < NCHARS; i++) 59485587Sobrien f->gototab[2][i] = 0; 59585587Sobrien } 59685587Sobrien p++; 59785587Sobrien } 59885587Sobrien return (0); 59985587Sobrien} 60085587Sobrien 601107806SobrienNode *reparse(const char *p) /* parses regular expression pointed to by p */ 60285587Sobrien{ /* uses relex() to scan regular expression */ 60385587Sobrien Node *np; 60485587Sobrien 60585587Sobrien dprintf( ("reparse <%s>\n", p) ); 60685587Sobrien lastre = prestr = (uschar *) p; /* prestr points to string to be parsed */ 60785587Sobrien rtok = relex(); 608107806Sobrien /* GNU compatibility: an empty regexp matches anything */ 60985587Sobrien if (rtok == '\0') 610107806Sobrien /* FATAL("empty regular expression"); previous */ 611107806Sobrien return(op2(ALL, NIL, NIL)); 61285587Sobrien np = regexp(); 61385587Sobrien if (rtok != '\0') 61485587Sobrien FATAL("syntax error in regular expression %s at %s", lastre, prestr); 61585587Sobrien return(np); 61685587Sobrien} 61785587Sobrien 61885587SobrienNode *regexp(void) /* top-level parse of reg expr */ 61985587Sobrien{ 62085587Sobrien return (alt(concat(primary()))); 62185587Sobrien} 62285587Sobrien 62385587SobrienNode *primary(void) 62485587Sobrien{ 62585587Sobrien Node *np; 62685587Sobrien 62785587Sobrien switch (rtok) { 62885587Sobrien case CHAR: 62985587Sobrien np = op2(CHAR, NIL, itonp(rlxval)); 63085587Sobrien rtok = relex(); 63185587Sobrien return (unary(np)); 63285587Sobrien case ALL: 63385587Sobrien rtok = relex(); 63485587Sobrien return (unary(op2(ALL, NIL, NIL))); 63585587Sobrien case DOT: 63685587Sobrien rtok = relex(); 63785587Sobrien return (unary(op2(DOT, NIL, NIL))); 63885587Sobrien case CCL: 63985587Sobrien np = op2(CCL, NIL, (Node*) cclenter((char *) rlxstr)); 64085587Sobrien rtok = relex(); 64185587Sobrien return (unary(np)); 64285587Sobrien case NCCL: 64385587Sobrien np = op2(NCCL, NIL, (Node *) cclenter((char *) rlxstr)); 64485587Sobrien rtok = relex(); 64585587Sobrien return (unary(np)); 64685587Sobrien case '^': 64785587Sobrien rtok = relex(); 64885587Sobrien return (unary(op2(CHAR, NIL, itonp(HAT)))); 64985587Sobrien case '$': 65085587Sobrien rtok = relex(); 65185587Sobrien return (unary(op2(CHAR, NIL, NIL))); 65285587Sobrien case '(': 65385587Sobrien rtok = relex(); 65485587Sobrien if (rtok == ')') { /* special pleading for () */ 65585587Sobrien rtok = relex(); 65685587Sobrien return unary(op2(CCL, NIL, (Node *) tostring(""))); 65785587Sobrien } 65885587Sobrien np = regexp(); 65985587Sobrien if (rtok == ')') { 66085587Sobrien rtok = relex(); 66185587Sobrien return (unary(np)); 66285587Sobrien } 66385587Sobrien else 66485587Sobrien FATAL("syntax error in regular expression %s at %s", lastre, prestr); 66585587Sobrien default: 66685587Sobrien FATAL("illegal primary in regular expression %s at %s", lastre, prestr); 66785587Sobrien } 66885587Sobrien return 0; /*NOTREACHED*/ 66985587Sobrien} 67085587Sobrien 67185587SobrienNode *concat(Node *np) 67285587Sobrien{ 67385587Sobrien switch (rtok) { 67485587Sobrien case CHAR: case DOT: case ALL: case CCL: case NCCL: case '$': case '(': 67585587Sobrien return (concat(op2(CAT, np, primary()))); 67685587Sobrien } 67785587Sobrien return (np); 67885587Sobrien} 67985587Sobrien 68085587SobrienNode *alt(Node *np) 68185587Sobrien{ 68285587Sobrien if (rtok == OR) { 68385587Sobrien rtok = relex(); 68485587Sobrien return (alt(op2(OR, np, concat(primary())))); 68585587Sobrien } 68685587Sobrien return (np); 68785587Sobrien} 68885587Sobrien 68985587SobrienNode *unary(Node *np) 69085587Sobrien{ 69185587Sobrien switch (rtok) { 69285587Sobrien case STAR: 69385587Sobrien rtok = relex(); 69485587Sobrien return (unary(op2(STAR, np, NIL))); 69585587Sobrien case PLUS: 69685587Sobrien rtok = relex(); 69785587Sobrien return (unary(op2(PLUS, np, NIL))); 69885587Sobrien case QUEST: 69985587Sobrien rtok = relex(); 70085587Sobrien return (unary(op2(QUEST, np, NIL))); 70185587Sobrien default: 70285587Sobrien return (np); 70385587Sobrien } 70485587Sobrien} 70585587Sobrien 70690902Sdes/* 70790902Sdes * Character class definitions conformant to the POSIX locale as 70890902Sdes * defined in IEEE P1003.1 draft 7 of June 2001, assuming the source 70990902Sdes * and operating character sets are both ASCII (ISO646) or supersets 71090902Sdes * thereof. 71190902Sdes * 71290902Sdes * Note that to avoid overflowing the temporary buffer used in 71390902Sdes * relex(), the expanded character class (prior to range expansion) 71490902Sdes * must be less than twice the size of their full name. 71590902Sdes */ 716112336Sobrien 717112336Sobrien/* Because isblank doesn't show up in any of the header files on any 718112336Sobrien * system i use, it's defined here. if some other locale has a richer 719112336Sobrien * definition of "blank", define HAS_ISBLANK and provide your own 720112336Sobrien * version. 721112336Sobrien */ 722112336Sobrien 723112336Sobrien#ifndef HAS_ISBLANK 724112336Sobrien 725112336Sobrienint isblank(int c) 726112336Sobrien{ 727112336Sobrien return c==' ' || c=='\t'; 728112336Sobrien} 729112336Sobrien 730112336Sobrien#endif 731112336Sobrien 73290902Sdesstruct charclass { 73390902Sdes const char *cc_name; 73490902Sdes int cc_namelen; 735112336Sobrien int (*cc_func)(int); 73690902Sdes} charclasses[] = { 737112336Sobrien { "alnum", 5, isalnum }, 738112336Sobrien { "alpha", 5, isalpha }, 739112336Sobrien { "blank", 5, isblank }, 740112336Sobrien { "cntrl", 5, iscntrl }, 741112336Sobrien { "digit", 5, isdigit }, 742112336Sobrien { "graph", 5, isgraph }, 743112336Sobrien { "lower", 5, islower }, 744112336Sobrien { "print", 5, isprint }, 745112336Sobrien { "punct", 5, ispunct }, 746112336Sobrien { "space", 5, isspace }, 747112336Sobrien { "upper", 5, isupper }, 748112336Sobrien { "xdigit", 6, isxdigit }, 74990902Sdes { NULL, 0, NULL }, 75090902Sdes}; 75190902Sdes 75290902Sdes 75385587Sobrienint relex(void) /* lexical analyzer for reparse */ 75485587Sobrien{ 75585587Sobrien int c, n; 75685587Sobrien int cflag; 75785587Sobrien static uschar *buf = 0; 75885587Sobrien static int bufsz = 100; 75985587Sobrien uschar *bp; 76090902Sdes struct charclass *cc; 761112336Sobrien int i; 76285587Sobrien 76385587Sobrien switch (c = *prestr++) { 76485587Sobrien case '|': return OR; 76585587Sobrien case '*': return STAR; 76685587Sobrien case '+': return PLUS; 76785587Sobrien case '?': return QUEST; 76885587Sobrien case '.': return DOT; 76985587Sobrien case '\0': prestr--; return '\0'; 77085587Sobrien case '^': 77185587Sobrien case '$': 77285587Sobrien case '(': 77385587Sobrien case ')': 77485587Sobrien return c; 77585587Sobrien case '\\': 77685587Sobrien rlxval = quoted((char **) &prestr); 77785587Sobrien return CHAR; 77885587Sobrien default: 77985587Sobrien rlxval = c; 78085587Sobrien return CHAR; 78185587Sobrien case '[': 78285587Sobrien if (buf == 0 && (buf = (uschar *) malloc(bufsz)) == NULL) 78385587Sobrien FATAL("out of space in reg expr %.10s..", lastre); 78485587Sobrien bp = buf; 78585587Sobrien if (*prestr == '^') { 78685587Sobrien cflag = 1; 78785587Sobrien prestr++; 78885587Sobrien } 78985587Sobrien else 79085587Sobrien cflag = 0; 79190902Sdes n = 2 * strlen((const char *) prestr)+1; 79285587Sobrien if (!adjbuf((char **) &buf, &bufsz, n, n, (char **) &bp, 0)) 79385587Sobrien FATAL("out of space for reg expr %.10s...", lastre); 79485587Sobrien for (; ; ) { 79585587Sobrien if ((c = *prestr++) == '\\') { 79685587Sobrien *bp++ = '\\'; 79785587Sobrien if ((c = *prestr++) == '\0') 79885587Sobrien FATAL("nonterminated character class %.20s...", lastre); 79985587Sobrien *bp++ = c; 80085587Sobrien /* } else if (c == '\n') { */ 80185587Sobrien /* FATAL("newline in character class %.20s...", lastre); */ 80290902Sdes } else if (c == '[' && *prestr == ':') { 80390902Sdes /* POSIX char class names, Dag-Erling Smorgrav, des@ofug.org */ 80490902Sdes for (cc = charclasses; cc->cc_name; cc++) 80590902Sdes if (strncmp((const char *) prestr + 1, (const char *) cc->cc_name, cc->cc_namelen) == 0) 80690902Sdes break; 80790902Sdes if (cc->cc_name != NULL && prestr[1 + cc->cc_namelen] == ':' && 80890902Sdes prestr[2 + cc->cc_namelen] == ']') { 80990902Sdes prestr += cc->cc_namelen + 3; 810112336Sobrien for (i = 0; i < NCHARS; i++) { 811112336Sobrien if (!adjbuf((char **) &buf, &bufsz, bp-buf+1, 100, (char **) &bp, 0)) 812112336Sobrien FATAL("out of space for reg expr %.10s...", lastre); 813112336Sobrien if (cc->cc_func(i)) { 814112336Sobrien *bp++ = i; 815112336Sobrien n++; 816112336Sobrien } 817112336Sobrien } 81890902Sdes } else 81990902Sdes *bp++ = c; 82085587Sobrien } else if (c == '\0') { 82185587Sobrien FATAL("nonterminated character class %.20s", lastre); 82285587Sobrien } else if (bp == buf) { /* 1st char is special */ 82385587Sobrien *bp++ = c; 82485587Sobrien } else if (c == ']') { 82585587Sobrien *bp++ = 0; 82685587Sobrien rlxstr = (uschar *) tostring((char *) buf); 82785587Sobrien if (cflag == 0) 82885587Sobrien return CCL; 82985587Sobrien else 83085587Sobrien return NCCL; 83185587Sobrien } else 83285587Sobrien *bp++ = c; 83385587Sobrien } 83485587Sobrien } 83585587Sobrien} 83685587Sobrien 83785587Sobrienint cgoto(fa *f, int s, int c) 83885587Sobrien{ 83985587Sobrien int i, j, k; 84085587Sobrien int *p, *q; 84185587Sobrien 84285587Sobrien if (c < 0 || c > 255) 84385587Sobrien FATAL("can't happen: neg char %d in cgoto", c); 84485587Sobrien while (f->accept >= maxsetvec) { /* guessing here! */ 84585587Sobrien maxsetvec *= 4; 84685587Sobrien setvec = (int *) realloc(setvec, maxsetvec * sizeof(int)); 84785587Sobrien tmpset = (int *) realloc(tmpset, maxsetvec * sizeof(int)); 84885587Sobrien if (setvec == 0 || tmpset == 0) 84985587Sobrien overflo("out of space in cgoto()"); 85085587Sobrien } 85185587Sobrien for (i = 0; i <= f->accept; i++) 85285587Sobrien setvec[i] = 0; 85385587Sobrien setcnt = 0; 85485587Sobrien /* compute positions of gototab[s,c] into setvec */ 85585587Sobrien p = f->posns[s]; 85685587Sobrien for (i = 1; i <= *p; i++) { 85785587Sobrien if ((k = f->re[p[i]].ltype) != FINAL) { 85885587Sobrien if ((k == CHAR && c == ptoi(f->re[p[i]].lval.np)) 85985587Sobrien || (k == DOT && c != 0 && c != HAT) 86085587Sobrien || (k == ALL && c != 0) 86185587Sobrien || (k == CCL && member(c, (char *) f->re[p[i]].lval.up)) 86285587Sobrien || (k == NCCL && !member(c, (char *) f->re[p[i]].lval.up) && c != 0 && c != HAT)) { 86385587Sobrien q = f->re[p[i]].lfollow; 86485587Sobrien for (j = 1; j <= *q; j++) { 86585587Sobrien if (q[j] >= maxsetvec) { 86685587Sobrien maxsetvec *= 4; 86785587Sobrien setvec = (int *) realloc(setvec, maxsetvec * sizeof(int)); 86885587Sobrien tmpset = (int *) realloc(setvec, maxsetvec * sizeof(int)); 86985587Sobrien if (setvec == 0 || tmpset == 0) 87085587Sobrien overflo("cgoto overflow"); 87185587Sobrien } 87285587Sobrien if (setvec[q[j]] == 0) { 87385587Sobrien setcnt++; 87485587Sobrien setvec[q[j]] = 1; 87585587Sobrien } 87685587Sobrien } 87785587Sobrien } 87885587Sobrien } 87985587Sobrien } 88085587Sobrien /* determine if setvec is a previous state */ 88185587Sobrien tmpset[0] = setcnt; 88285587Sobrien j = 1; 88385587Sobrien for (i = f->accept; i >= 0; i--) 88485587Sobrien if (setvec[i]) { 88585587Sobrien tmpset[j++] = i; 88685587Sobrien } 88785587Sobrien /* tmpset == previous state? */ 88885587Sobrien for (i = 1; i <= f->curstat; i++) { 88985587Sobrien p = f->posns[i]; 89085587Sobrien if ((k = tmpset[0]) != p[0]) 89185587Sobrien goto different; 89285587Sobrien for (j = 1; j <= k; j++) 89385587Sobrien if (tmpset[j] != p[j]) 89485587Sobrien goto different; 89585587Sobrien /* setvec is state i */ 89685587Sobrien f->gototab[s][c] = i; 89785587Sobrien return i; 89885587Sobrien different:; 89985587Sobrien } 90085587Sobrien 90185587Sobrien /* add tmpset to current set of states */ 90285587Sobrien if (f->curstat >= NSTATES-1) { 90385587Sobrien f->curstat = 2; 90485587Sobrien f->reset = 1; 90585587Sobrien for (i = 2; i < NSTATES; i++) 90685587Sobrien xfree(f->posns[i]); 90785587Sobrien } else 90885587Sobrien ++(f->curstat); 90985587Sobrien for (i = 0; i < NCHARS; i++) 91085587Sobrien f->gototab[f->curstat][i] = 0; 91185587Sobrien xfree(f->posns[f->curstat]); 91285587Sobrien if ((p = (int *) calloc(1, (setcnt+1)*sizeof(int))) == NULL) 91385587Sobrien overflo("out of space in cgoto"); 91485587Sobrien 91585587Sobrien f->posns[f->curstat] = p; 91685587Sobrien f->gototab[s][c] = f->curstat; 91785587Sobrien for (i = 0; i <= setcnt; i++) 91885587Sobrien p[i] = tmpset[i]; 91985587Sobrien if (setvec[f->accept]) 92085587Sobrien f->out[f->curstat] = 1; 92185587Sobrien else 92285587Sobrien f->out[f->curstat] = 0; 92385587Sobrien return f->curstat; 92485587Sobrien} 92585587Sobrien 92685587Sobrien 92785587Sobrienvoid freefa(fa *f) /* free a finite automaton */ 92885587Sobrien{ 92985587Sobrien int i; 93085587Sobrien 93185587Sobrien if (f == NULL) 93285587Sobrien return; 93385587Sobrien for (i = 0; i <= f->curstat; i++) 93485587Sobrien xfree(f->posns[i]); 93585587Sobrien for (i = 0; i <= f->accept; i++) { 93685587Sobrien xfree(f->re[i].lfollow); 93785587Sobrien if (f->re[i].ltype == CCL || f->re[i].ltype == NCCL) 93885587Sobrien xfree((f->re[i].lval.np)); 93985587Sobrien } 94085587Sobrien xfree(f->restr); 94185587Sobrien xfree(f); 94285587Sobrien} 943