b.c revision 107806
145405Smsmith/**************************************************************** 245405SmsmithCopyright (C) Lucent Technologies 1997 345405SmsmithAll Rights Reserved 445405Smsmith 545405SmsmithPermission to use, copy, modify, and distribute this software and 645405Smsmithits documentation for any purpose and without fee is hereby 745405Smsmithgranted, provided that the above copyright notice appear in all 845405Smsmithcopies and that both that the copyright notice and this 945405Smsmithpermission notice and warranty disclaimer appear in supporting 1045405Smsmithdocumentation, and that the name Lucent Technologies or any of 1145405Smsmithits entities not be used in advertising or publicity pertaining 1245405Smsmithto distribution of the software without specific, written prior 1345405Smsmithpermission. 1445405Smsmith 1545405SmsmithLUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, 1645405SmsmithINCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. 1745405SmsmithIN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY 1845405SmsmithSPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 1945405SmsmithWHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER 2045405SmsmithIN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, 2145405SmsmithARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF 2245405SmsmithTHIS SOFTWARE. 2345405Smsmith****************************************************************/ 2445405Smsmith 2545405Smsmith/* lasciate ogne speranza, voi ch'entrate. */ 2645405Smsmith 27115683Sobrien#define DEBUG 28115683Sobrien 29115683Sobrien#include <ctype.h> 3045405Smsmith#include <stdio.h> 3145405Smsmith#include <string.h> 3245405Smsmith#include <stdlib.h> 3345405Smsmith#include "awk.h" 3445405Smsmith#include "ytab.h" 3576078Sjhb 36106842Smdodd#define HAT (NCHARS-2) /* matches ^ in regular expr */ 3745405Smsmith /* NCHARS is 2**n */ 3845405Smsmith#define MAXLIN 22 3945405Smsmith 4045405Smsmith#define type(v) (v)->nobj /* badly overloaded here */ 4145405Smsmith#define info(v) (v)->ntype /* badly overloaded here */ 4245405Smsmith#define left(v) (v)->narg[0] 4345405Smsmith#define right(v) (v)->narg[1] 4445405Smsmith#define parent(v) (v)->nnext 4545405Smsmith 4645405Smsmith#define LEAF case CCL: case NCCL: case CHAR: case DOT: case FINAL: case ALL: 4745405Smsmith#define UNARY case STAR: case PLUS: case QUEST: 4845405Smsmith 4945405Smsmith/* encoding in tree Nodes: 50177070Sjhb leaf (CCL, NCCL, CHAR, DOT, FINAL, ALL): 5145405Smsmith left is index, right contains value or pointer to value 52177070Sjhb unary (STAR, PLUS, QUEST): left is child, right is null 53177070Sjhb binary (CAT, OR): left and right are children 54177070Sjhb parent contains pointer to parent 55177070Sjhb*/ 5645405Smsmith 57177070Sjhb 58177070Sjhbint *setvec; 59177070Sjhbint *tmpset; 60177070Sjhbint maxsetvec = 0; 61177070Sjhb 6245405Smsmithint rtok; /* next token in current re */ 63177070Sjhbint rlxval; 64177070Sjhbstatic uschar *rlxstr; 6545405Smsmithstatic uschar *prestr; /* current position in current re */ 66177070Sjhbstatic uschar *lastre; /* origin of last re */ 67106842Smdodd 68121307Ssilbystatic int setcnt; 69177070Sjhbstatic int poscnt; 70106842Smdodd 71177070Sjhbchar *patbeg; 72177070Sjhbint patlen; 73177070Sjhb 74177070Sjhb#define NFA 20 /* cache this many dynamic fa's */ 7545405Smsmithfa *fatab[NFA]; 7645405Smsmithint nfatab = 0; /* entries in fatab */ 77177070Sjhb 78177070Sjhbfa *makedfa(const char *s, int anchor) /* returns dfa for reg expr s */ 79177070Sjhb{ 8045405Smsmith int i, use, nuse; 8145405Smsmith fa *pfa; 8246215Smsmith static int now = 1; 83177070Sjhb 8446215Smsmith if (setvec == 0) { /* first time through any RE */ 85177070Sjhb maxsetvec = MAXLIN; 86177070Sjhb setvec = (int *) malloc(maxsetvec * sizeof(int)); 87177070Sjhb tmpset = (int *) malloc(maxsetvec * sizeof(int)); 88177070Sjhb if (setvec == 0 || tmpset == 0) 89177070Sjhb overflo("out of space initializing makedfa"); 90177070Sjhb } 91177070Sjhb 92177070Sjhb if (compile_time) /* a constant for sure */ 93177070Sjhb return mkdfa(s, anchor); 94177070Sjhb for (i = 0; i < nfatab; i++) /* is it there already? */ 95177070Sjhb if (fatab[i]->anchor == anchor 96177070Sjhb && strcmp((const char *) fatab[i]->restr, s) == 0) { 97177070Sjhb fatab[i]->use = now++; 98177070Sjhb return fatab[i]; 9945405Smsmith } 10045405Smsmith pfa = mkdfa(s, anchor); 10145405Smsmith if (nfatab < NFA) { /* room for another */ 102177070Sjhb fatab[nfatab] = pfa; 103177070Sjhb fatab[nfatab]->use = now++; 104177070Sjhb nfatab++; 105177070Sjhb return pfa; 106177070Sjhb } 107177070Sjhb use = fatab[0]->use; /* replace least-recently used */ 108177070Sjhb nuse = 0; 10945405Smsmith for (i = 1; i < nfatab; i++) 11045405Smsmith if (fatab[i]->use < use) { 111177070Sjhb use = fatab[i]->use; 11294683Sdwmalone nuse = i; 11394683Sdwmalone } 114177070Sjhb freefa(fatab[nuse]); 115177070Sjhb fatab[nuse] = pfa; 116177070Sjhb pfa->use = now++; 11794683Sdwmalone return pfa; 118177070Sjhb} 119177070Sjhb 12094683Sdwmalonefa *mkdfa(const char *s, int anchor) /* does the real work of making a dfa */ 12194683Sdwmalone /* anchor = 1 for anchored matches, else 0 */ 122177070Sjhb{ 12394683Sdwmalone Node *p, *p1; 12448925Smsmith fa *f; 12594683Sdwmalone 126177070Sjhb p = reparse(s); 127177070Sjhb p1 = op2(CAT, op2(STAR, op2(ALL, NIL, NIL), NIL), p); 128177070Sjhb /* put ALL STAR in front of reg. exp. */ 12994683Sdwmalone p1 = op2(CAT, p1, op2(FINAL, NIL, NIL)); 13094683Sdwmalone /* put FINAL after reg. exp. */ 13194683Sdwmalone 13294683Sdwmalone poscnt = 0; 13394683Sdwmalone penter(p1); /* enter parent pointers and leaf indices */ 134177070Sjhb if ((f = (fa *) calloc(1, sizeof(fa) + poscnt*sizeof(rrow))) == NULL) 135177070Sjhb overflo("out of space for fa"); 13694683Sdwmalone f->accept = poscnt-1; /* penter has computed number of positions in re */ 13745405Smsmith cfoll(f, p1); /* set up follow sets */ 13845405Smsmith freetr(p1); 13945405Smsmith if ((f->posns[0] = (int *) calloc(1, *(f->re[0].lfollow)*sizeof(int))) == NULL) 14045405Smsmith overflo("out of space in makedfa"); 14145405Smsmith if ((f->posns[1] = (int *) calloc(1, sizeof(int))) == NULL) 142177070Sjhb overflo("out of space in makedfa"); 14345405Smsmith *f->posns[1] = 0; 144177070Sjhb f->initstat = makeinit(f, anchor); 145177070Sjhb f->anchor = anchor; 146177070Sjhb f->restr = (uschar *) tostring(s); 147177070Sjhb return f; 148177070Sjhb} 149177070Sjhb 150177070Sjhbint makeinit(fa *f, int anchor) 151177070Sjhb{ 15245405Smsmith int i, k; 15345405Smsmith 15445405Smsmith f->curstat = 2; 155177070Sjhb f->out[2] = 0; 156177070Sjhb f->reset = 0; 157177070Sjhb k = *(f->re[0].lfollow); 158177070Sjhb xfree(f->posns[2]); 15945405Smsmith if ((f->posns[2] = (int *) calloc(1, (k+1)*sizeof(int))) == NULL) 16045405Smsmith overflo("out of space in makeinit"); 16145405Smsmith for (i=0; i <= k; i++) { 16245405Smsmith (f->posns[2])[i] = (f->re[0].lfollow)[i]; 163177070Sjhb } 164177070Sjhb if ((f->posns[2])[1] == f->accept) 165177070Sjhb f->out[2] = 1; 16645405Smsmith for (i=0; i < NCHARS; i++) 167177070Sjhb f->gototab[2][i] = 0; 16845405Smsmith f->curstat = cgoto(f, 2, HAT); 169177070Sjhb if (anchor) { 170177070Sjhb *f->posns[2] = k-1; /* leave out position 0 */ 171177070Sjhb for (i=0; i < k; i++) { 172177070Sjhb (f->posns[0])[i] = (f->posns[2])[i]; 173177070Sjhb } 174177070Sjhb 175177070Sjhb f->out[0] = f->out[2]; 176177070Sjhb if (f->curstat != 2) 177177070Sjhb --(*f->posns[f->curstat]); 178177070Sjhb } 179177070Sjhb return f->curstat; 180177070Sjhb} 181177070Sjhb 182177070Sjhbvoid penter(Node *p) /* set up parent pointers and leaf indices */ 183177070Sjhb{ 184177070Sjhb switch (type(p)) { 185177070Sjhb LEAF 186177070Sjhb info(p) = poscnt; 187177070Sjhb poscnt++; 188177070Sjhb break; 189177070Sjhb UNARY 190177070Sjhb penter(left(p)); 191177070Sjhb parent(left(p)) = p; 192177070Sjhb break; 193177070Sjhb case CAT: 194177070Sjhb case OR: 195177070Sjhb penter(left(p)); 196177070Sjhb penter(right(p)); 197177070Sjhb parent(left(p)) = p; 198177070Sjhb parent(right(p)) = p; 199177070Sjhb break; 200177070Sjhb default: /* can't happen */ 201177070Sjhb FATAL("can't happen: unknown type %d in penter", type(p)); 202177070Sjhb break; 203177070Sjhb } 204177070Sjhb} 205177070Sjhb 206177070Sjhbvoid freetr(Node *p) /* free parse tree */ 20745405Smsmith{ 208177070Sjhb switch (type(p)) { 209177070Sjhb LEAF 210177070Sjhb xfree(p); 211177070Sjhb break; 212177070Sjhb UNARY 21345405Smsmith freetr(left(p)); 214177070Sjhb xfree(p); 215177070Sjhb break; 216177070Sjhb case CAT: 217177070Sjhb case OR: 218177070Sjhb freetr(left(p)); 219177070Sjhb freetr(right(p)); 220177070Sjhb xfree(p); 221177070Sjhb break; 222177070Sjhb default: /* can't happen */ 223177070Sjhb FATAL("can't happen: unknown type %d in freetr", type(p)); 224177070Sjhb break; 225177070Sjhb } 226177070Sjhb} 227177070Sjhb 228177070Sjhb/* in the parsing of regular expressions, metacharacters like . have */ 229177070Sjhb/* to be seen literally; \056 is not a metacharacter. */ 23045405Smsmith 23145405Smsmithint hexstr(char **pp) /* find and eval hex string at pp, return new p */ 23245405Smsmith{ /* only pick up one 8-bit byte (2 chars) */ 23345405Smsmith uschar *p; 23445405Smsmith int n = 0; 23545405Smsmith int i; 23645405Smsmith 23745405Smsmith for (i = 0, p = (uschar *) *pp; i < 2 && isxdigit(*p); i++, p++) { 23845405Smsmith if (isdigit(*p)) 239177070Sjhb n = 16 * n + *p - '0'; 24045405Smsmith else if (*p >= 'a' && *p <= 'f') 241177070Sjhb n = 16 * n + *p - 'a' + 10; 24245405Smsmith else if (*p >= 'A' && *p <= 'F') 243177070Sjhb n = 16 * n + *p - 'A' + 10; 244177070Sjhb } 245177070Sjhb *pp = (char *) p; 246177070Sjhb return n; 247177070Sjhb} 248177070Sjhb 249177070Sjhb#define isoctdigit(c) ((c) >= '0' && (c) <= '7') /* multiple use of arg */ 25045405Smsmith 25145405Smsmithint quoted(char **pp) /* pick up next thing after a \\ */ 25294683Sdwmalone /* and increment *pp */ 25394683Sdwmalone{ 25494683Sdwmalone char *p = *pp; 25594683Sdwmalone int c; 25694683Sdwmalone 25794683Sdwmalone if ((c = *p++) == 't') 258177070Sjhb c = '\t'; 259177070Sjhb else if (c == 'n') 26094683Sdwmalone c = '\n'; 26194683Sdwmalone else if (c == 'f') 26245405Smsmith c = '\f'; 26346215Smsmith else if (c == 'r') 26446215Smsmith c = '\r'; 26546215Smsmith else if (c == 'b') 26646215Smsmith c = '\b'; 26745405Smsmith else if (c == '\\') 26848925Smsmith c = '\\'; 26945405Smsmith else if (c == 'x') { /* hexadecimal goo follows */ 27045405Smsmith c = hexstr(&p); /* this adds a null if number is invalid */ 27145405Smsmith } else if (isoctdigit(c)) { /* \d \dd \ddd */ 272177070Sjhb int n = c - '0'; 273177070Sjhb if (isoctdigit(*p)) { 274177070Sjhb n = 8 * n + *p++ - '0'; 275177070Sjhb if (isoctdigit(*p)) 276177070Sjhb n = 8 * n + *p++ - '0'; 277177070Sjhb } 278177070Sjhb c = n; 27948925Smsmith } /* else */ 280177070Sjhb /* c = c; */ 281177070Sjhb *pp = p; 282177070Sjhb return c; 28348925Smsmith} 28446215Smsmith 28546215Smsmithstatic int collate_range_cmp(int a, int b) 28646215Smsmith{ 28746215Smsmith int r; 288177070Sjhb static char s[2][2]; 289177070Sjhb 29046215Smsmith if ((uschar)a == (uschar)b) 29148925Smsmith return 0; 29248925Smsmith s[0][0] = a; 29346215Smsmith s[1][0] = b; 294177070Sjhb if ((r = strcoll(s[0], s[1])) == 0) 295177070Sjhb r = (uschar)a - (uschar)b; 296177070Sjhb return r; 297177070Sjhb} 298177070Sjhb 29946215Smsmithchar *cclenter(const char *argp) /* add a character class */ 300177070Sjhb{ 30146215Smsmith int i, c, c2; 302177070Sjhb int j; 303177070Sjhb uschar *p = (uschar *) argp; 304177070Sjhb uschar *op, *bp; 305177070Sjhb static uschar *buf = 0; 30645405Smsmith static int bufsz = 100; 307177070Sjhb 308177070Sjhb op = p; 309177070Sjhb if (buf == 0 && (buf = (uschar *) malloc(bufsz)) == NULL) 310177070Sjhb FATAL("out of space for character class [%.10s...] 1", p); 311177070Sjhb bp = buf; 312177070Sjhb for (i = 0; (c = *p++) != 0; ) { 313177070Sjhb if (c == '\\') { 314177070Sjhb c = quoted((char **) &p); 315177070Sjhb } else if (c == '-' && i > 0 && bp[-1] != 0) { 316177070Sjhb if (*p != 0) { 317177070Sjhb c = bp[-1]; 318177070Sjhb c2 = *p++; 319177070Sjhb if (c2 == '\\') 320177070Sjhb c2 = quoted((char **) &p); 321177070Sjhb if (collate_range_cmp(c, c2) > 0) { /* empty; ignore */ 322177070Sjhb bp--; 323177070Sjhb i--; 324177070Sjhb continue; 325177070Sjhb } 326177070Sjhb for (j = 0; j < NCHARS; j++) { 327177070Sjhb if ((collate_range_cmp(c, j) > 0) || 328177070Sjhb collate_range_cmp(j, c2) > 0) 329177070Sjhb continue; 330177070Sjhb if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, 0)) 331177070Sjhb FATAL("out of space for character class [%.10s...] 2", p); 332177070Sjhb *bp++ = j; 333177070Sjhb i++; 334177070Sjhb } 335177070Sjhb continue; 336177070Sjhb } 337177070Sjhb } 338177070Sjhb if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, 0)) 339177070Sjhb FATAL("out of space for character class [%.10s...] 3", p); 340177070Sjhb *bp++ = c; 341177070Sjhb i++; 342177070Sjhb } 343177070Sjhb *bp = 0; 344177070Sjhb dprintf( ("cclenter: in = |%s|, out = |%s|\n", op, buf) ); 345177070Sjhb xfree(op); 346177070Sjhb return (char *) tostring((char *) buf); 347177070Sjhb} 348177070Sjhb 349177070Sjhbvoid overflo(const char *s) 350177070Sjhb{ 351177070Sjhb FATAL("regular expression too big: %.30s...", s); 352177070Sjhb} 353177070Sjhb 35445405Smsmithvoid cfoll(fa *f, Node *v) /* enter follow set of each leaf of vertex v into lfollow[leaf] */ 355177070Sjhb{ 356177070Sjhb int i; 357177070Sjhb int *p; 358177070Sjhb 359177070Sjhb switch (type(v)) { 360177070Sjhb LEAF 361177070Sjhb f->re[info(v)].ltype = type(v); 362177070Sjhb f->re[info(v)].lval.np = right(v); 363177070Sjhb while (f->accept >= maxsetvec) { /* guessing here! */ 364177070Sjhb maxsetvec *= 4; 365177070Sjhb setvec = (int *) realloc(setvec, maxsetvec * sizeof(int)); 366177070Sjhb tmpset = (int *) realloc(tmpset, maxsetvec * sizeof(int)); 367177070Sjhb if (setvec == 0 || tmpset == 0) 368177070Sjhb overflo("out of space in cfoll()"); 369177070Sjhb } 370177070Sjhb for (i = 0; i <= f->accept; i++) 371177070Sjhb setvec[i] = 0; 372177070Sjhb setcnt = 0; 373177070Sjhb follow(v); /* computes setvec and setcnt */ 374177070Sjhb if ((p = (int *) calloc(1, (setcnt+1)*sizeof(int))) == NULL) 375177070Sjhb overflo("out of space building follow set"); 376177070Sjhb f->re[info(v)].lfollow = p; 37745405Smsmith *p = setcnt; 37845405Smsmith for (i = f->accept; i >= 0; i--) 379177070Sjhb if (setvec[i] == 1) 380177070Sjhb *++p = i; 381177070Sjhb break; 382177070Sjhb UNARY 383177070Sjhb cfoll(f,left(v)); 384177070Sjhb break; 385177070Sjhb case CAT: 386177070Sjhb case OR: 387177070Sjhb cfoll(f,left(v)); 388177070Sjhb cfoll(f,right(v)); 389177070Sjhb break; 39045405Smsmith default: /* can't happen */ 39145405Smsmith FATAL("can't happen: unknown type %d in cfoll", type(v)); 39245405Smsmith } 39345405Smsmith} 39445405Smsmith 39545405Smsmithint first(Node *p) /* collects initially active leaves of p into setvec */ 39645405Smsmith /* returns 1 if p matches empty string */ 39745405Smsmith{ 398177070Sjhb int b, lp; 399177070Sjhb 400177070Sjhb switch (type(p)) { 401177070Sjhb LEAF 402177070Sjhb lp = info(p); /* look for high-water mark of subscripts */ 403177070Sjhb while (setcnt >= maxsetvec || lp >= maxsetvec) { /* guessing here! */ 404177070Sjhb maxsetvec *= 4; 405177070Sjhb setvec = (int *) realloc(setvec, maxsetvec * sizeof(int)); 406177070Sjhb tmpset = (int *) realloc(tmpset, maxsetvec * sizeof(int)); 40745405Smsmith if (setvec == 0 || tmpset == 0) 40845405Smsmith overflo("out of space in first()"); 40945405Smsmith } 410177070Sjhb if (setvec[lp] != 1) { 411177070Sjhb setvec[lp] = 1; 41245405Smsmith setcnt++; 413177070Sjhb } 414177070Sjhb if (type(p) == CCL && (*(char *) right(p)) == '\0') 415177070Sjhb return(0); /* empty CCL */ 41645405Smsmith else return(1); 417177070Sjhb case PLUS: 418177070Sjhb if (first(left(p)) == 0) return(0); 41945405Smsmith return(1); 42045405Smsmith case STAR: 42145405Smsmith case QUEST: 42245405Smsmith first(left(p)); 423177070Sjhb return(0); 42445405Smsmith case CAT: 425177070Sjhb if (first(left(p)) == 0 && first(right(p)) == 0) return(0); 426177070Sjhb return(1); 427177070Sjhb case OR: 428177070Sjhb b = first(right(p)); 42945405Smsmith if (first(left(p)) == 0 || b == 0) return(0); 430177070Sjhb return(1); 431177070Sjhb } 432177070Sjhb FATAL("can't happen: unknown type %d in first", type(p)); /* can't happen */ 433177070Sjhb return(-1); 434177070Sjhb} 435177070Sjhb 436177070Sjhbvoid follow(Node *v) /* collects leaves that can follow v into setvec */ 437177070Sjhb{ 438103346Sdwmalone Node *p; 439177070Sjhb 440177070Sjhb if (type(v) == FINAL) 441177070Sjhb return; 442103346Sdwmalone p = parent(v); 443103346Sdwmalone switch (type(p)) { 444177070Sjhb case STAR: 44545405Smsmith case PLUS: 44645405Smsmith first(v); 44745405Smsmith follow(p); 44845405Smsmith return; 44945405Smsmith 45045405Smsmith case OR: 45145405Smsmith case QUEST: 45245405Smsmith follow(p); 453177070Sjhb return; 454177070Sjhb 45545405Smsmith case CAT: 456177070Sjhb if (v == left(p)) { /* v is left child of p */ 457177070Sjhb if (first(right(p)) == 0) { 458177070Sjhb follow(p); 459177070Sjhb return; 460177070Sjhb } 461177070Sjhb } else /* v is right child */ 462177070Sjhb follow(p); 463177070Sjhb return; 464177070Sjhb } 465177070Sjhb} 466177070Sjhb 467177070Sjhbint member(int c, const char *sarg) /* is c in s? */ 468177070Sjhb{ 469177070Sjhb uschar *s = (uschar *) sarg; 470177070Sjhb 471177070Sjhb while (*s) 472177070Sjhb if (c == *s++) 473177070Sjhb return(1); 474177070Sjhb return(0); 475177070Sjhb} 476177070Sjhb 477177070Sjhbint match(fa *f, const char *p0) /* shortest match ? */ 478177070Sjhb{ 479177070Sjhb int s, ns; 480177070Sjhb uschar *p = (uschar *) p0; 481177070Sjhb 482177070Sjhb s = f->reset ? makeinit(f,0) : f->initstat; 483177070Sjhb if (f->out[s]) 484177070Sjhb return(1); 485177070Sjhb do { 486177070Sjhb if ((ns = f->gototab[s][*p]) != 0) 487177070Sjhb s = ns; 488177070Sjhb else 489177070Sjhb s = cgoto(f, s, *p); 490177070Sjhb if (f->out[s]) 491177070Sjhb return(1); 492177070Sjhb } while (*p++ != 0); 493177070Sjhb return(0); 494177070Sjhb} 495177070Sjhb 496177070Sjhbint pmatch(fa *f, const char *p0) /* longest match, for sub */ 497177070Sjhb{ 498177070Sjhb int s, ns; 499177070Sjhb uschar *p = (uschar *) p0; 500177070Sjhb uschar *q; 50145405Smsmith int i, k; 50245405Smsmith 503177070Sjhb s = f->reset ? makeinit(f,1) : f->initstat; 504177070Sjhb patbeg = (char *) p; 505177070Sjhb patlen = -1; 506177070Sjhb do { 507177070Sjhb q = p; 508177070Sjhb do { 509177070Sjhb if (f->out[s]) /* final state */ 510177070Sjhb patlen = q-p; 511177070Sjhb if ((ns = f->gototab[s][*q]) != 0) 512177070Sjhb s = ns; 51345405Smsmith else 51445405Smsmith s = cgoto(f, s, *q); 51545405Smsmith if (s == 1) { /* no transition */ 51645405Smsmith if (patlen >= 0) { 51745405Smsmith patbeg = (char *) p; 51845405Smsmith return(1); 51945405Smsmith } 52045405Smsmith else 521177070Sjhb goto nextin; /* no match */ 522177070Sjhb } 52345405Smsmith } while (*q++ != 0); 524177070Sjhb if (f->out[s]) 525177070Sjhb patlen = q-p-1; /* don't count $ */ 526177070Sjhb if (patlen >= 0) { 527177070Sjhb patbeg = (char *) p; 528177070Sjhb return(1); 529177070Sjhb } 530177070Sjhb nextin: 531177070Sjhb s = 2; 532177070Sjhb if (f->reset) { 53345405Smsmith for (i = 2; i <= f->curstat; i++) 534177070Sjhb xfree(f->posns[i]); 53545405Smsmith k = *f->posns[0]; 536177070Sjhb if ((f->posns[2] = (int *) calloc(1, (k+1)*sizeof(int))) == NULL) 537177070Sjhb overflo("out of space in pmatch"); 538177070Sjhb for (i = 0; i <= k; i++) 539177070Sjhb (f->posns[2])[i] = (f->posns[0])[i]; 540177070Sjhb f->initstat = f->curstat = 2; 541177070Sjhb f->out[2] = f->out[0]; 542177070Sjhb for (i = 0; i < NCHARS; i++) 543177070Sjhb f->gototab[2][i] = 0; 544177070Sjhb } 545177070Sjhb } while (*p++ != 0); 546177070Sjhb return (0); 547177070Sjhb} 548177070Sjhb 549177070Sjhbint nematch(fa *f, const char *p0) /* non-empty match, for sub */ 550177070Sjhb{ 551177070Sjhb int s, ns; 552177070Sjhb uschar *p = (uschar *) p0; 553177070Sjhb uschar *q; 554177070Sjhb int i, k; 555177070Sjhb 556177070Sjhb s = f->reset ? makeinit(f,1) : f->initstat; 557177070Sjhb patlen = -1; 558177070Sjhb while (*p) { 559177070Sjhb q = p; 560177070Sjhb do { 56145405Smsmith if (f->out[s]) /* final state */ 56245405Smsmith patlen = q-p; 563177070Sjhb if ((ns = f->gototab[s][*q]) != 0) 564177070Sjhb s = ns; 56545405Smsmith else 566177070Sjhb s = cgoto(f, s, *q); 567177070Sjhb if (s == 1) { /* no transition */ 568177070Sjhb if (patlen > 0) { 56945405Smsmith patbeg = (char *) p; 57045405Smsmith return(1); 57145405Smsmith } else 572177070Sjhb goto nnextin; /* no nonempty match */ 573177070Sjhb } 57445405Smsmith } while (*q++ != 0); 57545405Smsmith if (f->out[s]) 57645405Smsmith patlen = q-p-1; /* don't count $ */ 57745405Smsmith if (patlen > 0 ) { 578177070Sjhb patbeg = (char *) p; 579177070Sjhb return(1); 58045405Smsmith } 581177070Sjhb nnextin: 582177070Sjhb s = 2; 58345405Smsmith if (f->reset) { 584177070Sjhb for (i = 2; i <= f->curstat; i++) 585177070Sjhb xfree(f->posns[i]); 586177070Sjhb k = *f->posns[0]; 587177070Sjhb if ((f->posns[2] = (int *) calloc(1, (k+1)*sizeof(int))) == NULL) 588177070Sjhb overflo("out of state space"); 589177070Sjhb for (i = 0; i <= k; i++) 590177070Sjhb (f->posns[2])[i] = (f->posns[0])[i]; 59145405Smsmith f->initstat = f->curstat = 2; 592177070Sjhb f->out[2] = f->out[0]; 59345405Smsmith for (i = 0; i < NCHARS; i++) 594177070Sjhb f->gototab[2][i] = 0; 595177070Sjhb } 596177070Sjhb p++; 597177070Sjhb } 598177070Sjhb return (0); 59945405Smsmith} 600177070Sjhb 601177070SjhbNode *reparse(const char *p) /* parses regular expression pointed to by p */ 602177070Sjhb{ /* uses relex() to scan regular expression */ 60345405Smsmith Node *np; 604177070Sjhb 60545405Smsmith dprintf( ("reparse <%s>\n", p) ); 606177070Sjhb lastre = prestr = (uschar *) p; /* prestr points to string to be parsed */ 607177070Sjhb rtok = relex(); 608177070Sjhb /* GNU compatibility: an empty regexp matches anything */ 609177070Sjhb if (rtok == '\0') 610177070Sjhb /* FATAL("empty regular expression"); previous */ 611177070Sjhb return(op2(ALL, NIL, NIL)); 612177070Sjhb np = regexp(); 613177070Sjhb if (rtok != '\0') 614177070Sjhb FATAL("syntax error in regular expression %s at %s", lastre, prestr); 615177070Sjhb return(np); 616177070Sjhb} 617177070Sjhb 618177070SjhbNode *regexp(void) /* top-level parse of reg expr */ 619177070Sjhb{ 620177070Sjhb return (alt(concat(primary()))); 621177070Sjhb} 622177070Sjhb 623177070SjhbNode *primary(void) 624177070Sjhb{ 625177070Sjhb Node *np; 62645405Smsmith 627177070Sjhb switch (rtok) { 628177070Sjhb case CHAR: 629177070Sjhb np = op2(CHAR, NIL, itonp(rlxval)); 630177070Sjhb rtok = relex(); 631177070Sjhb return (unary(np)); 632177070Sjhb case ALL: 633177070Sjhb rtok = relex(); 634177070Sjhb return (unary(op2(ALL, NIL, NIL))); 635177070Sjhb case DOT: 636177070Sjhb rtok = relex(); 637177070Sjhb return (unary(op2(DOT, NIL, NIL))); 63845405Smsmith case CCL: 63945405Smsmith np = op2(CCL, NIL, (Node*) cclenter((char *) rlxstr)); 64045405Smsmith rtok = relex(); 64146215Smsmith return (unary(np)); 64246215Smsmith case NCCL: 64346215Smsmith np = op2(NCCL, NIL, (Node *) cclenter((char *) rlxstr)); 64445405Smsmith rtok = relex(); 64546215Smsmith return (unary(np)); 64646215Smsmith case '^': 647177070Sjhb rtok = relex(); 648177070Sjhb return (unary(op2(CHAR, NIL, itonp(HAT)))); 649177070Sjhb case '$': 65046215Smsmith rtok = relex(); 65146215Smsmith return (unary(op2(CHAR, NIL, NIL))); 65246215Smsmith case '(': 65345405Smsmith rtok = relex(); 65445405Smsmith if (rtok == ')') { /* special pleading for () */ 655177070Sjhb rtok = relex(); 656177124Sjhb return unary(op2(CCL, NIL, (Node *) tostring(""))); 657177124Sjhb } 658177124Sjhb np = regexp(); 659177124Sjhb if (rtok == ')') { 660177124Sjhb rtok = relex(); 661177124Sjhb return (unary(np)); 662177124Sjhb } 663177124Sjhb else 664177124Sjhb FATAL("syntax error in regular expression %s at %s", lastre, prestr); 665177124Sjhb default: 66645405Smsmith FATAL("illegal primary in regular expression %s at %s", lastre, prestr); 667177070Sjhb } 668 return 0; /*NOTREACHED*/ 669} 670 671Node *concat(Node *np) 672{ 673 switch (rtok) { 674 case CHAR: case DOT: case ALL: case CCL: case NCCL: case '$': case '(': 675 return (concat(op2(CAT, np, primary()))); 676 } 677 return (np); 678} 679 680Node *alt(Node *np) 681{ 682 if (rtok == OR) { 683 rtok = relex(); 684 return (alt(op2(OR, np, concat(primary())))); 685 } 686 return (np); 687} 688 689Node *unary(Node *np) 690{ 691 switch (rtok) { 692 case STAR: 693 rtok = relex(); 694 return (unary(op2(STAR, np, NIL))); 695 case PLUS: 696 rtok = relex(); 697 return (unary(op2(PLUS, np, NIL))); 698 case QUEST: 699 rtok = relex(); 700 return (unary(op2(QUEST, np, NIL))); 701 default: 702 return (np); 703 } 704} 705 706/* 707 * Character class definitions conformant to the POSIX locale as 708 * defined in IEEE P1003.1 draft 7 of June 2001, assuming the source 709 * and operating character sets are both ASCII (ISO646) or supersets 710 * thereof. 711 * 712 * Note that to avoid overflowing the temporary buffer used in 713 * relex(), the expanded character class (prior to range expansion) 714 * must be less than twice the size of their full name. 715 */ 716 717struct charclass { 718 const char *cc_name; 719 int cc_namelen; 720 int (*cc_func)(int); 721} charclasses[] = { 722 { "alnum", 5, isalnum }, 723 { "alpha", 5, isalpha }, 724 { "blank", 5, isblank }, 725 { "cntrl", 5, iscntrl }, 726 { "digit", 5, isdigit }, 727 { "graph", 5, isgraph }, 728 { "lower", 5, islower }, 729 { "print", 5, isprint }, 730 { "punct", 5, ispunct }, 731 { "space", 5, isspace }, 732 { "upper", 5, isupper }, 733 { "xdigit", 6, isxdigit }, 734 { NULL, 0, NULL }, 735}; 736 737 738int relex(void) /* lexical analyzer for reparse */ 739{ 740 int c, n; 741 int cflag; 742 static uschar *buf = 0; 743 static int bufsz = 100; 744 uschar *bp; 745 struct charclass *cc; 746 int i; 747 748 switch (c = *prestr++) { 749 case '|': return OR; 750 case '*': return STAR; 751 case '+': return PLUS; 752 case '?': return QUEST; 753 case '.': return DOT; 754 case '\0': prestr--; return '\0'; 755 case '^': 756 case '$': 757 case '(': 758 case ')': 759 return c; 760 case '\\': 761 rlxval = quoted((char **) &prestr); 762 return CHAR; 763 default: 764 rlxval = c; 765 return CHAR; 766 case '[': 767 if (buf == 0 && (buf = (uschar *) malloc(bufsz)) == NULL) 768 FATAL("out of space in reg expr %.10s..", lastre); 769 bp = buf; 770 if (*prestr == '^') { 771 cflag = 1; 772 prestr++; 773 } 774 else 775 cflag = 0; 776 n = 2 * strlen((const char *) prestr)+1; 777 if (!adjbuf((char **) &buf, &bufsz, n, n, (char **) &bp, 0)) 778 FATAL("out of space for reg expr %.10s...", lastre); 779 for (; ; ) { 780 if ((c = *prestr++) == '\\') { 781 *bp++ = '\\'; 782 if ((c = *prestr++) == '\0') 783 FATAL("nonterminated character class %.20s...", lastre); 784 *bp++ = c; 785 /* } else if (c == '\n') { */ 786 /* FATAL("newline in character class %.20s...", lastre); */ 787 } else if (c == '[' && *prestr == ':') { 788 /* POSIX char class names, Dag-Erling Smorgrav, des@ofug.org */ 789 for (cc = charclasses; cc->cc_name; cc++) 790 if (strncmp((const char *) prestr + 1, (const char *) cc->cc_name, cc->cc_namelen) == 0) 791 break; 792 if (cc->cc_name != NULL && prestr[1 + cc->cc_namelen] == ':' && 793 prestr[2 + cc->cc_namelen] == ']') { 794 prestr += cc->cc_namelen + 3; 795 for (i = 0; i < NCHARS; i++) { 796 if (!adjbuf((char **) &buf, &bufsz, bp-buf+1, 100, (char **) &bp, 0)) 797 FATAL("out of space for reg expr %.10s...", lastre); 798 if (cc->cc_func(i)) { 799 *bp++ = i; 800 n++; 801 } 802 } 803 } else 804 *bp++ = c; 805 } else if (c == '\0') { 806 FATAL("nonterminated character class %.20s", lastre); 807 } else if (bp == buf) { /* 1st char is special */ 808 *bp++ = c; 809 } else if (c == ']') { 810 *bp++ = 0; 811 rlxstr = (uschar *) tostring((char *) buf); 812 if (cflag == 0) 813 return CCL; 814 else 815 return NCCL; 816 } else 817 *bp++ = c; 818 } 819 } 820} 821 822int cgoto(fa *f, int s, int c) 823{ 824 int i, j, k; 825 int *p, *q; 826 827 if (c < 0 || c > 255) 828 FATAL("can't happen: neg char %d in cgoto", c); 829 while (f->accept >= maxsetvec) { /* guessing here! */ 830 maxsetvec *= 4; 831 setvec = (int *) realloc(setvec, maxsetvec * sizeof(int)); 832 tmpset = (int *) realloc(tmpset, maxsetvec * sizeof(int)); 833 if (setvec == 0 || tmpset == 0) 834 overflo("out of space in cgoto()"); 835 } 836 for (i = 0; i <= f->accept; i++) 837 setvec[i] = 0; 838 setcnt = 0; 839 /* compute positions of gototab[s,c] into setvec */ 840 p = f->posns[s]; 841 for (i = 1; i <= *p; i++) { 842 if ((k = f->re[p[i]].ltype) != FINAL) { 843 if ((k == CHAR && c == ptoi(f->re[p[i]].lval.np)) 844 || (k == DOT && c != 0 && c != HAT) 845 || (k == ALL && c != 0) 846 || (k == CCL && member(c, (char *) f->re[p[i]].lval.up)) 847 || (k == NCCL && !member(c, (char *) f->re[p[i]].lval.up) && c != 0 && c != HAT)) { 848 q = f->re[p[i]].lfollow; 849 for (j = 1; j <= *q; j++) { 850 if (q[j] >= maxsetvec) { 851 maxsetvec *= 4; 852 setvec = (int *) realloc(setvec, maxsetvec * sizeof(int)); 853 tmpset = (int *) realloc(setvec, maxsetvec * sizeof(int)); 854 if (setvec == 0 || tmpset == 0) 855 overflo("cgoto overflow"); 856 } 857 if (setvec[q[j]] == 0) { 858 setcnt++; 859 setvec[q[j]] = 1; 860 } 861 } 862 } 863 } 864 } 865 /* determine if setvec is a previous state */ 866 tmpset[0] = setcnt; 867 j = 1; 868 for (i = f->accept; i >= 0; i--) 869 if (setvec[i]) { 870 tmpset[j++] = i; 871 } 872 /* tmpset == previous state? */ 873 for (i = 1; i <= f->curstat; i++) { 874 p = f->posns[i]; 875 if ((k = tmpset[0]) != p[0]) 876 goto different; 877 for (j = 1; j <= k; j++) 878 if (tmpset[j] != p[j]) 879 goto different; 880 /* setvec is state i */ 881 f->gototab[s][c] = i; 882 return i; 883 different:; 884 } 885 886 /* add tmpset to current set of states */ 887 if (f->curstat >= NSTATES-1) { 888 f->curstat = 2; 889 f->reset = 1; 890 for (i = 2; i < NSTATES; i++) 891 xfree(f->posns[i]); 892 } else 893 ++(f->curstat); 894 for (i = 0; i < NCHARS; i++) 895 f->gototab[f->curstat][i] = 0; 896 xfree(f->posns[f->curstat]); 897 if ((p = (int *) calloc(1, (setcnt+1)*sizeof(int))) == NULL) 898 overflo("out of space in cgoto"); 899 900 f->posns[f->curstat] = p; 901 f->gototab[s][c] = f->curstat; 902 for (i = 0; i <= setcnt; i++) 903 p[i] = tmpset[i]; 904 if (setvec[f->accept]) 905 f->out[f->curstat] = 1; 906 else 907 f->out[f->curstat] = 0; 908 return f->curstat; 909} 910 911 912void freefa(fa *f) /* free a finite automaton */ 913{ 914 int i; 915 916 if (f == NULL) 917 return; 918 for (i = 0; i <= f->curstat; i++) 919 xfree(f->posns[i]); 920 for (i = 0; i <= f->accept; i++) { 921 xfree(f->re[i].lfollow); 922 if (f->re[i].ltype == CCL || f->re[i].ltype == NCCL) 923 xfree((f->re[i].lval.np)); 924 } 925 xfree(f->restr); 926 xfree(f); 927} 928