b.c revision 146299
185587Sobrien/****************************************************************
285587SobrienCopyright (C) Lucent Technologies 1997
385587SobrienAll Rights Reserved
485587Sobrien
585587SobrienPermission to use, copy, modify, and distribute this software and
685587Sobrienits documentation for any purpose and without fee is hereby
785587Sobriengranted, provided that the above copyright notice appear in all
885587Sobriencopies and that both that the copyright notice and this
985587Sobrienpermission notice and warranty disclaimer appear in supporting
1085587Sobriendocumentation, and that the name Lucent Technologies or any of
1185587Sobrienits entities not be used in advertising or publicity pertaining
1285587Sobriento distribution of the software without specific, written prior
1385587Sobrienpermission.
1485587Sobrien
1585587SobrienLUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
1685587SobrienINCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
1785587SobrienIN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
1885587SobrienSPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
1985587SobrienWHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
2085587SobrienIN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
2185587SobrienARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
2285587SobrienTHIS SOFTWARE.
2385587Sobrien****************************************************************/
2485587Sobrien
2585587Sobrien/* lasciate ogne speranza, voi ch'entrate. */
2685587Sobrien
2785587Sobrien#define	DEBUG
2885587Sobrien
2985587Sobrien#include <ctype.h>
3085587Sobrien#include <stdio.h>
3185587Sobrien#include <string.h>
3285587Sobrien#include <stdlib.h>
3385587Sobrien#include "awk.h"
3485587Sobrien#include "ytab.h"
3585587Sobrien
36118194Sru#define	HAT	(NCHARS+2)	/* matches ^ in regular expr */
3785587Sobrien				/* NCHARS is 2**n */
3885587Sobrien#define MAXLIN 22
3985587Sobrien
4085587Sobrien#define type(v)		(v)->nobj	/* badly overloaded here */
4185587Sobrien#define info(v)		(v)->ntype	/* badly overloaded here */
4285587Sobrien#define left(v)		(v)->narg[0]
4385587Sobrien#define right(v)	(v)->narg[1]
4485587Sobrien#define parent(v)	(v)->nnext
4585587Sobrien
4685587Sobrien#define LEAF	case CCL: case NCCL: case CHAR: case DOT: case FINAL: case ALL:
4785587Sobrien#define UNARY	case STAR: case PLUS: case QUEST:
4885587Sobrien
4985587Sobrien/* encoding in tree Nodes:
5085587Sobrien	leaf (CCL, NCCL, CHAR, DOT, FINAL, ALL):
5185587Sobrien		left is index, right contains value or pointer to value
5285587Sobrien	unary (STAR, PLUS, QUEST): left is child, right is null
5385587Sobrien	binary (CAT, OR): left and right are children
5485587Sobrien	parent contains pointer to parent
5585587Sobrien*/
5685587Sobrien
5785587Sobrien
5885587Sobrienint	*setvec;
5985587Sobrienint	*tmpset;
6085587Sobrienint	maxsetvec = 0;
6185587Sobrien
6285587Sobrienint	rtok;		/* next token in current re */
6385587Sobrienint	rlxval;
6485587Sobrienstatic uschar	*rlxstr;
6585587Sobrienstatic uschar	*prestr;	/* current position in current re */
6685587Sobrienstatic uschar	*lastre;	/* origin of last re */
6785587Sobrien
6885587Sobrienstatic	int setcnt;
6985587Sobrienstatic	int poscnt;
7085587Sobrien
7185587Sobrienchar	*patbeg;
7285587Sobrienint	patlen;
7385587Sobrien
7485587Sobrien#define	NFA	20	/* cache this many dynamic fa's */
7585587Sobrienfa	*fatab[NFA];
7685587Sobrienint	nfatab	= 0;	/* entries in fatab */
7785587Sobrien
78107806Sobrienfa *makedfa(const char *s, int anchor)	/* returns dfa for reg expr s */
7985587Sobrien{
8085587Sobrien	int i, use, nuse;
8185587Sobrien	fa *pfa;
8285587Sobrien	static int now = 1;
8385587Sobrien
8485587Sobrien	if (setvec == 0) {	/* first time through any RE */
8585587Sobrien		maxsetvec = MAXLIN;
8685587Sobrien		setvec = (int *) malloc(maxsetvec * sizeof(int));
8785587Sobrien		tmpset = (int *) malloc(maxsetvec * sizeof(int));
8885587Sobrien		if (setvec == 0 || tmpset == 0)
8985587Sobrien			overflo("out of space initializing makedfa");
9085587Sobrien	}
9185587Sobrien
9285587Sobrien	if (compile_time)	/* a constant for sure */
9385587Sobrien		return mkdfa(s, anchor);
9485587Sobrien	for (i = 0; i < nfatab; i++)	/* is it there already? */
9585587Sobrien		if (fatab[i]->anchor == anchor
9690902Sdes		  && strcmp((const char *) fatab[i]->restr, s) == 0) {
9785587Sobrien			fatab[i]->use = now++;
9885587Sobrien			return fatab[i];
9985587Sobrien		}
10085587Sobrien	pfa = mkdfa(s, anchor);
10185587Sobrien	if (nfatab < NFA) {	/* room for another */
10285587Sobrien		fatab[nfatab] = pfa;
10385587Sobrien		fatab[nfatab]->use = now++;
10485587Sobrien		nfatab++;
10585587Sobrien		return pfa;
10685587Sobrien	}
10785587Sobrien	use = fatab[0]->use;	/* replace least-recently used */
10885587Sobrien	nuse = 0;
10985587Sobrien	for (i = 1; i < nfatab; i++)
11085587Sobrien		if (fatab[i]->use < use) {
11185587Sobrien			use = fatab[i]->use;
11285587Sobrien			nuse = i;
11385587Sobrien		}
11485587Sobrien	freefa(fatab[nuse]);
11585587Sobrien	fatab[nuse] = pfa;
11685587Sobrien	pfa->use = now++;
11785587Sobrien	return pfa;
11885587Sobrien}
11985587Sobrien
120107806Sobrienfa *mkdfa(const char *s, int anchor)	/* does the real work of making a dfa */
12185587Sobrien				/* anchor = 1 for anchored matches, else 0 */
12285587Sobrien{
12385587Sobrien	Node *p, *p1;
12485587Sobrien	fa *f;
12585587Sobrien
12685587Sobrien	p = reparse(s);
12785587Sobrien	p1 = op2(CAT, op2(STAR, op2(ALL, NIL, NIL), NIL), p);
12885587Sobrien		/* put ALL STAR in front of reg.  exp. */
12985587Sobrien	p1 = op2(CAT, p1, op2(FINAL, NIL, NIL));
13085587Sobrien		/* put FINAL after reg.  exp. */
13185587Sobrien
13285587Sobrien	poscnt = 0;
13385587Sobrien	penter(p1);	/* enter parent pointers and leaf indices */
13485587Sobrien	if ((f = (fa *) calloc(1, sizeof(fa) + poscnt*sizeof(rrow))) == NULL)
13585587Sobrien		overflo("out of space for fa");
13685587Sobrien	f->accept = poscnt-1;	/* penter has computed number of positions in re */
13785587Sobrien	cfoll(f, p1);	/* set up follow sets */
13885587Sobrien	freetr(p1);
13985587Sobrien	if ((f->posns[0] = (int *) calloc(1, *(f->re[0].lfollow)*sizeof(int))) == NULL)
14085587Sobrien			overflo("out of space in makedfa");
14185587Sobrien	if ((f->posns[1] = (int *) calloc(1, sizeof(int))) == NULL)
14285587Sobrien		overflo("out of space in makedfa");
14385587Sobrien	*f->posns[1] = 0;
14485587Sobrien	f->initstat = makeinit(f, anchor);
14585587Sobrien	f->anchor = anchor;
14685587Sobrien	f->restr = (uschar *) tostring(s);
14785587Sobrien	return f;
14885587Sobrien}
14985587Sobrien
15085587Sobrienint makeinit(fa *f, int anchor)
15185587Sobrien{
15285587Sobrien	int i, k;
15385587Sobrien
15485587Sobrien	f->curstat = 2;
15585587Sobrien	f->out[2] = 0;
15685587Sobrien	f->reset = 0;
15785587Sobrien	k = *(f->re[0].lfollow);
15885587Sobrien	xfree(f->posns[2]);
15985587Sobrien	if ((f->posns[2] = (int *) calloc(1, (k+1)*sizeof(int))) == NULL)
16085587Sobrien		overflo("out of space in makeinit");
16185587Sobrien	for (i=0; i <= k; i++) {
16285587Sobrien		(f->posns[2])[i] = (f->re[0].lfollow)[i];
16385587Sobrien	}
16485587Sobrien	if ((f->posns[2])[1] == f->accept)
16585587Sobrien		f->out[2] = 1;
16685587Sobrien	for (i=0; i < NCHARS; i++)
16785587Sobrien		f->gototab[2][i] = 0;
16885587Sobrien	f->curstat = cgoto(f, 2, HAT);
16985587Sobrien	if (anchor) {
17085587Sobrien		*f->posns[2] = k-1;	/* leave out position 0 */
17185587Sobrien		for (i=0; i < k; i++) {
17285587Sobrien			(f->posns[0])[i] = (f->posns[2])[i];
17385587Sobrien		}
17485587Sobrien
17585587Sobrien		f->out[0] = f->out[2];
17685587Sobrien		if (f->curstat != 2)
17785587Sobrien			--(*f->posns[f->curstat]);
17885587Sobrien	}
17985587Sobrien	return f->curstat;
18085587Sobrien}
18185587Sobrien
18285587Sobrienvoid penter(Node *p)	/* set up parent pointers and leaf indices */
18385587Sobrien{
18485587Sobrien	switch (type(p)) {
18585587Sobrien	LEAF
18685587Sobrien		info(p) = poscnt;
18785587Sobrien		poscnt++;
18885587Sobrien		break;
18985587Sobrien	UNARY
19085587Sobrien		penter(left(p));
19185587Sobrien		parent(left(p)) = p;
19285587Sobrien		break;
19385587Sobrien	case CAT:
19485587Sobrien	case OR:
19585587Sobrien		penter(left(p));
19685587Sobrien		penter(right(p));
19785587Sobrien		parent(left(p)) = p;
19885587Sobrien		parent(right(p)) = p;
19985587Sobrien		break;
20085587Sobrien	default:	/* can't happen */
20185587Sobrien		FATAL("can't happen: unknown type %d in penter", type(p));
20285587Sobrien		break;
20385587Sobrien	}
20485587Sobrien}
20585587Sobrien
20685587Sobrienvoid freetr(Node *p)	/* free parse tree */
20785587Sobrien{
20885587Sobrien	switch (type(p)) {
20985587Sobrien	LEAF
21085587Sobrien		xfree(p);
21185587Sobrien		break;
21285587Sobrien	UNARY
21385587Sobrien		freetr(left(p));
21485587Sobrien		xfree(p);
21585587Sobrien		break;
21685587Sobrien	case CAT:
21785587Sobrien	case OR:
21885587Sobrien		freetr(left(p));
21985587Sobrien		freetr(right(p));
22085587Sobrien		xfree(p);
22185587Sobrien		break;
22285587Sobrien	default:	/* can't happen */
22385587Sobrien		FATAL("can't happen: unknown type %d in freetr", type(p));
22485587Sobrien		break;
22585587Sobrien	}
22685587Sobrien}
22785587Sobrien
22885587Sobrien/* in the parsing of regular expressions, metacharacters like . have */
22985587Sobrien/* to be seen literally;  \056 is not a metacharacter. */
23085587Sobrien
23185587Sobrienint hexstr(char **pp)	/* find and eval hex string at pp, return new p */
23285587Sobrien{			/* only pick up one 8-bit byte (2 chars) */
23385587Sobrien	uschar *p;
23485587Sobrien	int n = 0;
23585587Sobrien	int i;
23685587Sobrien
23785587Sobrien	for (i = 0, p = (uschar *) *pp; i < 2 && isxdigit(*p); i++, p++) {
23885587Sobrien		if (isdigit(*p))
23985587Sobrien			n = 16 * n + *p - '0';
24085587Sobrien		else if (*p >= 'a' && *p <= 'f')
24185587Sobrien			n = 16 * n + *p - 'a' + 10;
24285587Sobrien		else if (*p >= 'A' && *p <= 'F')
24385587Sobrien			n = 16 * n + *p - 'A' + 10;
24485587Sobrien	}
24585587Sobrien	*pp = (char *) p;
24685587Sobrien	return n;
24785587Sobrien}
24885587Sobrien
24985587Sobrien#define isoctdigit(c) ((c) >= '0' && (c) <= '7')	/* multiple use of arg */
25085587Sobrien
25185587Sobrienint quoted(char **pp)	/* pick up next thing after a \\ */
25285587Sobrien			/* and increment *pp */
25385587Sobrien{
25485587Sobrien	char *p = *pp;
25585587Sobrien	int c;
25685587Sobrien
25785587Sobrien	if ((c = *p++) == 't')
25885587Sobrien		c = '\t';
25985587Sobrien	else if (c == 'n')
26085587Sobrien		c = '\n';
26185587Sobrien	else if (c == 'f')
26285587Sobrien		c = '\f';
26385587Sobrien	else if (c == 'r')
26485587Sobrien		c = '\r';
26585587Sobrien	else if (c == 'b')
26685587Sobrien		c = '\b';
26785587Sobrien	else if (c == '\\')
26885587Sobrien		c = '\\';
26985587Sobrien	else if (c == 'x') {	/* hexadecimal goo follows */
27085587Sobrien		c = hexstr(&p);	/* this adds a null if number is invalid */
27185587Sobrien	} else if (isoctdigit(c)) {	/* \d \dd \ddd */
27285587Sobrien		int n = c - '0';
27385587Sobrien		if (isoctdigit(*p)) {
27485587Sobrien			n = 8 * n + *p++ - '0';
27585587Sobrien			if (isoctdigit(*p))
27685587Sobrien				n = 8 * n + *p++ - '0';
27785587Sobrien		}
27885587Sobrien		c = n;
27985587Sobrien	} /* else */
28085587Sobrien		/* c = c; */
28185587Sobrien	*pp = p;
28285587Sobrien	return c;
28385587Sobrien}
28485587Sobrien
285107806Sobrienchar *cclenter(const char *argp)	/* add a character class */
286107806Sobrien{
28785587Sobrien	int i, c, c2;
28885587Sobrien	uschar *p = (uschar *) argp;
28985587Sobrien	uschar *op, *bp;
29085587Sobrien	static uschar *buf = 0;
29185587Sobrien	static int bufsz = 100;
29285587Sobrien
29385587Sobrien	op = p;
29485587Sobrien	if (buf == 0 && (buf = (uschar *) malloc(bufsz)) == NULL)
29585587Sobrien		FATAL("out of space for character class [%.10s...] 1", p);
29685587Sobrien	bp = buf;
29785587Sobrien	for (i = 0; (c = *p++) != 0; ) {
29885587Sobrien		if (c == '\\') {
29985587Sobrien			c = quoted((char **) &p);
30085587Sobrien		} else if (c == '-' && i > 0 && bp[-1] != 0) {
30185587Sobrien			if (*p != 0) {
30285587Sobrien				c = bp[-1];
30385587Sobrien				c2 = *p++;
30485587Sobrien				if (c2 == '\\')
30585587Sobrien					c2 = quoted((char **) &p);
306118194Sru				if (c > c2) {	/* empty; ignore */
30785587Sobrien					bp--;
30885587Sobrien					i--;
30985587Sobrien					continue;
31085587Sobrien				}
311118194Sru				while (c < c2) {
31285587Sobrien					if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, 0))
31385587Sobrien						FATAL("out of space for character class [%.10s...] 2", p);
314118194Sru					*bp++ = ++c;
31585587Sobrien					i++;
31685587Sobrien				}
31785587Sobrien				continue;
31885587Sobrien			}
31985587Sobrien		}
32085587Sobrien		if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, 0))
32185587Sobrien			FATAL("out of space for character class [%.10s...] 3", p);
32285587Sobrien		*bp++ = c;
32385587Sobrien		i++;
32485587Sobrien	}
32585587Sobrien	*bp = 0;
32685587Sobrien	dprintf( ("cclenter: in = |%s|, out = |%s|\n", op, buf) );
32785587Sobrien	xfree(op);
32885587Sobrien	return (char *) tostring((char *) buf);
32985587Sobrien}
33085587Sobrien
331107806Sobrienvoid overflo(const char *s)
33285587Sobrien{
33385587Sobrien	FATAL("regular expression too big: %.30s...", s);
33485587Sobrien}
33585587Sobrien
33685587Sobrienvoid cfoll(fa *f, Node *v)	/* enter follow set of each leaf of vertex v into lfollow[leaf] */
33785587Sobrien{
33885587Sobrien	int i;
33985587Sobrien	int *p;
34085587Sobrien
34185587Sobrien	switch (type(v)) {
34285587Sobrien	LEAF
34385587Sobrien		f->re[info(v)].ltype = type(v);
34485587Sobrien		f->re[info(v)].lval.np = right(v);
34585587Sobrien		while (f->accept >= maxsetvec) {	/* guessing here! */
34685587Sobrien			maxsetvec *= 4;
34785587Sobrien			setvec = (int *) realloc(setvec, maxsetvec * sizeof(int));
34885587Sobrien			tmpset = (int *) realloc(tmpset, maxsetvec * sizeof(int));
34985587Sobrien			if (setvec == 0 || tmpset == 0)
35085587Sobrien				overflo("out of space in cfoll()");
35185587Sobrien		}
35285587Sobrien		for (i = 0; i <= f->accept; i++)
35385587Sobrien			setvec[i] = 0;
35485587Sobrien		setcnt = 0;
35585587Sobrien		follow(v);	/* computes setvec and setcnt */
35685587Sobrien		if ((p = (int *) calloc(1, (setcnt+1)*sizeof(int))) == NULL)
35785587Sobrien			overflo("out of space building follow set");
35885587Sobrien		f->re[info(v)].lfollow = p;
35985587Sobrien		*p = setcnt;
36085587Sobrien		for (i = f->accept; i >= 0; i--)
36185587Sobrien			if (setvec[i] == 1)
36285587Sobrien				*++p = i;
36385587Sobrien		break;
36485587Sobrien	UNARY
36585587Sobrien		cfoll(f,left(v));
36685587Sobrien		break;
36785587Sobrien	case CAT:
36885587Sobrien	case OR:
36985587Sobrien		cfoll(f,left(v));
37085587Sobrien		cfoll(f,right(v));
37185587Sobrien		break;
37285587Sobrien	default:	/* can't happen */
37385587Sobrien		FATAL("can't happen: unknown type %d in cfoll", type(v));
37485587Sobrien	}
37585587Sobrien}
37685587Sobrien
37785587Sobrienint first(Node *p)	/* collects initially active leaves of p into setvec */
37885587Sobrien			/* returns 1 if p matches empty string */
37985587Sobrien{
38085587Sobrien	int b, lp;
38185587Sobrien
38285587Sobrien	switch (type(p)) {
38385587Sobrien	LEAF
38485587Sobrien		lp = info(p);	/* look for high-water mark of subscripts */
38585587Sobrien		while (setcnt >= maxsetvec || lp >= maxsetvec) {	/* guessing here! */
38685587Sobrien			maxsetvec *= 4;
38785587Sobrien			setvec = (int *) realloc(setvec, maxsetvec * sizeof(int));
38885587Sobrien			tmpset = (int *) realloc(tmpset, maxsetvec * sizeof(int));
38985587Sobrien			if (setvec == 0 || tmpset == 0)
39085587Sobrien				overflo("out of space in first()");
39185587Sobrien		}
39285587Sobrien		if (setvec[lp] != 1) {
39385587Sobrien			setvec[lp] = 1;
39485587Sobrien			setcnt++;
39585587Sobrien		}
39685587Sobrien		if (type(p) == CCL && (*(char *) right(p)) == '\0')
39785587Sobrien			return(0);		/* empty CCL */
39885587Sobrien		else return(1);
39985587Sobrien	case PLUS:
40085587Sobrien		if (first(left(p)) == 0) return(0);
40185587Sobrien		return(1);
40285587Sobrien	case STAR:
40385587Sobrien	case QUEST:
40485587Sobrien		first(left(p));
40585587Sobrien		return(0);
40685587Sobrien	case CAT:
40785587Sobrien		if (first(left(p)) == 0 && first(right(p)) == 0) return(0);
40885587Sobrien		return(1);
40985587Sobrien	case OR:
41085587Sobrien		b = first(right(p));
41185587Sobrien		if (first(left(p)) == 0 || b == 0) return(0);
41285587Sobrien		return(1);
41385587Sobrien	}
41485587Sobrien	FATAL("can't happen: unknown type %d in first", type(p));	/* can't happen */
41585587Sobrien	return(-1);
41685587Sobrien}
41785587Sobrien
41885587Sobrienvoid follow(Node *v)	/* collects leaves that can follow v into setvec */
41985587Sobrien{
42085587Sobrien	Node *p;
42185587Sobrien
42285587Sobrien	if (type(v) == FINAL)
42385587Sobrien		return;
42485587Sobrien	p = parent(v);
42585587Sobrien	switch (type(p)) {
42685587Sobrien	case STAR:
42785587Sobrien	case PLUS:
42885587Sobrien		first(v);
42985587Sobrien		follow(p);
43085587Sobrien		return;
43185587Sobrien
43285587Sobrien	case OR:
43385587Sobrien	case QUEST:
43485587Sobrien		follow(p);
43585587Sobrien		return;
43685587Sobrien
43785587Sobrien	case CAT:
43885587Sobrien		if (v == left(p)) {	/* v is left child of p */
43985587Sobrien			if (first(right(p)) == 0) {
44085587Sobrien				follow(p);
44185587Sobrien				return;
44285587Sobrien			}
44385587Sobrien		} else		/* v is right child */
44485587Sobrien			follow(p);
44585587Sobrien		return;
44685587Sobrien	}
44785587Sobrien}
44885587Sobrien
449107806Sobrienint member(int c, const char *sarg)	/* is c in s? */
45085587Sobrien{
45185587Sobrien	uschar *s = (uschar *) sarg;
45285587Sobrien
45385587Sobrien	while (*s)
45485587Sobrien		if (c == *s++)
45585587Sobrien			return(1);
45685587Sobrien	return(0);
45785587Sobrien}
45885587Sobrien
459107806Sobrienint match(fa *f, const char *p0)	/* shortest match ? */
46085587Sobrien{
46185587Sobrien	int s, ns;
46285587Sobrien	uschar *p = (uschar *) p0;
46385587Sobrien
46485587Sobrien	s = f->reset ? makeinit(f,0) : f->initstat;
46585587Sobrien	if (f->out[s])
46685587Sobrien		return(1);
46785587Sobrien	do {
468146299Sru		assert(*p < NCHARS);
46985587Sobrien		if ((ns = f->gototab[s][*p]) != 0)
47085587Sobrien			s = ns;
47185587Sobrien		else
47285587Sobrien			s = cgoto(f, s, *p);
47385587Sobrien		if (f->out[s])
47485587Sobrien			return(1);
47585587Sobrien	} while (*p++ != 0);
47685587Sobrien	return(0);
47785587Sobrien}
47885587Sobrien
479107806Sobrienint pmatch(fa *f, const char *p0)	/* longest match, for sub */
48085587Sobrien{
48185587Sobrien	int s, ns;
48285587Sobrien	uschar *p = (uschar *) p0;
48385587Sobrien	uschar *q;
48485587Sobrien	int i, k;
48585587Sobrien
486125601Sru	/* s = f->reset ? makeinit(f,1) : f->initstat; */
487125601Sru	if (f->reset) {
488125601Sru		f->initstat = s = makeinit(f,1);
489125601Sru	} else {
490125601Sru		s = f->initstat;
491125601Sru	}
49285587Sobrien	patbeg = (char *) p;
49385587Sobrien	patlen = -1;
49485587Sobrien	do {
49585587Sobrien		q = p;
49685587Sobrien		do {
49785587Sobrien			if (f->out[s])		/* final state */
49885587Sobrien				patlen = q-p;
499146299Sru			assert(*q < NCHARS);
50085587Sobrien			if ((ns = f->gototab[s][*q]) != 0)
50185587Sobrien				s = ns;
50285587Sobrien			else
50385587Sobrien				s = cgoto(f, s, *q);
50485587Sobrien			if (s == 1) {	/* no transition */
50585587Sobrien				if (patlen >= 0) {
50685587Sobrien					patbeg = (char *) p;
50785587Sobrien					return(1);
50885587Sobrien				}
50985587Sobrien				else
51085587Sobrien					goto nextin;	/* no match */
51185587Sobrien			}
51285587Sobrien		} while (*q++ != 0);
51385587Sobrien		if (f->out[s])
51485587Sobrien			patlen = q-p-1;	/* don't count $ */
51585587Sobrien		if (patlen >= 0) {
51685587Sobrien			patbeg = (char *) p;
51785587Sobrien			return(1);
51885587Sobrien		}
51985587Sobrien	nextin:
52085587Sobrien		s = 2;
52185587Sobrien		if (f->reset) {
52285587Sobrien			for (i = 2; i <= f->curstat; i++)
52385587Sobrien				xfree(f->posns[i]);
52485587Sobrien			k = *f->posns[0];
52585587Sobrien			if ((f->posns[2] = (int *) calloc(1, (k+1)*sizeof(int))) == NULL)
52685587Sobrien				overflo("out of space in pmatch");
52785587Sobrien			for (i = 0; i <= k; i++)
52885587Sobrien				(f->posns[2])[i] = (f->posns[0])[i];
52985587Sobrien			f->initstat = f->curstat = 2;
53085587Sobrien			f->out[2] = f->out[0];
53185587Sobrien			for (i = 0; i < NCHARS; i++)
53285587Sobrien				f->gototab[2][i] = 0;
53385587Sobrien		}
53485587Sobrien	} while (*p++ != 0);
53585587Sobrien	return (0);
53685587Sobrien}
53785587Sobrien
538107806Sobrienint nematch(fa *f, const char *p0)	/* non-empty match, for sub */
53985587Sobrien{
54085587Sobrien	int s, ns;
54185587Sobrien	uschar *p = (uschar *) p0;
54285587Sobrien	uschar *q;
54385587Sobrien	int i, k;
54485587Sobrien
545125601Sru	/* s = f->reset ? makeinit(f,1) : f->initstat; */
546125601Sru	if (f->reset) {
547125601Sru		f->initstat = s = makeinit(f,1);
548125601Sru	} else {
549125601Sru		s = f->initstat;
550125601Sru	}
55185587Sobrien	patlen = -1;
55285587Sobrien	while (*p) {
55385587Sobrien		q = p;
55485587Sobrien		do {
55585587Sobrien			if (f->out[s])		/* final state */
55685587Sobrien				patlen = q-p;
557146299Sru			assert(*q < NCHARS);
55885587Sobrien			if ((ns = f->gototab[s][*q]) != 0)
55985587Sobrien				s = ns;
56085587Sobrien			else
56185587Sobrien				s = cgoto(f, s, *q);
56285587Sobrien			if (s == 1) {	/* no transition */
56385587Sobrien				if (patlen > 0) {
56485587Sobrien					patbeg = (char *) p;
56585587Sobrien					return(1);
56685587Sobrien				} else
56785587Sobrien					goto nnextin;	/* no nonempty match */
56885587Sobrien			}
56985587Sobrien		} while (*q++ != 0);
57085587Sobrien		if (f->out[s])
57185587Sobrien			patlen = q-p-1;	/* don't count $ */
57285587Sobrien		if (patlen > 0 ) {
57385587Sobrien			patbeg = (char *) p;
57485587Sobrien			return(1);
57585587Sobrien		}
57685587Sobrien	nnextin:
57785587Sobrien		s = 2;
57885587Sobrien		if (f->reset) {
57985587Sobrien			for (i = 2; i <= f->curstat; i++)
58085587Sobrien				xfree(f->posns[i]);
58185587Sobrien			k = *f->posns[0];
58285587Sobrien			if ((f->posns[2] = (int *) calloc(1, (k+1)*sizeof(int))) == NULL)
58385587Sobrien				overflo("out of state space");
58485587Sobrien			for (i = 0; i <= k; i++)
58585587Sobrien				(f->posns[2])[i] = (f->posns[0])[i];
58685587Sobrien			f->initstat = f->curstat = 2;
58785587Sobrien			f->out[2] = f->out[0];
58885587Sobrien			for (i = 0; i < NCHARS; i++)
58985587Sobrien				f->gototab[2][i] = 0;
59085587Sobrien		}
59185587Sobrien		p++;
59285587Sobrien	}
59385587Sobrien	return (0);
59485587Sobrien}
59585587Sobrien
596107806SobrienNode *reparse(const char *p)	/* parses regular expression pointed to by p */
59785587Sobrien{			/* uses relex() to scan regular expression */
59885587Sobrien	Node *np;
59985587Sobrien
60085587Sobrien	dprintf( ("reparse <%s>\n", p) );
60185587Sobrien	lastre = prestr = (uschar *) p;	/* prestr points to string to be parsed */
60285587Sobrien	rtok = relex();
603107806Sobrien	/* GNU compatibility: an empty regexp matches anything */
60485587Sobrien	if (rtok == '\0')
605107806Sobrien		/* FATAL("empty regular expression"); previous */
606107806Sobrien		return(op2(ALL, NIL, NIL));
60785587Sobrien	np = regexp();
60885587Sobrien	if (rtok != '\0')
60985587Sobrien		FATAL("syntax error in regular expression %s at %s", lastre, prestr);
61085587Sobrien	return(np);
61185587Sobrien}
61285587Sobrien
61385587SobrienNode *regexp(void)	/* top-level parse of reg expr */
61485587Sobrien{
61585587Sobrien	return (alt(concat(primary())));
61685587Sobrien}
61785587Sobrien
61885587SobrienNode *primary(void)
61985587Sobrien{
62085587Sobrien	Node *np;
62185587Sobrien
62285587Sobrien	switch (rtok) {
62385587Sobrien	case CHAR:
62485587Sobrien		np = op2(CHAR, NIL, itonp(rlxval));
62585587Sobrien		rtok = relex();
62685587Sobrien		return (unary(np));
62785587Sobrien	case ALL:
62885587Sobrien		rtok = relex();
62985587Sobrien		return (unary(op2(ALL, NIL, NIL)));
63085587Sobrien	case DOT:
63185587Sobrien		rtok = relex();
63285587Sobrien		return (unary(op2(DOT, NIL, NIL)));
63385587Sobrien	case CCL:
63485587Sobrien		np = op2(CCL, NIL, (Node*) cclenter((char *) rlxstr));
63585587Sobrien		rtok = relex();
63685587Sobrien		return (unary(np));
63785587Sobrien	case NCCL:
63885587Sobrien		np = op2(NCCL, NIL, (Node *) cclenter((char *) rlxstr));
63985587Sobrien		rtok = relex();
64085587Sobrien		return (unary(np));
64185587Sobrien	case '^':
64285587Sobrien		rtok = relex();
64385587Sobrien		return (unary(op2(CHAR, NIL, itonp(HAT))));
64485587Sobrien	case '$':
64585587Sobrien		rtok = relex();
64685587Sobrien		return (unary(op2(CHAR, NIL, NIL)));
64785587Sobrien	case '(':
64885587Sobrien		rtok = relex();
64985587Sobrien		if (rtok == ')') {	/* special pleading for () */
65085587Sobrien			rtok = relex();
65185587Sobrien			return unary(op2(CCL, NIL, (Node *) tostring("")));
65285587Sobrien		}
65385587Sobrien		np = regexp();
65485587Sobrien		if (rtok == ')') {
65585587Sobrien			rtok = relex();
65685587Sobrien			return (unary(np));
65785587Sobrien		}
65885587Sobrien		else
65985587Sobrien			FATAL("syntax error in regular expression %s at %s", lastre, prestr);
66085587Sobrien	default:
66185587Sobrien		FATAL("illegal primary in regular expression %s at %s", lastre, prestr);
66285587Sobrien	}
66385587Sobrien	return 0;	/*NOTREACHED*/
66485587Sobrien}
66585587Sobrien
66685587SobrienNode *concat(Node *np)
66785587Sobrien{
66885587Sobrien	switch (rtok) {
66985587Sobrien	case CHAR: case DOT: case ALL: case CCL: case NCCL: case '$': case '(':
67085587Sobrien		return (concat(op2(CAT, np, primary())));
67185587Sobrien	}
67285587Sobrien	return (np);
67385587Sobrien}
67485587Sobrien
67585587SobrienNode *alt(Node *np)
67685587Sobrien{
67785587Sobrien	if (rtok == OR) {
67885587Sobrien		rtok = relex();
67985587Sobrien		return (alt(op2(OR, np, concat(primary()))));
68085587Sobrien	}
68185587Sobrien	return (np);
68285587Sobrien}
68385587Sobrien
68485587SobrienNode *unary(Node *np)
68585587Sobrien{
68685587Sobrien	switch (rtok) {
68785587Sobrien	case STAR:
68885587Sobrien		rtok = relex();
68985587Sobrien		return (unary(op2(STAR, np, NIL)));
69085587Sobrien	case PLUS:
69185587Sobrien		rtok = relex();
69285587Sobrien		return (unary(op2(PLUS, np, NIL)));
69385587Sobrien	case QUEST:
69485587Sobrien		rtok = relex();
69585587Sobrien		return (unary(op2(QUEST, np, NIL)));
69685587Sobrien	default:
69785587Sobrien		return (np);
69885587Sobrien	}
69985587Sobrien}
70085587Sobrien
70190902Sdes/*
70290902Sdes * Character class definitions conformant to the POSIX locale as
70390902Sdes * defined in IEEE P1003.1 draft 7 of June 2001, assuming the source
70490902Sdes * and operating character sets are both ASCII (ISO646) or supersets
70590902Sdes * thereof.
70690902Sdes *
70790902Sdes * Note that to avoid overflowing the temporary buffer used in
70890902Sdes * relex(), the expanded character class (prior to range expansion)
70990902Sdes * must be less than twice the size of their full name.
71090902Sdes */
711112336Sobrien
712112336Sobrien/* Because isblank doesn't show up in any of the header files on any
713112336Sobrien * system i use, it's defined here.  if some other locale has a richer
714112336Sobrien * definition of "blank", define HAS_ISBLANK and provide your own
715112336Sobrien * version.
716118194Sru * the parentheses here are an attempt to find a path through the maze
717118194Sru * of macro definition and/or function and/or version provided.  thanks
718118194Sru * to nelson beebe for the suggestion; let's see if it works everywhere.
719112336Sobrien */
720112336Sobrien
721112336Sobrien#ifndef HAS_ISBLANK
722112336Sobrien
723118194Sruint (isblank)(int c)
724112336Sobrien{
725112336Sobrien	return c==' ' || c=='\t';
726112336Sobrien}
727112336Sobrien
728112336Sobrien#endif
729112336Sobrien
73090902Sdesstruct charclass {
73190902Sdes	const char *cc_name;
73290902Sdes	int cc_namelen;
733112336Sobrien	int (*cc_func)(int);
73490902Sdes} charclasses[] = {
735112336Sobrien	{ "alnum",	5,	isalnum },
736112336Sobrien	{ "alpha",	5,	isalpha },
737112336Sobrien	{ "blank",	5,	isblank },
738112336Sobrien	{ "cntrl",	5,	iscntrl },
739112336Sobrien	{ "digit",	5,	isdigit },
740112336Sobrien	{ "graph",	5,	isgraph },
741112336Sobrien	{ "lower",	5,	islower },
742112336Sobrien	{ "print",	5,	isprint },
743112336Sobrien	{ "punct",	5,	ispunct },
744112336Sobrien	{ "space",	5,	isspace },
745112336Sobrien	{ "upper",	5,	isupper },
746112336Sobrien	{ "xdigit",	6,	isxdigit },
74790902Sdes	{ NULL,		0,	NULL },
74890902Sdes};
74990902Sdes
75090902Sdes
75185587Sobrienint relex(void)		/* lexical analyzer for reparse */
75285587Sobrien{
75385587Sobrien	int c, n;
75485587Sobrien	int cflag;
75585587Sobrien	static uschar *buf = 0;
75685587Sobrien	static int bufsz = 100;
75785587Sobrien	uschar *bp;
75890902Sdes	struct charclass *cc;
759112336Sobrien	int i;
76085587Sobrien
76185587Sobrien	switch (c = *prestr++) {
76285587Sobrien	case '|': return OR;
76385587Sobrien	case '*': return STAR;
76485587Sobrien	case '+': return PLUS;
76585587Sobrien	case '?': return QUEST;
76685587Sobrien	case '.': return DOT;
76785587Sobrien	case '\0': prestr--; return '\0';
76885587Sobrien	case '^':
76985587Sobrien	case '$':
77085587Sobrien	case '(':
77185587Sobrien	case ')':
77285587Sobrien		return c;
77385587Sobrien	case '\\':
77485587Sobrien		rlxval = quoted((char **) &prestr);
77585587Sobrien		return CHAR;
77685587Sobrien	default:
77785587Sobrien		rlxval = c;
77885587Sobrien		return CHAR;
77985587Sobrien	case '[':
78085587Sobrien		if (buf == 0 && (buf = (uschar *) malloc(bufsz)) == NULL)
78185587Sobrien			FATAL("out of space in reg expr %.10s..", lastre);
78285587Sobrien		bp = buf;
78385587Sobrien		if (*prestr == '^') {
78485587Sobrien			cflag = 1;
78585587Sobrien			prestr++;
78685587Sobrien		}
78785587Sobrien		else
78885587Sobrien			cflag = 0;
78990902Sdes		n = 2 * strlen((const char *) prestr)+1;
79085587Sobrien		if (!adjbuf((char **) &buf, &bufsz, n, n, (char **) &bp, 0))
79185587Sobrien			FATAL("out of space for reg expr %.10s...", lastre);
79285587Sobrien		for (; ; ) {
79385587Sobrien			if ((c = *prestr++) == '\\') {
79485587Sobrien				*bp++ = '\\';
79585587Sobrien				if ((c = *prestr++) == '\0')
79685587Sobrien					FATAL("nonterminated character class %.20s...", lastre);
79785587Sobrien				*bp++ = c;
79885587Sobrien			/* } else if (c == '\n') { */
79985587Sobrien			/* 	FATAL("newline in character class %.20s...", lastre); */
80090902Sdes			} else if (c == '[' && *prestr == ':') {
80190902Sdes				/* POSIX char class names, Dag-Erling Smorgrav, des@ofug.org */
80290902Sdes				for (cc = charclasses; cc->cc_name; cc++)
80390902Sdes					if (strncmp((const char *) prestr + 1, (const char *) cc->cc_name, cc->cc_namelen) == 0)
80490902Sdes						break;
80590902Sdes				if (cc->cc_name != NULL && prestr[1 + cc->cc_namelen] == ':' &&
80690902Sdes				    prestr[2 + cc->cc_namelen] == ']') {
80790902Sdes					prestr += cc->cc_namelen + 3;
808112336Sobrien					for (i = 0; i < NCHARS; i++) {
809112336Sobrien						if (!adjbuf((char **) &buf, &bufsz, bp-buf+1, 100, (char **) &bp, 0))
810112336Sobrien						    FATAL("out of space for reg expr %.10s...", lastre);
811112336Sobrien						if (cc->cc_func(i)) {
812112336Sobrien							*bp++ = i;
813112336Sobrien							n++;
814112336Sobrien						}
815112336Sobrien					}
81690902Sdes				} else
81790902Sdes					*bp++ = c;
81885587Sobrien			} else if (c == '\0') {
81985587Sobrien				FATAL("nonterminated character class %.20s", lastre);
82085587Sobrien			} else if (bp == buf) {	/* 1st char is special */
82185587Sobrien				*bp++ = c;
82285587Sobrien			} else if (c == ']') {
82385587Sobrien				*bp++ = 0;
82485587Sobrien				rlxstr = (uschar *) tostring((char *) buf);
82585587Sobrien				if (cflag == 0)
82685587Sobrien					return CCL;
82785587Sobrien				else
82885587Sobrien					return NCCL;
82985587Sobrien			} else
83085587Sobrien				*bp++ = c;
83185587Sobrien		}
83285587Sobrien	}
83385587Sobrien}
83485587Sobrien
83585587Sobrienint cgoto(fa *f, int s, int c)
83685587Sobrien{
83785587Sobrien	int i, j, k;
83885587Sobrien	int *p, *q;
83985587Sobrien
840146299Sru	assert(c == HAT || c < NCHARS);
84185587Sobrien	while (f->accept >= maxsetvec) {	/* guessing here! */
84285587Sobrien		maxsetvec *= 4;
84385587Sobrien		setvec = (int *) realloc(setvec, maxsetvec * sizeof(int));
84485587Sobrien		tmpset = (int *) realloc(tmpset, maxsetvec * sizeof(int));
84585587Sobrien		if (setvec == 0 || tmpset == 0)
84685587Sobrien			overflo("out of space in cgoto()");
84785587Sobrien	}
84885587Sobrien	for (i = 0; i <= f->accept; i++)
84985587Sobrien		setvec[i] = 0;
85085587Sobrien	setcnt = 0;
85185587Sobrien	/* compute positions of gototab[s,c] into setvec */
85285587Sobrien	p = f->posns[s];
85385587Sobrien	for (i = 1; i <= *p; i++) {
85485587Sobrien		if ((k = f->re[p[i]].ltype) != FINAL) {
85585587Sobrien			if ((k == CHAR && c == ptoi(f->re[p[i]].lval.np))
85685587Sobrien			 || (k == DOT && c != 0 && c != HAT)
85785587Sobrien			 || (k == ALL && c != 0)
85885587Sobrien			 || (k == CCL && member(c, (char *) f->re[p[i]].lval.up))
85985587Sobrien			 || (k == NCCL && !member(c, (char *) f->re[p[i]].lval.up) && c != 0 && c != HAT)) {
86085587Sobrien				q = f->re[p[i]].lfollow;
86185587Sobrien				for (j = 1; j <= *q; j++) {
86285587Sobrien					if (q[j] >= maxsetvec) {
86385587Sobrien						maxsetvec *= 4;
86485587Sobrien						setvec = (int *) realloc(setvec, maxsetvec * sizeof(int));
86585587Sobrien						tmpset = (int *) realloc(setvec, maxsetvec * sizeof(int));
86685587Sobrien						if (setvec == 0 || tmpset == 0)
86785587Sobrien							overflo("cgoto overflow");
86885587Sobrien					}
86985587Sobrien					if (setvec[q[j]] == 0) {
87085587Sobrien						setcnt++;
87185587Sobrien						setvec[q[j]] = 1;
87285587Sobrien					}
87385587Sobrien				}
87485587Sobrien			}
87585587Sobrien		}
87685587Sobrien	}
87785587Sobrien	/* determine if setvec is a previous state */
87885587Sobrien	tmpset[0] = setcnt;
87985587Sobrien	j = 1;
88085587Sobrien	for (i = f->accept; i >= 0; i--)
88185587Sobrien		if (setvec[i]) {
88285587Sobrien			tmpset[j++] = i;
88385587Sobrien		}
88485587Sobrien	/* tmpset == previous state? */
88585587Sobrien	for (i = 1; i <= f->curstat; i++) {
88685587Sobrien		p = f->posns[i];
88785587Sobrien		if ((k = tmpset[0]) != p[0])
88885587Sobrien			goto different;
88985587Sobrien		for (j = 1; j <= k; j++)
89085587Sobrien			if (tmpset[j] != p[j])
89185587Sobrien				goto different;
89285587Sobrien		/* setvec is state i */
89385587Sobrien		f->gototab[s][c] = i;
89485587Sobrien		return i;
89585587Sobrien	  different:;
89685587Sobrien	}
89785587Sobrien
89885587Sobrien	/* add tmpset to current set of states */
89985587Sobrien	if (f->curstat >= NSTATES-1) {
90085587Sobrien		f->curstat = 2;
90185587Sobrien		f->reset = 1;
90285587Sobrien		for (i = 2; i < NSTATES; i++)
90385587Sobrien			xfree(f->posns[i]);
90485587Sobrien	} else
90585587Sobrien		++(f->curstat);
90685587Sobrien	for (i = 0; i < NCHARS; i++)
90785587Sobrien		f->gototab[f->curstat][i] = 0;
90885587Sobrien	xfree(f->posns[f->curstat]);
90985587Sobrien	if ((p = (int *) calloc(1, (setcnt+1)*sizeof(int))) == NULL)
91085587Sobrien		overflo("out of space in cgoto");
91185587Sobrien
91285587Sobrien	f->posns[f->curstat] = p;
91385587Sobrien	f->gototab[s][c] = f->curstat;
91485587Sobrien	for (i = 0; i <= setcnt; i++)
91585587Sobrien		p[i] = tmpset[i];
91685587Sobrien	if (setvec[f->accept])
91785587Sobrien		f->out[f->curstat] = 1;
91885587Sobrien	else
91985587Sobrien		f->out[f->curstat] = 0;
92085587Sobrien	return f->curstat;
92185587Sobrien}
92285587Sobrien
92385587Sobrien
92485587Sobrienvoid freefa(fa *f)	/* free a finite automaton */
92585587Sobrien{
92685587Sobrien	int i;
92785587Sobrien
92885587Sobrien	if (f == NULL)
92985587Sobrien		return;
93085587Sobrien	for (i = 0; i <= f->curstat; i++)
93185587Sobrien		xfree(f->posns[i]);
93285587Sobrien	for (i = 0; i <= f->accept; i++) {
93385587Sobrien		xfree(f->re[i].lfollow);
93485587Sobrien		if (f->re[i].ltype == CCL || f->re[i].ltype == NCCL)
93585587Sobrien			xfree((f->re[i].lval.np));
93685587Sobrien	}
93785587Sobrien	xfree(f->restr);
93885587Sobrien	xfree(f);
93985587Sobrien}
940