b.c revision 112336
185587Sobrien/****************************************************************
285587SobrienCopyright (C) Lucent Technologies 1997
385587SobrienAll Rights Reserved
485587Sobrien
585587SobrienPermission to use, copy, modify, and distribute this software and
685587Sobrienits documentation for any purpose and without fee is hereby
785587Sobriengranted, provided that the above copyright notice appear in all
885587Sobriencopies and that both that the copyright notice and this
985587Sobrienpermission notice and warranty disclaimer appear in supporting
1085587Sobriendocumentation, and that the name Lucent Technologies or any of
1185587Sobrienits entities not be used in advertising or publicity pertaining
1285587Sobriento distribution of the software without specific, written prior
1385587Sobrienpermission.
1485587Sobrien
1585587SobrienLUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
1685587SobrienINCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
1785587SobrienIN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
1885587SobrienSPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
1985587SobrienWHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
2085587SobrienIN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
2185587SobrienARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
2285587SobrienTHIS SOFTWARE.
2385587Sobrien****************************************************************/
2485587Sobrien
2585587Sobrien/* lasciate ogne speranza, voi ch'entrate. */
2685587Sobrien
2785587Sobrien#define	DEBUG
2885587Sobrien
2985587Sobrien#include <ctype.h>
3085587Sobrien#include <stdio.h>
3185587Sobrien#include <string.h>
3285587Sobrien#include <stdlib.h>
3385587Sobrien#include "awk.h"
3485587Sobrien#include "ytab.h"
3585587Sobrien
3685587Sobrien#define	HAT	(NCHARS-2)	/* matches ^ in regular expr */
3785587Sobrien				/* NCHARS is 2**n */
3885587Sobrien#define MAXLIN 22
3985587Sobrien
4085587Sobrien#define type(v)		(v)->nobj	/* badly overloaded here */
4185587Sobrien#define info(v)		(v)->ntype	/* badly overloaded here */
4285587Sobrien#define left(v)		(v)->narg[0]
4385587Sobrien#define right(v)	(v)->narg[1]
4485587Sobrien#define parent(v)	(v)->nnext
4585587Sobrien
4685587Sobrien#define LEAF	case CCL: case NCCL: case CHAR: case DOT: case FINAL: case ALL:
4785587Sobrien#define UNARY	case STAR: case PLUS: case QUEST:
4885587Sobrien
4985587Sobrien/* encoding in tree Nodes:
5085587Sobrien	leaf (CCL, NCCL, CHAR, DOT, FINAL, ALL):
5185587Sobrien		left is index, right contains value or pointer to value
5285587Sobrien	unary (STAR, PLUS, QUEST): left is child, right is null
5385587Sobrien	binary (CAT, OR): left and right are children
5485587Sobrien	parent contains pointer to parent
5585587Sobrien*/
5685587Sobrien
5785587Sobrien
5885587Sobrienint	*setvec;
5985587Sobrienint	*tmpset;
6085587Sobrienint	maxsetvec = 0;
6185587Sobrien
6285587Sobrienint	rtok;		/* next token in current re */
6385587Sobrienint	rlxval;
6485587Sobrienstatic uschar	*rlxstr;
6585587Sobrienstatic uschar	*prestr;	/* current position in current re */
6685587Sobrienstatic uschar	*lastre;	/* origin of last re */
6785587Sobrien
6885587Sobrienstatic	int setcnt;
6985587Sobrienstatic	int poscnt;
7085587Sobrien
7185587Sobrienchar	*patbeg;
7285587Sobrienint	patlen;
7385587Sobrien
7485587Sobrien#define	NFA	20	/* cache this many dynamic fa's */
7585587Sobrienfa	*fatab[NFA];
7685587Sobrienint	nfatab	= 0;	/* entries in fatab */
7785587Sobrien
78107806Sobrienfa *makedfa(const char *s, int anchor)	/* returns dfa for reg expr s */
7985587Sobrien{
8085587Sobrien	int i, use, nuse;
8185587Sobrien	fa *pfa;
8285587Sobrien	static int now = 1;
8385587Sobrien
8485587Sobrien	if (setvec == 0) {	/* first time through any RE */
8585587Sobrien		maxsetvec = MAXLIN;
8685587Sobrien		setvec = (int *) malloc(maxsetvec * sizeof(int));
8785587Sobrien		tmpset = (int *) malloc(maxsetvec * sizeof(int));
8885587Sobrien		if (setvec == 0 || tmpset == 0)
8985587Sobrien			overflo("out of space initializing makedfa");
9085587Sobrien	}
9185587Sobrien
9285587Sobrien	if (compile_time)	/* a constant for sure */
9385587Sobrien		return mkdfa(s, anchor);
9485587Sobrien	for (i = 0; i < nfatab; i++)	/* is it there already? */
9585587Sobrien		if (fatab[i]->anchor == anchor
9690902Sdes		  && strcmp((const char *) fatab[i]->restr, s) == 0) {
9785587Sobrien			fatab[i]->use = now++;
9885587Sobrien			return fatab[i];
9985587Sobrien		}
10085587Sobrien	pfa = mkdfa(s, anchor);
10185587Sobrien	if (nfatab < NFA) {	/* room for another */
10285587Sobrien		fatab[nfatab] = pfa;
10385587Sobrien		fatab[nfatab]->use = now++;
10485587Sobrien		nfatab++;
10585587Sobrien		return pfa;
10685587Sobrien	}
10785587Sobrien	use = fatab[0]->use;	/* replace least-recently used */
10885587Sobrien	nuse = 0;
10985587Sobrien	for (i = 1; i < nfatab; i++)
11085587Sobrien		if (fatab[i]->use < use) {
11185587Sobrien			use = fatab[i]->use;
11285587Sobrien			nuse = i;
11385587Sobrien		}
11485587Sobrien	freefa(fatab[nuse]);
11585587Sobrien	fatab[nuse] = pfa;
11685587Sobrien	pfa->use = now++;
11785587Sobrien	return pfa;
11885587Sobrien}
11985587Sobrien
120107806Sobrienfa *mkdfa(const char *s, int anchor)	/* does the real work of making a dfa */
12185587Sobrien				/* anchor = 1 for anchored matches, else 0 */
12285587Sobrien{
12385587Sobrien	Node *p, *p1;
12485587Sobrien	fa *f;
12585587Sobrien
12685587Sobrien	p = reparse(s);
12785587Sobrien	p1 = op2(CAT, op2(STAR, op2(ALL, NIL, NIL), NIL), p);
12885587Sobrien		/* put ALL STAR in front of reg.  exp. */
12985587Sobrien	p1 = op2(CAT, p1, op2(FINAL, NIL, NIL));
13085587Sobrien		/* put FINAL after reg.  exp. */
13185587Sobrien
13285587Sobrien	poscnt = 0;
13385587Sobrien	penter(p1);	/* enter parent pointers and leaf indices */
13485587Sobrien	if ((f = (fa *) calloc(1, sizeof(fa) + poscnt*sizeof(rrow))) == NULL)
13585587Sobrien		overflo("out of space for fa");
13685587Sobrien	f->accept = poscnt-1;	/* penter has computed number of positions in re */
13785587Sobrien	cfoll(f, p1);	/* set up follow sets */
13885587Sobrien	freetr(p1);
13985587Sobrien	if ((f->posns[0] = (int *) calloc(1, *(f->re[0].lfollow)*sizeof(int))) == NULL)
14085587Sobrien			overflo("out of space in makedfa");
14185587Sobrien	if ((f->posns[1] = (int *) calloc(1, sizeof(int))) == NULL)
14285587Sobrien		overflo("out of space in makedfa");
14385587Sobrien	*f->posns[1] = 0;
14485587Sobrien	f->initstat = makeinit(f, anchor);
14585587Sobrien	f->anchor = anchor;
14685587Sobrien	f->restr = (uschar *) tostring(s);
14785587Sobrien	return f;
14885587Sobrien}
14985587Sobrien
15085587Sobrienint makeinit(fa *f, int anchor)
15185587Sobrien{
15285587Sobrien	int i, k;
15385587Sobrien
15485587Sobrien	f->curstat = 2;
15585587Sobrien	f->out[2] = 0;
15685587Sobrien	f->reset = 0;
15785587Sobrien	k = *(f->re[0].lfollow);
15885587Sobrien	xfree(f->posns[2]);
15985587Sobrien	if ((f->posns[2] = (int *) calloc(1, (k+1)*sizeof(int))) == NULL)
16085587Sobrien		overflo("out of space in makeinit");
16185587Sobrien	for (i=0; i <= k; i++) {
16285587Sobrien		(f->posns[2])[i] = (f->re[0].lfollow)[i];
16385587Sobrien	}
16485587Sobrien	if ((f->posns[2])[1] == f->accept)
16585587Sobrien		f->out[2] = 1;
16685587Sobrien	for (i=0; i < NCHARS; i++)
16785587Sobrien		f->gototab[2][i] = 0;
16885587Sobrien	f->curstat = cgoto(f, 2, HAT);
16985587Sobrien	if (anchor) {
17085587Sobrien		*f->posns[2] = k-1;	/* leave out position 0 */
17185587Sobrien		for (i=0; i < k; i++) {
17285587Sobrien			(f->posns[0])[i] = (f->posns[2])[i];
17385587Sobrien		}
17485587Sobrien
17585587Sobrien		f->out[0] = f->out[2];
17685587Sobrien		if (f->curstat != 2)
17785587Sobrien			--(*f->posns[f->curstat]);
17885587Sobrien	}
17985587Sobrien	return f->curstat;
18085587Sobrien}
18185587Sobrien
18285587Sobrienvoid penter(Node *p)	/* set up parent pointers and leaf indices */
18385587Sobrien{
18485587Sobrien	switch (type(p)) {
18585587Sobrien	LEAF
18685587Sobrien		info(p) = poscnt;
18785587Sobrien		poscnt++;
18885587Sobrien		break;
18985587Sobrien	UNARY
19085587Sobrien		penter(left(p));
19185587Sobrien		parent(left(p)) = p;
19285587Sobrien		break;
19385587Sobrien	case CAT:
19485587Sobrien	case OR:
19585587Sobrien		penter(left(p));
19685587Sobrien		penter(right(p));
19785587Sobrien		parent(left(p)) = p;
19885587Sobrien		parent(right(p)) = p;
19985587Sobrien		break;
20085587Sobrien	default:	/* can't happen */
20185587Sobrien		FATAL("can't happen: unknown type %d in penter", type(p));
20285587Sobrien		break;
20385587Sobrien	}
20485587Sobrien}
20585587Sobrien
20685587Sobrienvoid freetr(Node *p)	/* free parse tree */
20785587Sobrien{
20885587Sobrien	switch (type(p)) {
20985587Sobrien	LEAF
21085587Sobrien		xfree(p);
21185587Sobrien		break;
21285587Sobrien	UNARY
21385587Sobrien		freetr(left(p));
21485587Sobrien		xfree(p);
21585587Sobrien		break;
21685587Sobrien	case CAT:
21785587Sobrien	case OR:
21885587Sobrien		freetr(left(p));
21985587Sobrien		freetr(right(p));
22085587Sobrien		xfree(p);
22185587Sobrien		break;
22285587Sobrien	default:	/* can't happen */
22385587Sobrien		FATAL("can't happen: unknown type %d in freetr", type(p));
22485587Sobrien		break;
22585587Sobrien	}
22685587Sobrien}
22785587Sobrien
22885587Sobrien/* in the parsing of regular expressions, metacharacters like . have */
22985587Sobrien/* to be seen literally;  \056 is not a metacharacter. */
23085587Sobrien
23185587Sobrienint hexstr(char **pp)	/* find and eval hex string at pp, return new p */
23285587Sobrien{			/* only pick up one 8-bit byte (2 chars) */
23385587Sobrien	uschar *p;
23485587Sobrien	int n = 0;
23585587Sobrien	int i;
23685587Sobrien
23785587Sobrien	for (i = 0, p = (uschar *) *pp; i < 2 && isxdigit(*p); i++, p++) {
23885587Sobrien		if (isdigit(*p))
23985587Sobrien			n = 16 * n + *p - '0';
24085587Sobrien		else if (*p >= 'a' && *p <= 'f')
24185587Sobrien			n = 16 * n + *p - 'a' + 10;
24285587Sobrien		else if (*p >= 'A' && *p <= 'F')
24385587Sobrien			n = 16 * n + *p - 'A' + 10;
24485587Sobrien	}
24585587Sobrien	*pp = (char *) p;
24685587Sobrien	return n;
24785587Sobrien}
24885587Sobrien
24985587Sobrien#define isoctdigit(c) ((c) >= '0' && (c) <= '7')	/* multiple use of arg */
25085587Sobrien
25185587Sobrienint quoted(char **pp)	/* pick up next thing after a \\ */
25285587Sobrien			/* and increment *pp */
25385587Sobrien{
25485587Sobrien	char *p = *pp;
25585587Sobrien	int c;
25685587Sobrien
25785587Sobrien	if ((c = *p++) == 't')
25885587Sobrien		c = '\t';
25985587Sobrien	else if (c == 'n')
26085587Sobrien		c = '\n';
26185587Sobrien	else if (c == 'f')
26285587Sobrien		c = '\f';
26385587Sobrien	else if (c == 'r')
26485587Sobrien		c = '\r';
26585587Sobrien	else if (c == 'b')
26685587Sobrien		c = '\b';
26785587Sobrien	else if (c == '\\')
26885587Sobrien		c = '\\';
26985587Sobrien	else if (c == 'x') {	/* hexadecimal goo follows */
27085587Sobrien		c = hexstr(&p);	/* this adds a null if number is invalid */
27185587Sobrien	} else if (isoctdigit(c)) {	/* \d \dd \ddd */
27285587Sobrien		int n = c - '0';
27385587Sobrien		if (isoctdigit(*p)) {
27485587Sobrien			n = 8 * n + *p++ - '0';
27585587Sobrien			if (isoctdigit(*p))
27685587Sobrien				n = 8 * n + *p++ - '0';
27785587Sobrien		}
27885587Sobrien		c = n;
27985587Sobrien	} /* else */
28085587Sobrien		/* c = c; */
28185587Sobrien	*pp = p;
28285587Sobrien	return c;
28385587Sobrien}
28485587Sobrien
285112336Sobrienstatic int collate_range_cmp(int a, int b)
286112336Sobrien{
287112336Sobrien	int r;
288112336Sobrien	static char s[2][2];
289112336Sobrien
290112336Sobrien	if ((uschar)a == (uschar)b)
291112336Sobrien		return 0;
292112336Sobrien	s[0][0] = a;
293112336Sobrien	s[1][0] = b;
294112336Sobrien	if ((r = strcoll(s[0], s[1])) == 0)
295112336Sobrien		r = (uschar)a - (uschar)b;
296112336Sobrien	return r;
297112336Sobrien}
298112336Sobrien
299107806Sobrienchar *cclenter(const char *argp)	/* add a character class */
300107806Sobrien{
30185587Sobrien	int i, c, c2;
302112336Sobrien	int j;
30385587Sobrien	uschar *p = (uschar *) argp;
30485587Sobrien	uschar *op, *bp;
30585587Sobrien	static uschar *buf = 0;
30685587Sobrien	static int bufsz = 100;
30785587Sobrien
30885587Sobrien	op = p;
30985587Sobrien	if (buf == 0 && (buf = (uschar *) malloc(bufsz)) == NULL)
31085587Sobrien		FATAL("out of space for character class [%.10s...] 1", p);
31185587Sobrien	bp = buf;
31285587Sobrien	for (i = 0; (c = *p++) != 0; ) {
31385587Sobrien		if (c == '\\') {
31485587Sobrien			c = quoted((char **) &p);
31585587Sobrien		} else if (c == '-' && i > 0 && bp[-1] != 0) {
31685587Sobrien			if (*p != 0) {
31785587Sobrien				c = bp[-1];
31885587Sobrien				c2 = *p++;
31985587Sobrien				if (c2 == '\\')
32085587Sobrien					c2 = quoted((char **) &p);
321112336Sobrien				if (collate_range_cmp(c, c2) > 0) {	/* empty; ignore */
32285587Sobrien					bp--;
32385587Sobrien					i--;
32485587Sobrien					continue;
32585587Sobrien				}
326112336Sobrien				for (j = 0; j < NCHARS; j++) {
327112336Sobrien					if ((collate_range_cmp(c, j) > 0) ||
328112336Sobrien					    collate_range_cmp(j, c2) > 0)
329112336Sobrien						continue;
33085587Sobrien					if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, 0))
33185587Sobrien						FATAL("out of space for character class [%.10s...] 2", p);
332112336Sobrien					*bp++ = j;
33385587Sobrien					i++;
33485587Sobrien				}
33585587Sobrien				continue;
33685587Sobrien			}
33785587Sobrien		}
33885587Sobrien		if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, 0))
33985587Sobrien			FATAL("out of space for character class [%.10s...] 3", p);
34085587Sobrien		*bp++ = c;
34185587Sobrien		i++;
34285587Sobrien	}
34385587Sobrien	*bp = 0;
34485587Sobrien	dprintf( ("cclenter: in = |%s|, out = |%s|\n", op, buf) );
34585587Sobrien	xfree(op);
34685587Sobrien	return (char *) tostring((char *) buf);
34785587Sobrien}
34885587Sobrien
349107806Sobrienvoid overflo(const char *s)
35085587Sobrien{
35185587Sobrien	FATAL("regular expression too big: %.30s...", s);
35285587Sobrien}
35385587Sobrien
35485587Sobrienvoid cfoll(fa *f, Node *v)	/* enter follow set of each leaf of vertex v into lfollow[leaf] */
35585587Sobrien{
35685587Sobrien	int i;
35785587Sobrien	int *p;
35885587Sobrien
35985587Sobrien	switch (type(v)) {
36085587Sobrien	LEAF
36185587Sobrien		f->re[info(v)].ltype = type(v);
36285587Sobrien		f->re[info(v)].lval.np = right(v);
36385587Sobrien		while (f->accept >= maxsetvec) {	/* guessing here! */
36485587Sobrien			maxsetvec *= 4;
36585587Sobrien			setvec = (int *) realloc(setvec, maxsetvec * sizeof(int));
36685587Sobrien			tmpset = (int *) realloc(tmpset, maxsetvec * sizeof(int));
36785587Sobrien			if (setvec == 0 || tmpset == 0)
36885587Sobrien				overflo("out of space in cfoll()");
36985587Sobrien		}
37085587Sobrien		for (i = 0; i <= f->accept; i++)
37185587Sobrien			setvec[i] = 0;
37285587Sobrien		setcnt = 0;
37385587Sobrien		follow(v);	/* computes setvec and setcnt */
37485587Sobrien		if ((p = (int *) calloc(1, (setcnt+1)*sizeof(int))) == NULL)
37585587Sobrien			overflo("out of space building follow set");
37685587Sobrien		f->re[info(v)].lfollow = p;
37785587Sobrien		*p = setcnt;
37885587Sobrien		for (i = f->accept; i >= 0; i--)
37985587Sobrien			if (setvec[i] == 1)
38085587Sobrien				*++p = i;
38185587Sobrien		break;
38285587Sobrien	UNARY
38385587Sobrien		cfoll(f,left(v));
38485587Sobrien		break;
38585587Sobrien	case CAT:
38685587Sobrien	case OR:
38785587Sobrien		cfoll(f,left(v));
38885587Sobrien		cfoll(f,right(v));
38985587Sobrien		break;
39085587Sobrien	default:	/* can't happen */
39185587Sobrien		FATAL("can't happen: unknown type %d in cfoll", type(v));
39285587Sobrien	}
39385587Sobrien}
39485587Sobrien
39585587Sobrienint first(Node *p)	/* collects initially active leaves of p into setvec */
39685587Sobrien			/* returns 1 if p matches empty string */
39785587Sobrien{
39885587Sobrien	int b, lp;
39985587Sobrien
40085587Sobrien	switch (type(p)) {
40185587Sobrien	LEAF
40285587Sobrien		lp = info(p);	/* look for high-water mark of subscripts */
40385587Sobrien		while (setcnt >= maxsetvec || lp >= maxsetvec) {	/* guessing here! */
40485587Sobrien			maxsetvec *= 4;
40585587Sobrien			setvec = (int *) realloc(setvec, maxsetvec * sizeof(int));
40685587Sobrien			tmpset = (int *) realloc(tmpset, maxsetvec * sizeof(int));
40785587Sobrien			if (setvec == 0 || tmpset == 0)
40885587Sobrien				overflo("out of space in first()");
40985587Sobrien		}
41085587Sobrien		if (setvec[lp] != 1) {
41185587Sobrien			setvec[lp] = 1;
41285587Sobrien			setcnt++;
41385587Sobrien		}
41485587Sobrien		if (type(p) == CCL && (*(char *) right(p)) == '\0')
41585587Sobrien			return(0);		/* empty CCL */
41685587Sobrien		else return(1);
41785587Sobrien	case PLUS:
41885587Sobrien		if (first(left(p)) == 0) return(0);
41985587Sobrien		return(1);
42085587Sobrien	case STAR:
42185587Sobrien	case QUEST:
42285587Sobrien		first(left(p));
42385587Sobrien		return(0);
42485587Sobrien	case CAT:
42585587Sobrien		if (first(left(p)) == 0 && first(right(p)) == 0) return(0);
42685587Sobrien		return(1);
42785587Sobrien	case OR:
42885587Sobrien		b = first(right(p));
42985587Sobrien		if (first(left(p)) == 0 || b == 0) return(0);
43085587Sobrien		return(1);
43185587Sobrien	}
43285587Sobrien	FATAL("can't happen: unknown type %d in first", type(p));	/* can't happen */
43385587Sobrien	return(-1);
43485587Sobrien}
43585587Sobrien
43685587Sobrienvoid follow(Node *v)	/* collects leaves that can follow v into setvec */
43785587Sobrien{
43885587Sobrien	Node *p;
43985587Sobrien
44085587Sobrien	if (type(v) == FINAL)
44185587Sobrien		return;
44285587Sobrien	p = parent(v);
44385587Sobrien	switch (type(p)) {
44485587Sobrien	case STAR:
44585587Sobrien	case PLUS:
44685587Sobrien		first(v);
44785587Sobrien		follow(p);
44885587Sobrien		return;
44985587Sobrien
45085587Sobrien	case OR:
45185587Sobrien	case QUEST:
45285587Sobrien		follow(p);
45385587Sobrien		return;
45485587Sobrien
45585587Sobrien	case CAT:
45685587Sobrien		if (v == left(p)) {	/* v is left child of p */
45785587Sobrien			if (first(right(p)) == 0) {
45885587Sobrien				follow(p);
45985587Sobrien				return;
46085587Sobrien			}
46185587Sobrien		} else		/* v is right child */
46285587Sobrien			follow(p);
46385587Sobrien		return;
46485587Sobrien	}
46585587Sobrien}
46685587Sobrien
467107806Sobrienint member(int c, const char *sarg)	/* is c in s? */
46885587Sobrien{
46985587Sobrien	uschar *s = (uschar *) sarg;
47085587Sobrien
47185587Sobrien	while (*s)
47285587Sobrien		if (c == *s++)
47385587Sobrien			return(1);
47485587Sobrien	return(0);
47585587Sobrien}
47685587Sobrien
477107806Sobrienint match(fa *f, const char *p0)	/* shortest match ? */
47885587Sobrien{
47985587Sobrien	int s, ns;
48085587Sobrien	uschar *p = (uschar *) p0;
48185587Sobrien
48285587Sobrien	s = f->reset ? makeinit(f,0) : f->initstat;
48385587Sobrien	if (f->out[s])
48485587Sobrien		return(1);
48585587Sobrien	do {
48685587Sobrien		if ((ns = f->gototab[s][*p]) != 0)
48785587Sobrien			s = ns;
48885587Sobrien		else
48985587Sobrien			s = cgoto(f, s, *p);
49085587Sobrien		if (f->out[s])
49185587Sobrien			return(1);
49285587Sobrien	} while (*p++ != 0);
49385587Sobrien	return(0);
49485587Sobrien}
49585587Sobrien
496107806Sobrienint pmatch(fa *f, const char *p0)	/* longest match, for sub */
49785587Sobrien{
49885587Sobrien	int s, ns;
49985587Sobrien	uschar *p = (uschar *) p0;
50085587Sobrien	uschar *q;
50185587Sobrien	int i, k;
50285587Sobrien
50385587Sobrien	s = f->reset ? makeinit(f,1) : f->initstat;
50485587Sobrien	patbeg = (char *) p;
50585587Sobrien	patlen = -1;
50685587Sobrien	do {
50785587Sobrien		q = p;
50885587Sobrien		do {
50985587Sobrien			if (f->out[s])		/* final state */
51085587Sobrien				patlen = q-p;
51185587Sobrien			if ((ns = f->gototab[s][*q]) != 0)
51285587Sobrien				s = ns;
51385587Sobrien			else
51485587Sobrien				s = cgoto(f, s, *q);
51585587Sobrien			if (s == 1) {	/* no transition */
51685587Sobrien				if (patlen >= 0) {
51785587Sobrien					patbeg = (char *) p;
51885587Sobrien					return(1);
51985587Sobrien				}
52085587Sobrien				else
52185587Sobrien					goto nextin;	/* no match */
52285587Sobrien			}
52385587Sobrien		} while (*q++ != 0);
52485587Sobrien		if (f->out[s])
52585587Sobrien			patlen = q-p-1;	/* don't count $ */
52685587Sobrien		if (patlen >= 0) {
52785587Sobrien			patbeg = (char *) p;
52885587Sobrien			return(1);
52985587Sobrien		}
53085587Sobrien	nextin:
53185587Sobrien		s = 2;
53285587Sobrien		if (f->reset) {
53385587Sobrien			for (i = 2; i <= f->curstat; i++)
53485587Sobrien				xfree(f->posns[i]);
53585587Sobrien			k = *f->posns[0];
53685587Sobrien			if ((f->posns[2] = (int *) calloc(1, (k+1)*sizeof(int))) == NULL)
53785587Sobrien				overflo("out of space in pmatch");
53885587Sobrien			for (i = 0; i <= k; i++)
53985587Sobrien				(f->posns[2])[i] = (f->posns[0])[i];
54085587Sobrien			f->initstat = f->curstat = 2;
54185587Sobrien			f->out[2] = f->out[0];
54285587Sobrien			for (i = 0; i < NCHARS; i++)
54385587Sobrien				f->gototab[2][i] = 0;
54485587Sobrien		}
54585587Sobrien	} while (*p++ != 0);
54685587Sobrien	return (0);
54785587Sobrien}
54885587Sobrien
549107806Sobrienint nematch(fa *f, const char *p0)	/* non-empty match, for sub */
55085587Sobrien{
55185587Sobrien	int s, ns;
55285587Sobrien	uschar *p = (uschar *) p0;
55385587Sobrien	uschar *q;
55485587Sobrien	int i, k;
55585587Sobrien
55685587Sobrien	s = f->reset ? makeinit(f,1) : f->initstat;
55785587Sobrien	patlen = -1;
55885587Sobrien	while (*p) {
55985587Sobrien		q = p;
56085587Sobrien		do {
56185587Sobrien			if (f->out[s])		/* final state */
56285587Sobrien				patlen = q-p;
56385587Sobrien			if ((ns = f->gototab[s][*q]) != 0)
56485587Sobrien				s = ns;
56585587Sobrien			else
56685587Sobrien				s = cgoto(f, s, *q);
56785587Sobrien			if (s == 1) {	/* no transition */
56885587Sobrien				if (patlen > 0) {
56985587Sobrien					patbeg = (char *) p;
57085587Sobrien					return(1);
57185587Sobrien				} else
57285587Sobrien					goto nnextin;	/* no nonempty match */
57385587Sobrien			}
57485587Sobrien		} while (*q++ != 0);
57585587Sobrien		if (f->out[s])
57685587Sobrien			patlen = q-p-1;	/* don't count $ */
57785587Sobrien		if (patlen > 0 ) {
57885587Sobrien			patbeg = (char *) p;
57985587Sobrien			return(1);
58085587Sobrien		}
58185587Sobrien	nnextin:
58285587Sobrien		s = 2;
58385587Sobrien		if (f->reset) {
58485587Sobrien			for (i = 2; i <= f->curstat; i++)
58585587Sobrien				xfree(f->posns[i]);
58685587Sobrien			k = *f->posns[0];
58785587Sobrien			if ((f->posns[2] = (int *) calloc(1, (k+1)*sizeof(int))) == NULL)
58885587Sobrien				overflo("out of state space");
58985587Sobrien			for (i = 0; i <= k; i++)
59085587Sobrien				(f->posns[2])[i] = (f->posns[0])[i];
59185587Sobrien			f->initstat = f->curstat = 2;
59285587Sobrien			f->out[2] = f->out[0];
59385587Sobrien			for (i = 0; i < NCHARS; i++)
59485587Sobrien				f->gototab[2][i] = 0;
59585587Sobrien		}
59685587Sobrien		p++;
59785587Sobrien	}
59885587Sobrien	return (0);
59985587Sobrien}
60085587Sobrien
601107806SobrienNode *reparse(const char *p)	/* parses regular expression pointed to by p */
60285587Sobrien{			/* uses relex() to scan regular expression */
60385587Sobrien	Node *np;
60485587Sobrien
60585587Sobrien	dprintf( ("reparse <%s>\n", p) );
60685587Sobrien	lastre = prestr = (uschar *) p;	/* prestr points to string to be parsed */
60785587Sobrien	rtok = relex();
608107806Sobrien	/* GNU compatibility: an empty regexp matches anything */
60985587Sobrien	if (rtok == '\0')
610107806Sobrien		/* FATAL("empty regular expression"); previous */
611107806Sobrien		return(op2(ALL, NIL, NIL));
61285587Sobrien	np = regexp();
61385587Sobrien	if (rtok != '\0')
61485587Sobrien		FATAL("syntax error in regular expression %s at %s", lastre, prestr);
61585587Sobrien	return(np);
61685587Sobrien}
61785587Sobrien
61885587SobrienNode *regexp(void)	/* top-level parse of reg expr */
61985587Sobrien{
62085587Sobrien	return (alt(concat(primary())));
62185587Sobrien}
62285587Sobrien
62385587SobrienNode *primary(void)
62485587Sobrien{
62585587Sobrien	Node *np;
62685587Sobrien
62785587Sobrien	switch (rtok) {
62885587Sobrien	case CHAR:
62985587Sobrien		np = op2(CHAR, NIL, itonp(rlxval));
63085587Sobrien		rtok = relex();
63185587Sobrien		return (unary(np));
63285587Sobrien	case ALL:
63385587Sobrien		rtok = relex();
63485587Sobrien		return (unary(op2(ALL, NIL, NIL)));
63585587Sobrien	case DOT:
63685587Sobrien		rtok = relex();
63785587Sobrien		return (unary(op2(DOT, NIL, NIL)));
63885587Sobrien	case CCL:
63985587Sobrien		np = op2(CCL, NIL, (Node*) cclenter((char *) rlxstr));
64085587Sobrien		rtok = relex();
64185587Sobrien		return (unary(np));
64285587Sobrien	case NCCL:
64385587Sobrien		np = op2(NCCL, NIL, (Node *) cclenter((char *) rlxstr));
64485587Sobrien		rtok = relex();
64585587Sobrien		return (unary(np));
64685587Sobrien	case '^':
64785587Sobrien		rtok = relex();
64885587Sobrien		return (unary(op2(CHAR, NIL, itonp(HAT))));
64985587Sobrien	case '$':
65085587Sobrien		rtok = relex();
65185587Sobrien		return (unary(op2(CHAR, NIL, NIL)));
65285587Sobrien	case '(':
65385587Sobrien		rtok = relex();
65485587Sobrien		if (rtok == ')') {	/* special pleading for () */
65585587Sobrien			rtok = relex();
65685587Sobrien			return unary(op2(CCL, NIL, (Node *) tostring("")));
65785587Sobrien		}
65885587Sobrien		np = regexp();
65985587Sobrien		if (rtok == ')') {
66085587Sobrien			rtok = relex();
66185587Sobrien			return (unary(np));
66285587Sobrien		}
66385587Sobrien		else
66485587Sobrien			FATAL("syntax error in regular expression %s at %s", lastre, prestr);
66585587Sobrien	default:
66685587Sobrien		FATAL("illegal primary in regular expression %s at %s", lastre, prestr);
66785587Sobrien	}
66885587Sobrien	return 0;	/*NOTREACHED*/
66985587Sobrien}
67085587Sobrien
67185587SobrienNode *concat(Node *np)
67285587Sobrien{
67385587Sobrien	switch (rtok) {
67485587Sobrien	case CHAR: case DOT: case ALL: case CCL: case NCCL: case '$': case '(':
67585587Sobrien		return (concat(op2(CAT, np, primary())));
67685587Sobrien	}
67785587Sobrien	return (np);
67885587Sobrien}
67985587Sobrien
68085587SobrienNode *alt(Node *np)
68185587Sobrien{
68285587Sobrien	if (rtok == OR) {
68385587Sobrien		rtok = relex();
68485587Sobrien		return (alt(op2(OR, np, concat(primary()))));
68585587Sobrien	}
68685587Sobrien	return (np);
68785587Sobrien}
68885587Sobrien
68985587SobrienNode *unary(Node *np)
69085587Sobrien{
69185587Sobrien	switch (rtok) {
69285587Sobrien	case STAR:
69385587Sobrien		rtok = relex();
69485587Sobrien		return (unary(op2(STAR, np, NIL)));
69585587Sobrien	case PLUS:
69685587Sobrien		rtok = relex();
69785587Sobrien		return (unary(op2(PLUS, np, NIL)));
69885587Sobrien	case QUEST:
69985587Sobrien		rtok = relex();
70085587Sobrien		return (unary(op2(QUEST, np, NIL)));
70185587Sobrien	default:
70285587Sobrien		return (np);
70385587Sobrien	}
70485587Sobrien}
70585587Sobrien
70690902Sdes/*
70790902Sdes * Character class definitions conformant to the POSIX locale as
70890902Sdes * defined in IEEE P1003.1 draft 7 of June 2001, assuming the source
70990902Sdes * and operating character sets are both ASCII (ISO646) or supersets
71090902Sdes * thereof.
71190902Sdes *
71290902Sdes * Note that to avoid overflowing the temporary buffer used in
71390902Sdes * relex(), the expanded character class (prior to range expansion)
71490902Sdes * must be less than twice the size of their full name.
71590902Sdes */
716112336Sobrien
717112336Sobrien/* Because isblank doesn't show up in any of the header files on any
718112336Sobrien * system i use, it's defined here.  if some other locale has a richer
719112336Sobrien * definition of "blank", define HAS_ISBLANK and provide your own
720112336Sobrien * version.
721112336Sobrien */
722112336Sobrien
723112336Sobrien#ifndef HAS_ISBLANK
724112336Sobrien
725112336Sobrienint isblank(int c)
726112336Sobrien{
727112336Sobrien	return c==' ' || c=='\t';
728112336Sobrien}
729112336Sobrien
730112336Sobrien#endif
731112336Sobrien
73290902Sdesstruct charclass {
73390902Sdes	const char *cc_name;
73490902Sdes	int cc_namelen;
735112336Sobrien	int (*cc_func)(int);
73690902Sdes} charclasses[] = {
737112336Sobrien	{ "alnum",	5,	isalnum },
738112336Sobrien	{ "alpha",	5,	isalpha },
739112336Sobrien	{ "blank",	5,	isblank },
740112336Sobrien	{ "cntrl",	5,	iscntrl },
741112336Sobrien	{ "digit",	5,	isdigit },
742112336Sobrien	{ "graph",	5,	isgraph },
743112336Sobrien	{ "lower",	5,	islower },
744112336Sobrien	{ "print",	5,	isprint },
745112336Sobrien	{ "punct",	5,	ispunct },
746112336Sobrien	{ "space",	5,	isspace },
747112336Sobrien	{ "upper",	5,	isupper },
748112336Sobrien	{ "xdigit",	6,	isxdigit },
74990902Sdes	{ NULL,		0,	NULL },
75090902Sdes};
75190902Sdes
75290902Sdes
75385587Sobrienint relex(void)		/* lexical analyzer for reparse */
75485587Sobrien{
75585587Sobrien	int c, n;
75685587Sobrien	int cflag;
75785587Sobrien	static uschar *buf = 0;
75885587Sobrien	static int bufsz = 100;
75985587Sobrien	uschar *bp;
76090902Sdes	struct charclass *cc;
761112336Sobrien	int i;
76285587Sobrien
76385587Sobrien	switch (c = *prestr++) {
76485587Sobrien	case '|': return OR;
76585587Sobrien	case '*': return STAR;
76685587Sobrien	case '+': return PLUS;
76785587Sobrien	case '?': return QUEST;
76885587Sobrien	case '.': return DOT;
76985587Sobrien	case '\0': prestr--; return '\0';
77085587Sobrien	case '^':
77185587Sobrien	case '$':
77285587Sobrien	case '(':
77385587Sobrien	case ')':
77485587Sobrien		return c;
77585587Sobrien	case '\\':
77685587Sobrien		rlxval = quoted((char **) &prestr);
77785587Sobrien		return CHAR;
77885587Sobrien	default:
77985587Sobrien		rlxval = c;
78085587Sobrien		return CHAR;
78185587Sobrien	case '[':
78285587Sobrien		if (buf == 0 && (buf = (uschar *) malloc(bufsz)) == NULL)
78385587Sobrien			FATAL("out of space in reg expr %.10s..", lastre);
78485587Sobrien		bp = buf;
78585587Sobrien		if (*prestr == '^') {
78685587Sobrien			cflag = 1;
78785587Sobrien			prestr++;
78885587Sobrien		}
78985587Sobrien		else
79085587Sobrien			cflag = 0;
79190902Sdes		n = 2 * strlen((const char *) prestr)+1;
79285587Sobrien		if (!adjbuf((char **) &buf, &bufsz, n, n, (char **) &bp, 0))
79385587Sobrien			FATAL("out of space for reg expr %.10s...", lastre);
79485587Sobrien		for (; ; ) {
79585587Sobrien			if ((c = *prestr++) == '\\') {
79685587Sobrien				*bp++ = '\\';
79785587Sobrien				if ((c = *prestr++) == '\0')
79885587Sobrien					FATAL("nonterminated character class %.20s...", lastre);
79985587Sobrien				*bp++ = c;
80085587Sobrien			/* } else if (c == '\n') { */
80185587Sobrien			/* 	FATAL("newline in character class %.20s...", lastre); */
80290902Sdes			} else if (c == '[' && *prestr == ':') {
80390902Sdes				/* POSIX char class names, Dag-Erling Smorgrav, des@ofug.org */
80490902Sdes				for (cc = charclasses; cc->cc_name; cc++)
80590902Sdes					if (strncmp((const char *) prestr + 1, (const char *) cc->cc_name, cc->cc_namelen) == 0)
80690902Sdes						break;
80790902Sdes				if (cc->cc_name != NULL && prestr[1 + cc->cc_namelen] == ':' &&
80890902Sdes				    prestr[2 + cc->cc_namelen] == ']') {
80990902Sdes					prestr += cc->cc_namelen + 3;
810112336Sobrien					for (i = 0; i < NCHARS; i++) {
811112336Sobrien						if (!adjbuf((char **) &buf, &bufsz, bp-buf+1, 100, (char **) &bp, 0))
812112336Sobrien						    FATAL("out of space for reg expr %.10s...", lastre);
813112336Sobrien						if (cc->cc_func(i)) {
814112336Sobrien							*bp++ = i;
815112336Sobrien							n++;
816112336Sobrien						}
817112336Sobrien					}
81890902Sdes				} else
81990902Sdes					*bp++ = c;
82085587Sobrien			} else if (c == '\0') {
82185587Sobrien				FATAL("nonterminated character class %.20s", lastre);
82285587Sobrien			} else if (bp == buf) {	/* 1st char is special */
82385587Sobrien				*bp++ = c;
82485587Sobrien			} else if (c == ']') {
82585587Sobrien				*bp++ = 0;
82685587Sobrien				rlxstr = (uschar *) tostring((char *) buf);
82785587Sobrien				if (cflag == 0)
82885587Sobrien					return CCL;
82985587Sobrien				else
83085587Sobrien					return NCCL;
83185587Sobrien			} else
83285587Sobrien				*bp++ = c;
83385587Sobrien		}
83485587Sobrien	}
83585587Sobrien}
83685587Sobrien
83785587Sobrienint cgoto(fa *f, int s, int c)
83885587Sobrien{
83985587Sobrien	int i, j, k;
84085587Sobrien	int *p, *q;
84185587Sobrien
84285587Sobrien	if (c < 0 || c > 255)
84385587Sobrien		FATAL("can't happen: neg char %d in cgoto", c);
84485587Sobrien	while (f->accept >= maxsetvec) {	/* guessing here! */
84585587Sobrien		maxsetvec *= 4;
84685587Sobrien		setvec = (int *) realloc(setvec, maxsetvec * sizeof(int));
84785587Sobrien		tmpset = (int *) realloc(tmpset, maxsetvec * sizeof(int));
84885587Sobrien		if (setvec == 0 || tmpset == 0)
84985587Sobrien			overflo("out of space in cgoto()");
85085587Sobrien	}
85185587Sobrien	for (i = 0; i <= f->accept; i++)
85285587Sobrien		setvec[i] = 0;
85385587Sobrien	setcnt = 0;
85485587Sobrien	/* compute positions of gototab[s,c] into setvec */
85585587Sobrien	p = f->posns[s];
85685587Sobrien	for (i = 1; i <= *p; i++) {
85785587Sobrien		if ((k = f->re[p[i]].ltype) != FINAL) {
85885587Sobrien			if ((k == CHAR && c == ptoi(f->re[p[i]].lval.np))
85985587Sobrien			 || (k == DOT && c != 0 && c != HAT)
86085587Sobrien			 || (k == ALL && c != 0)
86185587Sobrien			 || (k == CCL && member(c, (char *) f->re[p[i]].lval.up))
86285587Sobrien			 || (k == NCCL && !member(c, (char *) f->re[p[i]].lval.up) && c != 0 && c != HAT)) {
86385587Sobrien				q = f->re[p[i]].lfollow;
86485587Sobrien				for (j = 1; j <= *q; j++) {
86585587Sobrien					if (q[j] >= maxsetvec) {
86685587Sobrien						maxsetvec *= 4;
86785587Sobrien						setvec = (int *) realloc(setvec, maxsetvec * sizeof(int));
86885587Sobrien						tmpset = (int *) realloc(setvec, maxsetvec * sizeof(int));
86985587Sobrien						if (setvec == 0 || tmpset == 0)
87085587Sobrien							overflo("cgoto overflow");
87185587Sobrien					}
87285587Sobrien					if (setvec[q[j]] == 0) {
87385587Sobrien						setcnt++;
87485587Sobrien						setvec[q[j]] = 1;
87585587Sobrien					}
87685587Sobrien				}
87785587Sobrien			}
87885587Sobrien		}
87985587Sobrien	}
88085587Sobrien	/* determine if setvec is a previous state */
88185587Sobrien	tmpset[0] = setcnt;
88285587Sobrien	j = 1;
88385587Sobrien	for (i = f->accept; i >= 0; i--)
88485587Sobrien		if (setvec[i]) {
88585587Sobrien			tmpset[j++] = i;
88685587Sobrien		}
88785587Sobrien	/* tmpset == previous state? */
88885587Sobrien	for (i = 1; i <= f->curstat; i++) {
88985587Sobrien		p = f->posns[i];
89085587Sobrien		if ((k = tmpset[0]) != p[0])
89185587Sobrien			goto different;
89285587Sobrien		for (j = 1; j <= k; j++)
89385587Sobrien			if (tmpset[j] != p[j])
89485587Sobrien				goto different;
89585587Sobrien		/* setvec is state i */
89685587Sobrien		f->gototab[s][c] = i;
89785587Sobrien		return i;
89885587Sobrien	  different:;
89985587Sobrien	}
90085587Sobrien
90185587Sobrien	/* add tmpset to current set of states */
90285587Sobrien	if (f->curstat >= NSTATES-1) {
90385587Sobrien		f->curstat = 2;
90485587Sobrien		f->reset = 1;
90585587Sobrien		for (i = 2; i < NSTATES; i++)
90685587Sobrien			xfree(f->posns[i]);
90785587Sobrien	} else
90885587Sobrien		++(f->curstat);
90985587Sobrien	for (i = 0; i < NCHARS; i++)
91085587Sobrien		f->gototab[f->curstat][i] = 0;
91185587Sobrien	xfree(f->posns[f->curstat]);
91285587Sobrien	if ((p = (int *) calloc(1, (setcnt+1)*sizeof(int))) == NULL)
91385587Sobrien		overflo("out of space in cgoto");
91485587Sobrien
91585587Sobrien	f->posns[f->curstat] = p;
91685587Sobrien	f->gototab[s][c] = f->curstat;
91785587Sobrien	for (i = 0; i <= setcnt; i++)
91885587Sobrien		p[i] = tmpset[i];
91985587Sobrien	if (setvec[f->accept])
92085587Sobrien		f->out[f->curstat] = 1;
92185587Sobrien	else
92285587Sobrien		f->out[f->curstat] = 0;
92385587Sobrien	return f->curstat;
92485587Sobrien}
92585587Sobrien
92685587Sobrien
92785587Sobrienvoid freefa(fa *f)	/* free a finite automaton */
92885587Sobrien{
92985587Sobrien	int i;
93085587Sobrien
93185587Sobrien	if (f == NULL)
93285587Sobrien		return;
93385587Sobrien	for (i = 0; i <= f->curstat; i++)
93485587Sobrien		xfree(f->posns[i]);
93585587Sobrien	for (i = 0; i <= f->accept; i++) {
93685587Sobrien		xfree(f->re[i].lfollow);
93785587Sobrien		if (f->re[i].ltype == CCL || f->re[i].ltype == NCCL)
93885587Sobrien			xfree((f->re[i].lval.np));
93985587Sobrien	}
94085587Sobrien	xfree(f->restr);
94185587Sobrien	xfree(f);
94285587Sobrien}
943