b.c revision 107806
145405Smsmith/****************************************************************
245405SmsmithCopyright (C) Lucent Technologies 1997
345405SmsmithAll Rights Reserved
445405Smsmith
545405SmsmithPermission to use, copy, modify, and distribute this software and
645405Smsmithits documentation for any purpose and without fee is hereby
745405Smsmithgranted, provided that the above copyright notice appear in all
845405Smsmithcopies and that both that the copyright notice and this
945405Smsmithpermission notice and warranty disclaimer appear in supporting
1045405Smsmithdocumentation, and that the name Lucent Technologies or any of
1145405Smsmithits entities not be used in advertising or publicity pertaining
1245405Smsmithto distribution of the software without specific, written prior
1345405Smsmithpermission.
1445405Smsmith
1545405SmsmithLUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
1645405SmsmithINCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
1745405SmsmithIN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
1845405SmsmithSPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
1945405SmsmithWHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
2045405SmsmithIN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
2145405SmsmithARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
2245405SmsmithTHIS SOFTWARE.
2345405Smsmith****************************************************************/
2445405Smsmith
2545405Smsmith/* lasciate ogne speranza, voi ch'entrate. */
2645405Smsmith
27115683Sobrien#define	DEBUG
28115683Sobrien
29115683Sobrien#include <ctype.h>
3045405Smsmith#include <stdio.h>
3145405Smsmith#include <string.h>
3245405Smsmith#include <stdlib.h>
3345405Smsmith#include "awk.h"
3445405Smsmith#include "ytab.h"
3576078Sjhb
36106842Smdodd#define	HAT	(NCHARS-2)	/* matches ^ in regular expr */
3745405Smsmith				/* NCHARS is 2**n */
3845405Smsmith#define MAXLIN 22
3945405Smsmith
4045405Smsmith#define type(v)		(v)->nobj	/* badly overloaded here */
4145405Smsmith#define info(v)		(v)->ntype	/* badly overloaded here */
4245405Smsmith#define left(v)		(v)->narg[0]
4345405Smsmith#define right(v)	(v)->narg[1]
4445405Smsmith#define parent(v)	(v)->nnext
4545405Smsmith
4645405Smsmith#define LEAF	case CCL: case NCCL: case CHAR: case DOT: case FINAL: case ALL:
4745405Smsmith#define UNARY	case STAR: case PLUS: case QUEST:
4845405Smsmith
4945405Smsmith/* encoding in tree Nodes:
50177070Sjhb	leaf (CCL, NCCL, CHAR, DOT, FINAL, ALL):
5145405Smsmith		left is index, right contains value or pointer to value
52177070Sjhb	unary (STAR, PLUS, QUEST): left is child, right is null
53177070Sjhb	binary (CAT, OR): left and right are children
54177070Sjhb	parent contains pointer to parent
55177070Sjhb*/
5645405Smsmith
57177070Sjhb
58177070Sjhbint	*setvec;
59177070Sjhbint	*tmpset;
60177070Sjhbint	maxsetvec = 0;
61177070Sjhb
6245405Smsmithint	rtok;		/* next token in current re */
63177070Sjhbint	rlxval;
64177070Sjhbstatic uschar	*rlxstr;
6545405Smsmithstatic uschar	*prestr;	/* current position in current re */
66177070Sjhbstatic uschar	*lastre;	/* origin of last re */
67106842Smdodd
68121307Ssilbystatic	int setcnt;
69177070Sjhbstatic	int poscnt;
70106842Smdodd
71177070Sjhbchar	*patbeg;
72177070Sjhbint	patlen;
73177070Sjhb
74177070Sjhb#define	NFA	20	/* cache this many dynamic fa's */
7545405Smsmithfa	*fatab[NFA];
7645405Smsmithint	nfatab	= 0;	/* entries in fatab */
77177070Sjhb
78177070Sjhbfa *makedfa(const char *s, int anchor)	/* returns dfa for reg expr s */
79177070Sjhb{
8045405Smsmith	int i, use, nuse;
8145405Smsmith	fa *pfa;
8246215Smsmith	static int now = 1;
83177070Sjhb
8446215Smsmith	if (setvec == 0) {	/* first time through any RE */
85177070Sjhb		maxsetvec = MAXLIN;
86177070Sjhb		setvec = (int *) malloc(maxsetvec * sizeof(int));
87177070Sjhb		tmpset = (int *) malloc(maxsetvec * sizeof(int));
88177070Sjhb		if (setvec == 0 || tmpset == 0)
89177070Sjhb			overflo("out of space initializing makedfa");
90177070Sjhb	}
91177070Sjhb
92177070Sjhb	if (compile_time)	/* a constant for sure */
93177070Sjhb		return mkdfa(s, anchor);
94177070Sjhb	for (i = 0; i < nfatab; i++)	/* is it there already? */
95177070Sjhb		if (fatab[i]->anchor == anchor
96177070Sjhb		  && strcmp((const char *) fatab[i]->restr, s) == 0) {
97177070Sjhb			fatab[i]->use = now++;
98177070Sjhb			return fatab[i];
9945405Smsmith		}
10045405Smsmith	pfa = mkdfa(s, anchor);
10145405Smsmith	if (nfatab < NFA) {	/* room for another */
102177070Sjhb		fatab[nfatab] = pfa;
103177070Sjhb		fatab[nfatab]->use = now++;
104177070Sjhb		nfatab++;
105177070Sjhb		return pfa;
106177070Sjhb	}
107177070Sjhb	use = fatab[0]->use;	/* replace least-recently used */
108177070Sjhb	nuse = 0;
10945405Smsmith	for (i = 1; i < nfatab; i++)
11045405Smsmith		if (fatab[i]->use < use) {
111177070Sjhb			use = fatab[i]->use;
11294683Sdwmalone			nuse = i;
11394683Sdwmalone		}
114177070Sjhb	freefa(fatab[nuse]);
115177070Sjhb	fatab[nuse] = pfa;
116177070Sjhb	pfa->use = now++;
11794683Sdwmalone	return pfa;
118177070Sjhb}
119177070Sjhb
12094683Sdwmalonefa *mkdfa(const char *s, int anchor)	/* does the real work of making a dfa */
12194683Sdwmalone				/* anchor = 1 for anchored matches, else 0 */
122177070Sjhb{
12394683Sdwmalone	Node *p, *p1;
12448925Smsmith	fa *f;
12594683Sdwmalone
126177070Sjhb	p = reparse(s);
127177070Sjhb	p1 = op2(CAT, op2(STAR, op2(ALL, NIL, NIL), NIL), p);
128177070Sjhb		/* put ALL STAR in front of reg.  exp. */
12994683Sdwmalone	p1 = op2(CAT, p1, op2(FINAL, NIL, NIL));
13094683Sdwmalone		/* put FINAL after reg.  exp. */
13194683Sdwmalone
13294683Sdwmalone	poscnt = 0;
13394683Sdwmalone	penter(p1);	/* enter parent pointers and leaf indices */
134177070Sjhb	if ((f = (fa *) calloc(1, sizeof(fa) + poscnt*sizeof(rrow))) == NULL)
135177070Sjhb		overflo("out of space for fa");
13694683Sdwmalone	f->accept = poscnt-1;	/* penter has computed number of positions in re */
13745405Smsmith	cfoll(f, p1);	/* set up follow sets */
13845405Smsmith	freetr(p1);
13945405Smsmith	if ((f->posns[0] = (int *) calloc(1, *(f->re[0].lfollow)*sizeof(int))) == NULL)
14045405Smsmith			overflo("out of space in makedfa");
14145405Smsmith	if ((f->posns[1] = (int *) calloc(1, sizeof(int))) == NULL)
142177070Sjhb		overflo("out of space in makedfa");
14345405Smsmith	*f->posns[1] = 0;
144177070Sjhb	f->initstat = makeinit(f, anchor);
145177070Sjhb	f->anchor = anchor;
146177070Sjhb	f->restr = (uschar *) tostring(s);
147177070Sjhb	return f;
148177070Sjhb}
149177070Sjhb
150177070Sjhbint makeinit(fa *f, int anchor)
151177070Sjhb{
15245405Smsmith	int i, k;
15345405Smsmith
15445405Smsmith	f->curstat = 2;
155177070Sjhb	f->out[2] = 0;
156177070Sjhb	f->reset = 0;
157177070Sjhb	k = *(f->re[0].lfollow);
158177070Sjhb	xfree(f->posns[2]);
15945405Smsmith	if ((f->posns[2] = (int *) calloc(1, (k+1)*sizeof(int))) == NULL)
16045405Smsmith		overflo("out of space in makeinit");
16145405Smsmith	for (i=0; i <= k; i++) {
16245405Smsmith		(f->posns[2])[i] = (f->re[0].lfollow)[i];
163177070Sjhb	}
164177070Sjhb	if ((f->posns[2])[1] == f->accept)
165177070Sjhb		f->out[2] = 1;
16645405Smsmith	for (i=0; i < NCHARS; i++)
167177070Sjhb		f->gototab[2][i] = 0;
16845405Smsmith	f->curstat = cgoto(f, 2, HAT);
169177070Sjhb	if (anchor) {
170177070Sjhb		*f->posns[2] = k-1;	/* leave out position 0 */
171177070Sjhb		for (i=0; i < k; i++) {
172177070Sjhb			(f->posns[0])[i] = (f->posns[2])[i];
173177070Sjhb		}
174177070Sjhb
175177070Sjhb		f->out[0] = f->out[2];
176177070Sjhb		if (f->curstat != 2)
177177070Sjhb			--(*f->posns[f->curstat]);
178177070Sjhb	}
179177070Sjhb	return f->curstat;
180177070Sjhb}
181177070Sjhb
182177070Sjhbvoid penter(Node *p)	/* set up parent pointers and leaf indices */
183177070Sjhb{
184177070Sjhb	switch (type(p)) {
185177070Sjhb	LEAF
186177070Sjhb		info(p) = poscnt;
187177070Sjhb		poscnt++;
188177070Sjhb		break;
189177070Sjhb	UNARY
190177070Sjhb		penter(left(p));
191177070Sjhb		parent(left(p)) = p;
192177070Sjhb		break;
193177070Sjhb	case CAT:
194177070Sjhb	case OR:
195177070Sjhb		penter(left(p));
196177070Sjhb		penter(right(p));
197177070Sjhb		parent(left(p)) = p;
198177070Sjhb		parent(right(p)) = p;
199177070Sjhb		break;
200177070Sjhb	default:	/* can't happen */
201177070Sjhb		FATAL("can't happen: unknown type %d in penter", type(p));
202177070Sjhb		break;
203177070Sjhb	}
204177070Sjhb}
205177070Sjhb
206177070Sjhbvoid freetr(Node *p)	/* free parse tree */
20745405Smsmith{
208177070Sjhb	switch (type(p)) {
209177070Sjhb	LEAF
210177070Sjhb		xfree(p);
211177070Sjhb		break;
212177070Sjhb	UNARY
21345405Smsmith		freetr(left(p));
214177070Sjhb		xfree(p);
215177070Sjhb		break;
216177070Sjhb	case CAT:
217177070Sjhb	case OR:
218177070Sjhb		freetr(left(p));
219177070Sjhb		freetr(right(p));
220177070Sjhb		xfree(p);
221177070Sjhb		break;
222177070Sjhb	default:	/* can't happen */
223177070Sjhb		FATAL("can't happen: unknown type %d in freetr", type(p));
224177070Sjhb		break;
225177070Sjhb	}
226177070Sjhb}
227177070Sjhb
228177070Sjhb/* in the parsing of regular expressions, metacharacters like . have */
229177070Sjhb/* to be seen literally;  \056 is not a metacharacter. */
23045405Smsmith
23145405Smsmithint hexstr(char **pp)	/* find and eval hex string at pp, return new p */
23245405Smsmith{			/* only pick up one 8-bit byte (2 chars) */
23345405Smsmith	uschar *p;
23445405Smsmith	int n = 0;
23545405Smsmith	int i;
23645405Smsmith
23745405Smsmith	for (i = 0, p = (uschar *) *pp; i < 2 && isxdigit(*p); i++, p++) {
23845405Smsmith		if (isdigit(*p))
239177070Sjhb			n = 16 * n + *p - '0';
24045405Smsmith		else if (*p >= 'a' && *p <= 'f')
241177070Sjhb			n = 16 * n + *p - 'a' + 10;
24245405Smsmith		else if (*p >= 'A' && *p <= 'F')
243177070Sjhb			n = 16 * n + *p - 'A' + 10;
244177070Sjhb	}
245177070Sjhb	*pp = (char *) p;
246177070Sjhb	return n;
247177070Sjhb}
248177070Sjhb
249177070Sjhb#define isoctdigit(c) ((c) >= '0' && (c) <= '7')	/* multiple use of arg */
25045405Smsmith
25145405Smsmithint quoted(char **pp)	/* pick up next thing after a \\ */
25294683Sdwmalone			/* and increment *pp */
25394683Sdwmalone{
25494683Sdwmalone	char *p = *pp;
25594683Sdwmalone	int c;
25694683Sdwmalone
25794683Sdwmalone	if ((c = *p++) == 't')
258177070Sjhb		c = '\t';
259177070Sjhb	else if (c == 'n')
26094683Sdwmalone		c = '\n';
26194683Sdwmalone	else if (c == 'f')
26245405Smsmith		c = '\f';
26346215Smsmith	else if (c == 'r')
26446215Smsmith		c = '\r';
26546215Smsmith	else if (c == 'b')
26646215Smsmith		c = '\b';
26745405Smsmith	else if (c == '\\')
26848925Smsmith		c = '\\';
26945405Smsmith	else if (c == 'x') {	/* hexadecimal goo follows */
27045405Smsmith		c = hexstr(&p);	/* this adds a null if number is invalid */
27145405Smsmith	} else if (isoctdigit(c)) {	/* \d \dd \ddd */
272177070Sjhb		int n = c - '0';
273177070Sjhb		if (isoctdigit(*p)) {
274177070Sjhb			n = 8 * n + *p++ - '0';
275177070Sjhb			if (isoctdigit(*p))
276177070Sjhb				n = 8 * n + *p++ - '0';
277177070Sjhb		}
278177070Sjhb		c = n;
27948925Smsmith	} /* else */
280177070Sjhb		/* c = c; */
281177070Sjhb	*pp = p;
282177070Sjhb	return c;
28348925Smsmith}
28446215Smsmith
28546215Smsmithstatic int collate_range_cmp(int a, int b)
28646215Smsmith{
28746215Smsmith	int r;
288177070Sjhb	static char s[2][2];
289177070Sjhb
29046215Smsmith	if ((uschar)a == (uschar)b)
29148925Smsmith		return 0;
29248925Smsmith	s[0][0] = a;
29346215Smsmith	s[1][0] = b;
294177070Sjhb	if ((r = strcoll(s[0], s[1])) == 0)
295177070Sjhb		r = (uschar)a - (uschar)b;
296177070Sjhb	return r;
297177070Sjhb}
298177070Sjhb
29946215Smsmithchar *cclenter(const char *argp)	/* add a character class */
300177070Sjhb{
30146215Smsmith	int i, c, c2;
302177070Sjhb	int j;
303177070Sjhb	uschar *p = (uschar *) argp;
304177070Sjhb	uschar *op, *bp;
305177070Sjhb	static uschar *buf = 0;
30645405Smsmith	static int bufsz = 100;
307177070Sjhb
308177070Sjhb	op = p;
309177070Sjhb	if (buf == 0 && (buf = (uschar *) malloc(bufsz)) == NULL)
310177070Sjhb		FATAL("out of space for character class [%.10s...] 1", p);
311177070Sjhb	bp = buf;
312177070Sjhb	for (i = 0; (c = *p++) != 0; ) {
313177070Sjhb		if (c == '\\') {
314177070Sjhb			c = quoted((char **) &p);
315177070Sjhb		} else if (c == '-' && i > 0 && bp[-1] != 0) {
316177070Sjhb			if (*p != 0) {
317177070Sjhb				c = bp[-1];
318177070Sjhb				c2 = *p++;
319177070Sjhb				if (c2 == '\\')
320177070Sjhb					c2 = quoted((char **) &p);
321177070Sjhb				if (collate_range_cmp(c, c2) > 0) {	/* empty; ignore */
322177070Sjhb					bp--;
323177070Sjhb					i--;
324177070Sjhb					continue;
325177070Sjhb				}
326177070Sjhb				for (j = 0; j < NCHARS; j++) {
327177070Sjhb					if ((collate_range_cmp(c, j) > 0) ||
328177070Sjhb					    collate_range_cmp(j, c2) > 0)
329177070Sjhb						continue;
330177070Sjhb					if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, 0))
331177070Sjhb						FATAL("out of space for character class [%.10s...] 2", p);
332177070Sjhb					*bp++ = j;
333177070Sjhb					i++;
334177070Sjhb				}
335177070Sjhb				continue;
336177070Sjhb			}
337177070Sjhb		}
338177070Sjhb		if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, 0))
339177070Sjhb			FATAL("out of space for character class [%.10s...] 3", p);
340177070Sjhb		*bp++ = c;
341177070Sjhb		i++;
342177070Sjhb	}
343177070Sjhb	*bp = 0;
344177070Sjhb	dprintf( ("cclenter: in = |%s|, out = |%s|\n", op, buf) );
345177070Sjhb	xfree(op);
346177070Sjhb	return (char *) tostring((char *) buf);
347177070Sjhb}
348177070Sjhb
349177070Sjhbvoid overflo(const char *s)
350177070Sjhb{
351177070Sjhb	FATAL("regular expression too big: %.30s...", s);
352177070Sjhb}
353177070Sjhb
35445405Smsmithvoid cfoll(fa *f, Node *v)	/* enter follow set of each leaf of vertex v into lfollow[leaf] */
355177070Sjhb{
356177070Sjhb	int i;
357177070Sjhb	int *p;
358177070Sjhb
359177070Sjhb	switch (type(v)) {
360177070Sjhb	LEAF
361177070Sjhb		f->re[info(v)].ltype = type(v);
362177070Sjhb		f->re[info(v)].lval.np = right(v);
363177070Sjhb		while (f->accept >= maxsetvec) {	/* guessing here! */
364177070Sjhb			maxsetvec *= 4;
365177070Sjhb			setvec = (int *) realloc(setvec, maxsetvec * sizeof(int));
366177070Sjhb			tmpset = (int *) realloc(tmpset, maxsetvec * sizeof(int));
367177070Sjhb			if (setvec == 0 || tmpset == 0)
368177070Sjhb				overflo("out of space in cfoll()");
369177070Sjhb		}
370177070Sjhb		for (i = 0; i <= f->accept; i++)
371177070Sjhb			setvec[i] = 0;
372177070Sjhb		setcnt = 0;
373177070Sjhb		follow(v);	/* computes setvec and setcnt */
374177070Sjhb		if ((p = (int *) calloc(1, (setcnt+1)*sizeof(int))) == NULL)
375177070Sjhb			overflo("out of space building follow set");
376177070Sjhb		f->re[info(v)].lfollow = p;
37745405Smsmith		*p = setcnt;
37845405Smsmith		for (i = f->accept; i >= 0; i--)
379177070Sjhb			if (setvec[i] == 1)
380177070Sjhb				*++p = i;
381177070Sjhb		break;
382177070Sjhb	UNARY
383177070Sjhb		cfoll(f,left(v));
384177070Sjhb		break;
385177070Sjhb	case CAT:
386177070Sjhb	case OR:
387177070Sjhb		cfoll(f,left(v));
388177070Sjhb		cfoll(f,right(v));
389177070Sjhb		break;
39045405Smsmith	default:	/* can't happen */
39145405Smsmith		FATAL("can't happen: unknown type %d in cfoll", type(v));
39245405Smsmith	}
39345405Smsmith}
39445405Smsmith
39545405Smsmithint first(Node *p)	/* collects initially active leaves of p into setvec */
39645405Smsmith			/* returns 1 if p matches empty string */
39745405Smsmith{
398177070Sjhb	int b, lp;
399177070Sjhb
400177070Sjhb	switch (type(p)) {
401177070Sjhb	LEAF
402177070Sjhb		lp = info(p);	/* look for high-water mark of subscripts */
403177070Sjhb		while (setcnt >= maxsetvec || lp >= maxsetvec) {	/* guessing here! */
404177070Sjhb			maxsetvec *= 4;
405177070Sjhb			setvec = (int *) realloc(setvec, maxsetvec * sizeof(int));
406177070Sjhb			tmpset = (int *) realloc(tmpset, maxsetvec * sizeof(int));
40745405Smsmith			if (setvec == 0 || tmpset == 0)
40845405Smsmith				overflo("out of space in first()");
40945405Smsmith		}
410177070Sjhb		if (setvec[lp] != 1) {
411177070Sjhb			setvec[lp] = 1;
41245405Smsmith			setcnt++;
413177070Sjhb		}
414177070Sjhb		if (type(p) == CCL && (*(char *) right(p)) == '\0')
415177070Sjhb			return(0);		/* empty CCL */
41645405Smsmith		else return(1);
417177070Sjhb	case PLUS:
418177070Sjhb		if (first(left(p)) == 0) return(0);
41945405Smsmith		return(1);
42045405Smsmith	case STAR:
42145405Smsmith	case QUEST:
42245405Smsmith		first(left(p));
423177070Sjhb		return(0);
42445405Smsmith	case CAT:
425177070Sjhb		if (first(left(p)) == 0 && first(right(p)) == 0) return(0);
426177070Sjhb		return(1);
427177070Sjhb	case OR:
428177070Sjhb		b = first(right(p));
42945405Smsmith		if (first(left(p)) == 0 || b == 0) return(0);
430177070Sjhb		return(1);
431177070Sjhb	}
432177070Sjhb	FATAL("can't happen: unknown type %d in first", type(p));	/* can't happen */
433177070Sjhb	return(-1);
434177070Sjhb}
435177070Sjhb
436177070Sjhbvoid follow(Node *v)	/* collects leaves that can follow v into setvec */
437177070Sjhb{
438103346Sdwmalone	Node *p;
439177070Sjhb
440177070Sjhb	if (type(v) == FINAL)
441177070Sjhb		return;
442103346Sdwmalone	p = parent(v);
443103346Sdwmalone	switch (type(p)) {
444177070Sjhb	case STAR:
44545405Smsmith	case PLUS:
44645405Smsmith		first(v);
44745405Smsmith		follow(p);
44845405Smsmith		return;
44945405Smsmith
45045405Smsmith	case OR:
45145405Smsmith	case QUEST:
45245405Smsmith		follow(p);
453177070Sjhb		return;
454177070Sjhb
45545405Smsmith	case CAT:
456177070Sjhb		if (v == left(p)) {	/* v is left child of p */
457177070Sjhb			if (first(right(p)) == 0) {
458177070Sjhb				follow(p);
459177070Sjhb				return;
460177070Sjhb			}
461177070Sjhb		} else		/* v is right child */
462177070Sjhb			follow(p);
463177070Sjhb		return;
464177070Sjhb	}
465177070Sjhb}
466177070Sjhb
467177070Sjhbint member(int c, const char *sarg)	/* is c in s? */
468177070Sjhb{
469177070Sjhb	uschar *s = (uschar *) sarg;
470177070Sjhb
471177070Sjhb	while (*s)
472177070Sjhb		if (c == *s++)
473177070Sjhb			return(1);
474177070Sjhb	return(0);
475177070Sjhb}
476177070Sjhb
477177070Sjhbint match(fa *f, const char *p0)	/* shortest match ? */
478177070Sjhb{
479177070Sjhb	int s, ns;
480177070Sjhb	uschar *p = (uschar *) p0;
481177070Sjhb
482177070Sjhb	s = f->reset ? makeinit(f,0) : f->initstat;
483177070Sjhb	if (f->out[s])
484177070Sjhb		return(1);
485177070Sjhb	do {
486177070Sjhb		if ((ns = f->gototab[s][*p]) != 0)
487177070Sjhb			s = ns;
488177070Sjhb		else
489177070Sjhb			s = cgoto(f, s, *p);
490177070Sjhb		if (f->out[s])
491177070Sjhb			return(1);
492177070Sjhb	} while (*p++ != 0);
493177070Sjhb	return(0);
494177070Sjhb}
495177070Sjhb
496177070Sjhbint pmatch(fa *f, const char *p0)	/* longest match, for sub */
497177070Sjhb{
498177070Sjhb	int s, ns;
499177070Sjhb	uschar *p = (uschar *) p0;
500177070Sjhb	uschar *q;
50145405Smsmith	int i, k;
50245405Smsmith
503177070Sjhb	s = f->reset ? makeinit(f,1) : f->initstat;
504177070Sjhb	patbeg = (char *) p;
505177070Sjhb	patlen = -1;
506177070Sjhb	do {
507177070Sjhb		q = p;
508177070Sjhb		do {
509177070Sjhb			if (f->out[s])		/* final state */
510177070Sjhb				patlen = q-p;
511177070Sjhb			if ((ns = f->gototab[s][*q]) != 0)
512177070Sjhb				s = ns;
51345405Smsmith			else
51445405Smsmith				s = cgoto(f, s, *q);
51545405Smsmith			if (s == 1) {	/* no transition */
51645405Smsmith				if (patlen >= 0) {
51745405Smsmith					patbeg = (char *) p;
51845405Smsmith					return(1);
51945405Smsmith				}
52045405Smsmith				else
521177070Sjhb					goto nextin;	/* no match */
522177070Sjhb			}
52345405Smsmith		} while (*q++ != 0);
524177070Sjhb		if (f->out[s])
525177070Sjhb			patlen = q-p-1;	/* don't count $ */
526177070Sjhb		if (patlen >= 0) {
527177070Sjhb			patbeg = (char *) p;
528177070Sjhb			return(1);
529177070Sjhb		}
530177070Sjhb	nextin:
531177070Sjhb		s = 2;
532177070Sjhb		if (f->reset) {
53345405Smsmith			for (i = 2; i <= f->curstat; i++)
534177070Sjhb				xfree(f->posns[i]);
53545405Smsmith			k = *f->posns[0];
536177070Sjhb			if ((f->posns[2] = (int *) calloc(1, (k+1)*sizeof(int))) == NULL)
537177070Sjhb				overflo("out of space in pmatch");
538177070Sjhb			for (i = 0; i <= k; i++)
539177070Sjhb				(f->posns[2])[i] = (f->posns[0])[i];
540177070Sjhb			f->initstat = f->curstat = 2;
541177070Sjhb			f->out[2] = f->out[0];
542177070Sjhb			for (i = 0; i < NCHARS; i++)
543177070Sjhb				f->gototab[2][i] = 0;
544177070Sjhb		}
545177070Sjhb	} while (*p++ != 0);
546177070Sjhb	return (0);
547177070Sjhb}
548177070Sjhb
549177070Sjhbint nematch(fa *f, const char *p0)	/* non-empty match, for sub */
550177070Sjhb{
551177070Sjhb	int s, ns;
552177070Sjhb	uschar *p = (uschar *) p0;
553177070Sjhb	uschar *q;
554177070Sjhb	int i, k;
555177070Sjhb
556177070Sjhb	s = f->reset ? makeinit(f,1) : f->initstat;
557177070Sjhb	patlen = -1;
558177070Sjhb	while (*p) {
559177070Sjhb		q = p;
560177070Sjhb		do {
56145405Smsmith			if (f->out[s])		/* final state */
56245405Smsmith				patlen = q-p;
563177070Sjhb			if ((ns = f->gototab[s][*q]) != 0)
564177070Sjhb				s = ns;
56545405Smsmith			else
566177070Sjhb				s = cgoto(f, s, *q);
567177070Sjhb			if (s == 1) {	/* no transition */
568177070Sjhb				if (patlen > 0) {
56945405Smsmith					patbeg = (char *) p;
57045405Smsmith					return(1);
57145405Smsmith				} else
572177070Sjhb					goto nnextin;	/* no nonempty match */
573177070Sjhb			}
57445405Smsmith		} while (*q++ != 0);
57545405Smsmith		if (f->out[s])
57645405Smsmith			patlen = q-p-1;	/* don't count $ */
57745405Smsmith		if (patlen > 0 ) {
578177070Sjhb			patbeg = (char *) p;
579177070Sjhb			return(1);
58045405Smsmith		}
581177070Sjhb	nnextin:
582177070Sjhb		s = 2;
58345405Smsmith		if (f->reset) {
584177070Sjhb			for (i = 2; i <= f->curstat; i++)
585177070Sjhb				xfree(f->posns[i]);
586177070Sjhb			k = *f->posns[0];
587177070Sjhb			if ((f->posns[2] = (int *) calloc(1, (k+1)*sizeof(int))) == NULL)
588177070Sjhb				overflo("out of state space");
589177070Sjhb			for (i = 0; i <= k; i++)
590177070Sjhb				(f->posns[2])[i] = (f->posns[0])[i];
59145405Smsmith			f->initstat = f->curstat = 2;
592177070Sjhb			f->out[2] = f->out[0];
59345405Smsmith			for (i = 0; i < NCHARS; i++)
594177070Sjhb				f->gototab[2][i] = 0;
595177070Sjhb		}
596177070Sjhb		p++;
597177070Sjhb	}
598177070Sjhb	return (0);
59945405Smsmith}
600177070Sjhb
601177070SjhbNode *reparse(const char *p)	/* parses regular expression pointed to by p */
602177070Sjhb{			/* uses relex() to scan regular expression */
60345405Smsmith	Node *np;
604177070Sjhb
60545405Smsmith	dprintf( ("reparse <%s>\n", p) );
606177070Sjhb	lastre = prestr = (uschar *) p;	/* prestr points to string to be parsed */
607177070Sjhb	rtok = relex();
608177070Sjhb	/* GNU compatibility: an empty regexp matches anything */
609177070Sjhb	if (rtok == '\0')
610177070Sjhb		/* FATAL("empty regular expression"); previous */
611177070Sjhb		return(op2(ALL, NIL, NIL));
612177070Sjhb	np = regexp();
613177070Sjhb	if (rtok != '\0')
614177070Sjhb		FATAL("syntax error in regular expression %s at %s", lastre, prestr);
615177070Sjhb	return(np);
616177070Sjhb}
617177070Sjhb
618177070SjhbNode *regexp(void)	/* top-level parse of reg expr */
619177070Sjhb{
620177070Sjhb	return (alt(concat(primary())));
621177070Sjhb}
622177070Sjhb
623177070SjhbNode *primary(void)
624177070Sjhb{
625177070Sjhb	Node *np;
62645405Smsmith
627177070Sjhb	switch (rtok) {
628177070Sjhb	case CHAR:
629177070Sjhb		np = op2(CHAR, NIL, itonp(rlxval));
630177070Sjhb		rtok = relex();
631177070Sjhb		return (unary(np));
632177070Sjhb	case ALL:
633177070Sjhb		rtok = relex();
634177070Sjhb		return (unary(op2(ALL, NIL, NIL)));
635177070Sjhb	case DOT:
636177070Sjhb		rtok = relex();
637177070Sjhb		return (unary(op2(DOT, NIL, NIL)));
63845405Smsmith	case CCL:
63945405Smsmith		np = op2(CCL, NIL, (Node*) cclenter((char *) rlxstr));
64045405Smsmith		rtok = relex();
64146215Smsmith		return (unary(np));
64246215Smsmith	case NCCL:
64346215Smsmith		np = op2(NCCL, NIL, (Node *) cclenter((char *) rlxstr));
64445405Smsmith		rtok = relex();
64546215Smsmith		return (unary(np));
64646215Smsmith	case '^':
647177070Sjhb		rtok = relex();
648177070Sjhb		return (unary(op2(CHAR, NIL, itonp(HAT))));
649177070Sjhb	case '$':
65046215Smsmith		rtok = relex();
65146215Smsmith		return (unary(op2(CHAR, NIL, NIL)));
65246215Smsmith	case '(':
65345405Smsmith		rtok = relex();
65445405Smsmith		if (rtok == ')') {	/* special pleading for () */
655177070Sjhb			rtok = relex();
656177124Sjhb			return unary(op2(CCL, NIL, (Node *) tostring("")));
657177124Sjhb		}
658177124Sjhb		np = regexp();
659177124Sjhb		if (rtok == ')') {
660177124Sjhb			rtok = relex();
661177124Sjhb			return (unary(np));
662177124Sjhb		}
663177124Sjhb		else
664177124Sjhb			FATAL("syntax error in regular expression %s at %s", lastre, prestr);
665177124Sjhb	default:
66645405Smsmith		FATAL("illegal primary in regular expression %s at %s", lastre, prestr);
667177070Sjhb	}
668	return 0;	/*NOTREACHED*/
669}
670
671Node *concat(Node *np)
672{
673	switch (rtok) {
674	case CHAR: case DOT: case ALL: case CCL: case NCCL: case '$': case '(':
675		return (concat(op2(CAT, np, primary())));
676	}
677	return (np);
678}
679
680Node *alt(Node *np)
681{
682	if (rtok == OR) {
683		rtok = relex();
684		return (alt(op2(OR, np, concat(primary()))));
685	}
686	return (np);
687}
688
689Node *unary(Node *np)
690{
691	switch (rtok) {
692	case STAR:
693		rtok = relex();
694		return (unary(op2(STAR, np, NIL)));
695	case PLUS:
696		rtok = relex();
697		return (unary(op2(PLUS, np, NIL)));
698	case QUEST:
699		rtok = relex();
700		return (unary(op2(QUEST, np, NIL)));
701	default:
702		return (np);
703	}
704}
705
706/*
707 * Character class definitions conformant to the POSIX locale as
708 * defined in IEEE P1003.1 draft 7 of June 2001, assuming the source
709 * and operating character sets are both ASCII (ISO646) or supersets
710 * thereof.
711 *
712 * Note that to avoid overflowing the temporary buffer used in
713 * relex(), the expanded character class (prior to range expansion)
714 * must be less than twice the size of their full name.
715 */
716
717struct charclass {
718	const char *cc_name;
719	int cc_namelen;
720	int (*cc_func)(int);
721} charclasses[] = {
722	{ "alnum",	5,	isalnum },
723	{ "alpha",	5,	isalpha },
724	{ "blank",	5,	isblank },
725	{ "cntrl",	5,	iscntrl },
726	{ "digit",	5,	isdigit },
727	{ "graph",	5,	isgraph },
728	{ "lower",	5,	islower },
729	{ "print",	5,	isprint },
730	{ "punct",	5,	ispunct },
731	{ "space",	5,	isspace },
732	{ "upper",	5,	isupper },
733	{ "xdigit",	6,	isxdigit },
734	{ NULL,		0,	NULL },
735};
736
737
738int relex(void)		/* lexical analyzer for reparse */
739{
740	int c, n;
741	int cflag;
742	static uschar *buf = 0;
743	static int bufsz = 100;
744	uschar *bp;
745	struct charclass *cc;
746	int i;
747
748	switch (c = *prestr++) {
749	case '|': return OR;
750	case '*': return STAR;
751	case '+': return PLUS;
752	case '?': return QUEST;
753	case '.': return DOT;
754	case '\0': prestr--; return '\0';
755	case '^':
756	case '$':
757	case '(':
758	case ')':
759		return c;
760	case '\\':
761		rlxval = quoted((char **) &prestr);
762		return CHAR;
763	default:
764		rlxval = c;
765		return CHAR;
766	case '[':
767		if (buf == 0 && (buf = (uschar *) malloc(bufsz)) == NULL)
768			FATAL("out of space in reg expr %.10s..", lastre);
769		bp = buf;
770		if (*prestr == '^') {
771			cflag = 1;
772			prestr++;
773		}
774		else
775			cflag = 0;
776		n = 2 * strlen((const char *) prestr)+1;
777		if (!adjbuf((char **) &buf, &bufsz, n, n, (char **) &bp, 0))
778			FATAL("out of space for reg expr %.10s...", lastre);
779		for (; ; ) {
780			if ((c = *prestr++) == '\\') {
781				*bp++ = '\\';
782				if ((c = *prestr++) == '\0')
783					FATAL("nonterminated character class %.20s...", lastre);
784				*bp++ = c;
785			/* } else if (c == '\n') { */
786			/* 	FATAL("newline in character class %.20s...", lastre); */
787			} else if (c == '[' && *prestr == ':') {
788				/* POSIX char class names, Dag-Erling Smorgrav, des@ofug.org */
789				for (cc = charclasses; cc->cc_name; cc++)
790					if (strncmp((const char *) prestr + 1, (const char *) cc->cc_name, cc->cc_namelen) == 0)
791						break;
792				if (cc->cc_name != NULL && prestr[1 + cc->cc_namelen] == ':' &&
793				    prestr[2 + cc->cc_namelen] == ']') {
794					prestr += cc->cc_namelen + 3;
795					for (i = 0; i < NCHARS; i++) {
796						if (!adjbuf((char **) &buf, &bufsz, bp-buf+1, 100, (char **) &bp, 0))
797						    FATAL("out of space for reg expr %.10s...", lastre);
798						if (cc->cc_func(i)) {
799							*bp++ = i;
800							n++;
801						}
802					}
803				} else
804					*bp++ = c;
805			} else if (c == '\0') {
806				FATAL("nonterminated character class %.20s", lastre);
807			} else if (bp == buf) {	/* 1st char is special */
808				*bp++ = c;
809			} else if (c == ']') {
810				*bp++ = 0;
811				rlxstr = (uschar *) tostring((char *) buf);
812				if (cflag == 0)
813					return CCL;
814				else
815					return NCCL;
816			} else
817				*bp++ = c;
818		}
819	}
820}
821
822int cgoto(fa *f, int s, int c)
823{
824	int i, j, k;
825	int *p, *q;
826
827	if (c < 0 || c > 255)
828		FATAL("can't happen: neg char %d in cgoto", c);
829	while (f->accept >= maxsetvec) {	/* guessing here! */
830		maxsetvec *= 4;
831		setvec = (int *) realloc(setvec, maxsetvec * sizeof(int));
832		tmpset = (int *) realloc(tmpset, maxsetvec * sizeof(int));
833		if (setvec == 0 || tmpset == 0)
834			overflo("out of space in cgoto()");
835	}
836	for (i = 0; i <= f->accept; i++)
837		setvec[i] = 0;
838	setcnt = 0;
839	/* compute positions of gototab[s,c] into setvec */
840	p = f->posns[s];
841	for (i = 1; i <= *p; i++) {
842		if ((k = f->re[p[i]].ltype) != FINAL) {
843			if ((k == CHAR && c == ptoi(f->re[p[i]].lval.np))
844			 || (k == DOT && c != 0 && c != HAT)
845			 || (k == ALL && c != 0)
846			 || (k == CCL && member(c, (char *) f->re[p[i]].lval.up))
847			 || (k == NCCL && !member(c, (char *) f->re[p[i]].lval.up) && c != 0 && c != HAT)) {
848				q = f->re[p[i]].lfollow;
849				for (j = 1; j <= *q; j++) {
850					if (q[j] >= maxsetvec) {
851						maxsetvec *= 4;
852						setvec = (int *) realloc(setvec, maxsetvec * sizeof(int));
853						tmpset = (int *) realloc(setvec, maxsetvec * sizeof(int));
854						if (setvec == 0 || tmpset == 0)
855							overflo("cgoto overflow");
856					}
857					if (setvec[q[j]] == 0) {
858						setcnt++;
859						setvec[q[j]] = 1;
860					}
861				}
862			}
863		}
864	}
865	/* determine if setvec is a previous state */
866	tmpset[0] = setcnt;
867	j = 1;
868	for (i = f->accept; i >= 0; i--)
869		if (setvec[i]) {
870			tmpset[j++] = i;
871		}
872	/* tmpset == previous state? */
873	for (i = 1; i <= f->curstat; i++) {
874		p = f->posns[i];
875		if ((k = tmpset[0]) != p[0])
876			goto different;
877		for (j = 1; j <= k; j++)
878			if (tmpset[j] != p[j])
879				goto different;
880		/* setvec is state i */
881		f->gototab[s][c] = i;
882		return i;
883	  different:;
884	}
885
886	/* add tmpset to current set of states */
887	if (f->curstat >= NSTATES-1) {
888		f->curstat = 2;
889		f->reset = 1;
890		for (i = 2; i < NSTATES; i++)
891			xfree(f->posns[i]);
892	} else
893		++(f->curstat);
894	for (i = 0; i < NCHARS; i++)
895		f->gototab[f->curstat][i] = 0;
896	xfree(f->posns[f->curstat]);
897	if ((p = (int *) calloc(1, (setcnt+1)*sizeof(int))) == NULL)
898		overflo("out of space in cgoto");
899
900	f->posns[f->curstat] = p;
901	f->gototab[s][c] = f->curstat;
902	for (i = 0; i <= setcnt; i++)
903		p[i] = tmpset[i];
904	if (setvec[f->accept])
905		f->out[f->curstat] = 1;
906	else
907		f->out[f->curstat] = 0;
908	return f->curstat;
909}
910
911
912void freefa(fa *f)	/* free a finite automaton */
913{
914	int i;
915
916	if (f == NULL)
917		return;
918	for (i = 0; i <= f->curstat; i++)
919		xfree(f->posns[i]);
920	for (i = 0; i <= f->accept; i++) {
921		xfree(f->re[i].lfollow);
922		if (f->re[i].ltype == CCL || f->re[i].ltype == NCCL)
923			xfree((f->re[i].lval.np));
924	}
925	xfree(f->restr);
926	xfree(f);
927}
928