1/****************************************************************
2Copyright (C) Lucent Technologies 1997
3All Rights Reserved
4
5Permission to use, copy, modify, and distribute this software and
6its documentation for any purpose and without fee is hereby
7granted, provided that the above copyright notice appear in all
8copies and that both that the copyright notice and this
9permission notice and warranty disclaimer appear in supporting
10documentation, and that the name Lucent Technologies or any of
11its entities not be used in advertising or publicity pertaining
12to distribution of the software without specific, written prior
13permission.
14
15LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
16INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
17IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
18SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
20IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
21ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
22THIS SOFTWARE.
23****************************************************************/
24
25/* lasciate ogne speranza, voi ch'intrate. */
26
27#include <sys/cdefs.h>
28__FBSDID("$FreeBSD$");
29
30#define	DEBUG
31
32#include <ctype.h>
33#include <limits.h>
34#include <stdio.h>
35#include <string.h>
36#include <stdlib.h>
37#include "awk.h"
38#include "ytab.h"
39
40#define MAXLIN 22
41
42#define type(v)		(v)->nobj	/* badly overloaded here */
43#define info(v)		(v)->ntype	/* badly overloaded here */
44#define left(v)		(v)->narg[0]
45#define right(v)	(v)->narg[1]
46#define parent(v)	(v)->nnext
47
48#define LEAF	case CCL: case NCCL: case CHAR: case DOT: case FINAL: case ALL:
49#define ELEAF	case EMPTYRE:		/* empty string in regexp */
50#define UNARY	case STAR: case PLUS: case QUEST:
51
52/* encoding in tree Nodes:
53	leaf (CCL, NCCL, CHAR, DOT, FINAL, ALL, EMPTYRE):
54		left is index, right contains value or pointer to value
55	unary (STAR, PLUS, QUEST): left is child, right is null
56	binary (CAT, OR): left and right are children
57	parent contains pointer to parent
58*/
59
60
61int	*setvec;
62int	*tmpset;
63int	maxsetvec = 0;
64
65int	rtok;		/* next token in current re */
66int	rlxval;
67static uschar	*rlxstr;
68static uschar	*prestr;	/* current position in current re */
69static uschar	*lastre;	/* origin of last re */
70static uschar	*lastatom;	/* origin of last Atom */
71static uschar	*starttok;
72static uschar 	*basestr;	/* starts with original, replaced during
73				   repetition processing */
74static uschar 	*firstbasestr;
75
76static	int setcnt;
77static	int poscnt;
78
79char	*patbeg;
80int	patlen;
81
82#define	NFA	20	/* cache this many dynamic fa's */
83fa	*fatab[NFA];
84int	nfatab	= 0;	/* entries in fatab */
85
86fa *makedfa(const char *s, int anchor)	/* returns dfa for reg expr s */
87{
88	int i, use, nuse;
89	fa *pfa;
90	static int now = 1;
91
92	if (setvec == NULL) {	/* first time through any RE */
93		maxsetvec = MAXLIN;
94		setvec = (int *) malloc(maxsetvec * sizeof(int));
95		tmpset = (int *) malloc(maxsetvec * sizeof(int));
96		if (setvec == NULL || tmpset == NULL)
97			overflo("out of space initializing makedfa");
98	}
99
100	if (compile_time)	/* a constant for sure */
101		return mkdfa(s, anchor);
102	for (i = 0; i < nfatab; i++)	/* is it there already? */
103		if (fatab[i]->anchor == anchor
104		  && strcmp((const char *) fatab[i]->restr, s) == 0) {
105			fatab[i]->use = now++;
106			return fatab[i];
107		}
108	pfa = mkdfa(s, anchor);
109	if (nfatab < NFA) {	/* room for another */
110		fatab[nfatab] = pfa;
111		fatab[nfatab]->use = now++;
112		nfatab++;
113		return pfa;
114	}
115	use = fatab[0]->use;	/* replace least-recently used */
116	nuse = 0;
117	for (i = 1; i < nfatab; i++)
118		if (fatab[i]->use < use) {
119			use = fatab[i]->use;
120			nuse = i;
121		}
122	freefa(fatab[nuse]);
123	fatab[nuse] = pfa;
124	pfa->use = now++;
125	return pfa;
126}
127
128fa *mkdfa(const char *s, int anchor)	/* does the real work of making a dfa */
129				/* anchor = 1 for anchored matches, else 0 */
130{
131	Node *p, *p1;
132	fa *f;
133
134	firstbasestr = (uschar *) s;
135	basestr = firstbasestr;
136	p = reparse(s);
137	p1 = op2(CAT, op2(STAR, op2(ALL, NIL, NIL), NIL), p);
138		/* put ALL STAR in front of reg.  exp. */
139	p1 = op2(CAT, p1, op2(FINAL, NIL, NIL));
140		/* put FINAL after reg.  exp. */
141
142	poscnt = 0;
143	penter(p1);	/* enter parent pointers and leaf indices */
144	if ((f = (fa *) calloc(1, sizeof(fa) + poscnt*sizeof(rrow))) == NULL)
145		overflo("out of space for fa");
146	f->accept = poscnt-1;	/* penter has computed number of positions in re */
147	cfoll(f, p1);	/* set up follow sets */
148	freetr(p1);
149	if ((f->posns[0] = (int *) calloc(*(f->re[0].lfollow), sizeof(int))) == NULL)
150			overflo("out of space in makedfa");
151	if ((f->posns[1] = (int *) calloc(1, sizeof(int))) == NULL)
152		overflo("out of space in makedfa");
153	*f->posns[1] = 0;
154	f->initstat = makeinit(f, anchor);
155	f->anchor = anchor;
156	f->restr = (uschar *) tostring(s);
157	if (firstbasestr != basestr) {
158		if (basestr)
159			xfree(basestr);
160	}
161	return f;
162}
163
164int makeinit(fa *f, int anchor)
165{
166	int i, k;
167
168	f->curstat = 2;
169	f->out[2] = 0;
170	f->reset = 0;
171	k = *(f->re[0].lfollow);
172	xfree(f->posns[2]);
173	if ((f->posns[2] = (int *) calloc(k+1, sizeof(int))) == NULL)
174		overflo("out of space in makeinit");
175	for (i=0; i <= k; i++) {
176		(f->posns[2])[i] = (f->re[0].lfollow)[i];
177	}
178	if ((f->posns[2])[1] == f->accept)
179		f->out[2] = 1;
180	for (i=0; i < NCHARS; i++)
181		f->gototab[2][i] = 0;
182	f->curstat = cgoto(f, 2, HAT);
183	if (anchor) {
184		*f->posns[2] = k-1;	/* leave out position 0 */
185		for (i=0; i < k; i++) {
186			(f->posns[0])[i] = (f->posns[2])[i];
187		}
188
189		f->out[0] = f->out[2];
190		if (f->curstat != 2)
191			--(*f->posns[f->curstat]);
192	}
193	return f->curstat;
194}
195
196void penter(Node *p)	/* set up parent pointers and leaf indices */
197{
198	switch (type(p)) {
199	ELEAF
200	LEAF
201		info(p) = poscnt;
202		poscnt++;
203		break;
204	UNARY
205		penter(left(p));
206		parent(left(p)) = p;
207		break;
208	case CAT:
209	case OR:
210		penter(left(p));
211		penter(right(p));
212		parent(left(p)) = p;
213		parent(right(p)) = p;
214		break;
215	default:	/* can't happen */
216		FATAL("can't happen: unknown type %d in penter", type(p));
217		break;
218	}
219}
220
221void freetr(Node *p)	/* free parse tree */
222{
223	switch (type(p)) {
224	ELEAF
225	LEAF
226		xfree(p);
227		break;
228	UNARY
229		freetr(left(p));
230		xfree(p);
231		break;
232	case CAT:
233	case OR:
234		freetr(left(p));
235		freetr(right(p));
236		xfree(p);
237		break;
238	default:	/* can't happen */
239		FATAL("can't happen: unknown type %d in freetr", type(p));
240		break;
241	}
242}
243
244/* in the parsing of regular expressions, metacharacters like . have */
245/* to be seen literally;  \056 is not a metacharacter. */
246
247int hexstr(uschar **pp)	/* find and eval hex string at pp, return new p */
248{			/* only pick up one 8-bit byte (2 chars) */
249	uschar *p;
250	int n = 0;
251	int i;
252
253	for (i = 0, p = (uschar *) *pp; i < 2 && isxdigit(*p); i++, p++) {
254		if (isdigit(*p))
255			n = 16 * n + *p - '0';
256		else if (*p >= 'a' && *p <= 'f')
257			n = 16 * n + *p - 'a' + 10;
258		else if (*p >= 'A' && *p <= 'F')
259			n = 16 * n + *p - 'A' + 10;
260	}
261	*pp = (uschar *) p;
262	return n;
263}
264
265#define isoctdigit(c) ((c) >= '0' && (c) <= '7')	/* multiple use of arg */
266
267int quoted(uschar **pp)	/* pick up next thing after a \\ */
268			/* and increment *pp */
269{
270	uschar *p = *pp;
271	int c;
272
273	if ((c = *p++) == 't')
274		c = '\t';
275	else if (c == 'n')
276		c = '\n';
277	else if (c == 'f')
278		c = '\f';
279	else if (c == 'r')
280		c = '\r';
281	else if (c == 'b')
282		c = '\b';
283	else if (c == '\\')
284		c = '\\';
285	else if (c == 'x') {	/* hexadecimal goo follows */
286		c = hexstr(&p);	/* this adds a null if number is invalid */
287	} else if (isoctdigit(c)) {	/* \d \dd \ddd */
288		int n = c - '0';
289		if (isoctdigit(*p)) {
290			n = 8 * n + *p++ - '0';
291			if (isoctdigit(*p))
292				n = 8 * n + *p++ - '0';
293		}
294		c = n;
295	} /* else */
296		/* c = c; */
297	*pp = p;
298	return c;
299}
300
301static int collate_range_cmp(int a, int b)
302{
303	static char s[2][2];
304
305	if ((uschar)a == (uschar)b)
306		return 0;
307	s[0][0] = a;
308	s[1][0] = b;
309	return (strcoll(s[0], s[1]));
310}
311
312char *cclenter(const char *argp)	/* add a character class */
313{
314	int i, c, c2;
315	int j;
316	uschar *p = (uschar *) argp;
317	uschar *op, *bp;
318	static uschar *buf = NULL;
319	static int bufsz = 100;
320
321	op = p;
322	if (buf == NULL && (buf = (uschar *) malloc(bufsz)) == NULL)
323		FATAL("out of space for character class [%.10s...] 1", p);
324	bp = buf;
325	for (i = 0; (c = *p++) != 0; ) {
326		if (c == '\\') {
327			c = quoted(&p);
328		} else if (c == '-' && i > 0 && bp[-1] != 0) {
329			if (*p != 0) {
330				c = bp[-1];
331				c2 = *p++;
332				if (c2 == '\\')
333					c2 = quoted(&p);
334				if (collate_range_cmp(c, c2) > 0) {
335					bp--;
336					i--;
337					continue;
338				}
339				for (j = 0; j < NCHARS; j++) {
340					if ((collate_range_cmp(c, j) > 0) ||
341					    collate_range_cmp(j, c2) > 0)
342						continue;
343					if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, "cclenter1"))
344						FATAL("out of space for character class [%.10s...] 2", p);
345					*bp++ = j;
346					i++;
347				}
348				continue;
349			}
350		}
351		if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, "cclenter2"))
352			FATAL("out of space for character class [%.10s...] 3", p);
353		*bp++ = c;
354		i++;
355	}
356	*bp = 0;
357	dprintf( ("cclenter: in = |%s|, out = |%s|\n", op, buf) );
358	xfree(op);
359	return (char *) tostring((char *) buf);
360}
361
362void overflo(const char *s)
363{
364	FATAL("regular expression too big: %.30s...", s);
365}
366
367void cfoll(fa *f, Node *v)	/* enter follow set of each leaf of vertex v into lfollow[leaf] */
368{
369	int i;
370	int *p;
371
372	switch (type(v)) {
373	ELEAF
374	LEAF
375		f->re[info(v)].ltype = type(v);
376		f->re[info(v)].lval.np = right(v);
377		while (f->accept >= maxsetvec) {	/* guessing here! */
378			maxsetvec *= 4;
379			setvec = (int *) realloc(setvec, maxsetvec * sizeof(int));
380			tmpset = (int *) realloc(tmpset, maxsetvec * sizeof(int));
381			if (setvec == NULL || tmpset == NULL)
382				overflo("out of space in cfoll()");
383		}
384		for (i = 0; i <= f->accept; i++)
385			setvec[i] = 0;
386		setcnt = 0;
387		follow(v);	/* computes setvec and setcnt */
388		if ((p = (int *) calloc(setcnt+1, sizeof(int))) == NULL)
389			overflo("out of space building follow set");
390		f->re[info(v)].lfollow = p;
391		*p = setcnt;
392		for (i = f->accept; i >= 0; i--)
393			if (setvec[i] == 1)
394				*++p = i;
395		break;
396	UNARY
397		cfoll(f,left(v));
398		break;
399	case CAT:
400	case OR:
401		cfoll(f,left(v));
402		cfoll(f,right(v));
403		break;
404	default:	/* can't happen */
405		FATAL("can't happen: unknown type %d in cfoll", type(v));
406	}
407}
408
409int first(Node *p)	/* collects initially active leaves of p into setvec */
410			/* returns 0 if p matches empty string */
411{
412	int b, lp;
413
414	switch (type(p)) {
415	ELEAF
416	LEAF
417		lp = info(p);	/* look for high-water mark of subscripts */
418		while (setcnt >= maxsetvec || lp >= maxsetvec) {	/* guessing here! */
419			maxsetvec *= 4;
420			setvec = (int *) realloc(setvec, maxsetvec * sizeof(int));
421			tmpset = (int *) realloc(tmpset, maxsetvec * sizeof(int));
422			if (setvec == NULL || tmpset == NULL)
423				overflo("out of space in first()");
424		}
425		if (type(p) == EMPTYRE) {
426			setvec[lp] = 0;
427			return(0);
428		}
429		if (setvec[lp] != 1) {
430			setvec[lp] = 1;
431			setcnt++;
432		}
433		if (type(p) == CCL && (*(char *) right(p)) == '\0')
434			return(0);		/* empty CCL */
435		else return(1);
436	case PLUS:
437		if (first(left(p)) == 0) return(0);
438		return(1);
439	case STAR:
440	case QUEST:
441		first(left(p));
442		return(0);
443	case CAT:
444		if (first(left(p)) == 0 && first(right(p)) == 0) return(0);
445		return(1);
446	case OR:
447		b = first(right(p));
448		if (first(left(p)) == 0 || b == 0) return(0);
449		return(1);
450	}
451	FATAL("can't happen: unknown type %d in first", type(p));	/* can't happen */
452	return(-1);
453}
454
455void follow(Node *v)	/* collects leaves that can follow v into setvec */
456{
457	Node *p;
458
459	if (type(v) == FINAL)
460		return;
461	p = parent(v);
462	switch (type(p)) {
463	case STAR:
464	case PLUS:
465		first(v);
466		follow(p);
467		return;
468
469	case OR:
470	case QUEST:
471		follow(p);
472		return;
473
474	case CAT:
475		if (v == left(p)) {	/* v is left child of p */
476			if (first(right(p)) == 0) {
477				follow(p);
478				return;
479			}
480		} else		/* v is right child */
481			follow(p);
482		return;
483	}
484}
485
486int member(int c, const char *sarg)	/* is c in s? */
487{
488	uschar *s = (uschar *) sarg;
489
490	while (*s)
491		if (c == *s++)
492			return(1);
493	return(0);
494}
495
496int match(fa *f, const char *p0)	/* shortest match ? */
497{
498	int s, ns;
499	uschar *p = (uschar *) p0;
500
501	s = f->reset ? makeinit(f,0) : f->initstat;
502	if (f->out[s])
503		return(1);
504	do {
505		/* assert(*p < NCHARS); */
506		if ((ns = f->gototab[s][*p]) != 0)
507			s = ns;
508		else
509			s = cgoto(f, s, *p);
510		if (f->out[s])
511			return(1);
512	} while (*p++ != 0);
513	return(0);
514}
515
516int pmatch(fa *f, const char *p0)	/* longest match, for sub */
517{
518	int s, ns;
519	uschar *p = (uschar *) p0;
520	uschar *q;
521	int i, k;
522
523	/* s = f->reset ? makeinit(f,1) : f->initstat; */
524	if (f->reset) {
525		f->initstat = s = makeinit(f,1);
526	} else {
527		s = f->initstat;
528	}
529	patbeg = (char *) p;
530	patlen = -1;
531	do {
532		q = p;
533		do {
534			if (f->out[s])		/* final state */
535				patlen = q-p;
536			/* assert(*q < NCHARS); */
537			if ((ns = f->gototab[s][*q]) != 0)
538				s = ns;
539			else
540				s = cgoto(f, s, *q);
541			if (s == 1) {	/* no transition */
542				if (patlen >= 0) {
543					patbeg = (char *) p;
544					return(1);
545				}
546				else
547					goto nextin;	/* no match */
548			}
549		} while (*q++ != 0);
550		if (f->out[s])
551			patlen = q-p-1;	/* don't count $ */
552		if (patlen >= 0) {
553			patbeg = (char *) p;
554			return(1);
555		}
556	nextin:
557		s = 2;
558		if (f->reset) {
559			for (i = 2; i <= f->curstat; i++)
560				xfree(f->posns[i]);
561			k = *f->posns[0];
562			if ((f->posns[2] = (int *) calloc(k+1, sizeof(int))) == NULL)
563				overflo("out of space in pmatch");
564			for (i = 0; i <= k; i++)
565				(f->posns[2])[i] = (f->posns[0])[i];
566			f->initstat = f->curstat = 2;
567			f->out[2] = f->out[0];
568			for (i = 0; i < NCHARS; i++)
569				f->gototab[2][i] = 0;
570		}
571	} while (*p++ != 0);
572	return (0);
573}
574
575int nematch(fa *f, const char *p0)	/* non-empty match, for sub */
576{
577	int s, ns;
578	uschar *p = (uschar *) p0;
579	uschar *q;
580	int i, k;
581
582	/* s = f->reset ? makeinit(f,1) : f->initstat; */
583	if (f->reset) {
584		f->initstat = s = makeinit(f,1);
585	} else {
586		s = f->initstat;
587	}
588	patlen = -1;
589	while (*p) {
590		q = p;
591		do {
592			if (f->out[s])		/* final state */
593				patlen = q-p;
594			/* assert(*q < NCHARS); */
595			if ((ns = f->gototab[s][*q]) != 0)
596				s = ns;
597			else
598				s = cgoto(f, s, *q);
599			if (s == 1) {	/* no transition */
600				if (patlen > 0) {
601					patbeg = (char *) p;
602					return(1);
603				} else
604					goto nnextin;	/* no nonempty match */
605			}
606		} while (*q++ != 0);
607		if (f->out[s])
608			patlen = q-p-1;	/* don't count $ */
609		if (patlen > 0 ) {
610			patbeg = (char *) p;
611			return(1);
612		}
613	nnextin:
614		s = 2;
615		if (f->reset) {
616			for (i = 2; i <= f->curstat; i++)
617				xfree(f->posns[i]);
618			k = *f->posns[0];
619			if ((f->posns[2] = (int *) calloc(k+1, sizeof(int))) == NULL)
620				overflo("out of state space");
621			for (i = 0; i <= k; i++)
622				(f->posns[2])[i] = (f->posns[0])[i];
623			f->initstat = f->curstat = 2;
624			f->out[2] = f->out[0];
625			for (i = 0; i < NCHARS; i++)
626				f->gototab[2][i] = 0;
627		}
628		p++;
629	}
630	return (0);
631}
632
633Node *reparse(const char *p)	/* parses regular expression pointed to by p */
634{			/* uses relex() to scan regular expression */
635	Node *np;
636
637	dprintf( ("reparse <%s>\n", p) );
638	lastre = prestr = (uschar *) p;	/* prestr points to string to be parsed */
639	rtok = relex();
640	/* GNU compatibility: an empty regexp matches anything */
641	if (rtok == '\0') {
642		/* FATAL("empty regular expression"); previous */
643		return(op2(EMPTYRE, NIL, NIL));
644	}
645	np = regexp();
646	if (rtok != '\0')
647		FATAL("syntax error in regular expression %s at %s", lastre, prestr);
648	return(np);
649}
650
651Node *regexp(void)	/* top-level parse of reg expr */
652{
653	return (alt(concat(primary())));
654}
655
656Node *primary(void)
657{
658	Node *np;
659	int savelastatom;
660
661	switch (rtok) {
662	case CHAR:
663		lastatom = starttok;
664		np = op2(CHAR, NIL, itonp(rlxval));
665		rtok = relex();
666		return (unary(np));
667	case ALL:
668		rtok = relex();
669		return (unary(op2(ALL, NIL, NIL)));
670	case EMPTYRE:
671		rtok = relex();
672		return (unary(op2(EMPTYRE, NIL, NIL)));
673	case DOT:
674		lastatom = starttok;
675		rtok = relex();
676		return (unary(op2(DOT, NIL, NIL)));
677	case CCL:
678		np = op2(CCL, NIL, (Node*) cclenter((char *) rlxstr));
679		lastatom = starttok;
680		rtok = relex();
681		return (unary(np));
682	case NCCL:
683		np = op2(NCCL, NIL, (Node *) cclenter((char *) rlxstr));
684		lastatom = starttok;
685		rtok = relex();
686		return (unary(np));
687	case '^':
688		rtok = relex();
689		return (unary(op2(CHAR, NIL, itonp(HAT))));
690	case '$':
691		rtok = relex();
692		return (unary(op2(CHAR, NIL, NIL)));
693	case '(':
694		lastatom = starttok;
695		savelastatom = starttok - basestr; /* Retain over recursion */
696		rtok = relex();
697		if (rtok == ')') {	/* special pleading for () */
698			rtok = relex();
699			return unary(op2(CCL, NIL, (Node *) tostring("")));
700		}
701		np = regexp();
702		if (rtok == ')') {
703			lastatom = basestr + savelastatom; /* Restore */
704			rtok = relex();
705			return (unary(np));
706		}
707		else
708			FATAL("syntax error in regular expression %s at %s", lastre, prestr);
709	default:
710		FATAL("illegal primary in regular expression %s at %s", lastre, prestr);
711	}
712	return 0;	/*NOTREACHED*/
713}
714
715Node *concat(Node *np)
716{
717	switch (rtok) {
718	case CHAR: case DOT: case ALL: case CCL: case NCCL: case '$': case '(':
719		return (concat(op2(CAT, np, primary())));
720	case EMPTYRE:
721		rtok = relex();
722		return (concat(op2(CAT, op2(CCL, NIL, (Node *) tostring("")),
723				primary())));
724	}
725	return (np);
726}
727
728Node *alt(Node *np)
729{
730	if (rtok == OR) {
731		rtok = relex();
732		return (alt(op2(OR, np, concat(primary()))));
733	}
734	return (np);
735}
736
737Node *unary(Node *np)
738{
739	switch (rtok) {
740	case STAR:
741		rtok = relex();
742		return (unary(op2(STAR, np, NIL)));
743	case PLUS:
744		rtok = relex();
745		return (unary(op2(PLUS, np, NIL)));
746	case QUEST:
747		rtok = relex();
748		return (unary(op2(QUEST, np, NIL)));
749	default:
750		return (np);
751	}
752}
753
754/*
755 * Character class definitions conformant to the POSIX locale as
756 * defined in IEEE P1003.1 draft 7 of June 2001, assuming the source
757 * and operating character sets are both ASCII (ISO646) or supersets
758 * thereof.
759 *
760 * Note that to avoid overflowing the temporary buffer used in
761 * relex(), the expanded character class (prior to range expansion)
762 * must be less than twice the size of their full name.
763 */
764
765/* Because isblank doesn't show up in any of the header files on any
766 * system i use, it's defined here.  if some other locale has a richer
767 * definition of "blank", define HAS_ISBLANK and provide your own
768 * version.
769 * the parentheses here are an attempt to find a path through the maze
770 * of macro definition and/or function and/or version provided.  thanks
771 * to nelson beebe for the suggestion; let's see if it works everywhere.
772 */
773
774/* #define HAS_ISBLANK */
775#ifndef HAS_ISBLANK
776
777int (xisblank)(int c)
778{
779	return c==' ' || c=='\t';
780}
781
782#endif
783
784struct charclass {
785	const char *cc_name;
786	int cc_namelen;
787	int (*cc_func)(int);
788} charclasses[] = {
789	{ "alnum",	5,	isalnum },
790	{ "alpha",	5,	isalpha },
791#ifndef HAS_ISBLANK
792	{ "blank",	5,	xisblank },
793#else
794	{ "blank",	5,	isblank },
795#endif
796	{ "cntrl",	5,	iscntrl },
797	{ "digit",	5,	isdigit },
798	{ "graph",	5,	isgraph },
799	{ "lower",	5,	islower },
800	{ "print",	5,	isprint },
801	{ "punct",	5,	ispunct },
802	{ "space",	5,	isspace },
803	{ "upper",	5,	isupper },
804	{ "xdigit",	6,	isxdigit },
805	{ NULL,		0,	NULL },
806};
807
808#define REPEAT_SIMPLE		0
809#define REPEAT_PLUS_APPENDED	1
810#define REPEAT_WITH_Q		2
811#define REPEAT_ZERO		3
812
813static int
814replace_repeat(const uschar *reptok, int reptoklen, const uschar *atom,
815	       int atomlen, int firstnum, int secondnum, int special_case)
816{
817	int i, j;
818	uschar *buf = 0;
819	int ret = 1;
820	int init_q = (firstnum==0);		/* first added char will be ? */
821	int n_q_reps = secondnum-firstnum;	/* m>n, so reduce until {1,m-n} left  */
822	int prefix_length = reptok - basestr;	/* prefix includes first rep	*/
823	int suffix_length = strlen((char *) reptok) - reptoklen;	/* string after rep specifier	*/
824	int size = prefix_length +  suffix_length;
825
826	if (firstnum > 1) {	/* add room for reps 2 through firstnum */
827		size += atomlen*(firstnum-1);
828	}
829
830	/* Adjust size of buffer for special cases */
831	if (special_case == REPEAT_PLUS_APPENDED) {
832		size++;		/* for the final + */
833	} else if (special_case == REPEAT_WITH_Q) {
834		size += init_q + (atomlen+1)* n_q_reps;
835	} else if (special_case == REPEAT_ZERO) {
836		size += 2;	/* just a null ERE: () */
837	}
838	if ((buf = (uschar *) malloc(size+1)) == NULL)
839		FATAL("out of space in reg expr %.10s..", lastre);
840	memcpy(buf, basestr, prefix_length);	/* copy prefix	*/
841	j = prefix_length;
842	if (special_case == REPEAT_ZERO) {
843		j -= atomlen;
844		buf[j++] = '(';
845		buf[j++] = ')';
846	}
847	for (i=1; i < firstnum; i++) {		/* copy x reps 	*/
848		memcpy(&buf[j], atom, atomlen);
849		j += atomlen;
850	}
851	if (special_case == REPEAT_PLUS_APPENDED) {
852		buf[j++] = '+';
853	} else if (special_case == REPEAT_WITH_Q) {
854		if (init_q) buf[j++] = '?';
855		for (i=0; i < n_q_reps; i++) {	/* copy x? reps */
856			memcpy(&buf[j], atom, atomlen);
857			j += atomlen;
858			buf[j++] = '?';
859		}
860	}
861	memcpy(&buf[j], reptok+reptoklen, suffix_length);
862	if (special_case == REPEAT_ZERO) {
863		buf[j+suffix_length] = '\0';
864	} else {
865		buf[size] = '\0';
866	}
867	/* free old basestr */
868	if (firstbasestr != basestr) {
869		if (basestr)
870			xfree(basestr);
871	}
872	basestr = buf;
873	prestr  = buf + prefix_length;
874	if (special_case == REPEAT_ZERO) {
875		prestr  -= atomlen;
876		ret++;
877	}
878	return ret;
879}
880
881static int repeat(const uschar *reptok, int reptoklen, const uschar *atom,
882		  int atomlen, int firstnum, int secondnum)
883{
884	/*
885	   In general, the repetition specifier or "bound" is replaced here
886	   by an equivalent ERE string, repeating the immediately previous atom
887	   and appending ? and + as needed. Note that the first copy of the
888	   atom is left in place, except in the special_case of a zero-repeat
889	   (i.e., {0}).
890	 */
891	if (secondnum < 0) {	/* means {n,} -> repeat n-1 times followed by PLUS */
892		if (firstnum < 2) {
893			/* 0 or 1: should be handled before you get here */
894			FATAL("internal error");
895		} else {
896			return replace_repeat(reptok, reptoklen, atom, atomlen,
897				firstnum, secondnum, REPEAT_PLUS_APPENDED);
898		}
899	} else if (firstnum == secondnum) {	/* {n} or {n,n} -> simply repeat n-1 times */
900		if (firstnum == 0) {	/* {0} or {0,0} */
901			/* This case is unusual because the resulting
902			   replacement string might actually be SMALLER than
903			   the original ERE */
904			return replace_repeat(reptok, reptoklen, atom, atomlen,
905					firstnum, secondnum, REPEAT_ZERO);
906		} else {		/* (firstnum >= 1) */
907			return replace_repeat(reptok, reptoklen, atom, atomlen,
908					firstnum, secondnum, REPEAT_SIMPLE);
909		}
910	} else if (firstnum < secondnum) {	/* {n,m} -> repeat n-1 times then alternate  */
911		/*  x{n,m}  =>  xx...x{1, m-n+1}  =>  xx...x?x?x?..x?	*/
912		return replace_repeat(reptok, reptoklen, atom, atomlen,
913					firstnum, secondnum, REPEAT_WITH_Q);
914	} else {	/* Error - shouldn't be here (n>m) */
915		FATAL("internal error");
916	}
917	return 0;
918}
919
920int relex(void)		/* lexical analyzer for reparse */
921{
922	int c, n;
923	int cflag;
924	static uschar *buf = NULL;
925	static int bufsz = 100;
926	uschar *bp;
927	struct charclass *cc;
928	int i;
929	int num, m, commafound, digitfound;
930	const uschar *startreptok;
931
932rescan:
933	starttok = prestr;
934
935	switch (c = *prestr++) {
936	case '|': return OR;
937	case '*': return STAR;
938	case '+': return PLUS;
939	case '?': return QUEST;
940	case '.': return DOT;
941	case '\0': prestr--; return '\0';
942	case '^':
943	case '$':
944	case '(':
945	case ')':
946		return c;
947	case '\\':
948		rlxval = quoted(&prestr);
949		return CHAR;
950	default:
951		rlxval = c;
952		return CHAR;
953	case '[':
954		if (buf == NULL && (buf = (uschar *) malloc(bufsz)) == NULL)
955			FATAL("out of space in reg expr %.10s..", lastre);
956		bp = buf;
957		if (*prestr == '^') {
958			cflag = 1;
959			prestr++;
960		}
961		else
962			cflag = 0;
963		n = 2 * strlen((const char *) prestr)+1;
964		if (!adjbuf((char **) &buf, &bufsz, n, n, (char **) &bp, "relex1"))
965			FATAL("out of space for reg expr %.10s...", lastre);
966		for (; ; ) {
967			if ((c = *prestr++) == '\\') {
968				*bp++ = '\\';
969				if ((c = *prestr++) == '\0')
970					FATAL("nonterminated character class %.20s...", lastre);
971				*bp++ = c;
972			/* } else if (c == '\n') { */
973			/* 	FATAL("newline in character class %.20s...", lastre); */
974			} else if (c == '[' && *prestr == ':') {
975				/* POSIX char class names, Dag-Erling Smorgrav, des@ofug.org */
976				for (cc = charclasses; cc->cc_name; cc++)
977					if (strncmp((const char *) prestr + 1, (const char *) cc->cc_name, cc->cc_namelen) == 0)
978						break;
979				if (cc->cc_name != NULL && prestr[1 + cc->cc_namelen] == ':' &&
980				    prestr[2 + cc->cc_namelen] == ']') {
981					prestr += cc->cc_namelen + 3;
982					/*
983					 * BUG: We begin at 1, instead of 0, since we
984					 * would otherwise prematurely terminate the
985					 * string for classes like [[:cntrl:]]. This
986					 * means that we can't match the NUL character,
987					 * not without first adapting the entire
988					 * program to track each string's length.
989					 */
990					for (i = 1; i <= UCHAR_MAX; i++) {
991						if (!adjbuf((char **) &buf, &bufsz, bp-buf+1, 100, (char **) &bp, "relex2"))
992						    FATAL("out of space for reg expr %.10s...", lastre);
993						if (cc->cc_func(i)) {
994							*bp++ = i;
995							n++;
996						}
997					}
998				} else
999					*bp++ = c;
1000			} else if (c == '[' && *prestr == '.') {
1001				char collate_char;
1002				prestr++;
1003				collate_char = *prestr++;
1004				if (*prestr == '.' && prestr[1] == ']') {
1005					prestr += 2;
1006					/* Found it: map via locale TBD: for
1007					   now, simply return this char.  This
1008					   is sufficient to pass conformance
1009					   test awk.ex 156
1010					 */
1011					if (*prestr == ']') {
1012						prestr++;
1013						rlxval = collate_char;
1014						return CHAR;
1015					}
1016				}
1017			} else if (c == '[' && *prestr == '=') {
1018				char equiv_char;
1019				prestr++;
1020				equiv_char = *prestr++;
1021				if (*prestr == '=' && prestr[1] == ']') {
1022					prestr += 2;
1023					/* Found it: map via locale TBD: for now
1024					   simply return this char. This is
1025					   sufficient to pass conformance test
1026					   awk.ex 156
1027					 */
1028					if (*prestr == ']') {
1029						prestr++;
1030						rlxval = equiv_char;
1031						return CHAR;
1032					}
1033				}
1034			} else if (c == '\0') {
1035				FATAL("nonterminated character class %.20s", lastre);
1036			} else if (bp == buf) {	/* 1st char is special */
1037				*bp++ = c;
1038			} else if (c == ']') {
1039				*bp++ = 0;
1040				rlxstr = (uschar *) tostring((char *) buf);
1041				if (cflag == 0)
1042					return CCL;
1043				else
1044					return NCCL;
1045			} else
1046				*bp++ = c;
1047		}
1048		break;
1049	case '{':
1050		if (isdigit(*(prestr))) {
1051			num = 0;	/* Process as a repetition */
1052			n = -1; m = -1;
1053			commafound = 0;
1054			digitfound = 0;
1055			startreptok = prestr-1;
1056			/* Remember start of previous atom here ? */
1057		} else {        	/* just a { char, not a repetition */
1058			rlxval = c;
1059			return CHAR;
1060                }
1061		for (; ; ) {
1062			if ((c = *prestr++) == '}') {
1063				if (commafound) {
1064					if (digitfound) { /* {n,m} */
1065						m = num;
1066						if (m<n)
1067							FATAL("illegal repetition expression: class %.20s",
1068								lastre);
1069						if ((n==0) && (m==1)) {
1070							return QUEST;
1071						}
1072					} else {	/* {n,} */
1073						if (n==0) return STAR;
1074						if (n==1) return PLUS;
1075					}
1076				} else {
1077					if (digitfound) { /* {n} same as {n,n} */
1078						n = num;
1079						m = num;
1080					} else {	/* {} */
1081						FATAL("illegal repetition expression: class %.20s",
1082							lastre);
1083					}
1084				}
1085				if (repeat(starttok, prestr-starttok, lastatom,
1086					   startreptok - lastatom, n, m) > 0) {
1087					if ((n==0) && (m==0)) {
1088						return EMPTYRE;
1089					}
1090					/* must rescan input for next token */
1091					goto rescan;
1092				}
1093				/* Failed to replace: eat up {...} characters
1094				   and treat like just PLUS */
1095				return PLUS;
1096			} else if (c == '\0') {
1097				FATAL("nonterminated character class %.20s",
1098					lastre);
1099			} else if (isdigit(c)) {
1100				num = 10 * num + c - '0';
1101				digitfound = 1;
1102			} else if (c == ',') {
1103				if (commafound)
1104					FATAL("illegal repetition expression: class %.20s",
1105						lastre);
1106				/* looking for {n,} or {n,m} */
1107				commafound = 1;
1108				n = num;
1109				digitfound = 0; /* reset */
1110				num = 0;
1111			} else {
1112				FATAL("illegal repetition expression: class %.20s",
1113					lastre);
1114			}
1115		}
1116		break;
1117	}
1118}
1119
1120int cgoto(fa *f, int s, int c)
1121{
1122	int i, j, k;
1123	int *p, *q;
1124
1125	assert(c == HAT || c < NCHARS);
1126	while (f->accept >= maxsetvec) {	/* guessing here! */
1127		maxsetvec *= 4;
1128		setvec = (int *) realloc(setvec, maxsetvec * sizeof(int));
1129		tmpset = (int *) realloc(tmpset, maxsetvec * sizeof(int));
1130		if (setvec == NULL || tmpset == NULL)
1131			overflo("out of space in cgoto()");
1132	}
1133	for (i = 0; i <= f->accept; i++)
1134		setvec[i] = 0;
1135	setcnt = 0;
1136	/* compute positions of gototab[s,c] into setvec */
1137	p = f->posns[s];
1138	for (i = 1; i <= *p; i++) {
1139		if ((k = f->re[p[i]].ltype) != FINAL) {
1140			if ((k == CHAR && c == ptoi(f->re[p[i]].lval.np))
1141			 || (k == DOT && c != 0 && c != HAT)
1142			 || (k == ALL && c != 0)
1143			 || (k == EMPTYRE && c != 0)
1144			 || (k == CCL && member(c, (char *) f->re[p[i]].lval.up))
1145			 || (k == NCCL && !member(c, (char *) f->re[p[i]].lval.up) && c != 0 && c != HAT)) {
1146				q = f->re[p[i]].lfollow;
1147				for (j = 1; j <= *q; j++) {
1148					if (q[j] >= maxsetvec) {
1149						maxsetvec *= 4;
1150						setvec = (int *) realloc(setvec, maxsetvec * sizeof(int));
1151						tmpset = (int *) realloc(tmpset, maxsetvec * sizeof(int));
1152						if (setvec == NULL || tmpset == NULL)
1153							overflo("cgoto overflow");
1154					}
1155					if (setvec[q[j]] == 0) {
1156						setcnt++;
1157						setvec[q[j]] = 1;
1158					}
1159				}
1160			}
1161		}
1162	}
1163	/* determine if setvec is a previous state */
1164	tmpset[0] = setcnt;
1165	j = 1;
1166	for (i = f->accept; i >= 0; i--)
1167		if (setvec[i]) {
1168			tmpset[j++] = i;
1169		}
1170	/* tmpset == previous state? */
1171	for (i = 1; i <= f->curstat; i++) {
1172		p = f->posns[i];
1173		if ((k = tmpset[0]) != p[0])
1174			goto different;
1175		for (j = 1; j <= k; j++)
1176			if (tmpset[j] != p[j])
1177				goto different;
1178		/* setvec is state i */
1179		f->gototab[s][c] = i;
1180		return i;
1181	  different:;
1182	}
1183
1184	/* add tmpset to current set of states */
1185	if (f->curstat >= NSTATES-1) {
1186		f->curstat = 2;
1187		f->reset = 1;
1188		for (i = 2; i < NSTATES; i++)
1189			xfree(f->posns[i]);
1190	} else
1191		++(f->curstat);
1192	for (i = 0; i < NCHARS; i++)
1193		f->gototab[f->curstat][i] = 0;
1194	xfree(f->posns[f->curstat]);
1195	if ((p = (int *) calloc(setcnt+1, sizeof(int))) == NULL)
1196		overflo("out of space in cgoto");
1197
1198	f->posns[f->curstat] = p;
1199	f->gototab[s][c] = f->curstat;
1200	for (i = 0; i <= setcnt; i++)
1201		p[i] = tmpset[i];
1202	if (setvec[f->accept])
1203		f->out[f->curstat] = 1;
1204	else
1205		f->out[f->curstat] = 0;
1206	return f->curstat;
1207}
1208
1209
1210void freefa(fa *f)	/* free a finite automaton */
1211{
1212	int i;
1213
1214	if (f == NULL)
1215		return;
1216	for (i = 0; i <= f->curstat; i++)
1217		xfree(f->posns[i]);
1218	for (i = 0; i <= f->accept; i++) {
1219		xfree(f->re[i].lfollow);
1220		if (f->re[i].ltype == CCL || f->re[i].ltype == NCCL)
1221			xfree((f->re[i].lval.np));
1222	}
1223	xfree(f->restr);
1224	xfree(f);
1225}
1226