1/****************************************************************
2Copyright (C) Lucent Technologies 1997
3All Rights Reserved
4
5Permission to use, copy, modify, and distribute this software and
6its documentation for any purpose and without fee is hereby
7granted, provided that the above copyright notice appear in all
8copies and that both that the copyright notice and this
9permission notice and warranty disclaimer appear in supporting
10documentation, and that the name Lucent Technologies or any of
11its entities not be used in advertising or publicity pertaining
12to distribution of the software without specific, written prior
13permission.
14
15LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
16INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
17IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
18SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
20IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
21ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
22THIS SOFTWARE.
23****************************************************************/
24
25/* lasciate ogne speranza, voi ch'intrate. */
26
27#if HAVE_NBTOOL_CONFIG_H
28#include "nbtool_config.h"
29#endif
30
31#define	DEBUG
32
33#include <ctype.h>
34#include <limits.h>
35#include <stdio.h>
36#include <string.h>
37#include <stdlib.h>
38#include "awk.h"
39#include "awkgram.h"
40
41#define MAXLIN 22
42
43#define type(v)		(v)->nobj	/* badly overloaded here */
44#define info(v)		(v)->ntype	/* badly overloaded here */
45#define left(v)		(v)->narg[0]
46#define right(v)	(v)->narg[1]
47#define parent(v)	(v)->nnext
48
49#define LEAF	case CCL: case NCCL: case CHAR: case DOT: case FINAL: case ALL:
50#define ELEAF	case EMPTYRE:		/* empty string in regexp */
51#define UNARY	case STAR: case PLUS: case QUEST:
52
53/* encoding in tree Nodes:
54	leaf (CCL, NCCL, CHAR, DOT, FINAL, ALL, EMPTYRE):
55		left is index, right contains value or pointer to value
56	unary (STAR, PLUS, QUEST): left is child, right is null
57	binary (CAT, OR): left and right are children
58	parent contains pointer to parent
59*/
60
61
62int	*setvec;
63int	*tmpset;
64int	maxsetvec = 0;
65
66int	rtok;		/* next token in current re */
67int	rlxval;
68static const uschar	*rlxstr;
69static const uschar	*prestr;	/* current position in current re */
70static const uschar	*lastre;	/* origin of last re */
71static const uschar	*lastatom;	/* origin of last Atom */
72static const uschar	*starttok;
73static const uschar 	*basestr;	/* starts with original, replaced during
74				   repetition processing */
75static const uschar 	*firstbasestr;
76
77static	int setcnt;
78static	int poscnt;
79
80const char	*patbeg;
81int	patlen;
82
83#define	NFA	128	/* cache this many dynamic fa's */
84fa	*fatab[NFA];
85int	nfatab	= 0;	/* entries in fatab */
86
87static int *
88intalloc(size_t n, const char *f)
89{
90	void *p = calloc(n, sizeof(int));
91	if (p == NULL)
92		overflo(f);
93	return p;
94}
95
96static void
97resizesetvec(const char *f)
98{
99	if (maxsetvec == 0)
100		maxsetvec = MAXLIN;
101	else
102		maxsetvec *= 4;
103	setvec = realloc(setvec, maxsetvec * sizeof(*setvec));
104	tmpset = realloc(tmpset, maxsetvec * sizeof(*tmpset));
105	if (setvec == NULL || tmpset == NULL)
106		overflo(f);
107}
108
109static void
110resize_state(fa *f, int state)
111{
112	void *p;
113	int i, new_count;
114
115	if (++state < f->state_count)
116		return;
117
118	new_count = state + 10; /* needs to be tuned */
119
120	p = realloc(f->gototab, new_count * sizeof(f->gototab[0]));
121	if (p == NULL)
122		goto out;
123	f->gototab = p;
124
125	p = realloc(f->out, new_count * sizeof(f->out[0]));
126	if (p == NULL)
127		goto out;
128	f->out = p;
129
130	p = realloc(f->posns, new_count * sizeof(f->posns[0]));
131	if (p == NULL)
132		goto out;
133	f->posns = p;
134
135	for (i = f->state_count; i < new_count; ++i) {
136		f->gototab[i] = calloc(NCHARS, sizeof(**f->gototab));
137		if (f->gototab[i] == NULL)
138			goto out;
139		f->out[i]  = 0;
140		f->posns[i] = NULL;
141	}
142	f->state_count = new_count;
143	return;
144out:
145	overflo(__func__);
146}
147
148fa *makedfa(const char *s, bool anchor)	/* returns dfa for reg expr s */
149{
150	int i, use, nuse;
151	fa *pfa;
152	static int now = 1;
153
154	if (setvec == NULL) {	/* first time through any RE */
155		resizesetvec(__func__);
156	}
157
158	if (compile_time != RUNNING)	/* a constant for sure */
159		return mkdfa(s, anchor);
160	for (i = 0; i < nfatab; i++)	/* is it there already? */
161		if (fatab[i]->anchor == anchor
162		  && strcmp((const char *) fatab[i]->restr, s) == 0) {
163			fatab[i]->use = now++;
164			return fatab[i];
165		}
166	pfa = mkdfa(s, anchor);
167	if (nfatab < NFA) {	/* room for another */
168		fatab[nfatab] = pfa;
169		fatab[nfatab]->use = now++;
170		nfatab++;
171		return pfa;
172	}
173	use = fatab[0]->use;	/* replace least-recently used */
174	nuse = 0;
175	for (i = 1; i < nfatab; i++)
176		if (fatab[i]->use < use) {
177			use = fatab[i]->use;
178			nuse = i;
179		}
180	freefa(fatab[nuse]);
181	fatab[nuse] = pfa;
182	pfa->use = now++;
183	return pfa;
184}
185
186fa *mkdfa(const char *s, bool anchor)	/* does the real work of making a dfa */
187				/* anchor = true for anchored matches, else false */
188{
189	Node *p, *p1;
190	fa *f;
191
192	firstbasestr = (const uschar *) s;
193	basestr = firstbasestr;
194	p = reparse(s);
195	p1 = op2(CAT, op2(STAR, op2(ALL, NIL, NIL), NIL), p);
196		/* put ALL STAR in front of reg.  exp. */
197	p1 = op2(CAT, p1, op2(FINAL, NIL, NIL));
198		/* put FINAL after reg.  exp. */
199
200	poscnt = 0;
201	penter(p1);	/* enter parent pointers and leaf indices */
202	if ((f = calloc(1, sizeof(fa) + poscnt * sizeof(rrow))) == NULL)
203		overflo(__func__);
204	f->accept = poscnt-1;	/* penter has computed number of positions in re */
205	cfoll(f, p1);	/* set up follow sets */
206	freetr(p1);
207	resize_state(f, 1);
208	f->posns[0] = intalloc(*(f->re[0].lfollow), __func__);
209	f->posns[1] = intalloc(1, __func__);
210	*f->posns[1] = 0;
211	f->initstat = makeinit(f, anchor);
212	f->anchor = anchor;
213	f->restr = (uschar *) tostring(s);
214	if (firstbasestr != basestr) {
215		if (basestr)
216			xfree(basestr);
217	}
218	return f;
219}
220
221int makeinit(fa *f, bool anchor)
222{
223	int i, k;
224
225	f->curstat = 2;
226	f->out[2] = 0;
227	k = *(f->re[0].lfollow);
228	xfree(f->posns[2]);
229	f->posns[2] = intalloc(k + 1,  __func__);
230	for (i = 0; i <= k; i++) {
231		(f->posns[2])[i] = (f->re[0].lfollow)[i];
232	}
233	if ((f->posns[2])[1] == f->accept)
234		f->out[2] = 1;
235	for (i = 0; i < NCHARS; i++)
236		f->gototab[2][i] = 0;
237	f->curstat = cgoto(f, 2, HAT);
238	if (anchor) {
239		*f->posns[2] = k-1;	/* leave out position 0 */
240		for (i = 0; i < k; i++) {
241			(f->posns[0])[i] = (f->posns[2])[i];
242		}
243
244		f->out[0] = f->out[2];
245		if (f->curstat != 2)
246			--(*f->posns[f->curstat]);
247	}
248	return f->curstat;
249}
250
251void penter(Node *p)	/* set up parent pointers and leaf indices */
252{
253	switch (type(p)) {
254	ELEAF
255	LEAF
256		info(p) = poscnt;
257		poscnt++;
258		break;
259	UNARY
260		penter(left(p));
261		parent(left(p)) = p;
262		break;
263	case CAT:
264	case OR:
265		penter(left(p));
266		penter(right(p));
267		parent(left(p)) = p;
268		parent(right(p)) = p;
269		break;
270	case ZERO:
271		break;
272	default:	/* can't happen */
273		FATAL("can't happen: unknown type %d in penter", type(p));
274		break;
275	}
276}
277
278void freetr(Node *p)	/* free parse tree */
279{
280	switch (type(p)) {
281	ELEAF
282	LEAF
283		xfree(p);
284		break;
285	UNARY
286	case ZERO:
287		freetr(left(p));
288		xfree(p);
289		break;
290	case CAT:
291	case OR:
292		freetr(left(p));
293		freetr(right(p));
294		xfree(p);
295		break;
296	default:	/* can't happen */
297		FATAL("can't happen: unknown type %d in freetr", type(p));
298		break;
299	}
300}
301
302/* in the parsing of regular expressions, metacharacters like . have */
303/* to be seen literally;  \056 is not a metacharacter. */
304
305int hexstr(const uschar **pp)	/* find and eval hex string at pp, return new p */
306{			/* only pick up one 8-bit byte (2 chars) */
307	const uschar *p;
308	int n = 0;
309	int i;
310
311	for (i = 0, p = *pp; i < 2 && isxdigit(*p); i++, p++) {
312		if (isdigit(*p))
313			n = 16 * n + *p - '0';
314		else if (*p >= 'a' && *p <= 'f')
315			n = 16 * n + *p - 'a' + 10;
316		else if (*p >= 'A' && *p <= 'F')
317			n = 16 * n + *p - 'A' + 10;
318	}
319	*pp = p;
320	return n;
321}
322
323#define isoctdigit(c) ((c) >= '0' && (c) <= '7')	/* multiple use of arg */
324
325int quoted(const uschar **pp)	/* pick up next thing after a \\ */
326			/* and increment *pp */
327{
328	const uschar *p = *pp;
329	int c;
330
331	if ((c = *p++) == 't')
332		c = '\t';
333	else if (c == 'n')
334		c = '\n';
335	else if (c == 'f')
336		c = '\f';
337	else if (c == 'r')
338		c = '\r';
339	else if (c == 'b')
340		c = '\b';
341	else if (c == 'v')
342		c = '\v';
343	else if (c == 'a')
344		c = '\a';
345	else if (c == '\\')
346		c = '\\';
347	else if (c == 'x') {	/* hexadecimal goo follows */
348		c = hexstr(&p);	/* this adds a null if number is invalid */
349	} else if (isoctdigit(c)) {	/* \d \dd \ddd */
350		int n = c - '0';
351		if (isoctdigit(*p)) {
352			n = 8 * n + *p++ - '0';
353			if (isoctdigit(*p))
354				n = 8 * n + *p++ - '0';
355		}
356		c = n;
357	} /* else */
358		/* c = c; */
359	*pp = p;
360	return c;
361}
362
363char *cclenter(const char *argp)	/* add a character class */
364{
365	int i, c, c2;
366	const uschar *op, *p = (const uschar *) argp;
367	uschar *bp;
368	static uschar *buf = NULL;
369	static int bufsz = 100;
370
371	op = p;
372	if (buf == NULL && (buf = malloc(bufsz)) == NULL)
373		FATAL("out of space for character class [%.10s...] 1", p);
374	bp = buf;
375	for (i = 0; (c = *p++) != 0; ) {
376		if (c == '\\') {
377			c = quoted(&p);
378		} else if (c == '-' && i > 0 && bp[-1] != 0) {
379			if (*p != 0) {
380				c = bp[-1];
381				c2 = *p++;
382				if (c2 == '\\')
383					c2 = quoted(&p);
384				if (c > c2) {	/* empty; ignore */
385					bp--;
386					i--;
387					continue;
388				}
389				while (c < c2) {
390					if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, "cclenter1"))
391						FATAL("out of space for character class [%.10s...] 2", p);
392					*bp++ = ++c;
393					i++;
394				}
395				continue;
396			}
397		}
398		if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, "cclenter2"))
399			FATAL("out of space for character class [%.10s...] 3", p);
400		*bp++ = c;
401		i++;
402	}
403	*bp = 0;
404	dprintf( ("cclenter: in = |%s|, out = |%s|\n", op, buf) );
405	xfree(op);
406	return (char *) tostring((char *) buf);
407}
408
409void overflo(const char *s)
410{
411	FATAL("regular expression too big: out of space in %.30s...", s);
412}
413
414void cfoll(fa *f, Node *v)	/* enter follow set of each leaf of vertex v into lfollow[leaf] */
415{
416	int i;
417	int *p;
418
419	switch (type(v)) {
420	ELEAF
421	LEAF
422		f->re[info(v)].ltype = type(v);
423		f->re[info(v)].lval.np = right(v);
424		while (f->accept >= maxsetvec) {	/* guessing here! */
425			resizesetvec(__func__);
426		}
427		for (i = 0; i <= f->accept; i++)
428			setvec[i] = 0;
429		setcnt = 0;
430		follow(v);	/* computes setvec and setcnt */
431		p = intalloc(setcnt + 1, __func__);
432		f->re[info(v)].lfollow = p;
433		*p = setcnt;
434		for (i = f->accept; i >= 0; i--)
435			if (setvec[i] == 1)
436				*++p = i;
437		break;
438	UNARY
439		cfoll(f,left(v));
440		break;
441	case CAT:
442	case OR:
443		cfoll(f,left(v));
444		cfoll(f,right(v));
445		break;
446	case ZERO:
447		break;
448	default:	/* can't happen */
449		FATAL("can't happen: unknown type %d in cfoll", type(v));
450	}
451}
452
453int first(Node *p)	/* collects initially active leaves of p into setvec */
454			/* returns 0 if p matches empty string */
455{
456	int b, lp;
457
458	switch (type(p)) {
459	ELEAF
460	LEAF
461		lp = info(p);	/* look for high-water mark of subscripts */
462		while (setcnt >= maxsetvec || lp >= maxsetvec) {	/* guessing here! */
463			resizesetvec(__func__);
464		}
465		if (type(p) == EMPTYRE) {
466			setvec[lp] = 0;
467			return(0);
468		}
469		if (setvec[lp] != 1) {
470			setvec[lp] = 1;
471			setcnt++;
472		}
473		if (type(p) == CCL && (*(char *) right(p)) == '\0')
474			return(0);		/* empty CCL */
475		return(1);
476	case PLUS:
477		if (first(left(p)) == 0)
478			return(0);
479		return(1);
480	case STAR:
481	case QUEST:
482		first(left(p));
483		return(0);
484	case CAT:
485		if (first(left(p)) == 0 && first(right(p)) == 0) return(0);
486		return(1);
487	case OR:
488		b = first(right(p));
489		if (first(left(p)) == 0 || b == 0) return(0);
490		return(1);
491	case ZERO:
492		return 0;
493	}
494	FATAL("can't happen: unknown type %d in first", type(p));	/* can't happen */
495	return(-1);
496}
497
498void follow(Node *v)	/* collects leaves that can follow v into setvec */
499{
500	Node *p;
501
502	if (type(v) == FINAL)
503		return;
504	p = parent(v);
505	switch (type(p)) {
506	case STAR:
507	case PLUS:
508		first(v);
509		follow(p);
510		return;
511
512	case OR:
513	case QUEST:
514		follow(p);
515		return;
516
517	case CAT:
518		if (v == left(p)) {	/* v is left child of p */
519			if (first(right(p)) == 0) {
520				follow(p);
521				return;
522			}
523		} else		/* v is right child */
524			follow(p);
525		return;
526	}
527}
528
529int member(int c, const char *sarg)	/* is c in s? */
530{
531	const uschar *s = (const uschar *) sarg;
532
533	while (*s)
534		if (c == *s++)
535			return(1);
536	return(0);
537}
538
539int match(fa *f, const char *p0)	/* shortest match ? */
540{
541	int s, ns;
542	const uschar *p = (const uschar *) p0;
543
544	s = f->initstat;
545	assert (s < f->state_count);
546
547	if (f->out[s])
548		return(1);
549	do {
550		/* assert(*p < NCHARS); */
551		if ((ns = f->gototab[s][*p]) != 0)
552			s = ns;
553		else
554			s = cgoto(f, s, *p);
555		if (f->out[s])
556			return(1);
557	} while (*p++ != 0);
558	return(0);
559}
560
561int pmatch(fa *f, const char *p0)	/* longest match, for sub */
562{
563	int s, ns;
564	const uschar *p = (const uschar *) p0;
565	const uschar *q;
566
567	s = f->initstat;
568	assert(s < f->state_count);
569
570	patbeg = (const char *)p;
571	patlen = -1;
572	do {
573		q = p;
574		do {
575			if (f->out[s])		/* final state */
576				patlen = q-p;
577			/* assert(*q < NCHARS); */
578			if ((ns = f->gototab[s][*q]) != 0)
579				s = ns;
580			else
581				s = cgoto(f, s, *q);
582
583			assert(s < f->state_count);
584
585			if (s == 1) {	/* no transition */
586				if (patlen >= 0) {
587					patbeg = (const char *) p;
588					return(1);
589				}
590				else
591					goto nextin;	/* no match */
592			}
593		} while (*q++ != 0);
594		if (f->out[s])
595			patlen = q-p-1;	/* don't count $ */
596		if (patlen >= 0) {
597			patbeg = (const char *) p;
598			return(1);
599		}
600	nextin:
601		s = 2;
602	} while (*p++);
603	return (0);
604}
605
606int nematch(fa *f, const char *p0)	/* non-empty match, for sub */
607{
608	int s, ns;
609	const uschar *p = (const uschar *) p0;
610	const uschar *q;
611
612	s = f->initstat;
613	assert(s < f->state_count);
614
615	patbeg = (const char *)p;
616	patlen = -1;
617	while (*p) {
618		q = p;
619		do {
620			if (f->out[s])		/* final state */
621				patlen = q-p;
622			/* assert(*q < NCHARS); */
623			if ((ns = f->gototab[s][*q]) != 0)
624				s = ns;
625			else
626				s = cgoto(f, s, *q);
627			if (s == 1) {	/* no transition */
628				if (patlen > 0) {
629					patbeg = (const char *) p;
630					return(1);
631				} else
632					goto nnextin;	/* no nonempty match */
633			}
634		} while (*q++ != 0);
635		if (f->out[s])
636			patlen = q-p-1;	/* don't count $ */
637		if (patlen > 0 ) {
638			patbeg = (const char *) p;
639			return(1);
640		}
641	nnextin:
642		s = 2;
643		p++;
644	}
645	return (0);
646}
647
648
649/*
650 * NAME
651 *     fnematch
652 *
653 * DESCRIPTION
654 *     A stream-fed version of nematch which transfers characters to a
655 *     null-terminated buffer. All characters up to and including the last
656 *     character of the matching text or EOF are placed in the buffer. If
657 *     a match is found, patbeg and patlen are set appropriately.
658 *
659 * RETURN VALUES
660 *     false    No match found.
661 *     true     Match found.
662 */
663
664bool fnematch(fa *pfa, FILE *f, char **pbuf, int *pbufsize, int quantum)
665{
666	char *buf = *pbuf;
667	int bufsize = *pbufsize;
668	int c, i, j, k, ns, s;
669
670	s = pfa->initstat;
671	patlen = 0;
672
673	/*
674	 * All indices relative to buf.
675	 * i <= j <= k <= bufsize
676	 *
677	 * i: origin of active substring
678	 * j: current character
679	 * k: destination of next getc()
680	 */
681	i = -1, k = 0;
682        do {
683		j = i++;
684		do {
685			if (++j == k) {
686				if (k == bufsize)
687					if (!adjbuf((char **) &buf, &bufsize, bufsize+1, quantum, 0, "fnematch"))
688						FATAL("stream '%.30s...' too long", buf);
689				buf[k++] = (c = getc(f)) != EOF ? c : 0;
690			}
691			c = buf[j];
692			/* assert(c < NCHARS); */
693
694			if ((ns = pfa->gototab[s][c]) != 0)
695				s = ns;
696			else
697				s = cgoto(pfa, s, c);
698
699			if (pfa->out[s]) {	/* final state */
700				patlen = j - i + 1;
701				if (c == 0)	/* don't count $ */
702					patlen--;
703			}
704		} while (buf[j] && s != 1);
705		s = 2;
706	} while (buf[i] && !patlen);
707
708	/* adjbuf() may have relocated a resized buffer. Inform the world. */
709	*pbuf = buf;
710	*pbufsize = bufsize;
711
712	if (patlen) {
713		patbeg = (char *) buf + i;
714		/*
715		 * Under no circumstances is the last character fed to
716		 * the automaton part of the match. It is EOF's nullbyte,
717		 * or it sent the automaton into a state with no further
718		 * transitions available (s==1), or both. Room for a
719		 * terminating nullbyte is guaranteed.
720		 *
721		 * ungetc any chars after the end of matching text
722		 * (except for EOF's nullbyte, if present) and null
723		 * terminate the buffer.
724		 */
725		do
726			if (buf[--k] && ungetc(buf[k], f) == EOF)
727				FATAL("unable to ungetc '%c'", buf[k]);
728		while (k > i + patlen);
729		buf[k] = '\0';
730		return true;
731	}
732	else
733		return false;
734}
735
736Node *reparse(const char *p)	/* parses regular expression pointed to by p */
737{			/* uses relex() to scan regular expression */
738	Node *np;
739
740	dprintf( ("reparse <%s>\n", p) );
741	lastre = prestr = (const uschar *) p;	/* prestr points to string to be parsed */
742	rtok = relex();
743	/* GNU compatibility: an empty regexp matches anything */
744	if (rtok == '\0') {
745		/* FATAL("empty regular expression"); previous */
746		return(op2(EMPTYRE, NIL, NIL));
747	}
748	np = regexp();
749	if (rtok != '\0')
750		FATAL("syntax error in regular expression %s at %s", lastre, prestr);
751	return(np);
752}
753
754Node *regexp(void)	/* top-level parse of reg expr */
755{
756	return (alt(concat(primary())));
757}
758
759Node *primary(void)
760{
761	Node *np;
762	int savelastatom;
763
764	switch (rtok) {
765	case CHAR:
766		lastatom = starttok;
767		np = op2(CHAR, NIL, itonp(rlxval));
768		rtok = relex();
769		return (unary(np));
770	case ALL:
771		rtok = relex();
772		return (unary(op2(ALL, NIL, NIL)));
773	case EMPTYRE:
774		rtok = relex();
775		return (unary(op2(EMPTYRE, NIL, NIL)));
776	case DOT:
777		lastatom = starttok;
778		rtok = relex();
779		return (unary(op2(DOT, NIL, NIL)));
780	case CCL:
781		np = op2(CCL, NIL, (Node*) cclenter((const char *) rlxstr));
782		lastatom = starttok;
783		rtok = relex();
784		return (unary(np));
785	case NCCL:
786		np = op2(NCCL, NIL, (Node *) cclenter((const char *) rlxstr));
787		lastatom = starttok;
788		rtok = relex();
789		return (unary(np));
790	case '^':
791		rtok = relex();
792		return (unary(op2(CHAR, NIL, itonp(HAT))));
793	case '$':
794		rtok = relex();
795		return (unary(op2(CHAR, NIL, NIL)));
796	case '(':
797		lastatom = starttok;
798		savelastatom = starttok - basestr; /* Retain over recursion */
799		rtok = relex();
800		if (rtok == ')') {	/* special pleading for () */
801			rtok = relex();
802			return unary(op2(CCL, NIL, (Node *) tostring("")));
803		}
804		np = regexp();
805		if (rtok == ')') {
806			lastatom = basestr + savelastatom; /* Restore */
807			rtok = relex();
808			return (unary(np));
809		}
810		else
811			FATAL("syntax error in regular expression %s at %s", lastre, prestr);
812	default:
813		FATAL("illegal primary in regular expression %s at %s", lastre, prestr);
814	}
815	return 0;	/*NOTREACHED*/
816}
817
818Node *concat(Node *np)
819{
820	switch (rtok) {
821	case CHAR: case DOT: case ALL: case CCL: case NCCL: case '$': case '(':
822		return (concat(op2(CAT, np, primary())));
823	case EMPTYRE:
824		rtok = relex();
825		return (concat(op2(CAT, op2(CCL, NIL, (Node *) tostring("")),
826				primary())));
827	}
828	return (np);
829}
830
831Node *alt(Node *np)
832{
833	if (rtok == OR) {
834		rtok = relex();
835		return (alt(op2(OR, np, concat(primary()))));
836	}
837	return (np);
838}
839
840Node *unary(Node *np)
841{
842	switch (rtok) {
843	case STAR:
844		rtok = relex();
845		return (unary(op2(STAR, np, NIL)));
846	case PLUS:
847		rtok = relex();
848		return (unary(op2(PLUS, np, NIL)));
849	case QUEST:
850		rtok = relex();
851		return (unary(op2(QUEST, np, NIL)));
852	case ZERO:
853		rtok = relex();
854		return (unary(op2(ZERO, np, NIL)));
855	default:
856		return (np);
857	}
858}
859
860/*
861 * Character class definitions conformant to the POSIX locale as
862 * defined in IEEE P1003.1 draft 7 of June 2001, assuming the source
863 * and operating character sets are both ASCII (ISO646) or supersets
864 * thereof.
865 *
866 * Note that to avoid overflowing the temporary buffer used in
867 * relex(), the expanded character class (prior to range expansion)
868 * must be less than twice the size of their full name.
869 */
870
871/* Because isblank doesn't show up in any of the header files on any
872 * system i use, it's defined here.  if some other locale has a richer
873 * definition of "blank", define HAS_ISBLANK and provide your own
874 * version.
875 * the parentheses here are an attempt to find a path through the maze
876 * of macro definition and/or function and/or version provided.  thanks
877 * to nelson beebe for the suggestion; let's see if it works everywhere.
878 */
879
880/* #define HAS_ISBLANK */
881#ifndef HAS_ISBLANK
882
883int (xisblank)(int c)
884{
885	return c==' ' || c=='\t';
886}
887
888#endif
889
890static const struct charclass {
891	const char *cc_name;
892	int cc_namelen;
893	int (*cc_func)(int);
894} charclasses[] = {
895	{ "alnum",	5,	isalnum },
896	{ "alpha",	5,	isalpha },
897#ifndef HAS_ISBLANK
898	{ "blank",	5,	xisblank },
899#else
900	{ "blank",	5,	isblank },
901#endif
902	{ "cntrl",	5,	iscntrl },
903	{ "digit",	5,	isdigit },
904	{ "graph",	5,	isgraph },
905	{ "lower",	5,	islower },
906	{ "print",	5,	isprint },
907	{ "punct",	5,	ispunct },
908	{ "space",	5,	isspace },
909	{ "upper",	5,	isupper },
910	{ "xdigit",	6,	isxdigit },
911	{ NULL,		0,	NULL },
912};
913
914#define REPEAT_SIMPLE		0
915#define REPEAT_PLUS_APPENDED	1
916#define REPEAT_WITH_Q		2
917#define REPEAT_ZERO		3
918
919static int
920replace_repeat(const uschar *reptok, int reptoklen, const uschar *atom,
921	       int atomlen, int firstnum, int secondnum, int special_case)
922{
923	int i, j;
924	uschar *buf = 0;
925	int ret = 1;
926	int init_q = (firstnum == 0);		/* first added char will be ? */
927	int n_q_reps = secondnum-firstnum;	/* m>n, so reduce until {1,m-n} left  */
928	int prefix_length = reptok - basestr;	/* prefix includes first rep	*/
929	int suffix_length = strlen((const char *) reptok) - reptoklen;	/* string after rep specifier	*/
930	int size = prefix_length +  suffix_length;
931
932	if (firstnum > 1) {	/* add room for reps 2 through firstnum */
933		size += atomlen*(firstnum-1);
934	}
935
936	/* Adjust size of buffer for special cases */
937	if (special_case == REPEAT_PLUS_APPENDED) {
938		size++;		/* for the final + */
939	} else if (special_case == REPEAT_WITH_Q) {
940		size += init_q + (atomlen+1)* n_q_reps;
941	} else if (special_case == REPEAT_ZERO) {
942		size += 2;	/* just a null ERE: () */
943	}
944	if ((buf = malloc(size + 1)) == NULL)
945		FATAL("out of space in reg expr %.10s..", lastre);
946	memcpy(buf, basestr, prefix_length);	/* copy prefix	*/
947	j = prefix_length;
948	if (special_case == REPEAT_ZERO) {
949		j -= atomlen;
950		buf[j++] = '(';
951		buf[j++] = ')';
952	}
953	for (i = 1; i < firstnum; i++) {	/* copy x reps 	*/
954		memcpy(&buf[j], atom, atomlen);
955		j += atomlen;
956	}
957	if (special_case == REPEAT_PLUS_APPENDED) {
958		buf[j++] = '+';
959	} else if (special_case == REPEAT_WITH_Q) {
960		if (init_q)
961			buf[j++] = '?';
962		for (i = init_q; i < n_q_reps; i++) {	/* copy x? reps */
963			memcpy(&buf[j], atom, atomlen);
964			j += atomlen;
965			buf[j++] = '?';
966		}
967	}
968	memcpy(&buf[j], reptok+reptoklen, suffix_length);
969	if (special_case == REPEAT_ZERO) {
970		buf[j+suffix_length] = '\0';
971	} else {
972		buf[size] = '\0';
973	}
974	/* free old basestr */
975	if (firstbasestr != basestr) {
976		if (basestr)
977			xfree(basestr);
978	}
979	basestr = buf;
980	prestr  = buf + prefix_length;
981	if (special_case == REPEAT_ZERO) {
982		prestr  -= atomlen;
983		ret++;
984	}
985	return ret;
986}
987
988static int repeat(const uschar *reptok, int reptoklen, const uschar *atom,
989		  int atomlen, int firstnum, int secondnum)
990{
991	/*
992	   In general, the repetition specifier or "bound" is replaced here
993	   by an equivalent ERE string, repeating the immediately previous atom
994	   and appending ? and + as needed. Note that the first copy of the
995	   atom is left in place, except in the special_case of a zero-repeat
996	   (i.e., {0}).
997	 */
998	if (secondnum < 0) {	/* means {n,} -> repeat n-1 times followed by PLUS */
999		if (firstnum < 2) {
1000			/* 0 or 1: should be handled before you get here */
1001			FATAL("internal error");
1002		} else {
1003			return replace_repeat(reptok, reptoklen, atom, atomlen,
1004				firstnum, secondnum, REPEAT_PLUS_APPENDED);
1005		}
1006	} else if (firstnum == secondnum) {	/* {n} or {n,n} -> simply repeat n-1 times */
1007		if (firstnum == 0) {	/* {0} or {0,0} */
1008			/* This case is unusual because the resulting
1009			   replacement string might actually be SMALLER than
1010			   the original ERE */
1011			return replace_repeat(reptok, reptoklen, atom, atomlen,
1012					firstnum, secondnum, REPEAT_ZERO);
1013		} else {		/* (firstnum >= 1) */
1014			return replace_repeat(reptok, reptoklen, atom, atomlen,
1015					firstnum, secondnum, REPEAT_SIMPLE);
1016		}
1017	} else if (firstnum < secondnum) {	/* {n,m} -> repeat n-1 times then alternate  */
1018		/*  x{n,m}  =>  xx...x{1, m-n+1}  =>  xx...x?x?x?..x?	*/
1019		return replace_repeat(reptok, reptoklen, atom, atomlen,
1020					firstnum, secondnum, REPEAT_WITH_Q);
1021	} else {	/* Error - shouldn't be here (n>m) */
1022		FATAL("internal error");
1023	}
1024	return 0;
1025}
1026
1027int relex(void)		/* lexical analyzer for reparse */
1028{
1029	int c, n;
1030	int cflag;
1031	static uschar *buf = NULL;
1032	static int bufsz = 100;
1033	uschar *bp;
1034	const struct charclass *cc;
1035	int i;
1036	int num, m;
1037	bool commafound, digitfound;
1038	const uschar *startreptok;
1039	static int parens = 0;
1040
1041rescan:
1042	starttok = prestr;
1043
1044	switch (c = *prestr++) {
1045	case '|': return OR;
1046	case '*': return STAR;
1047	case '+': return PLUS;
1048	case '?': return QUEST;
1049	case '.': return DOT;
1050	case '\0': prestr--; return '\0';
1051	case '^':
1052	case '$':
1053		return c;
1054	case '(':
1055		parens++;
1056		return c;
1057	case ')':
1058		if (parens) {
1059			parens--;
1060			return c;
1061		}
1062		/* unmatched close parenthesis; per POSIX, treat as literal */
1063		rlxval = c;
1064		return CHAR;
1065	case '\\':
1066		rlxval = quoted(&prestr);
1067		return CHAR;
1068	default:
1069		rlxval = c;
1070		return CHAR;
1071	case '[':
1072		if (buf == NULL && (buf = malloc(bufsz)) == NULL)
1073			FATAL("out of space in reg expr %.10s..", lastre);
1074		bp = buf;
1075		if (*prestr == '^') {
1076			cflag = 1;
1077			prestr++;
1078		}
1079		else
1080			cflag = 0;
1081		n = 2 * strlen((const char *) prestr)+1;
1082		if (!adjbuf((char **) &buf, &bufsz, n, n, (char **) &bp, "relex1"))
1083			FATAL("out of space for reg expr %.10s...", lastre);
1084		for (; ; ) {
1085			if ((c = *prestr++) == '\\') {
1086				*bp++ = '\\';
1087				if ((c = *prestr++) == '\0')
1088					FATAL("nonterminated character class %.20s...", lastre);
1089				*bp++ = c;
1090			/* } else if (c == '\n') { */
1091			/* 	FATAL("newline in character class %.20s...", lastre); */
1092			} else if (c == '[' && *prestr == ':') {
1093				/* POSIX char class names, Dag-Erling Smorgrav, des@ofug.org */
1094				for (cc = charclasses; cc->cc_name; cc++)
1095					if (strncmp((const char *) prestr + 1, (const char *) cc->cc_name, cc->cc_namelen) == 0)
1096						break;
1097				if (cc->cc_name != NULL && prestr[1 + cc->cc_namelen] == ':' &&
1098				    prestr[2 + cc->cc_namelen] == ']') {
1099					prestr += cc->cc_namelen + 3;
1100					/*
1101					 * BUG: We begin at 1, instead of 0, since we
1102					 * would otherwise prematurely terminate the
1103					 * string for classes like [[:cntrl:]]. This
1104					 * means that we can't match the NUL character,
1105					 * not without first adapting the entire
1106					 * program to track each string's length.
1107					 */
1108					for (i = 1; i <= UCHAR_MAX; i++) {
1109						if (!adjbuf((char **) &buf, &bufsz, bp-buf+1, 100, (char **) &bp, "relex2"))
1110						    FATAL("out of space for reg expr %.10s...", lastre);
1111						if (cc->cc_func(i)) {
1112							*bp++ = i;
1113							n++;
1114						}
1115					}
1116				} else
1117					*bp++ = c;
1118			} else if (c == '[' && *prestr == '.') {
1119				char collate_char;
1120				prestr++;
1121				collate_char = *prestr++;
1122				if (*prestr == '.' && prestr[1] == ']') {
1123					prestr += 2;
1124					/* Found it: map via locale TBD: for
1125					   now, simply return this char.  This
1126					   is sufficient to pass conformance
1127					   test awk.ex 156
1128					 */
1129					if (*prestr == ']') {
1130						prestr++;
1131						rlxval = collate_char;
1132						return CHAR;
1133					}
1134				}
1135			} else if (c == '[' && *prestr == '=') {
1136				char equiv_char;
1137				prestr++;
1138				equiv_char = *prestr++;
1139				if (*prestr == '=' && prestr[1] == ']') {
1140					prestr += 2;
1141					/* Found it: map via locale TBD: for now
1142					   simply return this char. This is
1143					   sufficient to pass conformance test
1144					   awk.ex 156
1145					 */
1146					if (*prestr == ']') {
1147						prestr++;
1148						rlxval = equiv_char;
1149						return CHAR;
1150					}
1151				}
1152			} else if (c == '\0') {
1153				FATAL("nonterminated character class %.20s", lastre);
1154			} else if (bp == buf) {	/* 1st char is special */
1155				*bp++ = c;
1156			} else if (c == ']') {
1157				*bp++ = 0;
1158				rlxstr = (uschar *) tostring((char *) buf);
1159				if (cflag == 0)
1160					return CCL;
1161				else
1162					return NCCL;
1163			} else
1164				*bp++ = c;
1165		}
1166		break;
1167	case '{':
1168		if (isdigit(*(prestr))) {
1169			num = 0;	/* Process as a repetition */
1170			n = -1; m = -1;
1171			commafound = false;
1172			digitfound = false;
1173			startreptok = prestr-1;
1174			/* Remember start of previous atom here ? */
1175		} else {        	/* just a { char, not a repetition */
1176			rlxval = c;
1177			return CHAR;
1178                }
1179		for (; ; ) {
1180			if ((c = *prestr++) == '}') {
1181				if (commafound) {
1182					if (digitfound) { /* {n,m} */
1183						m = num;
1184						if (m < n)
1185							FATAL("illegal repetition expression: class %.20s",
1186								lastre);
1187						if (n == 0 && m == 1) {
1188							return QUEST;
1189						}
1190					} else {	/* {n,} */
1191						if (n == 0)
1192							return STAR;
1193						else if (n == 1)
1194							return PLUS;
1195					}
1196				} else {
1197					if (digitfound) { /* {n} same as {n,n} */
1198						n = num;
1199						m = num;
1200					} else {	/* {} */
1201						FATAL("illegal repetition expression: class %.20s",
1202							lastre);
1203					}
1204				}
1205				if (repeat(starttok, prestr-starttok, lastatom,
1206					   startreptok - lastatom, n, m) > 0) {
1207					if (n == 0 && m == 0) {
1208						return ZERO;
1209					}
1210					/* must rescan input for next token */
1211					goto rescan;
1212				}
1213				/* Failed to replace: eat up {...} characters
1214				   and treat like just PLUS */
1215				return PLUS;
1216			} else if (c == '\0') {
1217				FATAL("nonterminated character class %.20s",
1218					lastre);
1219			} else if (isdigit(c)) {
1220				num = 10 * num + c - '0';
1221				digitfound = true;
1222			} else if (c == ',') {
1223				if (commafound)
1224					FATAL("illegal repetition expression: class %.20s",
1225						lastre);
1226				/* looking for {n,} or {n,m} */
1227				commafound = true;
1228				n = num;
1229				digitfound = false; /* reset */
1230				num = 0;
1231			} else {
1232				FATAL("illegal repetition expression: class %.20s",
1233					lastre);
1234			}
1235		}
1236		break;
1237	}
1238}
1239
1240int cgoto(fa *f, int s, int c)
1241{
1242	int *p, *q;
1243	int i, j, k;
1244
1245	assert(c == HAT || c < NCHARS);
1246	while (f->accept >= maxsetvec) {	/* guessing here! */
1247		resizesetvec(__func__);
1248	}
1249	for (i = 0; i <= f->accept; i++)
1250		setvec[i] = 0;
1251	setcnt = 0;
1252	resize_state(f, s);
1253	/* compute positions of gototab[s,c] into setvec */
1254	p = f->posns[s];
1255	for (i = 1; i <= *p; i++) {
1256		if ((k = f->re[p[i]].ltype) != FINAL) {
1257			if ((k == CHAR && c == ptoi(f->re[p[i]].lval.np))
1258			 || (k == DOT && c != 0 && c != HAT)
1259			 || (k == ALL && c != 0)
1260			 || (k == EMPTYRE && c != 0)
1261			 || (k == CCL && member(c, (char *) f->re[p[i]].lval.up))
1262			 || (k == NCCL && !member(c, (char *) f->re[p[i]].lval.up) && c != 0 && c != HAT)) {
1263				q = f->re[p[i]].lfollow;
1264				for (j = 1; j <= *q; j++) {
1265					if (q[j] >= maxsetvec) {
1266						resizesetvec(__func__);
1267					}
1268					if (setvec[q[j]] == 0) {
1269						setcnt++;
1270						setvec[q[j]] = 1;
1271					}
1272				}
1273			}
1274		}
1275	}
1276	/* determine if setvec is a previous state */
1277	tmpset[0] = setcnt;
1278	j = 1;
1279	for (i = f->accept; i >= 0; i--)
1280		if (setvec[i]) {
1281			tmpset[j++] = i;
1282		}
1283	resize_state(f, f->curstat > s ? f->curstat : s);
1284	/* tmpset == previous state? */
1285	for (i = 1; i <= f->curstat; i++) {
1286		p = f->posns[i];
1287		if ((k = tmpset[0]) != p[0])
1288			goto different;
1289		for (j = 1; j <= k; j++)
1290			if (tmpset[j] != p[j])
1291				goto different;
1292		/* setvec is state i */
1293		if (c != HAT)
1294			f->gototab[s][c] = i;
1295		return i;
1296	  different:;
1297	}
1298
1299	/* add tmpset to current set of states */
1300	++(f->curstat);
1301	resize_state(f, f->curstat);
1302	for (i = 0; i < NCHARS; i++)
1303		f->gototab[f->curstat][i] = 0;
1304	xfree(f->posns[f->curstat]);
1305	p = intalloc(setcnt + 1, __func__);
1306
1307	f->posns[f->curstat] = p;
1308	if (c != HAT)
1309		f->gototab[s][c] = f->curstat;
1310	for (i = 0; i <= setcnt; i++)
1311		p[i] = tmpset[i];
1312	if (setvec[f->accept])
1313		f->out[f->curstat] = 1;
1314	else
1315		f->out[f->curstat] = 0;
1316	return f->curstat;
1317}
1318
1319
1320void freefa(fa *f)	/* free a finite automaton */
1321{
1322	int i;
1323
1324	if (f == NULL)
1325		return;
1326	for (i = 0; i < f->state_count; i++)
1327		xfree(f->gototab[i])
1328	for (i = 0; i <= f->curstat; i++)
1329		xfree(f->posns[i]);
1330	for (i = 0; i <= f->accept; i++) {
1331		xfree(f->re[i].lfollow);
1332		if (f->re[i].ltype == CCL || f->re[i].ltype == NCCL)
1333			xfree(f->re[i].lval.np);
1334	}
1335	xfree(f->restr);
1336	xfree(f->out);
1337	xfree(f->posns);
1338	xfree(f->gototab);
1339	xfree(f);
1340}
1341