1/*	$OpenBSD: b.c,v 1.53 2024/06/03 00:55:05 millert Exp $	*/
2/****************************************************************
3Copyright (C) Lucent Technologies 1997
4All Rights Reserved
5
6Permission to use, copy, modify, and distribute this software and
7its documentation for any purpose and without fee is hereby
8granted, provided that the above copyright notice appear in all
9copies and that both that the copyright notice and this
10permission notice and warranty disclaimer appear in supporting
11documentation, and that the name Lucent Technologies or any of
12its entities not be used in advertising or publicity pertaining
13to distribution of the software without specific, written prior
14permission.
15
16LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
17INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
18IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
19SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
20WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
21IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
22ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
23THIS SOFTWARE.
24****************************************************************/
25
26/* lasciate ogne speranza, voi ch'intrate. */
27
28#define	DEBUG
29
30#include <ctype.h>
31#include <limits.h>
32#include <stdio.h>
33#include <string.h>
34#include <stdlib.h>
35#include "awk.h"
36#include "awkgram.tab.h"
37
38#define MAXLIN 22
39
40#define type(v)		(v)->nobj	/* badly overloaded here */
41#define info(v)		(v)->ntype	/* badly overloaded here */
42#define left(v)		(v)->narg[0]
43#define right(v)	(v)->narg[1]
44#define parent(v)	(v)->nnext
45
46#define LEAF	case CCL: case NCCL: case CHAR: case DOT: case FINAL: case ALL:
47#define ELEAF	case EMPTYRE:		/* empty string in regexp */
48#define UNARY	case STAR: case PLUS: case QUEST:
49
50/* encoding in tree Nodes:
51	leaf (CCL, NCCL, CHAR, DOT, FINAL, ALL, EMPTYRE):
52		left is index, right contains value or pointer to value
53	unary (STAR, PLUS, QUEST): left is child, right is null
54	binary (CAT, OR): left and right are children
55	parent contains pointer to parent
56*/
57
58
59int	*setvec;
60int	*tmpset;
61int	maxsetvec = 0;
62
63int	rtok;		/* next token in current re */
64int	rlxval;
65static const uschar	*rlxstr;
66static const uschar	*prestr;	/* current position in current re */
67static const uschar	*lastre;	/* origin of last re */
68static const uschar	*lastatom;	/* origin of last Atom */
69static const uschar	*starttok;
70static const uschar 	*basestr;	/* starts with original, replaced during
71				   repetition processing */
72static const uschar 	*firstbasestr;
73
74static	int setcnt;
75static	int poscnt;
76
77const char	*patbeg;
78int	patlen;
79
80#define	NFA	128	/* cache this many dynamic fa's */
81fa	*fatab[NFA];
82int	nfatab	= 0;	/* entries in fatab */
83
84/* utf-8 mechanism:
85
86   For most of Awk, utf-8 strings just "work", since they look like
87   null-terminated sequences of 8-bit bytes.
88
89   Functions like length(), index(), and substr() have to operate
90   in units of utf-8 characters.  The u8_* functions in run.c
91   handle this.
92
93   Regular expressions are more complicated, since the basic
94   mechanism of the goto table used 8-bit byte indices into the
95   gototab entries to compute the next state.  Unicode is a lot
96   bigger, so the gototab entries are now structs with a character
97   and a next state. These are sorted by code point and binary
98   searched.
99
100   Throughout the RE mechanism in b.c, utf-8 characters are
101   converted to their utf-32 value.  This mostly shows up in
102   cclenter, which expands character class ranges like a-z and now
103   alpha-omega.  The size of a gototab array is still about 256.
104   This should be dynamic, but for now things work ok for a single
105   code page of Unicode, which is the most likely case.
106
107   The code changes are localized in run.c and b.c.  I have added a
108   handful of functions to somewhat better hide the implementation,
109   but a lot more could be done.
110
111 */
112
113static int entry_cmp(const void *l, const void *r);
114static int get_gototab(fa*, int, int);
115static int set_gototab(fa*, int, int, int);
116static void clear_gototab(fa*, int);
117
118static int *
119intalloc(size_t n, const char *f)
120{
121	int *p = (int *) calloc(n, sizeof(int));
122	if (p == NULL)
123		overflo(f);
124	return p;
125}
126
127static void
128allocsetvec(const char *f)
129{
130	maxsetvec = MAXLIN;
131	setvec = (int *) reallocarray(setvec, maxsetvec, sizeof(*setvec));
132	tmpset = (int *) reallocarray(tmpset, maxsetvec, sizeof(*tmpset));
133	if (setvec == NULL || tmpset == NULL)
134		overflo(f);
135}
136
137static void
138resizesetvec(const char *f)
139{
140	setvec = (int *) reallocarray(setvec, maxsetvec, 4 * sizeof(*setvec));
141	tmpset = (int *) reallocarray(tmpset, maxsetvec, 4 * sizeof(*tmpset));
142	if (setvec == NULL || tmpset == NULL)
143		overflo(f);
144	maxsetvec *= 4;
145}
146
147static void
148resize_state(fa *f, int state)
149{
150	gtt *p;
151	uschar *p2;
152	int **p3;
153	int i, new_count;
154
155	if (++state < f->state_count)
156		return;
157
158	new_count = state + 10; /* needs to be tuned */
159
160	p = (gtt *) reallocarray(f->gototab, new_count, sizeof(gtt));
161	if (p == NULL)
162		goto out;
163	f->gototab = p;
164
165	p2 = (uschar *) reallocarray(f->out, new_count, sizeof(f->out[0]));
166	if (p2 == NULL)
167		goto out;
168	f->out = p2;
169
170	p3 = (int **) reallocarray(f->posns, new_count, sizeof(f->posns[0]));
171	if (p3 == NULL)
172		goto out;
173	f->posns = p3;
174
175	for (i = f->state_count; i < new_count; ++i) {
176		f->gototab[i].entries = (gtte *) calloc(NCHARS, sizeof(gtte));
177		if (f->gototab[i].entries == NULL)
178			goto out;
179		f->gototab[i].allocated = NCHARS;
180		f->gototab[i].inuse = 0;
181		f->out[i] = 0;
182		f->posns[i] = NULL;
183	}
184	f->state_count = new_count;
185	return;
186out:
187	overflo(__func__);
188}
189
190fa *makedfa(const char *s, bool anchor)	/* returns dfa for reg expr s */
191{
192	int i, use, nuse;
193	fa *pfa;
194	static int now = 1;
195
196	if (setvec == NULL) {	/* first time through any RE */
197		allocsetvec(__func__);
198	}
199
200	if (compile_time != RUNNING)	/* a constant for sure */
201		return mkdfa(s, anchor);
202	for (i = 0; i < nfatab; i++)	/* is it there already? */
203		if (fatab[i]->anchor == anchor
204		  && strcmp((const char *) fatab[i]->restr, s) == 0) {
205			fatab[i]->use = now++;
206			return fatab[i];
207		}
208	pfa = mkdfa(s, anchor);
209	if (nfatab < NFA) {	/* room for another */
210		fatab[nfatab] = pfa;
211		fatab[nfatab]->use = now++;
212		nfatab++;
213		return pfa;
214	}
215	use = fatab[0]->use;	/* replace least-recently used */
216	nuse = 0;
217	for (i = 1; i < nfatab; i++)
218		if (fatab[i]->use < use) {
219			use = fatab[i]->use;
220			nuse = i;
221		}
222	freefa(fatab[nuse]);
223	fatab[nuse] = pfa;
224	pfa->use = now++;
225	return pfa;
226}
227
228fa *mkdfa(const char *s, bool anchor)	/* does the real work of making a dfa */
229				/* anchor = true for anchored matches, else false */
230{
231	Node *p, *p1;
232	fa *f;
233
234	firstbasestr = (const uschar *) s;
235	basestr = firstbasestr;
236	p = reparse(s);
237	p1 = op2(CAT, op2(STAR, op2(ALL, NIL, NIL), NIL), p);
238		/* put ALL STAR in front of reg.  exp. */
239	p1 = op2(CAT, p1, op2(FINAL, NIL, NIL));
240		/* put FINAL after reg.  exp. */
241
242	poscnt = 0;
243	penter(p1);	/* enter parent pointers and leaf indices */
244	if ((f = (fa *) calloc(1, sizeof(fa) + poscnt * sizeof(rrow))) == NULL)
245		overflo(__func__);
246	f->accept = poscnt-1;	/* penter has computed number of positions in re */
247	cfoll(f, p1);	/* set up follow sets */
248	freetr(p1);
249	resize_state(f, 1);
250	f->posns[0] = intalloc(*(f->re[0].lfollow), __func__);
251	f->posns[1] = intalloc(1, __func__);
252	*f->posns[1] = 0;
253	f->initstat = makeinit(f, anchor);
254	f->anchor = anchor;
255	f->restr = (uschar *) tostring(s);
256	if (firstbasestr != basestr) {
257		if (basestr)
258			xfree(basestr);
259	}
260	return f;
261}
262
263int makeinit(fa *f, bool anchor)
264{
265	int i, k;
266
267	f->curstat = 2;
268	f->out[2] = 0;
269	k = *(f->re[0].lfollow);
270	xfree(f->posns[2]);
271	f->posns[2] = intalloc(k + 1,  __func__);
272	for (i = 0; i <= k; i++) {
273		(f->posns[2])[i] = (f->re[0].lfollow)[i];
274	}
275	if ((f->posns[2])[1] == f->accept)
276		f->out[2] = 1;
277	clear_gototab(f, 2);
278	f->curstat = cgoto(f, 2, HAT);
279	if (anchor) {
280		*f->posns[2] = k-1;	/* leave out position 0 */
281		for (i = 0; i < k; i++) {
282			(f->posns[0])[i] = (f->posns[2])[i];
283		}
284
285		f->out[0] = f->out[2];
286		if (f->curstat != 2)
287			--(*f->posns[f->curstat]);
288	}
289	return f->curstat;
290}
291
292void penter(Node *p)	/* set up parent pointers and leaf indices */
293{
294	switch (type(p)) {
295	ELEAF
296	LEAF
297		info(p) = poscnt;
298		poscnt++;
299		break;
300	UNARY
301		penter(left(p));
302		parent(left(p)) = p;
303		break;
304	case CAT:
305	case OR:
306		penter(left(p));
307		penter(right(p));
308		parent(left(p)) = p;
309		parent(right(p)) = p;
310		break;
311	case ZERO:
312		break;
313	default:	/* can't happen */
314		FATAL("can't happen: unknown type %d in penter", type(p));
315		break;
316	}
317}
318
319void freetr(Node *p)	/* free parse tree */
320{
321	switch (type(p)) {
322	ELEAF
323	LEAF
324		xfree(p);
325		break;
326	UNARY
327	case ZERO:
328		freetr(left(p));
329		xfree(p);
330		break;
331	case CAT:
332	case OR:
333		freetr(left(p));
334		freetr(right(p));
335		xfree(p);
336		break;
337	default:	/* can't happen */
338		FATAL("can't happen: unknown type %d in freetr", type(p));
339		break;
340	}
341}
342
343/* in the parsing of regular expressions, metacharacters like . have */
344/* to be seen literally;  \056 is not a metacharacter. */
345
346static int
347hexstr(const uschar **pp, int max)	/* find and eval hex string at pp, return new p */
348{			/* only pick up one 8-bit byte (2 chars) */
349	const uschar *p;
350	int n = 0;
351	int i;
352
353	for (i = 0, p = *pp; i < max && isxdigit(*p); i++, p++) {
354		if (isdigit(*p))
355			n = 16 * n + *p - '0';
356		else if (*p >= 'a' && *p <= 'f')
357			n = 16 * n + *p - 'a' + 10;
358		else if (*p >= 'A' && *p <= 'F')
359			n = 16 * n + *p - 'A' + 10;
360	}
361	*pp = p;
362	return n;
363}
364
365#define isoctdigit(c) ((c) >= '0' && (c) <= '7')	/* multiple use of arg */
366
367int quoted(const uschar **pp)	/* pick up next thing after a \\ */
368			/* and increment *pp */
369{
370	const uschar *p = *pp;
371	int c;
372
373/* BUG: should advance by utf-8 char even if makes no sense */
374
375	if ((c = *p++) == 't') {
376		c = '\t';
377	} else if (c == 'n') {
378		c = '\n';
379	} else if (c == 'f') {
380		c = '\f';
381	} else if (c == 'r') {
382		c = '\r';
383	} else if (c == 'b') {
384		c = '\b';
385	} else if (c == 'v') {
386		c = '\v';
387	} else if (c == 'a') {
388		c = '\a';
389	} else if (c == '\\') {
390		c = '\\';
391	} else if (c == 'x') {	/* 2 hex digits follow */
392		c = hexstr(&p, 2);	/* this adds a null if number is invalid */
393	} else if (c == 'u') {	/* unicode char number up to 8 hex digits */
394		c = hexstr(&p, 8);
395	} else if (isoctdigit(c)) {	/* \d \dd \ddd */
396		int n = c - '0';
397		if (isoctdigit(*p)) {
398			n = 8 * n + *p++ - '0';
399			if (isoctdigit(*p))
400				n = 8 * n + *p++ - '0';
401		}
402		c = n;
403	} /* else */
404		/* c = c; */
405	*pp = p;
406	return c;
407}
408
409int *cclenter(const char *argp)	/* add a character class */
410{
411	int i, c, c2;
412	int n;
413	const uschar *p = (const uschar *) argp;
414	int *bp, *retp;
415	static int *buf = NULL;
416	static int bufsz = 100;
417
418	if (buf == NULL && (buf = (int *) calloc(bufsz, sizeof(int))) == NULL)
419		FATAL("out of space for character class [%.10s...] 1", p);
420	bp = buf;
421	for (i = 0; *p != 0; ) {
422		n = u8_rune(&c, (const char *) p);
423		p += n;
424		if (c == '\\') {
425			c = quoted(&p);
426		} else if (c == '-' && i > 0 && bp[-1] != 0) {
427			if (*p != 0) {
428				c = bp[-1];
429				/* c2 = *p++; */
430				n = u8_rune(&c2, (const char *) p);
431				p += n;
432				if (c2 == '\\')
433					c2 = quoted(&p); /* BUG: sets p, has to be u8 size */
434				if (c > c2) {	/* empty; ignore */
435					bp--;
436					i--;
437					continue;
438				}
439				while (c < c2) {
440					if (i >= bufsz) {
441						buf = (int *) reallocarray(buf, bufsz, 2 * sizeof(int));
442						if (buf == NULL)
443							FATAL("out of space for character class [%.10s...] 2", p);
444						bufsz *= 2;
445						bp = buf + i;
446					}
447					*bp++ = ++c;
448					i++;
449				}
450				continue;
451			}
452		}
453		if (i >= bufsz) {
454			buf = (int *) reallocarray(buf, bufsz, 2 * sizeof(int));
455			if (buf == NULL)
456				FATAL("out of space for character class [%.10s...] 2", p);
457			bufsz *= 2;
458			bp = buf + i;
459		}
460		*bp++ = c;
461		i++;
462	}
463	*bp = 0;
464	/* DPRINTF("cclenter: in = |%s|, out = |%s|\n", op, buf); BUG: can't print array of int */
465	/* xfree(op);  BUG: what are we freeing here? */
466	retp = (int *) calloc(bp-buf+1, sizeof(int));
467	for (i = 0; i < bp-buf+1; i++)
468		retp[i] = buf[i];
469	return retp;
470}
471
472void overflo(const char *s)
473{
474	FATAL("regular expression too big: out of space in %.30s...", s);
475}
476
477void cfoll(fa *f, Node *v)	/* enter follow set of each leaf of vertex v into lfollow[leaf] */
478{
479	int i;
480	int *p;
481
482	switch (type(v)) {
483	ELEAF
484	LEAF
485		f->re[info(v)].ltype = type(v);
486		f->re[info(v)].lval.np = right(v);
487		while (f->accept >= maxsetvec) {	/* guessing here! */
488			resizesetvec(__func__);
489		}
490		for (i = 0; i <= f->accept; i++)
491			setvec[i] = 0;
492		setcnt = 0;
493		follow(v);	/* computes setvec and setcnt */
494		p = intalloc(setcnt + 1, __func__);
495		f->re[info(v)].lfollow = p;
496		*p = setcnt;
497		for (i = f->accept; i >= 0; i--)
498			if (setvec[i] == 1)
499				*++p = i;
500		break;
501	UNARY
502		cfoll(f,left(v));
503		break;
504	case CAT:
505	case OR:
506		cfoll(f,left(v));
507		cfoll(f,right(v));
508		break;
509	case ZERO:
510		break;
511	default:	/* can't happen */
512		FATAL("can't happen: unknown type %d in cfoll", type(v));
513	}
514}
515
516int first(Node *p)	/* collects initially active leaves of p into setvec */
517			/* returns 0 if p matches empty string */
518{
519	int b, lp;
520
521	switch (type(p)) {
522	ELEAF
523	LEAF
524		lp = info(p);	/* look for high-water mark of subscripts */
525		while (setcnt >= maxsetvec || lp >= maxsetvec) {	/* guessing here! */
526			resizesetvec(__func__);
527		}
528		if (type(p) == EMPTYRE) {
529			setvec[lp] = 0;
530			return(0);
531		}
532		if (setvec[lp] != 1) {
533			setvec[lp] = 1;
534			setcnt++;
535		}
536		if (type(p) == CCL && (*(int *) right(p)) == 0)
537			return(0);		/* empty CCL */
538		return(1);
539	case PLUS:
540		if (first(left(p)) == 0)
541			return(0);
542		return(1);
543	case STAR:
544	case QUEST:
545		first(left(p));
546		return(0);
547	case CAT:
548		if (first(left(p)) == 0 && first(right(p)) == 0) return(0);
549		return(1);
550	case OR:
551		b = first(right(p));
552		if (first(left(p)) == 0 || b == 0) return(0);
553		return(1);
554	case ZERO:
555		return 0;
556	}
557	FATAL("can't happen: unknown type %d in first", type(p));	/* can't happen */
558	return(-1);
559}
560
561void follow(Node *v)	/* collects leaves that can follow v into setvec */
562{
563	Node *p;
564
565	if (type(v) == FINAL)
566		return;
567	p = parent(v);
568	switch (type(p)) {
569	case STAR:
570	case PLUS:
571		first(v);
572		follow(p);
573		return;
574
575	case OR:
576	case QUEST:
577		follow(p);
578		return;
579
580	case CAT:
581		if (v == left(p)) {	/* v is left child of p */
582			if (first(right(p)) == 0) {
583				follow(p);
584				return;
585			}
586		} else		/* v is right child */
587			follow(p);
588		return;
589	}
590}
591
592int member(int c, int *sarg)	/* is c in s? */
593{
594	int *s = (int *) sarg;
595
596	while (*s)
597		if (c == *s++)
598			return(1);
599	return(0);
600}
601
602static void resize_gototab(fa *f, int state)
603{
604	size_t new_size = f->gototab[state].allocated * 2;
605	gtte *p = (gtte *) realloc(f->gototab[state].entries, new_size * sizeof(gtte));
606	if (p == NULL)
607		overflo(__func__);
608
609	// need to initialized the new memory to zero
610	size_t orig_size = f->gototab[state].allocated;		// 2nd half of new mem is this size
611	memset(p + orig_size, 0, orig_size * sizeof(gtte));	// clean it out
612
613	f->gototab[state].allocated = new_size;			// update gototab info
614	f->gototab[state].entries = p;
615}
616
617static int get_gototab(fa *f, int state, int ch) /* hide gototab implementation */
618{
619	gtte key;
620	gtte *item;
621
622	key.ch = ch;
623	key.state = 0;	/* irrelevant */
624	item = (gtte *) bsearch(& key, f->gototab[state].entries,
625			f->gototab[state].inuse, sizeof(gtte),
626			entry_cmp);
627
628	if (item == NULL)
629		return 0;
630	else
631		return item->state;
632}
633
634static int entry_cmp(const void *l, const void *r)
635{
636	const gtte *left, *right;
637
638	left = (const gtte *) l;
639	right = (const gtte *) r;
640
641	return left->ch - right->ch;
642}
643
644static int set_gototab(fa *f, int state, int ch, int val) /* hide gototab implementation */
645{
646	if (f->gototab[state].inuse == 0) {
647		f->gototab[state].entries[0].ch = ch;
648		f->gototab[state].entries[0].state = val;
649		f->gototab[state].inuse++;
650		return val;
651	} else if ((unsigned)ch > f->gototab[state].entries[f->gototab[state].inuse-1].ch) {
652		// not seen yet, insert and return
653		gtt *tab = & f->gototab[state];
654		if (tab->inuse + 1 >= tab->allocated)
655			resize_gototab(f, state);
656
657		f->gototab[state].entries[f->gototab[state].inuse].ch = ch;
658		f->gototab[state].entries[f->gototab[state].inuse].state = val;
659		f->gototab[state].inuse++;
660		return val;
661	} else {
662		// maybe we have it, maybe we don't
663		gtte key;
664		gtte *item;
665
666		key.ch = ch;
667		key.state = 0;	/* irrelevant */
668		item = (gtte *) bsearch(& key, f->gototab[state].entries,
669				f->gototab[state].inuse, sizeof(gtte),
670				entry_cmp);
671
672		if (item != NULL) {
673			// we have it, update state and return
674			item->state = val;
675			return item->state;
676		}
677		// otherwise, fall through to insert and reallocate.
678	}
679
680	gtt *tab = & f->gototab[state];
681	if (tab->inuse + 1 >= tab->allocated)
682		resize_gototab(f, state);
683	f->gototab[state].entries[tab->inuse].ch = ch;
684	f->gototab[state].entries[tab->inuse].state = val;
685	++tab->inuse;
686
687	qsort(f->gototab[state].entries,
688		f->gototab[state].inuse, sizeof(gtte), entry_cmp);
689
690	return val; /* not used anywhere at the moment */
691}
692
693static void clear_gototab(fa *f, int state)
694{
695	memset(f->gototab[state].entries, 0,
696		f->gototab[state].allocated * sizeof(gtte));
697	f->gototab[state].inuse = 0;
698}
699
700int match(fa *f, const char *p0)	/* shortest match ? */
701{
702	int s, ns;
703	int n;
704	int rune;
705	const uschar *p = (const uschar *) p0;
706
707	/* return pmatch(f, p0); does it matter whether longest or shortest? */
708
709	s = f->initstat;
710	assert (s < f->state_count);
711
712	if (f->out[s])
713		return(1);
714	do {
715		/* assert(*p < NCHARS); */
716		n = u8_rune(&rune, (const char *) p);
717		if ((ns = get_gototab(f, s, rune)) != 0)
718			s = ns;
719		else
720			s = cgoto(f, s, rune);
721		if (f->out[s])
722			return(1);
723		if (*p == 0)
724			break;
725		p += n;
726	} while (1);  /* was *p++ != 0 */
727	return(0);
728}
729
730int pmatch(fa *f, const char *p0)	/* longest match, for sub */
731{
732	int s, ns;
733	int n;
734	int rune;
735	const uschar *p = (const uschar *) p0;
736	const uschar *q;
737
738	s = f->initstat;
739	assert(s < f->state_count);
740
741	patbeg = (const char *)p;
742	patlen = -1;
743	do {
744		q = p;
745		do {
746			if (f->out[s])		/* final state */
747				patlen = q-p;
748			/* assert(*q < NCHARS); */
749			n = u8_rune(&rune, (const char *) q);
750			if ((ns = get_gototab(f, s, rune)) != 0)
751				s = ns;
752			else
753				s = cgoto(f, s, rune);
754
755			assert(s < f->state_count);
756
757			if (s == 1) {	/* no transition */
758				if (patlen >= 0) {
759					patbeg = (const char *) p;
760					return(1);
761				}
762				else
763					goto nextin;	/* no match */
764			}
765			if (*q == 0)
766				break;
767			q += n;
768		} while (1);
769		q++;  /* was *q++ */
770		if (f->out[s])
771			patlen = q-p-1;	/* don't count $ */
772		if (patlen >= 0) {
773			patbeg = (const char *) p;
774			return(1);
775		}
776	nextin:
777		s = 2;
778		if (*p == 0)
779			break;
780		n = u8_rune(&rune, (const char *) p);
781		p += n;
782	} while (1); /* was *p++ */
783	return (0);
784}
785
786int nematch(fa *f, const char *p0)	/* non-empty match, for sub */
787{
788	int s, ns;
789	int n;
790	int rune;
791	const uschar *p = (const uschar *) p0;
792	const uschar *q;
793
794	s = f->initstat;
795	assert(s < f->state_count);
796
797	patbeg = (const char *)p;
798	patlen = -1;
799	while (*p) {
800		q = p;
801		do {
802			if (f->out[s])		/* final state */
803				patlen = q-p;
804			/* assert(*q < NCHARS); */
805			n = u8_rune(&rune, (const char *) q);
806			if ((ns = get_gototab(f, s, rune)) != 0)
807				s = ns;
808			else
809				s = cgoto(f, s, rune);
810			if (s == 1) {	/* no transition */
811				if (patlen > 0) {
812					patbeg = (const char *) p;
813					return(1);
814				} else
815					goto nnextin;	/* no nonempty match */
816			}
817			if (*q == 0)
818				break;
819			q += n;
820		} while (1);
821		q++;
822		if (f->out[s])
823			patlen = q-p-1;	/* don't count $ */
824		if (patlen > 0 ) {
825			patbeg = (const char *) p;
826			return(1);
827		}
828	nnextin:
829		s = 2;
830		p++;
831	}
832	return (0);
833}
834
835
836/*
837 * NAME
838 *     fnematch
839 *
840 * DESCRIPTION
841 *     A stream-fed version of nematch which transfers characters to a
842 *     null-terminated buffer. All characters up to and including the last
843 *     character of the matching text or EOF are placed in the buffer. If
844 *     a match is found, patbeg and patlen are set appropriately.
845 *
846 * RETURN VALUES
847 *     false    No match found.
848 *     true     Match found.
849 */
850
851bool fnematch(fa *pfa, FILE *f, char **pbuf, int *pbufsize, int quantum)
852{
853	char *i, *j, *k, *buf = *pbuf;
854	int bufsize = *pbufsize;
855	int c, n, ns, s;
856
857	s = pfa->initstat;
858	patlen = 0;
859
860	/*
861	 * buf <= i <= j <= k <= buf+bufsize
862	 *
863	 * i: origin of active substring
864	 * j: current character
865	 * k: destination of the next getc
866	 */
867
868	i = j = k = buf;
869
870	do {
871		/*
872		 * Call u8_rune with at least awk_mb_cur_max ahead in
873		 * the buffer until EOF interferes.
874		 */
875		if (k - j < (int)awk_mb_cur_max) {
876			if (k + awk_mb_cur_max > buf + bufsize) {
877				char *obuf = buf;
878				adjbuf(&buf, &bufsize,
879				    bufsize + awk_mb_cur_max,
880				    quantum, 0, "fnematch");
881
882				/* buf resized, maybe moved. update pointers */
883				*pbufsize = bufsize;
884				if (obuf != buf) {
885					i = buf + (i - obuf);
886					j = buf + (j - obuf);
887					k = buf + (k - obuf);
888					*pbuf = buf;
889					if (patlen)
890						patbeg = buf + (patbeg - obuf);
891				}
892			}
893			for (n = awk_mb_cur_max ; n > 0; n--) {
894				*k++ = (c = getc(f)) != EOF ? c : 0;
895				if (c == EOF) {
896					if (ferror(f))
897						FATAL("fnematch: getc error");
898					break;
899				}
900			}
901		}
902
903		j += u8_rune(&c, j);
904
905		if ((ns = get_gototab(pfa, s, c)) != 0)
906			s = ns;
907		else
908			s = cgoto(pfa, s, c);
909
910		if (pfa->out[s]) {	/* final state */
911			patbeg = i;
912			patlen = j - i;
913			if (c == 0)	/* don't count $ */
914				patlen--;
915		}
916
917		if (c && s != 1)
918			continue;  /* origin i still viable, next j */
919		if (patlen)
920			break;     /* best match found */
921
922		/* no match at origin i, next i and start over */
923		i += u8_rune(&c, i);
924		if (c == 0)
925			break;    /* no match */
926		j = i;
927		s = 2;
928	} while (1);
929
930	if (patlen) {
931		/*
932		 * Under no circumstances is the last character fed to
933		 * the automaton part of the match. It is EOF's nullbyte,
934		 * or it sent the automaton into a state with no further
935		 * transitions available (s==1), or both. Room for a
936		 * terminating nullbyte is guaranteed.
937		 *
938		 * ungetc any chars after the end of matching text
939		 * (except for EOF's nullbyte, if present) and null
940		 * terminate the buffer.
941		 */
942		do
943			if (*--k && ungetc(*k, f) == EOF)
944				FATAL("unable to ungetc '%c'", *k);
945		while (k > patbeg + patlen);
946		*k = '\0';
947		return true;
948	}
949	else
950		return false;
951}
952
953Node *reparse(const char *p)	/* parses regular expression pointed to by p */
954{			/* uses relex() to scan regular expression */
955	Node *np;
956
957	DPRINTF("reparse <%s>\n", p);
958	lastre = prestr = (const uschar *) p;	/* prestr points to string to be parsed */
959	rtok = relex();
960	/* GNU compatibility: an empty regexp matches anything */
961	if (rtok == '\0') {
962		/* FATAL("empty regular expression"); previous */
963		return(op2(EMPTYRE, NIL, NIL));
964	}
965	np = regexp();
966	if (rtok != '\0')
967		FATAL("syntax error in regular expression %s at %s", lastre, prestr);
968	return(np);
969}
970
971Node *regexp(void)	/* top-level parse of reg expr */
972{
973	return (alt(concat(primary())));
974}
975
976Node *primary(void)
977{
978	Node *np;
979	int savelastatom;
980
981	switch (rtok) {
982	case CHAR:
983		lastatom = starttok;
984		np = op2(CHAR, NIL, itonp(rlxval));
985		rtok = relex();
986		return (unary(np));
987	case ALL:
988		rtok = relex();
989		return (unary(op2(ALL, NIL, NIL)));
990	case EMPTYRE:
991		rtok = relex();
992		return (unary(op2(EMPTYRE, NIL, NIL)));
993	case DOT:
994		lastatom = starttok;
995		rtok = relex();
996		return (unary(op2(DOT, NIL, NIL)));
997	case CCL:
998		np = op2(CCL, NIL, (Node*) cclenter((const char *) rlxstr));
999		lastatom = starttok;
1000		rtok = relex();
1001		return (unary(np));
1002	case NCCL:
1003		np = op2(NCCL, NIL, (Node *) cclenter((const char *) rlxstr));
1004		lastatom = starttok;
1005		rtok = relex();
1006		return (unary(np));
1007	case '^':
1008		rtok = relex();
1009		return (unary(op2(CHAR, NIL, itonp(HAT))));
1010	case '$':
1011		rtok = relex();
1012		return (unary(op2(CHAR, NIL, NIL)));
1013	case '(':
1014		lastatom = starttok;
1015		savelastatom = starttok - basestr; /* Retain over recursion */
1016		rtok = relex();
1017		if (rtok == ')') {	/* special pleading for () */
1018			rtok = relex();
1019			return unary(op2(CCL, NIL, (Node *) cclenter("")));
1020		}
1021		np = regexp();
1022		if (rtok == ')') {
1023			lastatom = basestr + savelastatom; /* Restore */
1024			rtok = relex();
1025			return (unary(np));
1026		}
1027		else
1028			FATAL("syntax error in regular expression %s at %s", lastre, prestr);
1029	default:
1030		FATAL("illegal primary in regular expression %s at %s", lastre, prestr);
1031	}
1032	return 0;	/*NOTREACHED*/
1033}
1034
1035Node *concat(Node *np)
1036{
1037	switch (rtok) {
1038	case CHAR: case DOT: case ALL: case CCL: case NCCL: case '$': case '(':
1039		return (concat(op2(CAT, np, primary())));
1040	case EMPTYRE:
1041		rtok = relex();
1042		return (concat(op2(CAT, op2(CCL, NIL, (Node *) cclenter("")),
1043				primary())));
1044	}
1045	return (np);
1046}
1047
1048Node *alt(Node *np)
1049{
1050	if (rtok == OR) {
1051		rtok = relex();
1052		return (alt(op2(OR, np, concat(primary()))));
1053	}
1054	return (np);
1055}
1056
1057Node *unary(Node *np)
1058{
1059	switch (rtok) {
1060	case STAR:
1061		rtok = relex();
1062		return (unary(op2(STAR, np, NIL)));
1063	case PLUS:
1064		rtok = relex();
1065		return (unary(op2(PLUS, np, NIL)));
1066	case QUEST:
1067		rtok = relex();
1068		return (unary(op2(QUEST, np, NIL)));
1069	case ZERO:
1070		rtok = relex();
1071		return (unary(op2(ZERO, np, NIL)));
1072	default:
1073		return (np);
1074	}
1075}
1076
1077/*
1078 * Character class definitions conformant to the POSIX locale as
1079 * defined in IEEE P1003.1 draft 7 of June 2001, assuming the source
1080 * and operating character sets are both ASCII (ISO646) or supersets
1081 * thereof.
1082 *
1083 * Note that to avoid overflowing the temporary buffer used in
1084 * relex(), the expanded character class (prior to range expansion)
1085 * must be less than twice the size of their full name.
1086 */
1087
1088/* Because isblank doesn't show up in any of the header files on any
1089 * system i use, it's defined here.  if some other locale has a richer
1090 * definition of "blank", define HAS_ISBLANK and provide your own
1091 * version.
1092 * the parentheses here are an attempt to find a path through the maze
1093 * of macro definition and/or function and/or version provided.  thanks
1094 * to nelson beebe for the suggestion; let's see if it works everywhere.
1095 */
1096
1097#ifndef HAS_ISBLANK
1098
1099int (xisblank)(int c)
1100{
1101	return c==' ' || c=='\t';
1102}
1103
1104#endif
1105
1106static const struct charclass {
1107	const char *cc_name;
1108	int cc_namelen;
1109	int (*cc_func)(int);
1110} charclasses[] = {
1111	{ "alnum",	5,	isalnum },
1112	{ "alpha",	5,	isalpha },
1113#ifndef HAS_ISBLANK
1114	{ "blank",	5,	xisblank },
1115#else
1116	{ "blank",	5,	isblank },
1117#endif
1118	{ "cntrl",	5,	iscntrl },
1119	{ "digit",	5,	isdigit },
1120	{ "graph",	5,	isgraph },
1121	{ "lower",	5,	islower },
1122	{ "print",	5,	isprint },
1123	{ "punct",	5,	ispunct },
1124	{ "space",	5,	isspace },
1125	{ "upper",	5,	isupper },
1126	{ "xdigit",	6,	isxdigit },
1127	{ NULL,		0,	NULL },
1128};
1129
1130#define REPEAT_SIMPLE		0
1131#define REPEAT_PLUS_APPENDED	1
1132#define REPEAT_WITH_Q		2
1133#define REPEAT_ZERO		3
1134
1135static int
1136replace_repeat(const uschar *reptok, int reptoklen, const uschar *atom,
1137	       int atomlen, int firstnum, int secondnum, int special_case)
1138{
1139	int i, j;
1140	uschar *buf = NULL;
1141	int ret = 1;
1142	int init_q = (firstnum == 0);		/* first added char will be ? */
1143	int n_q_reps = secondnum-firstnum;	/* m>n, so reduce until {1,m-n} left  */
1144	int prefix_length = reptok - basestr;	/* prefix includes first rep	*/
1145	int suffix_length = strlen((const char *) reptok) - reptoklen;	/* string after rep specifier	*/
1146	int size = prefix_length +  suffix_length;
1147
1148	if (firstnum > 1) {	/* add room for reps 2 through firstnum */
1149		size += atomlen*(firstnum-1);
1150	}
1151
1152	/* Adjust size of buffer for special cases */
1153	if (special_case == REPEAT_PLUS_APPENDED) {
1154		size++;		/* for the final + */
1155	} else if (special_case == REPEAT_WITH_Q) {
1156		size += init_q + (atomlen+1)* (n_q_reps-init_q);
1157	} else if (special_case == REPEAT_ZERO) {
1158		size += 2;	/* just a null ERE: () */
1159	}
1160	if ((buf = (uschar *) malloc(size + 1)) == NULL)
1161		FATAL("out of space in reg expr %.10s..", lastre);
1162	memcpy(buf, basestr, prefix_length);	/* copy prefix	*/
1163	j = prefix_length;
1164	if (special_case == REPEAT_ZERO) {
1165		j -= atomlen;
1166		buf[j++] = '(';
1167		buf[j++] = ')';
1168	}
1169	for (i = 1; i < firstnum; i++) {	/* copy x reps 	*/
1170		memcpy(&buf[j], atom, atomlen);
1171		j += atomlen;
1172	}
1173	if (special_case == REPEAT_PLUS_APPENDED) {
1174		buf[j++] = '+';
1175	} else if (special_case == REPEAT_WITH_Q) {
1176		if (init_q)
1177			buf[j++] = '?';
1178		for (i = init_q; i < n_q_reps; i++) {	/* copy x? reps */
1179			memcpy(&buf[j], atom, atomlen);
1180			j += atomlen;
1181			buf[j++] = '?';
1182		}
1183	}
1184	memcpy(&buf[j], reptok+reptoklen, suffix_length);
1185	j += suffix_length;
1186	buf[j] = '\0';
1187	/* free old basestr */
1188	if (firstbasestr != basestr) {
1189		if (basestr)
1190			xfree(basestr);
1191	}
1192	basestr = buf;
1193	prestr  = buf + prefix_length;
1194	if (special_case == REPEAT_ZERO) {
1195		prestr  -= atomlen;
1196		ret++;
1197	}
1198	return ret;
1199}
1200
1201static int repeat(const uschar *reptok, int reptoklen, const uschar *atom,
1202		  int atomlen, int firstnum, int secondnum)
1203{
1204	/*
1205	   In general, the repetition specifier or "bound" is replaced here
1206	   by an equivalent ERE string, repeating the immediately previous atom
1207	   and appending ? and + as needed. Note that the first copy of the
1208	   atom is left in place, except in the special_case of a zero-repeat
1209	   (i.e., {0}).
1210	 */
1211	if (secondnum < 0) {	/* means {n,} -> repeat n-1 times followed by PLUS */
1212		if (firstnum < 2) {
1213			/* 0 or 1: should be handled before you get here */
1214			FATAL("internal error");
1215		} else {
1216			return replace_repeat(reptok, reptoklen, atom, atomlen,
1217				firstnum, secondnum, REPEAT_PLUS_APPENDED);
1218		}
1219	} else if (firstnum == secondnum) {	/* {n} or {n,n} -> simply repeat n-1 times */
1220		if (firstnum == 0) {	/* {0} or {0,0} */
1221			/* This case is unusual because the resulting
1222			   replacement string might actually be SMALLER than
1223			   the original ERE */
1224			return replace_repeat(reptok, reptoklen, atom, atomlen,
1225					firstnum, secondnum, REPEAT_ZERO);
1226		} else {		/* (firstnum >= 1) */
1227			return replace_repeat(reptok, reptoklen, atom, atomlen,
1228					firstnum, secondnum, REPEAT_SIMPLE);
1229		}
1230	} else if (firstnum < secondnum) {	/* {n,m} -> repeat n-1 times then alternate  */
1231		/*  x{n,m}  =>  xx...x{1, m-n+1}  =>  xx...x?x?x?..x?	*/
1232		return replace_repeat(reptok, reptoklen, atom, atomlen,
1233					firstnum, secondnum, REPEAT_WITH_Q);
1234	} else {	/* Error - shouldn't be here (n>m) */
1235		FATAL("internal error");
1236	}
1237	return 0;
1238}
1239
1240int relex(void)		/* lexical analyzer for reparse */
1241{
1242	int c, n;
1243	int cflag;
1244	static uschar *buf = NULL;
1245	static int bufsz = 100;
1246	uschar *bp;
1247	const struct charclass *cc;
1248	int i;
1249	int num, m;
1250	bool commafound, digitfound;
1251	const uschar *startreptok;
1252	static int parens = 0;
1253
1254rescan:
1255	starttok = prestr;
1256
1257	if ((n = u8_rune(&rlxval, (const char *) prestr)) > 1) {
1258		prestr += n;
1259		starttok = prestr;
1260		return CHAR;
1261	}
1262
1263	switch (c = *prestr++) {
1264	case '|': return OR;
1265	case '*': return STAR;
1266	case '+': return PLUS;
1267	case '?': return QUEST;
1268	case '.': return DOT;
1269	case '\0': prestr--; return '\0';
1270	case '^':
1271	case '$':
1272		return c;
1273	case '(':
1274		parens++;
1275		return c;
1276	case ')':
1277		if (parens) {
1278			parens--;
1279			return c;
1280		}
1281		/* unmatched close parenthesis; per POSIX, treat as literal */
1282		rlxval = c;
1283		return CHAR;
1284	case '\\':
1285		rlxval = quoted(&prestr);
1286		return CHAR;
1287	default:
1288		rlxval = c;
1289		return CHAR;
1290	case '[':
1291		if (buf == NULL && (buf = (uschar *) malloc(bufsz)) == NULL)
1292			FATAL("out of space in reg expr %.10s..", lastre);
1293		bp = buf;
1294		if (*prestr == '^') {
1295			cflag = 1;
1296			prestr++;
1297		}
1298		else
1299			cflag = 0;
1300		n = 5 * strlen((const char *) prestr)+1; /* BUG: was 2.  what value? */
1301		if (!adjbuf((char **) &buf, &bufsz, n, n, (char **) &bp, "relex1"))
1302			FATAL("out of space for reg expr %.10s...", lastre);
1303		for (; ; ) {
1304			if ((n = u8_rune(&rlxval, (const char *) prestr)) > 1) {
1305				for (i = 0; i < n; i++)
1306					*bp++ = *prestr++;
1307				continue;
1308			}
1309			if ((c = *prestr++) == '\\') {
1310				*bp++ = '\\';
1311				if ((c = *prestr++) == '\0')
1312					FATAL("nonterminated character class %.20s...", lastre);
1313				*bp++ = c;
1314			/* } else if (c == '\n') { */
1315			/* 	FATAL("newline in character class %.20s...", lastre); */
1316			} else if (c == '[' && *prestr == ':') {
1317				/* POSIX char class names, Dag-Erling Smorgrav, des@ofug.org */
1318				for (cc = charclasses; cc->cc_name; cc++)
1319					if (strncmp((const char *) prestr + 1, (const char *) cc->cc_name, cc->cc_namelen) == 0)
1320						break;
1321				if (cc->cc_name != NULL && prestr[1 + cc->cc_namelen] == ':' &&
1322				    prestr[2 + cc->cc_namelen] == ']') {
1323					prestr += cc->cc_namelen + 3;
1324					/*
1325					 * BUG: We begin at 1, instead of 0, since we
1326					 * would otherwise prematurely terminate the
1327					 * string for classes like [[:cntrl:]]. This
1328					 * means that we can't match the NUL character,
1329					 * not without first adapting the entire
1330					 * program to track each string's length.
1331					 */
1332					for (i = 1; i <= UCHAR_MAX; i++) {
1333						if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, "relex2"))
1334						    FATAL("out of space for reg expr %.10s...", lastre);
1335						if (cc->cc_func(i)) {
1336							/* escape backslash */
1337							if (i == '\\') {
1338								*bp++ = '\\';
1339								n++;
1340							}
1341
1342							*bp++ = i;
1343							n++;
1344						}
1345					}
1346				} else
1347					*bp++ = c;
1348			} else if (c == '[' && *prestr == '.') {
1349				char collate_char;
1350				prestr++;
1351				collate_char = *prestr++;
1352				if (*prestr == '.' && prestr[1] == ']') {
1353					prestr += 2;
1354					/* Found it: map via locale TBD: for
1355					   now, simply return this char.  This
1356					   is sufficient to pass conformance
1357					   test awk.ex 156
1358					 */
1359					if (*prestr == ']') {
1360						prestr++;
1361						rlxval = collate_char;
1362						return CHAR;
1363					}
1364				}
1365			} else if (c == '[' && *prestr == '=') {
1366				char equiv_char;
1367				prestr++;
1368				equiv_char = *prestr++;
1369				if (*prestr == '=' && prestr[1] == ']') {
1370					prestr += 2;
1371					/* Found it: map via locale TBD: for now
1372					   simply return this char. This is
1373					   sufficient to pass conformance test
1374					   awk.ex 156
1375					 */
1376					if (*prestr == ']') {
1377						prestr++;
1378						rlxval = equiv_char;
1379						return CHAR;
1380					}
1381				}
1382			} else if (c == '\0') {
1383				FATAL("nonterminated character class %.20s", lastre);
1384			} else if (bp == buf) {	/* 1st char is special */
1385				*bp++ = c;
1386			} else if (c == ']') {
1387				*bp++ = 0;
1388				rlxstr = (uschar *) tostring((char *) buf);
1389				if (cflag == 0)
1390					return CCL;
1391				else
1392					return NCCL;
1393			} else
1394				*bp++ = c;
1395		}
1396		break;
1397	case '{':
1398		if (isdigit(*(prestr))) {
1399			num = 0;	/* Process as a repetition */
1400			n = -1; m = -1;
1401			commafound = false;
1402			digitfound = false;
1403			startreptok = prestr-1;
1404			/* Remember start of previous atom here ? */
1405		} else {        	/* just a { char, not a repetition */
1406			rlxval = c;
1407			return CHAR;
1408                }
1409		for (; ; ) {
1410			if ((c = *prestr++) == '}') {
1411				if (commafound) {
1412					if (digitfound) { /* {n,m} */
1413						m = num;
1414						if (m < n)
1415							FATAL("illegal repetition expression: class %.20s",
1416								lastre);
1417						if (n == 0 && m == 1) {
1418							return QUEST;
1419						}
1420					} else {	/* {n,} */
1421						if (n == 0)
1422							return STAR;
1423						else if (n == 1)
1424							return PLUS;
1425					}
1426				} else {
1427					if (digitfound) { /* {n} same as {n,n} */
1428						n = num;
1429						m = num;
1430					} else {	/* {} */
1431						FATAL("illegal repetition expression: class %.20s",
1432							lastre);
1433					}
1434				}
1435				if (repeat(starttok, prestr-starttok, lastatom,
1436					   startreptok - lastatom, n, m) > 0) {
1437					if (n == 0 && m == 0) {
1438						return ZERO;
1439					}
1440					/* must rescan input for next token */
1441					goto rescan;
1442				}
1443				/* Failed to replace: eat up {...} characters
1444				   and treat like just PLUS */
1445				return PLUS;
1446			} else if (c == '\0') {
1447				FATAL("nonterminated character class %.20s",
1448					lastre);
1449			} else if (isdigit(c)) {
1450				num = 10 * num + c - '0';
1451				digitfound = true;
1452			} else if (c == ',') {
1453				if (commafound)
1454					FATAL("illegal repetition expression: class %.20s",
1455						lastre);
1456				/* looking for {n,} or {n,m} */
1457				commafound = true;
1458				n = num;
1459				digitfound = false; /* reset */
1460				num = 0;
1461			} else {
1462				FATAL("illegal repetition expression: class %.20s",
1463					lastre);
1464			}
1465		}
1466		break;
1467	}
1468}
1469
1470int cgoto(fa *f, int s, int c)
1471{
1472	int *p, *q;
1473	int i, j, k;
1474
1475	/* assert(c == HAT || c < NCHARS);  BUG: seg fault if disable test */
1476	while (f->accept >= maxsetvec) {	/* guessing here! */
1477		resizesetvec(__func__);
1478	}
1479	for (i = 0; i <= f->accept; i++)
1480		setvec[i] = 0;
1481	setcnt = 0;
1482	resize_state(f, s);
1483	/* compute positions of gototab[s,c] into setvec */
1484	p = f->posns[s];
1485	for (i = 1; i <= *p; i++) {
1486		if ((k = f->re[p[i]].ltype) != FINAL) {
1487			if ((k == CHAR && c == ptoi(f->re[p[i]].lval.np))
1488			 || (k == DOT && c != 0 && c != HAT)
1489			 || (k == ALL && c != 0)
1490			 || (k == EMPTYRE && c != 0)
1491			 || (k == CCL && member(c, (int *) f->re[p[i]].lval.rp))
1492			 || (k == NCCL && !member(c, (int *) f->re[p[i]].lval.rp) && c != 0 && c != HAT)) {
1493				q = f->re[p[i]].lfollow;
1494				for (j = 1; j <= *q; j++) {
1495					if (q[j] >= maxsetvec) {
1496						resizesetvec(__func__);
1497					}
1498					if (setvec[q[j]] == 0) {
1499						setcnt++;
1500						setvec[q[j]] = 1;
1501					}
1502				}
1503			}
1504		}
1505	}
1506	/* determine if setvec is a previous state */
1507	tmpset[0] = setcnt;
1508	j = 1;
1509	for (i = f->accept; i >= 0; i--)
1510		if (setvec[i]) {
1511			tmpset[j++] = i;
1512		}
1513	resize_state(f, f->curstat > s ? f->curstat : s);
1514	/* tmpset == previous state? */
1515	for (i = 1; i <= f->curstat; i++) {
1516		p = f->posns[i];
1517		if ((k = tmpset[0]) != p[0])
1518			goto different;
1519		for (j = 1; j <= k; j++)
1520			if (tmpset[j] != p[j])
1521				goto different;
1522		/* setvec is state i */
1523		if (c != HAT)
1524			set_gototab(f, s, c, i);
1525		return i;
1526	  different:;
1527	}
1528
1529	/* add tmpset to current set of states */
1530	++(f->curstat);
1531	resize_state(f, f->curstat);
1532	clear_gototab(f, f->curstat);
1533	xfree(f->posns[f->curstat]);
1534	p = intalloc(setcnt + 1, __func__);
1535
1536	f->posns[f->curstat] = p;
1537	if (c != HAT)
1538		set_gototab(f, s, c, f->curstat);
1539	for (i = 0; i <= setcnt; i++)
1540		p[i] = tmpset[i];
1541	if (setvec[f->accept])
1542		f->out[f->curstat] = 1;
1543	else
1544		f->out[f->curstat] = 0;
1545	return f->curstat;
1546}
1547
1548
1549void freefa(fa *f)	/* free a finite automaton */
1550{
1551	int i;
1552
1553	if (f == NULL)
1554		return;
1555	for (i = 0; i < f->state_count; i++)
1556		xfree(f->gototab[i].entries);
1557	xfree(f->gototab);
1558	for (i = 0; i <= f->curstat; i++)
1559		xfree(f->posns[i]);
1560	for (i = 0; i <= f->accept; i++) {
1561		xfree(f->re[i].lfollow);
1562		if (f->re[i].ltype == CCL || f->re[i].ltype == NCCL)
1563			xfree(f->re[i].lval.np);
1564	}
1565	xfree(f->restr);
1566	xfree(f->out);
1567	xfree(f->posns);
1568	xfree(f->gototab);
1569	xfree(f);
1570}
1571