1/****************************************************************
2Copyright (C) Lucent Technologies 1997
3All Rights Reserved
4
5Permission to use, copy, modify, and distribute this software and
6its documentation for any purpose and without fee is hereby
7granted, provided that the above copyright notice appear in all
8copies and that both that the copyright notice and this
9permission notice and warranty disclaimer appear in supporting
10documentation, and that the name Lucent Technologies or any of
11its entities not be used in advertising or publicity pertaining
12to distribution of the software without specific, written prior
13permission.
14
15LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
16INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
17IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
18SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
20IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
21ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
22THIS SOFTWARE.
23****************************************************************/
24
25/* lasciate ogne speranza, voi ch'intrate. */
26
27#define	DEBUG
28
29#include <ctype.h>
30#include <stdio.h>
31#include <string.h>
32#include <stdlib.h>
33#include "awk.h"
34#include "ytab.h"
35#include <fcntl.h>
36
37#define	HAT	(NCHARS+2)	/* matches ^ in regular expr */
38				/* NCHARS is 2**n */
39#define MAXLIN 22
40
41#define type(v)		(v)->nobj	/* badly overloaded here */
42#define info(v)		(v)->ntype	/* badly overloaded here */
43#define left(v)		(v)->narg[0]
44#define right(v)	(v)->narg[1]
45#define parent(v)	(v)->nnext
46
47#define LEAF	case CCL: case NCCL: case CHAR: case DOT: case FINAL: case ALL:
48#define ELEAF	case EMPTYRE:		/* empty string in regexp */
49#define UNARY	case STAR: case PLUS: case QUEST:
50
51/* encoding in tree Nodes:
52	leaf (CCL, NCCL, CHAR, DOT, FINAL, ALL, EMPTYRE):
53		left is index, right contains value or pointer to value
54	unary (STAR, PLUS, QUEST): left is child, right is null
55	binary (CAT, OR): left and right are children
56	parent contains pointer to parent
57*/
58
59
60int	*setvec;
61int	*tmpset;
62int	maxsetvec = 0;
63
64int	rtok;		/* next token in current re */
65int	rlxval;
66static uschar	*rlxstr;
67static uschar	*prestr;	/* current position in current re */
68static uschar	*lastre;	/* origin of last re */
69static uschar	*lastatom;	/* origin of last Atom */
70static uschar	*starttok;
71static char 	*basestr;	/* starts with original, replaced during
72				   repetition processing */
73static char 	*firstbasestr;
74
75static FILE * replogfile = 0;
76
77static	int setcnt;
78static	int poscnt;
79
80char	*patbeg;
81int	patlen;
82
83#define	NFA	20	/* cache this many dynamic fa's */
84fa	*fatab[NFA];
85int	nfatab	= 0;	/* entries in fatab */
86
87fa *makedfa(const char *s, int anchor)	/* returns dfa for reg expr s */
88{
89	int i, use, nuse;
90	fa *pfa;
91	static int now = 1;
92
93	if (setvec == 0) {	/* first time through any RE */
94		maxsetvec = MAXLIN;
95		setvec = (int *) malloc(maxsetvec * sizeof(int));
96		tmpset = (int *) malloc(maxsetvec * sizeof(int));
97		if (setvec == 0 || tmpset == 0)
98			overflo("out of space initializing makedfa");
99	}
100
101	if (compile_time)	/* a constant for sure */
102		return mkdfa(s, anchor);
103	for (i = 0; i < nfatab; i++)	/* is it there already? */
104		if (fatab[i]->anchor == anchor
105		  && strcmp((const char *) fatab[i]->restr, s) == 0) {
106			fatab[i]->use = now++;
107			return fatab[i];
108		}
109	pfa = mkdfa(s, anchor);
110	if (nfatab < NFA) {	/* room for another */
111		fatab[nfatab] = pfa;
112		fatab[nfatab]->use = now++;
113		nfatab++;
114		return pfa;
115	}
116	use = fatab[0]->use;	/* replace least-recently used */
117	nuse = 0;
118	for (i = 1; i < nfatab; i++)
119		if (fatab[i]->use < use) {
120			use = fatab[i]->use;
121			nuse = i;
122		}
123	freefa(fatab[nuse]);
124	fatab[nuse] = pfa;
125	pfa->use = now++;
126	return pfa;
127}
128
129fa *mkdfa(const char *s, int anchor)	/* does the real work of making a dfa */
130				/* anchor = 1 for anchored matches, else 0 */
131{
132	Node *p, *p1;
133	fa *f;
134
135	firstbasestr = (char *)s;
136	basestr = firstbasestr;
137	if (replogfile==0) {
138		/*	disabled
139		replogfile = fopen("/tmp/repeatlog", "a");
140		*/
141	}
142	p = reparse(s);
143	p1 = op2(CAT, op2(STAR, op2(ALL, NIL, NIL), NIL), p);
144		/* put ALL STAR in front of reg.  exp. */
145	p1 = op2(CAT, p1, op2(FINAL, NIL, NIL));
146		/* put FINAL after reg.  exp. */
147
148	poscnt = 0;
149	penter(p1);	/* enter parent pointers and leaf indices */
150	if ((f = (fa *) calloc(1, sizeof(fa) + poscnt*sizeof(rrow))) == NULL)
151		overflo("out of space for fa");
152	f->accept = poscnt-1;	/* penter has computed number of positions in re */
153	cfoll(f, p1);	/* set up follow sets */
154	freetr(p1);
155	if ((f->posns[0] = (int *) calloc(1, *(f->re[0].lfollow)*sizeof(int))) == NULL)
156			overflo("out of space in makedfa");
157	if ((f->posns[1] = (int *) calloc(1, sizeof(int))) == NULL)
158		overflo("out of space in makedfa");
159	*f->posns[1] = 0;
160	f->initstat = makeinit(f, anchor);
161	f->anchor = anchor;
162	f->restr = (uschar *) tostring(s);
163	if (replogfile) {
164		fflush(replogfile);
165		fclose(replogfile);
166		replogfile=0;
167	}
168	if (firstbasestr != basestr) {
169		if (basestr) free(basestr);
170	}
171	return f;
172}
173
174int makeinit(fa *f, int anchor)
175{
176	int i, k;
177
178	f->curstat = 2;
179	f->out[2] = 0;
180	f->reset = 0;
181	k = *(f->re[0].lfollow);
182	xfree(f->posns[2]);
183	if ((f->posns[2] = (int *) calloc(1, (k+1)*sizeof(int))) == NULL)
184		overflo("out of space in makeinit");
185	for (i=0; i <= k; i++) {
186		(f->posns[2])[i] = (f->re[0].lfollow)[i];
187	}
188	if ((f->posns[2])[1] == f->accept)
189		f->out[2] = 1;
190	for (i=0; i < NCHARS; i++)
191		f->gototab[2][i] = 0;
192	f->curstat = cgoto(f, 2, HAT);
193	if (anchor) {
194		*f->posns[2] = k-1;	/* leave out position 0 */
195		for (i=0; i < k; i++) {
196			(f->posns[0])[i] = (f->posns[2])[i];
197		}
198
199		f->out[0] = f->out[2];
200		if (f->curstat != 2)
201			--(*f->posns[f->curstat]);
202	}
203	return f->curstat;
204}
205
206void penter(Node *p)	/* set up parent pointers and leaf indices */
207{
208	switch (type(p)) {
209	ELEAF
210	LEAF
211		info(p) = poscnt;
212		poscnt++;
213		break;
214	UNARY
215		penter(left(p));
216		parent(left(p)) = p;
217		break;
218	case CAT:
219	case OR:
220		penter(left(p));
221		penter(right(p));
222		parent(left(p)) = p;
223		parent(right(p)) = p;
224		break;
225	default:	/* can't happen */
226		FATAL("can't happen: unknown type %d in penter", type(p));
227		break;
228	}
229}
230
231void freetr(Node *p)	/* free parse tree */
232{
233	switch (type(p)) {
234	ELEAF
235	LEAF
236		xfree(p);
237		break;
238	UNARY
239		freetr(left(p));
240		xfree(p);
241		break;
242	case CAT:
243	case OR:
244		freetr(left(p));
245		freetr(right(p));
246		xfree(p);
247		break;
248	default:	/* can't happen */
249		FATAL("can't happen: unknown type %d in freetr", type(p));
250		break;
251	}
252}
253
254/* in the parsing of regular expressions, metacharacters like . have */
255/* to be seen literally;  \056 is not a metacharacter. */
256
257int hexstr(char **pp)	/* find and eval hex string at pp, return new p */
258{			/* only pick up one 8-bit byte (2 chars) */
259	uschar *p;
260	int n = 0;
261	int i;
262
263	for (i = 0, p = (uschar *) *pp; i < 2 && isxdigit(*p); i++, p++) {
264		if (isdigit(*p))
265			n = 16 * n + *p - '0';
266		else if (*p >= 'a' && *p <= 'f')
267			n = 16 * n + *p - 'a' + 10;
268		else if (*p >= 'A' && *p <= 'F')
269			n = 16 * n + *p - 'A' + 10;
270	}
271	*pp = (char *) p;
272	return n;
273}
274
275#define isoctdigit(c) ((c) >= '0' && (c) <= '7')	/* multiple use of arg */
276
277int quoted(char **pp)	/* pick up next thing after a \\ */
278			/* and increment *pp */
279{
280	char *p = *pp;
281	int c;
282
283	if ((c = *p++) == 't')
284		c = '\t';
285	else if (c == 'n')
286		c = '\n';
287	else if (c == 'f')
288		c = '\f';
289	else if (c == 'r')
290		c = '\r';
291	else if (c == 'b')
292		c = '\b';
293	else if (c == '\\')
294		c = '\\';
295	else if (c == 'x') {	/* hexadecimal goo follows */
296		c = hexstr(&p);	/* this adds a null if number is invalid */
297	} else if (isoctdigit(c)) {	/* \d \dd \ddd */
298		int n = c - '0';
299		if (isoctdigit(*p)) {
300			n = 8 * n + *p++ - '0';
301			if (isoctdigit(*p))
302				n = 8 * n + *p++ - '0';
303		}
304		c = n;
305	} /* else */
306		/* c = c; */
307	*pp = p;
308	return c;
309}
310
311char *cclenter(const char *argp)	/* add a character class */
312{
313	int i, c, c2;
314	uschar *p = (uschar *) argp;
315	uschar *op, *bp;
316	static uschar *buf = 0;
317	static int bufsz = 100;
318
319	op = p;
320	if (buf == 0 && (buf = (uschar *) malloc(bufsz)) == NULL)
321		FATAL("out of space for character class [%.10s...] 1", p);
322	bp = buf;
323	for (i = 0; (c = *p++) != 0; ) {
324		if (c == '\\') {
325			c = quoted((char **) &p);
326		} else if (c == '-' && i > 0 && bp[-1] != 0) {
327			if (*p != 0) {
328				c = bp[-1];
329				c2 = *p++;
330				if (c2 == '\\')
331					c2 = quoted((char **) &p);
332				if (c > c2) {	/* empty; ignore */
333					bp--;
334					i--;
335					continue;
336				}
337				while (c < c2) {
338					if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, "cclenter1"))
339						FATAL("out of space for character class [%.10s...] 2", p);
340					*bp++ = ++c;
341					i++;
342				}
343				continue;
344			}
345		}
346		if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, "cclenter2"))
347			FATAL("out of space for character class [%.10s...] 3", p);
348		*bp++ = c;
349		i++;
350	}
351	*bp = 0;
352	dprintf( ("cclenter: in = |%s|, out = |%s|\n", op, buf) );
353	xfree(op);
354	return (char *) tostring((char *) buf);
355}
356
357void overflo(const char *s)
358{
359	FATAL("regular expression too big: %.30s...", s);
360}
361
362void cfoll(fa *f, Node *v)	/* enter follow set of each leaf of vertex v into lfollow[leaf] */
363{
364	int i;
365	int *p;
366
367	switch (type(v)) {
368	ELEAF
369	LEAF
370		f->re[info(v)].ltype = type(v);
371		f->re[info(v)].lval.np = right(v);
372		while (f->accept >= maxsetvec) {	/* guessing here! */
373			maxsetvec *= 4;
374			setvec = (int *) realloc(setvec, maxsetvec * sizeof(int));
375			tmpset = (int *) realloc(tmpset, maxsetvec * sizeof(int));
376			if (setvec == 0 || tmpset == 0)
377				overflo("out of space in cfoll()");
378		}
379		for (i = 0; i <= f->accept; i++)
380			setvec[i] = 0;
381		setcnt = 0;
382		follow(v);	/* computes setvec and setcnt */
383		if ((p = (int *) calloc(1, (setcnt+1)*sizeof(int))) == NULL)
384			overflo("out of space building follow set");
385		f->re[info(v)].lfollow = p;
386		*p = setcnt;
387		for (i = f->accept; i >= 0; i--)
388			if (setvec[i] == 1)
389				*++p = i;
390		break;
391	UNARY
392		cfoll(f,left(v));
393		break;
394	case CAT:
395	case OR:
396		cfoll(f,left(v));
397		cfoll(f,right(v));
398		break;
399	default:	/* can't happen */
400		FATAL("can't happen: unknown type %d in cfoll", type(v));
401	}
402}
403
404int first(Node *p)	/* collects initially active leaves of p into setvec */
405			/* returns 0 if p matches empty string */
406{
407	int b, lp;
408
409	switch (type(p)) {
410	ELEAF
411	LEAF
412		lp = info(p);	/* look for high-water mark of subscripts */
413		while (setcnt >= maxsetvec || lp >= maxsetvec) {	/* guessing here! */
414			maxsetvec *= 4;
415			setvec = (int *) realloc(setvec, maxsetvec * sizeof(int));
416			tmpset = (int *) realloc(tmpset, maxsetvec * sizeof(int));
417			if (setvec == 0 || tmpset == 0)
418				overflo("out of space in first()");
419		}
420		if (type(p) == EMPTYRE) {
421			setvec[lp] = 0;
422			return(0);
423		}
424		if (setvec[lp] != 1) {
425			setvec[lp] = 1;
426			setcnt++;
427		}
428		if (type(p) == CCL && (*(char *) right(p)) == '\0')
429			return(0);		/* empty CCL */
430		else return(1);
431	case PLUS:
432		if (first(left(p)) == 0) return(0);
433		return(1);
434	case STAR:
435	case QUEST:
436		first(left(p));
437		return(0);
438	case CAT:
439		if (first(left(p)) == 0 && first(right(p)) == 0) return(0);
440		return(1);
441	case OR:
442		b = first(right(p));
443		if (first(left(p)) == 0 || b == 0) return(0);
444		return(1);
445	}
446	FATAL("can't happen: unknown type %d in first", type(p));	/* can't happen */
447	return(-1);
448}
449
450void follow(Node *v)	/* collects leaves that can follow v into setvec */
451{
452	Node *p;
453
454	if (type(v) == FINAL)
455		return;
456	p = parent(v);
457	switch (type(p)) {
458	case STAR:
459	case PLUS:
460		first(v);
461		follow(p);
462		return;
463
464	case OR:
465	case QUEST:
466		follow(p);
467		return;
468
469	case CAT:
470		if (v == left(p)) {	/* v is left child of p */
471			if (first(right(p)) == 0) {
472				follow(p);
473				return;
474			}
475		} else		/* v is right child */
476			follow(p);
477		return;
478	}
479}
480
481int member(int c, const char *sarg)	/* is c in s? */
482{
483	uschar *s = (uschar *) sarg;
484
485	while (*s)
486		if (c == *s++)
487			return(1);
488	return(0);
489}
490
491int match(fa *f, const char *p0)	/* shortest match ? */
492{
493	int s, ns;
494	uschar *p = (uschar *) p0;
495
496	s = f->reset ? makeinit(f,0) : f->initstat;
497	if (f->out[s])
498		return(1);
499	do {
500		/* assert(*p < NCHARS); */
501		if ((ns = f->gototab[s][*p]) != 0)
502			s = ns;
503		else
504			s = cgoto(f, s, *p);
505		if (f->out[s])
506			return(1);
507	} while (*p++ != 0);
508	return(0);
509}
510
511int pmatch(fa *f, const char *p0)	/* longest match, for sub */
512{
513	int s, ns;
514	uschar *p = (uschar *) p0;
515	uschar *q;
516	int i, k;
517
518	/* s = f->reset ? makeinit(f,1) : f->initstat; */
519	if (f->reset) {
520		f->initstat = s = makeinit(f,1);
521	} else {
522		s = f->initstat;
523	}
524	patbeg = (char *) p;
525	patlen = -1;
526	do {
527		q = p;
528		do {
529			if (f->out[s])		/* final state */
530				patlen = q-p;
531			/* assert(*q < NCHARS); */
532			if ((ns = f->gototab[s][*q]) != 0)
533				s = ns;
534			else
535				s = cgoto(f, s, *q);
536			if (s == 1) {	/* no transition */
537				if (patlen >= 0) {
538					patbeg = (char *) p;
539					return(1);
540				}
541				else
542					goto nextin;	/* no match */
543			}
544		} while (*q++ != 0);
545		if (f->out[s])
546			patlen = q-p-1;	/* don't count $ */
547		if (patlen >= 0) {
548			patbeg = (char *) p;
549			return(1);
550		}
551	nextin:
552		s = 2;
553		if (f->reset) {
554			for (i = 2; i <= f->curstat; i++)
555				xfree(f->posns[i]);
556			k = *f->posns[0];
557			if ((f->posns[2] = (int *) calloc(1, (k+1)*sizeof(int))) == NULL)
558				overflo("out of space in pmatch");
559			for (i = 0; i <= k; i++)
560				(f->posns[2])[i] = (f->posns[0])[i];
561			f->initstat = f->curstat = 2;
562			f->out[2] = f->out[0];
563			for (i = 0; i < NCHARS; i++)
564				f->gototab[2][i] = 0;
565		}
566	} while (*p++ != 0);
567	return (0);
568}
569
570int nematch(fa *f, const char *p0)	/* non-empty match, for sub */
571{
572	int s, ns;
573	uschar *p = (uschar *) p0;
574	uschar *q;
575	int i, k;
576
577	/* s = f->reset ? makeinit(f,1) : f->initstat; */
578	if (f->reset) {
579		f->initstat = s = makeinit(f,1);
580	} else {
581		s = f->initstat;
582	}
583	patlen = -1;
584	while (*p) {
585		q = p;
586		do {
587			if (f->out[s])		/* final state */
588				patlen = q-p;
589			/* assert(*q < NCHARS); */
590			if ((ns = f->gototab[s][*q]) != 0)
591				s = ns;
592			else
593				s = cgoto(f, s, *q);
594			if (s == 1) {	/* no transition */
595				if (patlen > 0) {
596					patbeg = (char *) p;
597					return(1);
598				} else
599					goto nnextin;	/* no nonempty match */
600			}
601		} while (*q++ != 0);
602		if (f->out[s])
603			patlen = q-p-1;	/* don't count $ */
604		if (patlen > 0 ) {
605			patbeg = (char *) p;
606			return(1);
607		}
608	nnextin:
609		s = 2;
610		if (f->reset) {
611			for (i = 2; i <= f->curstat; i++)
612				xfree(f->posns[i]);
613			k = *f->posns[0];
614			if ((f->posns[2] = (int *) calloc(1, (k+1)*sizeof(int))) == NULL)
615				overflo("out of state space");
616			for (i = 0; i <= k; i++)
617				(f->posns[2])[i] = (f->posns[0])[i];
618			f->initstat = f->curstat = 2;
619			f->out[2] = f->out[0];
620			for (i = 0; i < NCHARS; i++)
621				f->gototab[2][i] = 0;
622		}
623		p++;
624	}
625	return (0);
626}
627
628Node *reparse(const char *p)	/* parses regular expression pointed to by p */
629{			/* uses relex() to scan regular expression */
630	Node *np;
631
632	dprintf( ("reparse <%s>\n", p) );
633	lastre = prestr = (uschar *) p;	/* prestr points to string to be parsed */
634	rtok = relex();
635	/* GNU compatibility: an empty regexp matches anything */
636	if (rtok == '\0') {
637		/* FATAL("empty regular expression"); previous */
638		return(op2(EMPTYRE, NIL, NIL));
639	}
640	np = regexp();
641	if (rtok != '\0')
642		FATAL("syntax error in regular expression %s at %s", lastre, prestr);
643	return(np);
644}
645
646Node *regexp(void)	/* top-level parse of reg expr */
647{
648	return (alt(concat(primary())));
649}
650
651Node *primary(void)
652{
653	Node *np;
654	int savelastatom;
655
656	switch (rtok) {
657	case CHAR:
658		lastatom = starttok;
659		np = op2(CHAR, NIL, itonp(rlxval));
660		rtok = relex();
661		return (unary(np));
662	case ALL:
663		rtok = relex();
664		return (unary(op2(ALL, NIL, NIL)));
665	case EMPTYRE:
666		if (replogfile) {
667			fprintf(replogfile,
668				"returned EMPTYRE from primary\n");
669			fflush(replogfile);
670		}
671		rtok = relex();
672
673		return (unary(op2(EMPTYRE, NIL, NIL)));
674	case DOT:
675		lastatom = starttok;
676		rtok = relex();
677		return (unary(op2(DOT, NIL, NIL)));
678	case CCL:
679		lastatom = starttok;
680		np = op2(CCL, NIL, (Node*) cclenter((char *) rlxstr));
681		rtok = relex();
682		return (unary(np));
683	case NCCL:
684		lastatom = starttok;
685		np = op2(NCCL, NIL, (Node *) cclenter((char *) rlxstr));
686		rtok = relex();
687		return (unary(np));
688	case '^':
689		rtok = relex();
690		return (unary(op2(CHAR, NIL, itonp(HAT))));
691	case '$':
692		rtok = relex();
693		return (unary(op2(CHAR, NIL, NIL)));
694	case '(':
695		lastatom = starttok;
696		savelastatom = (char *)starttok-basestr; /* Retain over recursion */
697		rtok = relex();
698		if (rtok == ')') {	/* special pleading for () */
699			rtok = relex();
700			return unary(op2(CCL, NIL, (Node *) tostring("")));
701		}
702		np = regexp();
703		if (rtok == ')') {
704			lastatom = basestr+savelastatom; /* Restore */
705			rtok = relex();
706			return (unary(np));
707		}
708		else
709			FATAL("syntax error in regular expression %s at %s", lastre, prestr);
710	default:
711		FATAL("illegal primary in regular expression %s at %s", lastre, prestr);
712	}
713	return 0;	/*NOTREACHED*/
714}
715
716Node *concat(Node *np)
717{
718	switch (rtok) {
719	case CHAR: case DOT: case ALL: case CCL: case NCCL: case '$': case '(':
720		return (concat(op2(CAT, np, primary())));
721	case EMPTYRE:
722		if (replogfile) {
723			fprintf(replogfile,
724				"returned EMPTYRE to concat\n");
725			fflush(replogfile);
726		}
727		rtok = relex();
728		return (concat(op2(CAT, op2(CCL, NIL, (Node *) tostring("")),
729				primary())));
730	}
731	return (np);
732}
733
734Node *alt(Node *np)
735{
736	if (rtok == OR) {
737		rtok = relex();
738		return (alt(op2(OR, np, concat(primary()))));
739	}
740	return (np);
741}
742
743Node *unary(Node *np)
744{
745	switch (rtok) {
746	case STAR:
747		rtok = relex();
748		return (unary(op2(STAR, np, NIL)));
749	case PLUS:
750		rtok = relex();
751		return (unary(op2(PLUS, np, NIL)));
752	case QUEST:
753		rtok = relex();
754		return (unary(op2(QUEST, np, NIL)));
755	default:
756		return (np);
757	}
758}
759
760/*
761 * Character class definitions conformant to the POSIX locale as
762 * defined in IEEE P1003.1 draft 7 of June 2001, assuming the source
763 * and operating character sets are both ASCII (ISO646) or supersets
764 * thereof.
765 *
766 * Note that to avoid overflowing the temporary buffer used in
767 * relex(), the expanded character class (prior to range expansion)
768 * must be less than twice the size of their full name.
769 */
770
771/* Because isblank doesn't show up in any of the header files on any
772 * system i use, it's defined here.  if some other locale has a richer
773 * definition of "blank", define HAS_ISBLANK and provide your own
774 * version.
775 * the parentheses here are an attempt to find a path through the maze
776 * of macro definition and/or function and/or version provided.  thanks
777 * to nelson beebe for the suggestion; let's see if it works everywhere.
778 */
779
780#if defined(__APPLE__)
781#define HAS_ISBLANK
782#endif
783#ifndef HAS_ISBLANK
784
785int (isblank)(int c)
786{
787	return c==' ' || c=='\t';
788}
789
790#endif
791
792struct charclass {
793	const char *cc_name;
794	int cc_namelen;
795	int (*cc_func)(int);
796} charclasses[] = {
797	{ "alnum",	5,	isalnum },
798	{ "alpha",	5,	isalpha },
799	{ "blank",	5,	isblank },
800	{ "cntrl",	5,	iscntrl },
801	{ "digit",	5,	isdigit },
802	{ "graph",	5,	isgraph },
803	{ "lower",	5,	islower },
804	{ "print",	5,	isprint },
805	{ "punct",	5,	ispunct },
806	{ "space",	5,	isspace },
807	{ "upper",	5,	isupper },
808	{ "xdigit",	6,	isxdigit },
809	{ NULL,		0,	NULL },
810};
811
812#define REPEAT_SIMPLE		0
813#define REPEAT_PLUS_APPENDED	1
814#define REPEAT_WITH_Q		2
815#define REPEAT_ZERO		3
816
817int replace_repeat(uschar * reptok, int reptoklen, uschar * atom, int atomlen,
818			int firstnum, int secondnum, int special_case)
819{
820	int i, j;
821	uschar *buf = 0;
822	int ret = 1;
823	int init_q = (firstnum==0);		/* first added char will be ? */
824	int n_q_reps = secondnum-firstnum;	/* m>n, so reduce until {1,m-n} left  */
825	int prefix_length = (char *) reptok-basestr;	/* prefix includes first rep	*/
826	int suffix_length = strlen(reptok) - reptoklen;	/* string after rep specifier	*/
827	int size = prefix_length +  suffix_length;
828
829	if (firstnum > 1) {	/* add room for reps 2 through firstnum */
830		size += atomlen*(firstnum-1);
831	}
832
833	/* Adjust size of buffer for special cases */
834	if (special_case == REPEAT_PLUS_APPENDED) {
835		size++;		/* for the final + */
836	} else if (special_case == REPEAT_WITH_Q) {
837		size += init_q + (atomlen+1)* n_q_reps;
838	} else if (special_case == REPEAT_ZERO) {
839		size += 2;	/* just a null ERE: () */
840	}
841	if ((buf = (uschar *) malloc(size+1)) == NULL)
842		FATAL("out of space in reg expr %.10s..", lastre);
843	if (replogfile) {
844		fprintf(replogfile, "re before: len=%d,%s\n"
845				    "         : init_q=%d,n_q_reps=%d\n",
846				strlen(basestr),basestr,
847				init_q,n_q_reps);
848		fprintf(replogfile, "re prefix_length=%d,atomlen=%d\n",
849				prefix_length,atomlen);
850/*
851		fprintf(replogfile, " new buf size: %d, atom=%s, atomlen=%d\n",
852				size, atom, atomlen);
853*/
854		fflush(replogfile);
855	}
856	memcpy(buf, basestr, prefix_length);	/* copy prefix	*/
857	j = prefix_length;
858	if (special_case == REPEAT_ZERO) {
859		j -= atomlen;
860		buf[j++] = '(';
861		buf[j++] = ')';
862	}
863	for (i=1; i < firstnum; i++) {		/* copy x reps 	*/
864		memcpy(&buf[j], atom, atomlen);
865		j += atomlen;
866	}
867	if (special_case == REPEAT_PLUS_APPENDED) {
868		buf[j++] = '+';
869	} else if (special_case == REPEAT_WITH_Q) {
870		if (init_q) buf[j++] = '?';
871		for (i=0; i < n_q_reps; i++) {	/* copy x? reps */
872			memcpy(&buf[j], atom, atomlen);
873			j += atomlen;
874			buf[j++] = '?';
875		}
876	}
877	memcpy(&buf[j], reptok+reptoklen, suffix_length);
878	if (special_case == REPEAT_ZERO) {
879		buf[j+suffix_length] = '\0';
880	} else {
881		buf[size] = '\0';
882	}
883	if (replogfile) {
884		fprintf(replogfile, "re after : len=%d,%s\n",strlen(buf),buf);
885		fflush(replogfile);
886	}
887	/* free old basestr */
888	if (firstbasestr != basestr) {
889		if (basestr) free(basestr);
890	}
891	basestr = (char *)buf;
892	prestr  = buf + prefix_length;
893	if (special_case == REPEAT_ZERO) {
894		prestr  -= atomlen;
895		ret++;
896	}
897	return ret;
898}
899
900int repeat(uschar * reptok, int reptoklen, uschar * atom, int atomlen,
901		int firstnum, int secondnum)
902{
903	/*
904	   In general, the repetition specifier or "bound" is replaced here
905	   by an equivalent ERE string, repeating the immediately previous atom
906	   and appending ? and + as needed. Note that the first copy of the
907	   atom is left in place, except in the special_case of a zero-repeat
908	   (i.e., {0}).
909	 */
910	int i, j;
911	if (secondnum < 0) {	/* means {n,} -> repeat n-1 times followed by PLUS */
912		if (firstnum < 2) {
913			/* 0 or 1: should be handled before you get here */
914			if (replogfile) {
915				fprintf(replogfile,
916					"{%d, %d}, shouldn't be here\n",
917					firstnum, secondnum);
918				fflush(replogfile);
919			}
920		} else {
921			return replace_repeat(reptok, reptoklen, atom, atomlen,
922				firstnum, secondnum, REPEAT_PLUS_APPENDED);
923		}
924	} else if (firstnum == secondnum) {	/* {n} or {n,n} -> simply repeat n-1 times */
925		if (firstnum == 0) {	/* {0} or {0,0} */
926			/* This case is unusual because the resulting
927			   replacement string might actually be SMALLER than
928			   the original ERE */
929			return replace_repeat(reptok, reptoklen, atom, atomlen,
930					firstnum, secondnum, REPEAT_ZERO);
931		} else {		/* (firstnum >= 1) */
932			return replace_repeat(reptok, reptoklen, atom, atomlen,
933					firstnum, secondnum, REPEAT_SIMPLE);
934		}
935	} else if (firstnum < secondnum) {	/* {n,m} -> repeat n-1 times then alternate  */
936		/*  x{n,m}  =>  xx...x{1, m-n+1}  =>  xx...x?x?x?..x?	*/
937		return replace_repeat(reptok, reptoklen, atom, atomlen,
938					firstnum, secondnum, REPEAT_WITH_Q);
939	} else {	/* Error - shouldn't be here (n>m) */
940		if (replogfile) {
941			fprintf(replogfile,
942				"illegal ERE {%d,%d} shouldn't be here!\n",
943				firstnum,secondnum);
944			fflush(replogfile);
945		}
946	}
947	return 0;
948}
949
950int relex(void)		/* lexical analyzer for reparse */
951{
952	int c, n;
953	int cflag;
954	static uschar *buf = 0;
955	static int bufsz = 100;
956	uschar *bp;
957	struct charclass *cc;
958	int i;
959	int num, m, commafound, digitfound;
960	uschar *startreptok;
961
962rescan:
963	starttok = prestr;
964
965	switch (c = *prestr++) {
966	case '|': return OR;
967	case '*': return STAR;
968	case '+': return PLUS;
969	case '?': return QUEST;
970	case '.': return DOT;
971	case '\0': prestr--; return '\0';
972	case '^':
973	case '$':
974	case '(':
975	case ')':
976		return c;
977	case '\\':
978		rlxval = quoted((char **) &prestr);
979		return CHAR;
980	default:
981		rlxval = c;
982		return CHAR;
983	case '[':
984		if (buf == 0 && (buf = (uschar *) malloc(bufsz)) == NULL)
985			FATAL("out of space in reg expr %.10s..", lastre);
986		bp = buf;
987		if (*prestr == '^') {
988			cflag = 1;
989			prestr++;
990		}
991		else
992			cflag = 0;
993		n = 2 * strlen((const char *) prestr)+1;
994		if (!adjbuf((char **) &buf, &bufsz, n, n, (char **) &bp, "relex1"))
995			FATAL("out of space for reg expr %.10s...", lastre);
996		for (; ; ) {
997			if ((c = *prestr++) == '\\') {
998				*bp++ = '\\';
999				if ((c = *prestr++) == '\0')
1000					FATAL("nonterminated character class %.20s...", lastre);
1001				*bp++ = c;
1002			/* } else if (c == '\n') { */
1003			/* 	FATAL("newline in character class %.20s...", lastre); */
1004			} else if (c == '[' && *prestr == ':') {
1005				/* POSIX char class names, Dag-Erling Smorgrav, des@ofug.org */
1006				for (cc = charclasses; cc->cc_name; cc++)
1007					if (strncmp((const char *) prestr + 1, (const char *) cc->cc_name, cc->cc_namelen) == 0)
1008						break;
1009				if (cc->cc_name != NULL && prestr[1 + cc->cc_namelen] == ':' &&
1010				    prestr[2 + cc->cc_namelen] == ']') {
1011					prestr += cc->cc_namelen + 3;
1012					for (i = 0; i < NCHARS; i++) {
1013						if (!adjbuf((char **) &buf, &bufsz, bp-buf+1, 100, (char **) &bp, "relex2"))
1014						    FATAL("out of space for reg expr %.10s...", lastre);
1015						if (cc->cc_func(i)) {
1016							*bp++ = i;
1017							n++;
1018						}
1019					}
1020				} else
1021					*bp++ = c;
1022			} else if (Unix2003_compat && c == '['
1023						   && *prestr == '.') {
1024				char collate_char;
1025				prestr++;
1026				collate_char = *prestr++;
1027				if (*prestr == '.' && prestr[1] == ']') {
1028					prestr += 2;
1029					/* Found it: map via locale TBD: for
1030					   now, simply return this char.  This
1031					   is sufficient to pass conformance
1032					   test awk.ex 156
1033					 */
1034					if (*prestr == ']') {
1035						prestr++;
1036						rlxval = collate_char;
1037						if (replogfile) {
1038							fprintf(replogfile,
1039								"[..] collate char=%c\n",
1040								collate_char);
1041							fflush(replogfile);
1042						}
1043						return CHAR;
1044					}
1045				}
1046			} else if (Unix2003_compat && c == '['
1047						   && *prestr == '=') {
1048				char equiv_char;
1049				prestr++;
1050				equiv_char = *prestr++;
1051				if (*prestr == '=' && prestr[1] == ']') {
1052					prestr += 2;
1053					/* Found it: map via locale TBD: for now
1054					   simply return this char. This is
1055					   sufficient to pass conformance test
1056					   awk.ex 156
1057					 */
1058					if (*prestr == ']') {
1059						prestr++;
1060						rlxval = equiv_char;
1061						if (replogfile) {
1062							fprintf(replogfile,
1063								"[==] equiv char=%c\n",
1064								equiv_char);
1065							fflush(replogfile);
1066						}
1067						return CHAR;
1068					}
1069				}
1070			} else if (c == '\0') {
1071				FATAL("nonterminated character class %.20s", lastre);
1072			} else if (bp == buf) {	/* 1st char is special */
1073				*bp++ = c;
1074			} else if (c == ']') {
1075				*bp++ = 0;
1076				rlxstr = (uschar *) tostring((char *) buf);
1077				if (replogfile) {
1078					fprintf(replogfile,
1079					"detecting []: cflag=%d, len=%d,%s\n",
1080						cflag,strlen(rlxstr),rlxstr);
1081					fflush(replogfile);
1082				}
1083				if (cflag == 0)
1084					return CCL;
1085				else
1086					return NCCL;
1087			} else
1088				*bp++ = c;
1089		}
1090		break;
1091	case '{':
1092		if (Unix2003_compat && isdigit(*(prestr))) {
1093			num = 0;	/* Process as a repetition */
1094			n = -1; m = -1;
1095			commafound = 0;
1096			digitfound = 0;
1097			startreptok = prestr-1;
1098			/* Remember start of previous atom here ? */
1099	 	} else {        	/* just a { char, not a repetition */
1100			rlxval = c;
1101			return CHAR;
1102                }
1103		for (; ; ) {
1104			if ((c = *prestr++) == '}') {
1105				if (commafound) {
1106					if (digitfound) { /* {n,m} */
1107						m = num;
1108						if (m<n)
1109							FATAL("illegal repetition expression: class %.20s",
1110								lastre);
1111						if ((n==0) && (m==1)) {
1112							return QUEST;
1113						}
1114					} else {	/* {n,} */
1115						if (n==0) return STAR;
1116						if (n==1) return PLUS;
1117					}
1118				} else {
1119					if (digitfound) { /* {n} same as {n,n} */
1120						n = num;
1121						m = num;
1122					} else {	/* {} */
1123						FATAL("illegal repetition expression: class %.20s",
1124							lastre);
1125					}
1126				}
1127				if (repeat(starttok, prestr-starttok, lastatom,
1128					   startreptok - lastatom, n, m) > 0) {
1129					if ((n==0) && (m==0)) {
1130						return EMPTYRE;
1131					}
1132					/* must rescan input for next token */
1133					goto rescan;
1134				}
1135				/* Failed to replace: eat up {...} characters
1136				   and treat like just PLUS */
1137				return PLUS;
1138			} else if (c == '\0') {
1139				FATAL("nonterminated character class %.20s",
1140					lastre);
1141			} else if (isdigit(c)) {
1142				num = 10 * num + c - '0';
1143				digitfound = 1;
1144			} else if (c == ',') {
1145				if (commafound)
1146					FATAL("illegal repetition expression: class %.20s",
1147						lastre);
1148				/* looking for {n,} or {n,m} */
1149				commafound = 1;
1150				n = num;
1151				digitfound = 0; /* reset */
1152				num = 0;
1153			} else {
1154				FATAL("illegal repetition expression: class %.20s",
1155					lastre);
1156			}
1157		}
1158		break;
1159	}
1160}
1161
1162int cgoto(fa *f, int s, int c)
1163{
1164	int i, j, k;
1165	int *p, *q;
1166
1167	assert(c == HAT || c < NCHARS);
1168	while (f->accept >= maxsetvec) {	/* guessing here! */
1169		maxsetvec *= 4;
1170		setvec = (int *) realloc(setvec, maxsetvec * sizeof(int));
1171		tmpset = (int *) realloc(tmpset, maxsetvec * sizeof(int));
1172		if (setvec == 0 || tmpset == 0)
1173			overflo("out of space in cgoto()");
1174	}
1175	for (i = 0; i <= f->accept; i++)
1176		setvec[i] = 0;
1177	setcnt = 0;
1178	/* compute positions of gototab[s,c] into setvec */
1179	p = f->posns[s];
1180	for (i = 1; i <= *p; i++) {
1181		if ((k = f->re[p[i]].ltype) != FINAL) {
1182			if ((k == CHAR && c == ptoi(f->re[p[i]].lval.np))
1183			 || (k == DOT && c != 0 && c != HAT)
1184			 || (k == ALL && c != 0)
1185			 || (k == EMPTYRE && c != 0)
1186			 || (k == CCL && member(c, (char *) f->re[p[i]].lval.up))
1187			 || (k == NCCL && !member(c, (char *) f->re[p[i]].lval.up) && c != 0 && c != HAT)) {
1188				q = f->re[p[i]].lfollow;
1189				for (j = 1; j <= *q; j++) {
1190					if (q[j] >= maxsetvec) {
1191						maxsetvec *= 4;
1192						setvec = (int *) realloc(setvec, maxsetvec * sizeof(int));
1193						tmpset = (int *) realloc(setvec, maxsetvec * sizeof(int));
1194						if (setvec == 0 || tmpset == 0)
1195							overflo("cgoto overflow");
1196					}
1197					if (setvec[q[j]] == 0) {
1198						setcnt++;
1199						setvec[q[j]] = 1;
1200					}
1201				}
1202			}
1203		}
1204	}
1205	/* determine if setvec is a previous state */
1206	tmpset[0] = setcnt;
1207	j = 1;
1208	for (i = f->accept; i >= 0; i--)
1209		if (setvec[i]) {
1210			tmpset[j++] = i;
1211		}
1212	/* tmpset == previous state? */
1213	for (i = 1; i <= f->curstat; i++) {
1214		p = f->posns[i];
1215		if ((k = tmpset[0]) != p[0])
1216			goto different;
1217		for (j = 1; j <= k; j++)
1218			if (tmpset[j] != p[j])
1219				goto different;
1220		/* setvec is state i */
1221		f->gototab[s][c] = i;
1222		return i;
1223	  different:;
1224	}
1225
1226	/* add tmpset to current set of states */
1227	if (f->curstat >= NSTATES-1) {
1228		f->curstat = 2;
1229		f->reset = 1;
1230		for (i = 2; i < NSTATES; i++)
1231			xfree(f->posns[i]);
1232	} else
1233		++(f->curstat);
1234	for (i = 0; i < NCHARS; i++)
1235		f->gototab[f->curstat][i] = 0;
1236	xfree(f->posns[f->curstat]);
1237	if ((p = (int *) calloc(1, (setcnt+1)*sizeof(int))) == NULL)
1238		overflo("out of space in cgoto");
1239
1240	f->posns[f->curstat] = p;
1241	f->gototab[s][c] = f->curstat;
1242	for (i = 0; i <= setcnt; i++)
1243		p[i] = tmpset[i];
1244	if (setvec[f->accept])
1245		f->out[f->curstat] = 1;
1246	else
1247		f->out[f->curstat] = 0;
1248	return f->curstat;
1249}
1250
1251
1252void freefa(fa *f)	/* free a finite automaton */
1253{
1254	int i;
1255
1256	if (f == NULL)
1257		return;
1258	for (i = 0; i <= f->curstat; i++)
1259		xfree(f->posns[i]);
1260	for (i = 0; i <= f->accept; i++) {
1261		xfree(f->re[i].lfollow);
1262		if (f->re[i].ltype == CCL || f->re[i].ltype == NCCL)
1263			xfree((f->re[i].lval.np));
1264	}
1265	xfree(f->restr);
1266	xfree(f);
1267}
1268