1/*
2 * re_*comp and friends - compile REs
3 * This file #includes several others (see the bottom).
4 *
5 * Copyright (c) 1998, 1999 Henry Spencer.  All rights reserved.
6 *
7 * Development of this software was funded, in part, by Cray Research Inc.,
8 * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics
9 * Corporation, none of whom are responsible for the results.  The author
10 * thanks all of them.
11 *
12 * Redistribution and use in source and binary forms -- with or without
13 * modification -- are permitted for any purpose, provided that
14 * redistributions in source form retain this entire copyright notice and
15 * indicate the origin and nature of any modifications.
16 *
17 * I'd appreciate being given credit for this package in the documentation
18 * of software which uses it, but that is not a requirement.
19 *
20 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
21 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
22 * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL
23 * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
24 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
25 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
26 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
27 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
28 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
29 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 *
31 */
32
33#include "regguts.h"
34
35/*
36 * forward declarations, up here so forward datatypes etc. are defined early
37 */
38/* =====^!^===== begin forwards =====^!^===== */
39/* automatically gathered by fwd; do not hand-edit */
40/* === regcomp.c === */
41int compile _ANSI_ARGS_((regex_t *, CONST chr *, size_t, int));
42static VOID moresubs _ANSI_ARGS_((struct vars *, int));
43static int freev _ANSI_ARGS_((struct vars *, int));
44static VOID makesearch _ANSI_ARGS_((struct vars *, struct nfa *));
45static struct subre *parse _ANSI_ARGS_((struct vars *, int, int, struct state *, struct state *));
46static struct subre *parsebranch _ANSI_ARGS_((struct vars *, int, int, struct state *, struct state *, int));
47static VOID parseqatom _ANSI_ARGS_((struct vars *, int, int, struct state *, struct state *, struct subre *));
48static VOID nonword _ANSI_ARGS_((struct vars *, int, struct state *, struct state *));
49static VOID word _ANSI_ARGS_((struct vars *, int, struct state *, struct state *));
50static int scannum _ANSI_ARGS_((struct vars *));
51static VOID repeat _ANSI_ARGS_((struct vars *, struct state *, struct state *, int, int));
52static VOID bracket _ANSI_ARGS_((struct vars *, struct state *, struct state *));
53static VOID cbracket _ANSI_ARGS_((struct vars *, struct state *, struct state *));
54static VOID brackpart _ANSI_ARGS_((struct vars *, struct state *, struct state *));
55static chr *scanplain _ANSI_ARGS_((struct vars *));
56static VOID leaders _ANSI_ARGS_((struct vars *, struct cvec *));
57static VOID onechr _ANSI_ARGS_((struct vars *, pchr, struct state *, struct state *));
58static VOID dovec _ANSI_ARGS_((struct vars *, struct cvec *, struct state *, struct state *));
59static celt nextleader _ANSI_ARGS_((struct vars *, pchr, pchr));
60static VOID wordchrs _ANSI_ARGS_((struct vars *));
61static struct subre *subre _ANSI_ARGS_((struct vars *, int, int, struct state *, struct state *));
62static VOID freesubre _ANSI_ARGS_((struct vars *, struct subre *));
63static VOID freesrnode _ANSI_ARGS_((struct vars *, struct subre *));
64static VOID optst _ANSI_ARGS_((struct vars *, struct subre *));
65static int numst _ANSI_ARGS_((struct subre *, int));
66static VOID markst _ANSI_ARGS_((struct subre *));
67static VOID cleanst _ANSI_ARGS_((struct vars *));
68static long nfatree _ANSI_ARGS_((struct vars *, struct subre *, FILE *));
69static long nfanode _ANSI_ARGS_((struct vars *, struct subre *, FILE *));
70static int newlacon _ANSI_ARGS_((struct vars *, struct state *, struct state *, int));
71static VOID freelacons _ANSI_ARGS_((struct subre *, int));
72static VOID rfree _ANSI_ARGS_((regex_t *));
73static VOID dump _ANSI_ARGS_((regex_t *, FILE *));
74static VOID dumpst _ANSI_ARGS_((struct subre *, FILE *, int));
75static VOID stdump _ANSI_ARGS_((struct subre *, FILE *, int));
76static char *stid _ANSI_ARGS_((struct subre *, char *, size_t));
77/* === regc_lex.c === */
78static VOID lexstart _ANSI_ARGS_((struct vars *));
79static VOID prefixes _ANSI_ARGS_((struct vars *));
80static VOID lexnest _ANSI_ARGS_((struct vars *, chr *, chr *));
81static VOID lexword _ANSI_ARGS_((struct vars *));
82static int next _ANSI_ARGS_((struct vars *));
83static int lexescape _ANSI_ARGS_((struct vars *));
84static chr lexdigits _ANSI_ARGS_((struct vars *, int, int, int));
85static int brenext _ANSI_ARGS_((struct vars *, pchr));
86static VOID skip _ANSI_ARGS_((struct vars *));
87static chr newline _ANSI_ARGS_((NOPARMS));
88#ifdef REG_DEBUG
89static chr *ch _ANSI_ARGS_((NOPARMS));
90#endif
91static chr chrnamed _ANSI_ARGS_((struct vars *, chr *, chr *, pchr));
92/* === regc_color.c === */
93static VOID initcm _ANSI_ARGS_((struct vars *, struct colormap *));
94static VOID freecm _ANSI_ARGS_((struct colormap *));
95static VOID cmtreefree _ANSI_ARGS_((struct colormap *, union tree *, int));
96static color setcolor _ANSI_ARGS_((struct colormap *, pchr, pcolor));
97static color maxcolor _ANSI_ARGS_((struct colormap *));
98static color newcolor _ANSI_ARGS_((struct colormap *));
99static VOID freecolor _ANSI_ARGS_((struct colormap *, pcolor));
100static color pseudocolor _ANSI_ARGS_((struct colormap *));
101static color subcolor _ANSI_ARGS_((struct colormap *, pchr c));
102static color newsub _ANSI_ARGS_((struct colormap *, pcolor));
103static VOID subrange _ANSI_ARGS_((struct vars *, pchr, pchr, struct state *, struct state *));
104static VOID subblock _ANSI_ARGS_((struct vars *, pchr, struct state *, struct state *));
105static VOID okcolors _ANSI_ARGS_((struct nfa *, struct colormap *));
106static VOID colorchain _ANSI_ARGS_((struct colormap *, struct arc *));
107static VOID uncolorchain _ANSI_ARGS_((struct colormap *, struct arc *));
108static int singleton _ANSI_ARGS_((struct colormap *, pchr c));
109static VOID rainbow _ANSI_ARGS_((struct nfa *, struct colormap *, int, pcolor, struct state *, struct state *));
110static VOID colorcomplement _ANSI_ARGS_((struct nfa *, struct colormap *, int, struct state *, struct state *, struct state *));
111#ifdef REG_DEBUG
112static VOID dumpcolors _ANSI_ARGS_((struct colormap *, FILE *));
113static VOID fillcheck _ANSI_ARGS_((struct colormap *, union tree *, int, FILE *));
114static VOID dumpchr _ANSI_ARGS_((pchr, FILE *));
115#endif
116/* === regc_nfa.c === */
117static struct nfa *newnfa _ANSI_ARGS_((struct vars *, struct colormap *, struct nfa *));
118static VOID freenfa _ANSI_ARGS_((struct nfa *));
119static struct state *newstate _ANSI_ARGS_((struct nfa *));
120static struct state *newfstate _ANSI_ARGS_((struct nfa *, int flag));
121static VOID dropstate _ANSI_ARGS_((struct nfa *, struct state *));
122static VOID freestate _ANSI_ARGS_((struct nfa *, struct state *));
123static VOID destroystate _ANSI_ARGS_((struct nfa *, struct state *));
124static VOID newarc _ANSI_ARGS_((struct nfa *, int, pcolor, struct state *, struct state *));
125static struct arc *allocarc _ANSI_ARGS_((struct nfa *, struct state *));
126static VOID freearc _ANSI_ARGS_((struct nfa *, struct arc *));
127static struct arc *findarc _ANSI_ARGS_((struct state *, int, pcolor));
128static VOID cparc _ANSI_ARGS_((struct nfa *, struct arc *, struct state *, struct state *));
129static VOID moveins _ANSI_ARGS_((struct nfa *, struct state *, struct state *));
130static VOID copyins _ANSI_ARGS_((struct nfa *, struct state *, struct state *));
131static VOID moveouts _ANSI_ARGS_((struct nfa *, struct state *, struct state *));
132static VOID copyouts _ANSI_ARGS_((struct nfa *, struct state *, struct state *));
133static VOID cloneouts _ANSI_ARGS_((struct nfa *, struct state *, struct state *, struct state *, int));
134static VOID delsub _ANSI_ARGS_((struct nfa *, struct state *, struct state *));
135static VOID deltraverse _ANSI_ARGS_((struct nfa *, struct state *, struct state *));
136static VOID dupnfa _ANSI_ARGS_((struct nfa *, struct state *, struct state *, struct state *, struct state *));
137static VOID duptraverse _ANSI_ARGS_((struct nfa *, struct state *, struct state *));
138static VOID cleartraverse _ANSI_ARGS_((struct nfa *, struct state *));
139static VOID specialcolors _ANSI_ARGS_((struct nfa *));
140static long optimize _ANSI_ARGS_((struct nfa *, FILE *));
141static VOID pullback _ANSI_ARGS_((struct nfa *, FILE *));
142static int pull _ANSI_ARGS_((struct nfa *, struct arc *));
143static VOID pushfwd _ANSI_ARGS_((struct nfa *, FILE *));
144static int push _ANSI_ARGS_((struct nfa *, struct arc *));
145#define	INCOMPATIBLE	1	/* destroys arc */
146#define	SATISFIED	2	/* constraint satisfied */
147#define	COMPATIBLE	3	/* compatible but not satisfied yet */
148static int combine _ANSI_ARGS_((struct arc *, struct arc *));
149static VOID fixempties _ANSI_ARGS_((struct nfa *, FILE *));
150static int unempty _ANSI_ARGS_((struct nfa *, struct arc *));
151static VOID cleanup _ANSI_ARGS_((struct nfa *));
152static VOID markreachable _ANSI_ARGS_((struct nfa *, struct state *, struct state *, struct state *));
153static VOID markcanreach _ANSI_ARGS_((struct nfa *, struct state *, struct state *, struct state *));
154static long analyze _ANSI_ARGS_((struct nfa *));
155static VOID compact _ANSI_ARGS_((struct nfa *, struct cnfa *));
156static VOID carcsort _ANSI_ARGS_((struct carc *, struct carc *));
157static VOID freecnfa _ANSI_ARGS_((struct cnfa *));
158static VOID dumpnfa _ANSI_ARGS_((struct nfa *, FILE *));
159#ifdef REG_DEBUG
160static VOID dumpstate _ANSI_ARGS_((struct state *, FILE *));
161static VOID dumparcs _ANSI_ARGS_((struct state *, FILE *));
162static int dumprarcs _ANSI_ARGS_((struct arc *, struct state *, FILE *, int));
163static VOID dumparc _ANSI_ARGS_((struct arc *, struct state *, FILE *));
164#endif
165static VOID dumpcnfa _ANSI_ARGS_((struct cnfa *, FILE *));
166#ifdef REG_DEBUG
167static VOID dumpcstate _ANSI_ARGS_((int, struct carc *, struct cnfa *, FILE *));
168#endif
169/* === regc_cvec.c === */
170static struct cvec *newcvec _ANSI_ARGS_((int, int, int));
171static struct cvec *clearcvec _ANSI_ARGS_((struct cvec *));
172static VOID addchr _ANSI_ARGS_((struct cvec *, pchr));
173static VOID addrange _ANSI_ARGS_((struct cvec *, pchr, pchr));
174static VOID addmcce _ANSI_ARGS_((struct cvec *, chr *, chr *));
175static int haschr _ANSI_ARGS_((struct cvec *, pchr));
176static struct cvec *getcvec _ANSI_ARGS_((struct vars *, int, int, int));
177static VOID freecvec _ANSI_ARGS_((struct cvec *));
178/* === regc_locale.c === */
179static int nmcces _ANSI_ARGS_((struct vars *));
180static int nleaders _ANSI_ARGS_((struct vars *));
181static struct cvec *allmcces _ANSI_ARGS_((struct vars *, struct cvec *));
182static celt element _ANSI_ARGS_((struct vars *, chr *, chr *));
183static struct cvec *range _ANSI_ARGS_((struct vars *, celt, celt, int));
184static int before _ANSI_ARGS_((celt, celt));
185static struct cvec *eclass _ANSI_ARGS_((struct vars *, celt, int));
186static struct cvec *cclass _ANSI_ARGS_((struct vars *, chr *, chr *, int));
187static struct cvec *allcases _ANSI_ARGS_((struct vars *, pchr));
188static int cmp _ANSI_ARGS_((CONST chr *, CONST chr *, size_t));
189static int casecmp _ANSI_ARGS_((CONST chr *, CONST chr *, size_t));
190/* automatically gathered by fwd; do not hand-edit */
191/* =====^!^===== end forwards =====^!^===== */
192
193
194
195/* internal variables, bundled for easy passing around */
196struct vars {
197	regex_t *re;
198	chr *now;		/* scan pointer into string */
199	chr *stop;		/* end of string */
200	chr *savenow;		/* saved now and stop for "subroutine call" */
201	chr *savestop;
202	int err;		/* error code (0 if none) */
203	int cflags;		/* copy of compile flags */
204	int lasttype;		/* type of previous token */
205	int nexttype;		/* type of next token */
206	chr nextvalue;		/* value (if any) of next token */
207	int lexcon;		/* lexical context type (see lex.c) */
208	int nsubexp;		/* subexpression count */
209	struct subre **subs;	/* subRE pointer vector */
210	size_t nsubs;		/* length of vector */
211	struct subre *sub10[10];	/* initial vector, enough for most */
212	struct nfa *nfa;	/* the NFA */
213	struct colormap *cm;	/* character color map */
214	color nlcolor;		/* color of newline */
215	struct state *wordchrs;	/* state in nfa holding word-char outarcs */
216	struct subre *tree;	/* subexpression tree */
217	struct subre *treechain;	/* all tree nodes allocated */
218	struct subre *treefree;		/* any free tree nodes */
219	int ntree;		/* number of tree nodes */
220	struct cvec *cv;	/* interface cvec */
221	struct cvec *cv2;	/* utility cvec */
222	struct cvec *mcces;	/* collating-element information */
223#		define	ISCELEADER(v,c)	(v->mcces != NULL && haschr(v->mcces, (c)))
224	struct state *mccepbegin;	/* in nfa, start of MCCE prototypes */
225	struct state *mccepend;	/* in nfa, end of MCCE prototypes */
226	struct subre *lacons;	/* lookahead-constraint vector */
227	int nlacons;		/* size of lacons */
228};
229
230/* parsing macros; most know that `v' is the struct vars pointer */
231#define	NEXT()	(next(v))		/* advance by one token */
232#define	SEE(t)	(v->nexttype == (t))	/* is next token this? */
233#define	EAT(t)	(SEE(t) && next(v))	/* if next is this, swallow it */
234#define	VISERR(vv)	((vv)->err != 0)	/* have we seen an error yet? */
235#define	ISERR()	VISERR(v)
236#define	VERR(vv,e)	((vv)->nexttype = EOS, ((vv)->err) ? (vv)->err :\
237							((vv)->err = (e)))
238#define	ERR(e)	VERR(v, e)		/* record an error */
239#define	NOERR()	{if (ISERR()) return;}	/* if error seen, return */
240#define	NOERRN()	{if (ISERR()) return NULL;}	/* NOERR with retval */
241#define	NOERRZ()	{if (ISERR()) return 0;}	/* NOERR with retval */
242#define	INSIST(c, e)	((c) ? 0 : ERR(e))	/* if condition false, error */
243#define	NOTE(b)	(v->re->re_info |= (b))		/* note visible condition */
244#define	EMPTYARC(x, y)	newarc(v->nfa, EMPTY, 0, x, y)
245
246/* token type codes, some also used as NFA arc types */
247#define	EMPTY	'n'		/* no token present */
248#define	EOS	'e'		/* end of string */
249#define	PLAIN	'p'		/* ordinary character */
250#define	DIGIT	'd'		/* digit (in bound) */
251#define	BACKREF	'b'		/* back reference */
252#define	COLLEL	'I'		/* start of [. */
253#define	ECLASS	'E'		/* start of [= */
254#define	CCLASS	'C'		/* start of [: */
255#define	END	'X'		/* end of [. [= [: */
256#define	RANGE	'R'		/* - within [] which might be range delim. */
257#define	LACON	'L'		/* lookahead constraint subRE */
258#define	AHEAD	'a'		/* color-lookahead arc */
259#define	BEHIND	'r'		/* color-lookbehind arc */
260#define	WBDRY	'w'		/* word boundary constraint */
261#define	NWBDRY	'W'		/* non-word-boundary constraint */
262#define	SBEGIN	'A'		/* beginning of string (even if not BOL) */
263#define	SEND	'Z'		/* end of string (even if not EOL) */
264#define	PREFER	'P'		/* length preference */
265
266/* is an arc colored, and hence on a color chain? */
267#define	COLORED(a)	((a)->type == PLAIN || (a)->type == AHEAD || \
268							(a)->type == BEHIND)
269
270
271
272/* static function list */
273static struct fns functions = {
274	rfree,			/* regfree insides */
275};
276
277
278
279/*
280 - compile - compile regular expression
281 ^ int compile(regex_t *, CONST chr *, size_t, int);
282 */
283int
284compile(re, string, len, flags)
285regex_t *re;
286CONST chr *string;
287size_t len;
288int flags;
289{
290	struct vars var;
291	struct vars *v = &var;
292	struct guts *g;
293	int i;
294	size_t j;
295	FILE *debug = (flags&REG_PROGRESS) ? stdout : (FILE *)NULL;
296#	define	CNOERR()	{ if (ISERR()) return freev(v, v->err); }
297
298	/* sanity checks */
299
300	if (re == NULL || string == NULL)
301		return REG_INVARG;
302	if ((flags&REG_QUOTE) &&
303			(flags&(REG_ADVANCED|REG_EXPANDED|REG_NEWLINE)))
304		return REG_INVARG;
305	if (!(flags&REG_EXTENDED) && (flags&REG_ADVF))
306		return REG_INVARG;
307
308	/* initial setup (after which freev() is callable) */
309	v->re = re;
310	v->now = (chr *)string;
311	v->stop = v->now + len;
312	v->savenow = v->savestop = NULL;
313	v->err = 0;
314	v->cflags = flags;
315	v->nsubexp = 0;
316	v->subs = v->sub10;
317	v->nsubs = 10;
318	for (j = 0; j < v->nsubs; j++)
319		v->subs[j] = NULL;
320	v->nfa = NULL;
321	v->cm = NULL;
322	v->nlcolor = COLORLESS;
323	v->wordchrs = NULL;
324	v->tree = NULL;
325	v->treechain = NULL;
326	v->treefree = NULL;
327	v->cv = NULL;
328	v->cv2 = NULL;
329	v->mcces = NULL;
330	v->lacons = NULL;
331	v->nlacons = 0;
332	re->re_magic = REMAGIC;
333	re->re_info = 0;		/* bits get set during parse */
334	re->re_csize = sizeof(chr);
335	re->re_guts = NULL;
336	re->re_fns = VS(&functions);
337
338	/* more complex setup, malloced things */
339	re->re_guts = VS(MALLOC(sizeof(struct guts)));
340	if (re->re_guts == NULL)
341		return freev(v, REG_ESPACE);
342	g = (struct guts *)re->re_guts;
343	g->tree = NULL;
344	initcm(v, &g->cmap);
345	v->cm = &g->cmap;
346	g->lacons = NULL;
347	g->nlacons = 0;
348	ZAPCNFA(g->search);
349	v->nfa = newnfa(v, v->cm, (struct nfa *)NULL);
350	CNOERR();
351	v->cv = newcvec(100, 20, 10);
352	if (v->cv == NULL)
353		return freev(v, REG_ESPACE);
354	i = nmcces(v);
355	if (i > 0) {
356		v->mcces = newcvec(nleaders(v), 0, i);
357		CNOERR();
358		v->mcces = allmcces(v, v->mcces);
359		leaders(v, v->mcces);
360		addmcce(v->mcces, (chr *)NULL, (chr *)NULL);	/* dummy */
361	}
362	CNOERR();
363
364	/* parsing */
365	lexstart(v);			/* also handles prefixes */
366	if ((v->cflags&REG_NLSTOP) || (v->cflags&REG_NLANCH)) {
367		/* assign newline a unique color */
368		v->nlcolor = subcolor(v->cm, newline());
369		okcolors(v->nfa, v->cm);
370	}
371	CNOERR();
372	v->tree = parse(v, EOS, PLAIN, v->nfa->init, v->nfa->final);
373	assert(SEE(EOS));		/* even if error; ISERR() => SEE(EOS) */
374	CNOERR();
375	assert(v->tree != NULL);
376
377	/* finish setup of nfa and its subre tree */
378	specialcolors(v->nfa);
379	CNOERR();
380	if (debug != NULL) {
381		fprintf(debug, "\n\n\n========= RAW ==========\n");
382		dumpnfa(v->nfa, debug);
383		dumpst(v->tree, debug, 1);
384	}
385	optst(v, v->tree);
386	v->ntree = numst(v->tree, 1);
387	markst(v->tree);
388	cleanst(v);
389	if (debug != NULL) {
390		fprintf(debug, "\n\n\n========= TREE FIXED ==========\n");
391		dumpst(v->tree, debug, 1);
392	}
393
394	/* build compacted NFAs for tree and lacons */
395	re->re_info |= nfatree(v, v->tree, debug);
396	CNOERR();
397	assert(v->nlacons == 0 || v->lacons != NULL);
398	for (i = 1; i < v->nlacons; i++) {
399		if (debug != NULL)
400			fprintf(debug, "\n\n\n========= LA%d ==========\n", i);
401		nfanode(v, &v->lacons[i], debug);
402	}
403	CNOERR();
404	if (v->tree->flags&SHORTER)
405		NOTE(REG_USHORTEST);
406
407	/* build compacted NFAs for tree, lacons, fast search */
408	if (debug != NULL)
409		fprintf(debug, "\n\n\n========= SEARCH ==========\n");
410	/* can sacrifice main NFA now, so use it as work area */
411	(DISCARD)optimize(v->nfa, debug);
412	CNOERR();
413	makesearch(v, v->nfa);
414	CNOERR();
415	compact(v->nfa, &g->search);
416	CNOERR();
417
418	/* looks okay, package it up */
419	re->re_nsub = v->nsubexp;
420	v->re = NULL;			/* freev no longer frees re */
421	g->magic = GUTSMAGIC;
422	g->cflags = v->cflags;
423	g->info = re->re_info;
424	g->nsub = re->re_nsub;
425	g->tree = v->tree;
426	v->tree = NULL;
427	g->ntree = v->ntree;
428	g->compare = (v->cflags&REG_ICASE) ? casecmp : cmp;
429	g->lacons = v->lacons;
430	v->lacons = NULL;
431	g->nlacons = v->nlacons;
432
433	if (flags&REG_DUMP)
434		dump(re, stdout);
435
436	assert(v->err == 0);
437	return freev(v, 0);
438}
439
440/*
441 - moresubs - enlarge subRE vector
442 ^ static VOID moresubs(struct vars *, int);
443 */
444static VOID
445moresubs(v, wanted)
446struct vars *v;
447int wanted;			/* want enough room for this one */
448{
449	struct subre **p;
450	size_t n;
451
452	assert(wanted > 0 && (size_t)wanted >= v->nsubs);
453	n = (size_t)wanted * 3 / 2 + 1;
454	if (v->subs == v->sub10) {
455		p = (struct subre **)MALLOC(n * sizeof(struct subre *));
456		if (p != NULL)
457			memcpy(VS(p), VS(v->subs),
458					v->nsubs * sizeof(struct subre *));
459	} else
460		p = (struct subre **)REALLOC(v->subs, n*sizeof(struct subre *));
461	if (p == NULL) {
462		ERR(REG_ESPACE);
463		return;
464	}
465	v->subs = p;
466	for (p = &v->subs[v->nsubs]; v->nsubs < n; p++, v->nsubs++)
467		*p = NULL;
468	assert(v->nsubs == n);
469	assert((size_t)wanted < v->nsubs);
470}
471
472/*
473 - freev - free vars struct's substructures where necessary
474 * Optionally does error-number setting, and always returns error code
475 * (if any), to make error-handling code terser.
476 ^ static int freev(struct vars *, int);
477 */
478static int
479freev(v, err)
480struct vars *v;
481int err;
482{
483	if (v->re != NULL)
484		rfree(v->re);
485	if (v->subs != v->sub10)
486		FREE(v->subs);
487	if (v->nfa != NULL)
488		freenfa(v->nfa);
489	if (v->tree != NULL)
490		freesubre(v, v->tree);
491	if (v->treechain != NULL)
492		cleanst(v);
493	if (v->cv != NULL)
494		freecvec(v->cv);
495	if (v->cv2 != NULL)
496		freecvec(v->cv2);
497	if (v->mcces != NULL)
498		freecvec(v->mcces);
499	if (v->lacons != NULL)
500		freelacons(v->lacons, v->nlacons);
501	ERR(err);			/* nop if err==0 */
502
503	return v->err;
504}
505
506/*
507 - makesearch - turn an NFA into a search NFA (implicit prepend of .*?)
508 * NFA must have been optimize()d already.
509 ^ static VOID makesearch(struct vars *, struct nfa *);
510 */
511static VOID
512makesearch(v, nfa)
513struct vars *v;
514struct nfa *nfa;
515{
516	struct arc *a;
517	struct arc *b;
518	struct state *pre = nfa->pre;
519	struct state *s;
520	struct state *s2;
521	struct state *slist;
522
523	/* no loops are needed if it's anchored */
524	for (a = pre->outs; a != NULL; a = a->outchain) {
525		assert(a->type == PLAIN);
526		if (a->co != nfa->bos[0] && a->co != nfa->bos[1])
527			break;
528	}
529	if (a != NULL) {
530		/* add implicit .* in front */
531		rainbow(nfa, v->cm, PLAIN, COLORLESS, pre, pre);
532
533		/* and ^* and \A* too -- not always necessary, but harmless */
534		newarc(nfa, PLAIN, nfa->bos[0], pre, pre);
535		newarc(nfa, PLAIN, nfa->bos[1], pre, pre);
536	}
537
538	/*
539	 * Now here's the subtle part.  Because many REs have no lookback
540	 * constraints, often knowing when you were in the pre state tells
541	 * you little; it's the next state(s) that are informative.  But
542	 * some of them may have other inarcs, i.e. it may be possible to
543	 * make actual progress and then return to one of them.  We must
544	 * de-optimize such cases, splitting each such state into progress
545	 * and no-progress states.
546	 */
547
548	/* first, make a list of the states */
549	slist = NULL;
550	for (a = pre->outs; a != NULL; a = a->outchain) {
551		s = a->to;
552		for (b = s->ins; b != NULL; b = b->inchain)
553			if (b->from != pre)
554				break;
555		if (b != NULL) {		/* must be split */
556			if (s->tmp == NULL) {  /* if not already in the list */
557			                       /* (fixes bugs 505048, 230589, */
558			                       /* 840258, 504785) */
559				s->tmp = slist;
560				slist = s;
561			}
562		}
563	}
564
565	/* do the splits */
566	for (s = slist; s != NULL; s = s2) {
567		s2 = newstate(nfa);
568		copyouts(nfa, s, s2);
569		for (a = s->ins; a != NULL; a = b) {
570			b = a->inchain;
571			if (a->from != pre) {
572				cparc(nfa, a, a->from, s2);
573				freearc(nfa, a);
574			}
575		}
576		s2 = s->tmp;
577		s->tmp = NULL;		/* clean up while we're at it */
578	}
579}
580
581/*
582 - parse - parse an RE
583 * This is actually just the top level, which parses a bunch of branches
584 * tied together with '|'.  They appear in the tree as the left children
585 * of a chain of '|' subres.
586 ^ static struct subre *parse(struct vars *, int, int, struct state *,
587 ^ 	struct state *);
588 */
589static struct subre *
590parse(v, stopper, type, init, final)
591struct vars *v;
592int stopper;			/* EOS or ')' */
593int type;			/* LACON (lookahead subRE) or PLAIN */
594struct state *init;		/* initial state */
595struct state *final;		/* final state */
596{
597	struct state *left;	/* scaffolding for branch */
598	struct state *right;
599	struct subre *branches;	/* top level */
600	struct subre *branch;	/* current branch */
601	struct subre *t;	/* temporary */
602	int firstbranch;	/* is this the first branch? */
603
604	assert(stopper == ')' || stopper == EOS);
605
606	branches = subre(v, '|', LONGER, init, final);
607	NOERRN();
608	branch = branches;
609	firstbranch = 1;
610	do {	/* a branch */
611		if (!firstbranch) {
612			/* need a place to hang it */
613			branch->right = subre(v, '|', LONGER, init, final);
614			NOERRN();
615			branch = branch->right;
616		}
617		firstbranch = 0;
618		left = newstate(v->nfa);
619		right = newstate(v->nfa);
620		NOERRN();
621		EMPTYARC(init, left);
622		EMPTYARC(right, final);
623		NOERRN();
624		branch->left = parsebranch(v, stopper, type, left, right, 0);
625		NOERRN();
626		branch->flags |= UP(branch->flags | branch->left->flags);
627		if ((branch->flags &~ branches->flags) != 0)	/* new flags */
628			for (t = branches; t != branch; t = t->right)
629				t->flags |= branch->flags;
630	} while (EAT('|'));
631	assert(SEE(stopper) || SEE(EOS));
632
633	if (!SEE(stopper)) {
634		assert(stopper == ')' && SEE(EOS));
635		ERR(REG_EPAREN);
636	}
637
638	/* optimize out simple cases */
639	if (branch == branches) {	/* only one branch */
640		assert(branch->right == NULL);
641		t = branch->left;
642		branch->left = NULL;
643		freesubre(v, branches);
644		branches = t;
645	} else if (!MESSY(branches->flags)) {	/* no interesting innards */
646		freesubre(v, branches->left);
647		branches->left = NULL;
648		freesubre(v, branches->right);
649		branches->right = NULL;
650		branches->op = '=';
651	}
652
653	return branches;
654}
655
656/*
657 - parsebranch - parse one branch of an RE
658 * This mostly manages concatenation, working closely with parseqatom().
659 * Concatenated things are bundled up as much as possible, with separate
660 * ',' nodes introduced only when necessary due to substructure.
661 ^ static struct subre *parsebranch(struct vars *, int, int, struct state *,
662 ^ 	struct state *, int);
663 */
664static struct subre *
665parsebranch(v, stopper, type, left, right, partial)
666struct vars *v;
667int stopper;			/* EOS or ')' */
668int type;			/* LACON (lookahead subRE) or PLAIN */
669struct state *left;		/* leftmost state */
670struct state *right;		/* rightmost state */
671int partial;			/* is this only part of a branch? */
672{
673	struct state *lp;	/* left end of current construct */
674	int seencontent;	/* is there anything in this branch yet? */
675	struct subre *t;
676
677	lp = left;
678	seencontent = 0;
679	t = subre(v, '=', 0, left, right);	/* op '=' is tentative */
680	NOERRN();
681	while (!SEE('|') && !SEE(stopper) && !SEE(EOS)) {
682		if (seencontent) {	/* implicit concat operator */
683			lp = newstate(v->nfa);
684			NOERRN();
685			moveins(v->nfa, right, lp);
686		}
687		seencontent = 1;
688
689		/* NB, recursion in parseqatom() may swallow rest of branch */
690		parseqatom(v, stopper, type, lp, right, t);
691	}
692
693	if (!seencontent) {		/* empty branch */
694		if (!partial)
695			NOTE(REG_UUNSPEC);
696		assert(lp == left);
697		EMPTYARC(left, right);
698	}
699
700	return t;
701}
702
703/*
704 - parseqatom - parse one quantified atom or constraint of an RE
705 * The bookkeeping near the end cooperates very closely with parsebranch();
706 * in particular, it contains a recursion that can involve parsing the rest
707 * of the branch, making this function's name somewhat inaccurate.
708 ^ static VOID parseqatom(struct vars *, int, int, struct state *,
709 ^ 	struct state *, struct subre *);
710 */
711static VOID
712parseqatom(v, stopper, type, lp, rp, top)
713struct vars *v;
714int stopper;			/* EOS or ')' */
715int type;			/* LACON (lookahead subRE) or PLAIN */
716struct state *lp;		/* left state to hang it on */
717struct state *rp;		/* right state to hang it on */
718struct subre *top;		/* subtree top */
719{
720	struct state *s;	/* temporaries for new states */
721	struct state *s2;
722#	define	ARCV(t, val)	newarc(v->nfa, t, val, lp, rp)
723	int m, n;
724	struct subre *atom;	/* atom's subtree */
725	struct subre *t;
726	int cap;		/* capturing parens? */
727	int pos;		/* positive lookahead? */
728	int subno;		/* capturing-parens or backref number */
729	int atomtype;
730	int qprefer;		/* quantifier short/long preference */
731	int f;
732	struct subre **atomp;	/* where the pointer to atom is */
733
734	/* initial bookkeeping */
735	atom = NULL;
736	assert(lp->nouts == 0);	/* must string new code */
737	assert(rp->nins == 0);	/*  between lp and rp */
738	subno = 0;		/* just to shut lint up */
739
740	/* an atom or constraint... */
741	atomtype = v->nexttype;
742	switch (atomtype) {
743	/* first, constraints, which end by returning */
744	case '^':
745		ARCV('^', 1);
746		if (v->cflags&REG_NLANCH)
747			ARCV(BEHIND, v->nlcolor);
748		NEXT();
749		return;
750		break;
751	case '$':
752		ARCV('$', 1);
753		if (v->cflags&REG_NLANCH)
754			ARCV(AHEAD, v->nlcolor);
755		NEXT();
756		return;
757		break;
758	case SBEGIN:
759		ARCV('^', 1);	/* BOL */
760		ARCV('^', 0);	/* or BOS */
761		NEXT();
762		return;
763		break;
764	case SEND:
765		ARCV('$', 1);	/* EOL */
766		ARCV('$', 0);	/* or EOS */
767		NEXT();
768		return;
769		break;
770	case '<':
771		wordchrs(v);	/* does NEXT() */
772		s = newstate(v->nfa);
773		NOERR();
774		nonword(v, BEHIND, lp, s);
775		word(v, AHEAD, s, rp);
776		return;
777		break;
778	case '>':
779		wordchrs(v);	/* does NEXT() */
780		s = newstate(v->nfa);
781		NOERR();
782		word(v, BEHIND, lp, s);
783		nonword(v, AHEAD, s, rp);
784		return;
785		break;
786	case WBDRY:
787		wordchrs(v);	/* does NEXT() */
788		s = newstate(v->nfa);
789		NOERR();
790		nonword(v, BEHIND, lp, s);
791		word(v, AHEAD, s, rp);
792		s = newstate(v->nfa);
793		NOERR();
794		word(v, BEHIND, lp, s);
795		nonword(v, AHEAD, s, rp);
796		return;
797		break;
798	case NWBDRY:
799		wordchrs(v);	/* does NEXT() */
800		s = newstate(v->nfa);
801		NOERR();
802		word(v, BEHIND, lp, s);
803		word(v, AHEAD, s, rp);
804		s = newstate(v->nfa);
805		NOERR();
806		nonword(v, BEHIND, lp, s);
807		nonword(v, AHEAD, s, rp);
808		return;
809		break;
810	case LACON:	/* lookahead constraint */
811		pos = v->nextvalue;
812		NEXT();
813		s = newstate(v->nfa);
814		s2 = newstate(v->nfa);
815		NOERR();
816		t = parse(v, ')', LACON, s, s2);
817		freesubre(v, t);	/* internal structure irrelevant */
818		assert(SEE(')') || ISERR());
819		NEXT();
820		n = newlacon(v, s, s2, pos);
821		NOERR();
822		ARCV(LACON, n);
823		return;
824		break;
825	/* then errors, to get them out of the way */
826	case '*':
827	case '+':
828	case '?':
829	case '{':
830		ERR(REG_BADRPT);
831		return;
832		break;
833	default:
834		ERR(REG_ASSERT);
835		return;
836		break;
837	/* then plain characters, and minor variants on that theme */
838	case ')':		/* unbalanced paren */
839		if ((v->cflags&REG_ADVANCED) != REG_EXTENDED) {
840			ERR(REG_EPAREN);
841			return;
842		}
843		/* legal in EREs due to specification botch */
844		NOTE(REG_UPBOTCH);
845		/* fallthrough into case PLAIN */
846	case PLAIN:
847		onechr(v, v->nextvalue, lp, rp);
848		okcolors(v->nfa, v->cm);
849		NOERR();
850		NEXT();
851		break;
852	case '[':
853		if (v->nextvalue == 1)
854			bracket(v, lp, rp);
855		else
856			cbracket(v, lp, rp);
857		assert(SEE(']') || ISERR());
858		NEXT();
859		break;
860	case '.':
861		rainbow(v->nfa, v->cm, PLAIN,
862				(v->cflags&REG_NLSTOP) ? v->nlcolor : COLORLESS,
863				lp, rp);
864		NEXT();
865		break;
866	/* and finally the ugly stuff */
867	case '(':	/* value flags as capturing or non */
868		cap = (type == LACON) ? 0 : v->nextvalue;
869		if (cap) {
870			v->nsubexp++;
871			subno = v->nsubexp;
872			if ((size_t)subno >= v->nsubs)
873				moresubs(v, subno);
874			assert((size_t)subno < v->nsubs);
875		} else
876			atomtype = PLAIN;	/* something that's not '(' */
877		NEXT();
878		/* need new endpoints because tree will contain pointers */
879		s = newstate(v->nfa);
880		s2 = newstate(v->nfa);
881		NOERR();
882		EMPTYARC(lp, s);
883		EMPTYARC(s2, rp);
884		NOERR();
885		atom = parse(v, ')', PLAIN, s, s2);
886		assert(SEE(')') || ISERR());
887		NEXT();
888		NOERR();
889		if (cap) {
890			v->subs[subno] = atom;
891			t = subre(v, '(', atom->flags|CAP, lp, rp);
892			NOERR();
893			t->subno = subno;
894			t->left = atom;
895			atom = t;
896		}
897		/* postpone everything else pending possible {0} */
898		break;
899	case BACKREF:	/* the Feature From The Black Lagoon */
900		INSIST(type != LACON, REG_ESUBREG);
901		INSIST(v->nextvalue < v->nsubs, REG_ESUBREG);
902		INSIST(v->subs[v->nextvalue] != NULL, REG_ESUBREG);
903		NOERR();
904		assert(v->nextvalue > 0);
905		atom = subre(v, 'b', BACKR, lp, rp);
906		subno = v->nextvalue;
907		atom->subno = subno;
908		EMPTYARC(lp, rp);	/* temporarily, so there's something */
909		NEXT();
910		break;
911	}
912
913	/* ...and an atom may be followed by a quantifier */
914	switch (v->nexttype) {
915	case '*':
916		m = 0;
917		n = INFINITY;
918		qprefer = (v->nextvalue) ? LONGER : SHORTER;
919		NEXT();
920		break;
921	case '+':
922		m = 1;
923		n = INFINITY;
924		qprefer = (v->nextvalue) ? LONGER : SHORTER;
925		NEXT();
926		break;
927	case '?':
928		m = 0;
929		n = 1;
930		qprefer = (v->nextvalue) ? LONGER : SHORTER;
931		NEXT();
932		break;
933	case '{':
934		NEXT();
935		m = scannum(v);
936		if (EAT(',')) {
937			if (SEE(DIGIT))
938				n = scannum(v);
939			else
940				n = INFINITY;
941			if (m > n) {
942				ERR(REG_BADBR);
943				return;
944			}
945			/* {m,n} exercises preference, even if it's {m,m} */
946			qprefer = (v->nextvalue) ? LONGER : SHORTER;
947		} else {
948			n = m;
949			/* {m} passes operand's preference through */
950			qprefer = 0;
951		}
952		if (!SEE('}')) {	/* catches errors too */
953			ERR(REG_BADBR);
954			return;
955		}
956		NEXT();
957		break;
958	default:		/* no quantifier */
959		m = n = 1;
960		qprefer = 0;
961		break;
962	}
963
964	/* annoying special case:  {0} or {0,0} cancels everything */
965	if (m == 0 && n == 0) {
966		if (atom != NULL)
967			freesubre(v, atom);
968		if (atomtype == '(')
969			v->subs[subno] = NULL;
970		delsub(v->nfa, lp, rp);
971		EMPTYARC(lp, rp);
972		return;
973	}
974
975	/* if not a messy case, avoid hard part */
976	assert(!MESSY(top->flags));
977	f = top->flags | qprefer | ((atom != NULL) ? atom->flags : 0);
978	if (atomtype != '(' && atomtype != BACKREF && !MESSY(UP(f))) {
979		if (!(m == 1 && n == 1))
980			repeat(v, lp, rp, m, n);
981		if (atom != NULL)
982			freesubre(v, atom);
983		top->flags = f;
984		return;
985	}
986
987	/*
988	 * hard part:  something messy
989	 * That is, capturing parens, back reference, short/long clash, or
990	 * an atom with substructure containing one of those.
991	 */
992
993	/* now we'll need a subre for the contents even if they're boring */
994	if (atom == NULL) {
995		atom = subre(v, '=', 0, lp, rp);
996		NOERR();
997	}
998
999	/*
1000	 * prepare a general-purpose state skeleton
1001	 *
1002	 *    ---> [s] ---prefix---> [begin] ---atom---> [end] ----rest---> [rp]
1003	 *   /                                            /
1004	 * [lp] ----> [s2] ----bypass---------------------
1005	 *
1006	 * where bypass is an empty, and prefix is some repetitions of atom
1007	 */
1008	s = newstate(v->nfa);		/* first, new endpoints for the atom */
1009	s2 = newstate(v->nfa);
1010	NOERR();
1011	moveouts(v->nfa, lp, s);
1012	moveins(v->nfa, rp, s2);
1013	NOERR();
1014	atom->begin = s;
1015	atom->end = s2;
1016	s = newstate(v->nfa);		/* and spots for prefix and bypass */
1017	s2 = newstate(v->nfa);
1018	NOERR();
1019	EMPTYARC(lp, s);
1020	EMPTYARC(lp, s2);
1021	NOERR();
1022
1023	/* break remaining subRE into x{...} and what follows */
1024	t = subre(v, '.', COMBINE(qprefer, atom->flags), lp, rp);
1025	t->left = atom;
1026	atomp = &t->left;
1027	/* here we should recurse... but we must postpone that to the end */
1028
1029	/* split top into prefix and remaining */
1030	assert(top->op == '=' && top->left == NULL && top->right == NULL);
1031	top->left = subre(v, '=', top->flags, top->begin, lp);
1032	top->op = '.';
1033	top->right = t;
1034
1035	/* if it's a backref, now is the time to replicate the subNFA */
1036	if (atomtype == BACKREF) {
1037		assert(atom->begin->nouts == 1);	/* just the EMPTY */
1038		delsub(v->nfa, atom->begin, atom->end);
1039		assert(v->subs[subno] != NULL);
1040		/* and here's why the recursion got postponed:  it must */
1041		/* wait until the skeleton is filled in, because it may */
1042		/* hit a backref that wants to copy the filled-in skeleton */
1043		dupnfa(v->nfa, v->subs[subno]->begin, v->subs[subno]->end,
1044						atom->begin, atom->end);
1045		NOERR();
1046	}
1047
1048	/* it's quantifier time; first, turn x{0,...} into x{1,...}|empty */
1049	if (m == 0) {
1050		EMPTYARC(s2, atom->end);		/* the bypass */
1051		assert(PREF(qprefer) != 0);
1052		f = COMBINE(qprefer, atom->flags);
1053		t = subre(v, '|', f, lp, atom->end);
1054		NOERR();
1055		t->left = atom;
1056		t->right = subre(v, '|', PREF(f), s2, atom->end);
1057		NOERR();
1058		t->right->left = subre(v, '=', 0, s2, atom->end);
1059		NOERR();
1060		*atomp = t;
1061		atomp = &t->left;
1062		m = 1;
1063	}
1064
1065	/* deal with the rest of the quantifier */
1066	if (atomtype == BACKREF) {
1067		/* special case:  backrefs have internal quantifiers */
1068		EMPTYARC(s, atom->begin);	/* empty prefix */
1069		/* just stuff everything into atom */
1070		repeat(v, atom->begin, atom->end, m, n);
1071		atom->min = (short)m;
1072		atom->max = (short)n;
1073		atom->flags |= COMBINE(qprefer, atom->flags);
1074	} else if (m == 1 && n == 1) {
1075		/* no/vacuous quantifier:  done */
1076		EMPTYARC(s, atom->begin);	/* empty prefix */
1077	} else {
1078		/* turn x{m,n} into x{m-1,n-1}x, with capturing */
1079		/*  parens in only second x */
1080		dupnfa(v->nfa, atom->begin, atom->end, s, atom->begin);
1081		assert(m >= 1 && m != INFINITY && n >= 1);
1082		repeat(v, s, atom->begin, m-1, (n == INFINITY) ? n : n-1);
1083		f = COMBINE(qprefer, atom->flags);
1084		t = subre(v, '.', f, s, atom->end);	/* prefix and atom */
1085		NOERR();
1086		t->left = subre(v, '=', PREF(f), s, atom->begin);
1087		NOERR();
1088		t->right = atom;
1089		*atomp = t;
1090	}
1091
1092	/* and finally, look after that postponed recursion */
1093	t = top->right;
1094	if (!(SEE('|') || SEE(stopper) || SEE(EOS)))
1095		t->right = parsebranch(v, stopper, type, atom->end, rp, 1);
1096	else {
1097		EMPTYARC(atom->end, rp);
1098		t->right = subre(v, '=', 0, atom->end, rp);
1099	}
1100	assert(SEE('|') || SEE(stopper) || SEE(EOS));
1101	t->flags |= COMBINE(t->flags, t->right->flags);
1102	top->flags |= COMBINE(top->flags, t->flags);
1103}
1104
1105/*
1106 - nonword - generate arcs for non-word-character ahead or behind
1107 ^ static VOID nonword(struct vars *, int, struct state *, struct state *);
1108 */
1109static VOID
1110nonword(v, dir, lp, rp)
1111struct vars *v;
1112int dir;			/* AHEAD or BEHIND */
1113struct state *lp;
1114struct state *rp;
1115{
1116	int anchor = (dir == AHEAD) ? '$' : '^';
1117
1118	assert(dir == AHEAD || dir == BEHIND);
1119	newarc(v->nfa, anchor, 1, lp, rp);
1120	newarc(v->nfa, anchor, 0, lp, rp);
1121	colorcomplement(v->nfa, v->cm, dir, v->wordchrs, lp, rp);
1122	/* (no need for special attention to \n) */
1123}
1124
1125/*
1126 - word - generate arcs for word character ahead or behind
1127 ^ static VOID word(struct vars *, int, struct state *, struct state *);
1128 */
1129static VOID
1130word(v, dir, lp, rp)
1131struct vars *v;
1132int dir;			/* AHEAD or BEHIND */
1133struct state *lp;
1134struct state *rp;
1135{
1136	assert(dir == AHEAD || dir == BEHIND);
1137	cloneouts(v->nfa, v->wordchrs, lp, rp, dir);
1138	/* (no need for special attention to \n) */
1139}
1140
1141/*
1142 - scannum - scan a number
1143 ^ static int scannum(struct vars *);
1144 */
1145static int			/* value, <= DUPMAX */
1146scannum(v)
1147struct vars *v;
1148{
1149	int n = 0;
1150
1151	while (SEE(DIGIT) && n < DUPMAX) {
1152		n = n*10 + v->nextvalue;
1153		NEXT();
1154	}
1155	if (SEE(DIGIT) || n > DUPMAX) {
1156		ERR(REG_BADBR);
1157		return 0;
1158	}
1159	return n;
1160}
1161
1162/*
1163 - repeat - replicate subNFA for quantifiers
1164 * The duplication sequences used here are chosen carefully so that any
1165 * pointers starting out pointing into the subexpression end up pointing into
1166 * the last occurrence.  (Note that it may not be strung between the same
1167 * left and right end states, however!)  This used to be important for the
1168 * subRE tree, although the important bits are now handled by the in-line
1169 * code in parse(), and when this is called, it doesn't matter any more.
1170 ^ static VOID repeat(struct vars *, struct state *, struct state *, int, int);
1171 */
1172static VOID
1173repeat(v, lp, rp, m, n)
1174struct vars *v;
1175struct state *lp;
1176struct state *rp;
1177int m;
1178int n;
1179{
1180#	define	SOME	2
1181#	define	INF	3
1182#	define	PAIR(x, y)	((x)*4 + (y))
1183#	define	REDUCE(x)	( ((x) == INFINITY) ? INF : (((x) > 1) ? SOME : (x)) )
1184	CONST int rm = REDUCE(m);
1185	CONST int rn = REDUCE(n);
1186	struct state *s;
1187	struct state *s2;
1188
1189	switch (PAIR(rm, rn)) {
1190	case PAIR(0, 0):		/* empty string */
1191		delsub(v->nfa, lp, rp);
1192		EMPTYARC(lp, rp);
1193		break;
1194	case PAIR(0, 1):		/* do as x| */
1195		EMPTYARC(lp, rp);
1196		break;
1197	case PAIR(0, SOME):		/* do as x{1,n}| */
1198		repeat(v, lp, rp, 1, n);
1199		NOERR();
1200		EMPTYARC(lp, rp);
1201		break;
1202	case PAIR(0, INF):		/* loop x around */
1203		s = newstate(v->nfa);
1204		NOERR();
1205		moveouts(v->nfa, lp, s);
1206		moveins(v->nfa, rp, s);
1207		EMPTYARC(lp, s);
1208		EMPTYARC(s, rp);
1209		break;
1210	case PAIR(1, 1):		/* no action required */
1211		break;
1212	case PAIR(1, SOME):		/* do as x{0,n-1}x = (x{1,n-1}|)x */
1213		s = newstate(v->nfa);
1214		NOERR();
1215		moveouts(v->nfa, lp, s);
1216		dupnfa(v->nfa, s, rp, lp, s);
1217		NOERR();
1218		repeat(v, lp, s, 1, n-1);
1219		NOERR();
1220		EMPTYARC(lp, s);
1221		break;
1222	case PAIR(1, INF):		/* add loopback arc */
1223		s = newstate(v->nfa);
1224		s2 = newstate(v->nfa);
1225		NOERR();
1226		moveouts(v->nfa, lp, s);
1227		moveins(v->nfa, rp, s2);
1228		EMPTYARC(lp, s);
1229		EMPTYARC(s2, rp);
1230		EMPTYARC(s2, s);
1231		break;
1232	case PAIR(SOME, SOME):		/* do as x{m-1,n-1}x */
1233		s = newstate(v->nfa);
1234		NOERR();
1235		moveouts(v->nfa, lp, s);
1236		dupnfa(v->nfa, s, rp, lp, s);
1237		NOERR();
1238		repeat(v, lp, s, m-1, n-1);
1239		break;
1240	case PAIR(SOME, INF):		/* do as x{m-1,}x */
1241		s = newstate(v->nfa);
1242		NOERR();
1243		moveouts(v->nfa, lp, s);
1244		dupnfa(v->nfa, s, rp, lp, s);
1245		NOERR();
1246		repeat(v, lp, s, m-1, n);
1247		break;
1248	default:
1249		ERR(REG_ASSERT);
1250		break;
1251	}
1252}
1253
1254/*
1255 - bracket - handle non-complemented bracket expression
1256 * Also called from cbracket for complemented bracket expressions.
1257 ^ static VOID bracket(struct vars *, struct state *, struct state *);
1258 */
1259static VOID
1260bracket(v, lp, rp)
1261struct vars *v;
1262struct state *lp;
1263struct state *rp;
1264{
1265	assert(SEE('['));
1266	NEXT();
1267	while (!SEE(']') && !SEE(EOS))
1268		brackpart(v, lp, rp);
1269	assert(SEE(']') || ISERR());
1270	okcolors(v->nfa, v->cm);
1271}
1272
1273/*
1274 - cbracket - handle complemented bracket expression
1275 * We do it by calling bracket() with dummy endpoints, and then complementing
1276 * the result.  The alternative would be to invoke rainbow(), and then delete
1277 * arcs as the b.e. is seen... but that gets messy.
1278 ^ static VOID cbracket(struct vars *, struct state *, struct state *);
1279 */
1280static VOID
1281cbracket(v, lp, rp)
1282struct vars *v;
1283struct state *lp;
1284struct state *rp;
1285{
1286	struct state *left = newstate(v->nfa);
1287	struct state *right = newstate(v->nfa);
1288	struct state *s;
1289	struct arc *a;			/* arc from lp */
1290	struct arc *ba;			/* arc from left, from bracket() */
1291	struct arc *pa;			/* MCCE-prototype arc */
1292	color co;
1293	chr *p;
1294	int i;
1295
1296	NOERR();
1297	bracket(v, left, right);
1298	if (v->cflags&REG_NLSTOP)
1299		newarc(v->nfa, PLAIN, v->nlcolor, left, right);
1300	NOERR();
1301
1302	assert(lp->nouts == 0);		/* all outarcs will be ours */
1303
1304	/* easy part of complementing */
1305	colorcomplement(v->nfa, v->cm, PLAIN, left, lp, rp);
1306	NOERR();
1307	if (v->mcces == NULL) {		/* no MCCEs -- we're done */
1308		dropstate(v->nfa, left);
1309		assert(right->nins == 0);
1310		freestate(v->nfa, right);
1311		return;
1312	}
1313
1314	/* but complementing gets messy in the presence of MCCEs... */
1315	NOTE(REG_ULOCALE);
1316	for (p = v->mcces->chrs, i = v->mcces->nchrs; i > 0; p++, i--) {
1317		co = GETCOLOR(v->cm, *p);
1318		a = findarc(lp, PLAIN, co);
1319		ba = findarc(left, PLAIN, co);
1320		if (ba == NULL) {
1321			assert(a != NULL);
1322			freearc(v->nfa, a);
1323		} else {
1324			assert(a == NULL);
1325		}
1326		s = newstate(v->nfa);
1327		NOERR();
1328		newarc(v->nfa, PLAIN, co, lp, s);
1329		NOERR();
1330		pa = findarc(v->mccepbegin, PLAIN, co);
1331		assert(pa != NULL);
1332		if (ba == NULL) {	/* easy case, need all of them */
1333			cloneouts(v->nfa, pa->to, s, rp, PLAIN);
1334			newarc(v->nfa, '$', 1, s, rp);
1335			newarc(v->nfa, '$', 0, s, rp);
1336			colorcomplement(v->nfa, v->cm, AHEAD, pa->to, s, rp);
1337		} else {		/* must be selective */
1338			if (findarc(ba->to, '$', 1) == NULL) {
1339				newarc(v->nfa, '$', 1, s, rp);
1340				newarc(v->nfa, '$', 0, s, rp);
1341				colorcomplement(v->nfa, v->cm, AHEAD, pa->to,
1342									 s, rp);
1343			}
1344			for (pa = pa->to->outs; pa != NULL; pa = pa->outchain)
1345				if (findarc(ba->to, PLAIN, pa->co) == NULL)
1346					newarc(v->nfa, PLAIN, pa->co, s, rp);
1347			if (s->nouts == 0)	/* limit of selectivity: none */
1348				dropstate(v->nfa, s);	/* frees arc too */
1349		}
1350		NOERR();
1351	}
1352
1353	delsub(v->nfa, left, right);
1354	assert(left->nouts == 0);
1355	freestate(v->nfa, left);
1356	assert(right->nins == 0);
1357	freestate(v->nfa, right);
1358}
1359
1360/*
1361 - brackpart - handle one item (or range) within a bracket expression
1362 ^ static VOID brackpart(struct vars *, struct state *, struct state *);
1363 */
1364static VOID
1365brackpart(v, lp, rp)
1366struct vars *v;
1367struct state *lp;
1368struct state *rp;
1369{
1370	celt startc;
1371	celt endc;
1372	struct cvec *cv;
1373	chr *startp;
1374	chr *endp;
1375	chr c[1];
1376
1377	/* parse something, get rid of special cases, take shortcuts */
1378	switch (v->nexttype) {
1379	case RANGE:			/* a-b-c or other botch */
1380		ERR(REG_ERANGE);
1381		return;
1382		break;
1383	case PLAIN:
1384		c[0] = v->nextvalue;
1385		NEXT();
1386		/* shortcut for ordinary chr (not range, not MCCE leader) */
1387		if (!SEE(RANGE) && !ISCELEADER(v, c[0])) {
1388			onechr(v, c[0], lp, rp);
1389			return;
1390		}
1391		startc = element(v, c, c+1);
1392		NOERR();
1393		break;
1394	case COLLEL:
1395		startp = v->now;
1396		endp = scanplain(v);
1397		INSIST(startp < endp, REG_ECOLLATE);
1398		NOERR();
1399		startc = element(v, startp, endp);
1400		NOERR();
1401		break;
1402	case ECLASS:
1403		startp = v->now;
1404		endp = scanplain(v);
1405		INSIST(startp < endp, REG_ECOLLATE);
1406		NOERR();
1407		startc = element(v, startp, endp);
1408		NOERR();
1409		cv = eclass(v, startc, (v->cflags&REG_ICASE));
1410		NOERR();
1411		dovec(v, cv, lp, rp);
1412		return;
1413		break;
1414	case CCLASS:
1415		startp = v->now;
1416		endp = scanplain(v);
1417		INSIST(startp < endp, REG_ECTYPE);
1418		NOERR();
1419		cv = cclass(v, startp, endp, (v->cflags&REG_ICASE));
1420		NOERR();
1421		dovec(v, cv, lp, rp);
1422		return;
1423		break;
1424	default:
1425		ERR(REG_ASSERT);
1426		return;
1427		break;
1428	}
1429
1430	if (SEE(RANGE)) {
1431		NEXT();
1432		switch (v->nexttype) {
1433		case PLAIN:
1434		case RANGE:
1435			c[0] = v->nextvalue;
1436			NEXT();
1437			endc = element(v, c, c+1);
1438			NOERR();
1439			break;
1440		case COLLEL:
1441			startp = v->now;
1442			endp = scanplain(v);
1443			INSIST(startp < endp, REG_ECOLLATE);
1444			NOERR();
1445			endc = element(v, startp, endp);
1446			NOERR();
1447			break;
1448		default:
1449			ERR(REG_ERANGE);
1450			return;
1451			break;
1452		}
1453	} else
1454		endc = startc;
1455
1456	/*
1457	 * Ranges are unportable.  Actually, standard C does
1458	 * guarantee that digits are contiguous, but making
1459	 * that an exception is just too complicated.
1460	 */
1461	if (startc != endc)
1462		NOTE(REG_UUNPORT);
1463	cv = range(v, startc, endc, (v->cflags&REG_ICASE));
1464	NOERR();
1465	dovec(v, cv, lp, rp);
1466}
1467
1468/*
1469 - scanplain - scan PLAIN contents of [. etc.
1470 * Certain bits of trickery in lex.c know that this code does not try
1471 * to look past the final bracket of the [. etc.
1472 ^ static chr *scanplain(struct vars *);
1473 */
1474static chr *			/* just after end of sequence */
1475scanplain(v)
1476struct vars *v;
1477{
1478	chr *endp;
1479
1480	assert(SEE(COLLEL) || SEE(ECLASS) || SEE(CCLASS));
1481	NEXT();
1482
1483	endp = v->now;
1484	while (SEE(PLAIN)) {
1485		endp = v->now;
1486		NEXT();
1487	}
1488
1489	assert(SEE(END) || ISERR());
1490	NEXT();
1491
1492	return endp;
1493}
1494
1495/*
1496 - leaders - process a cvec of collating elements to also include leaders
1497 * Also gives all characters involved their own colors, which is almost
1498 * certainly necessary, and sets up little disconnected subNFA.
1499 ^ static VOID leaders(struct vars *, struct cvec *);
1500 */
1501static VOID
1502leaders(v, cv)
1503struct vars *v;
1504struct cvec *cv;
1505{
1506	int mcce;
1507	chr *p;
1508	chr leader;
1509	struct state *s;
1510	struct arc *a;
1511
1512	v->mccepbegin = newstate(v->nfa);
1513	v->mccepend = newstate(v->nfa);
1514	NOERR();
1515
1516	for (mcce = 0; mcce < cv->nmcces; mcce++) {
1517		p = cv->mcces[mcce];
1518		leader = *p;
1519		if (!haschr(cv, leader)) {
1520			addchr(cv, leader);
1521			s = newstate(v->nfa);
1522			newarc(v->nfa, PLAIN, subcolor(v->cm, leader),
1523							v->mccepbegin, s);
1524			okcolors(v->nfa, v->cm);
1525		} else {
1526			a = findarc(v->mccepbegin, PLAIN,
1527						GETCOLOR(v->cm, leader));
1528			assert(a != NULL);
1529			s = a->to;
1530			assert(s != v->mccepend);
1531		}
1532		p++;
1533		assert(*p != 0 && *(p+1) == 0);	/* only 2-char MCCEs for now */
1534		newarc(v->nfa, PLAIN, subcolor(v->cm, *p), s, v->mccepend);
1535		okcolors(v->nfa, v->cm);
1536	}
1537}
1538
1539/*
1540 - onechr - fill in arcs for a plain character, and possible case complements
1541 * This is mostly a shortcut for efficient handling of the common case.
1542 ^ static VOID onechr(struct vars *, pchr, struct state *, struct state *);
1543 */
1544static VOID
1545onechr(v, c, lp, rp)
1546struct vars *v;
1547pchr c;
1548struct state *lp;
1549struct state *rp;
1550{
1551	if (!(v->cflags&REG_ICASE)) {
1552		newarc(v->nfa, PLAIN, subcolor(v->cm, c), lp, rp);
1553		return;
1554	}
1555
1556	/* rats, need general case anyway... */
1557	dovec(v, allcases(v, c), lp, rp);
1558}
1559
1560/*
1561 - dovec - fill in arcs for each element of a cvec
1562 * This one has to handle the messy cases, like MCCEs and MCCE leaders.
1563 ^ static VOID dovec(struct vars *, struct cvec *, struct state *,
1564 ^ 	struct state *);
1565 */
1566static VOID
1567dovec(v, cv, lp, rp)
1568struct vars *v;
1569struct cvec *cv;
1570struct state *lp;
1571struct state *rp;
1572{
1573	chr ch, from, to;
1574	celt ce;
1575	chr *p;
1576	int i;
1577	color co;
1578	struct cvec *leads;
1579	struct arc *a;
1580	struct arc *pa;		/* arc in prototype */
1581	struct state *s;
1582	struct state *ps;	/* state in prototype */
1583
1584	/* need a place to store leaders, if any */
1585	if (nmcces(v) > 0) {
1586		assert(v->mcces != NULL);
1587		if (v->cv2 == NULL || v->cv2->nchrs < v->mcces->nchrs) {
1588			if (v->cv2 != NULL)
1589				free(v->cv2);
1590			v->cv2 = newcvec(v->mcces->nchrs, 0, v->mcces->nmcces);
1591			NOERR();
1592			leads = v->cv2;
1593		} else
1594			leads = clearcvec(v->cv2);
1595	} else
1596		leads = NULL;
1597
1598	/* first, get the ordinary characters out of the way */
1599	for (p = cv->chrs, i = cv->nchrs; i > 0; p++, i--) {
1600		ch = *p;
1601		if (!ISCELEADER(v, ch))
1602			newarc(v->nfa, PLAIN, subcolor(v->cm, ch), lp, rp);
1603		else {
1604			assert(singleton(v->cm, ch));
1605			assert(leads != NULL);
1606			if (!haschr(leads, ch))
1607				addchr(leads, ch);
1608		}
1609	}
1610
1611	/* and the ranges */
1612	for (p = cv->ranges, i = cv->nranges; i > 0; p += 2, i--) {
1613		from = *p;
1614		to = *(p+1);
1615		while (from <= to && (ce = nextleader(v, from, to)) != NOCELT) {
1616			if (from < ce)
1617				subrange(v, from, ce - 1, lp, rp);
1618			assert(singleton(v->cm, ce));
1619			assert(leads != NULL);
1620			if (!haschr(leads, ce))
1621				addchr(leads, ce);
1622			from = ce + 1;
1623		}
1624		if (from <= to)
1625			subrange(v, from, to, lp, rp);
1626	}
1627
1628	if ((leads == NULL || leads->nchrs == 0) && cv->nmcces == 0)
1629		return;
1630
1631	/* deal with the MCCE leaders */
1632	NOTE(REG_ULOCALE);
1633	for (p = leads->chrs, i = leads->nchrs; i > 0; p++, i--) {
1634		co = GETCOLOR(v->cm, *p);
1635		a = findarc(lp, PLAIN, co);
1636		if (a != NULL)
1637			s = a->to;
1638		else {
1639			s = newstate(v->nfa);
1640			NOERR();
1641			newarc(v->nfa, PLAIN, co, lp, s);
1642			NOERR();
1643		}
1644		pa = findarc(v->mccepbegin, PLAIN, co);
1645		assert(pa != NULL);
1646		ps = pa->to;
1647		newarc(v->nfa, '$', 1, s, rp);
1648		newarc(v->nfa, '$', 0, s, rp);
1649		colorcomplement(v->nfa, v->cm, AHEAD, ps, s, rp);
1650		NOERR();
1651	}
1652
1653	/* and the MCCEs */
1654	for (i = 0; i < cv->nmcces; i++) {
1655		p = cv->mcces[i];
1656		assert(singleton(v->cm, *p));
1657		if (!singleton(v->cm, *p)) {
1658			ERR(REG_ASSERT);
1659			return;
1660		}
1661		ch = *p++;
1662		co = GETCOLOR(v->cm, ch);
1663		a = findarc(lp, PLAIN, co);
1664		if (a != NULL)
1665			s = a->to;
1666		else {
1667			s = newstate(v->nfa);
1668			NOERR();
1669			newarc(v->nfa, PLAIN, co, lp, s);
1670			NOERR();
1671		}
1672		assert(*p != 0);	/* at least two chars */
1673		assert(singleton(v->cm, *p));
1674		ch = *p++;
1675		co = GETCOLOR(v->cm, ch);
1676		assert(*p == 0);	/* and only two, for now */
1677		newarc(v->nfa, PLAIN, co, s, rp);
1678		NOERR();
1679	}
1680}
1681
1682/*
1683 - nextleader - find next MCCE leader within range
1684 ^ static celt nextleader(struct vars *, pchr, pchr);
1685 */
1686static celt			/* NOCELT means none */
1687nextleader(v, from, to)
1688struct vars *v;
1689pchr from;
1690pchr to;
1691{
1692	int i;
1693	chr *p;
1694	chr ch;
1695	celt it = NOCELT;
1696
1697	if (v->mcces == NULL)
1698		return it;
1699
1700	for (i = v->mcces->nchrs, p = v->mcces->chrs; i > 0; i--, p++) {
1701		ch = *p;
1702		if (from <= ch && ch <= to)
1703			if (it == NOCELT || ch < it)
1704				it = ch;
1705	}
1706	return it;
1707}
1708
1709/*
1710 - wordchrs - set up word-chr list for word-boundary stuff, if needed
1711 * The list is kept as a bunch of arcs between two dummy states; it's
1712 * disposed of by the unreachable-states sweep in NFA optimization.
1713 * Does NEXT().  Must not be called from any unusual lexical context.
1714 * This should be reconciled with the \w etc. handling in lex.c, and
1715 * should be cleaned up to reduce dependencies on input scanning.
1716 ^ static VOID wordchrs(struct vars *);
1717 */
1718static VOID
1719wordchrs(v)
1720struct vars *v;
1721{
1722	struct state *left;
1723	struct state *right;
1724
1725	if (v->wordchrs != NULL) {
1726		NEXT();		/* for consistency */
1727		return;
1728	}
1729
1730	left = newstate(v->nfa);
1731	right = newstate(v->nfa);
1732	NOERR();
1733	/* fine point:  implemented with [::], and lexer will set REG_ULOCALE */
1734	lexword(v);
1735	NEXT();
1736	assert(v->savenow != NULL && SEE('['));
1737	bracket(v, left, right);
1738	assert((v->savenow != NULL && SEE(']')) || ISERR());
1739	NEXT();
1740	NOERR();
1741	v->wordchrs = left;
1742}
1743
1744/*
1745 - subre - allocate a subre
1746 ^ static struct subre *subre(struct vars *, int, int, struct state *,
1747 ^	struct state *);
1748 */
1749static struct subre *
1750subre(v, op, flags, begin, end)
1751struct vars *v;
1752int op;
1753int flags;
1754struct state *begin;
1755struct state *end;
1756{
1757	struct subre *ret;
1758
1759	ret = v->treefree;
1760	if (ret != NULL)
1761		v->treefree = ret->left;
1762	else {
1763		ret = (struct subre *)MALLOC(sizeof(struct subre));
1764		if (ret == NULL) {
1765			ERR(REG_ESPACE);
1766			return NULL;
1767		}
1768		ret->chain = v->treechain;
1769		v->treechain = ret;
1770	}
1771
1772	assert(strchr("|.b(=", op) != NULL);
1773
1774	ret->op = op;
1775	ret->flags = flags;
1776	ret->retry = 0;
1777	ret->subno = 0;
1778	ret->min = ret->max = 1;
1779	ret->left = NULL;
1780	ret->right = NULL;
1781	ret->begin = begin;
1782	ret->end = end;
1783	ZAPCNFA(ret->cnfa);
1784
1785	return ret;
1786}
1787
1788/*
1789 - freesubre - free a subRE subtree
1790 ^ static VOID freesubre(struct vars *, struct subre *);
1791 */
1792static VOID
1793freesubre(v, sr)
1794struct vars *v;			/* might be NULL */
1795struct subre *sr;
1796{
1797	if (sr == NULL)
1798		return;
1799
1800	if (sr->left != NULL)
1801		freesubre(v, sr->left);
1802	if (sr->right != NULL)
1803		freesubre(v, sr->right);
1804
1805	freesrnode(v, sr);
1806}
1807
1808/*
1809 - freesrnode - free one node in a subRE subtree
1810 ^ static VOID freesrnode(struct vars *, struct subre *);
1811 */
1812static VOID
1813freesrnode(v, sr)
1814struct vars *v;			/* might be NULL */
1815struct subre *sr;
1816{
1817	if (sr == NULL)
1818		return;
1819
1820	if (!NULLCNFA(sr->cnfa))
1821		freecnfa(&sr->cnfa);
1822	sr->flags = 0;
1823
1824	if (v != NULL) {
1825		sr->left = v->treefree;
1826		v->treefree = sr;
1827	} else
1828		FREE(sr);
1829}
1830
1831/*
1832 - optst - optimize a subRE subtree
1833 ^ static VOID optst(struct vars *, struct subre *);
1834 */
1835static VOID
1836optst(v, t)
1837struct vars *v;
1838struct subre *t;
1839{
1840    /*
1841     * DGP (2007-11-13): I assume it was the programmer's intent to eventually
1842     * come back and add code to optimize subRE trees, but the routine coded
1843     * just spent effort traversing the tree and doing nothing. We can do
1844     * nothing with less effort.
1845     */
1846
1847    return;
1848}
1849
1850/*
1851 - numst - number tree nodes (assigning retry indexes)
1852 ^ static int numst(struct subre *, int);
1853 */
1854static int			/* next number */
1855numst(t, start)
1856struct subre *t;
1857int start;			/* starting point for subtree numbers */
1858{
1859	int i;
1860
1861	assert(t != NULL);
1862
1863	i = start;
1864	t->retry = (short)i++;
1865	if (t->left != NULL)
1866		i = numst(t->left, i);
1867	if (t->right != NULL)
1868		i = numst(t->right, i);
1869	return i;
1870}
1871
1872/*
1873 - markst - mark tree nodes as INUSE
1874 ^ static VOID markst(struct subre *);
1875 */
1876static VOID
1877markst(t)
1878struct subre *t;
1879{
1880	assert(t != NULL);
1881
1882	t->flags |= INUSE;
1883	if (t->left != NULL)
1884		markst(t->left);
1885	if (t->right != NULL)
1886		markst(t->right);
1887}
1888
1889/*
1890 - cleanst - free any tree nodes not marked INUSE
1891 ^ static VOID cleanst(struct vars *);
1892 */
1893static VOID
1894cleanst(v)
1895struct vars *v;
1896{
1897	struct subre *t;
1898	struct subre *next;
1899
1900	for (t = v->treechain; t != NULL; t = next) {
1901		next = t->chain;
1902		if (!(t->flags&INUSE))
1903			FREE(t);
1904	}
1905	v->treechain = NULL;
1906	v->treefree = NULL;		/* just on general principles */
1907}
1908
1909/*
1910 - nfatree - turn a subRE subtree into a tree of compacted NFAs
1911 ^ static long nfatree(struct vars *, struct subre *, FILE *);
1912 */
1913static long			/* optimize results from top node */
1914nfatree(v, t, f)
1915struct vars *v;
1916struct subre *t;
1917FILE *f;			/* for debug output */
1918{
1919	assert(t != NULL && t->begin != NULL);
1920
1921	if (t->left != NULL)
1922		(DISCARD)nfatree(v, t->left, f);
1923	if (t->right != NULL)
1924		(DISCARD)nfatree(v, t->right, f);
1925
1926	return nfanode(v, t, f);
1927}
1928
1929/*
1930 - nfanode - do one NFA for nfatree
1931 ^ static long nfanode(struct vars *, struct subre *, FILE *);
1932 */
1933static long			/* optimize results */
1934nfanode(v, t, f)
1935struct vars *v;
1936struct subre *t;
1937FILE *f;			/* for debug output */
1938{
1939	struct nfa *nfa;
1940	long ret = 0;
1941	char idbuf[50];
1942
1943	assert(t->begin != NULL);
1944
1945	if (f != NULL)
1946		fprintf(f, "\n\n\n========= TREE NODE %s ==========\n",
1947						stid(t, idbuf, sizeof(idbuf)));
1948	nfa = newnfa(v, v->cm, v->nfa);
1949	NOERRZ();
1950	dupnfa(nfa, t->begin, t->end, nfa->init, nfa->final);
1951	if (!ISERR()) {
1952		specialcolors(nfa);
1953		ret = optimize(nfa, f);
1954	}
1955	if (!ISERR())
1956		compact(nfa, &t->cnfa);
1957
1958	freenfa(nfa);
1959	return ret;
1960}
1961
1962/*
1963 - newlacon - allocate a lookahead-constraint subRE
1964 ^ static int newlacon(struct vars *, struct state *, struct state *, int);
1965 */
1966static int			/* lacon number */
1967newlacon(v, begin, end, pos)
1968struct vars *v;
1969struct state *begin;
1970struct state *end;
1971int pos;
1972{
1973	int n;
1974	struct subre *sub;
1975
1976	if (v->nlacons == 0) {
1977		v->lacons = (struct subre *)MALLOC(2 * sizeof(struct subre));
1978		n = 1;		/* skip 0th */
1979		v->nlacons = 2;
1980	} else {
1981		v->lacons = (struct subre *)REALLOC(v->lacons,
1982					(v->nlacons+1)*sizeof(struct subre));
1983		n = v->nlacons++;
1984	}
1985	if (v->lacons == NULL) {
1986		ERR(REG_ESPACE);
1987		return 0;
1988	}
1989	sub = &v->lacons[n];
1990	sub->begin = begin;
1991	sub->end = end;
1992	sub->subno = pos;
1993	ZAPCNFA(sub->cnfa);
1994	return n;
1995}
1996
1997/*
1998 - freelacons - free lookahead-constraint subRE vector
1999 ^ static VOID freelacons(struct subre *, int);
2000 */
2001static VOID
2002freelacons(subs, n)
2003struct subre *subs;
2004int n;
2005{
2006	struct subre *sub;
2007	int i;
2008
2009	assert(n > 0);
2010	for (sub = subs + 1, i = n - 1; i > 0; sub++, i--)	/* no 0th */
2011		if (!NULLCNFA(sub->cnfa))
2012			freecnfa(&sub->cnfa);
2013	FREE(subs);
2014}
2015
2016/*
2017 - rfree - free a whole RE (insides of regfree)
2018 ^ static VOID rfree(regex_t *);
2019 */
2020static VOID
2021rfree(re)
2022regex_t *re;
2023{
2024	struct guts *g;
2025
2026	if (re == NULL || re->re_magic != REMAGIC)
2027		return;
2028
2029	re->re_magic = 0;	/* invalidate RE */
2030	g = (struct guts *)re->re_guts;
2031	re->re_guts = NULL;
2032	re->re_fns = NULL;
2033	g->magic = 0;
2034	freecm(&g->cmap);
2035	if (g->tree != NULL)
2036		freesubre((struct vars *)NULL, g->tree);
2037	if (g->lacons != NULL)
2038		freelacons(g->lacons, g->nlacons);
2039	if (!NULLCNFA(g->search))
2040		freecnfa(&g->search);
2041	FREE(g);
2042}
2043
2044/*
2045 - dump - dump an RE in human-readable form
2046 ^ static VOID dump(regex_t *, FILE *);
2047 */
2048static VOID
2049dump(re, f)
2050regex_t *re;
2051FILE *f;
2052{
2053#ifdef REG_DEBUG
2054	struct guts *g;
2055	int i;
2056
2057	if (re->re_magic != REMAGIC)
2058		fprintf(f, "bad magic number (0x%x not 0x%x)\n", re->re_magic,
2059								REMAGIC);
2060	if (re->re_guts == NULL) {
2061		fprintf(f, "NULL guts!!!\n");
2062		return;
2063	}
2064	g = (struct guts *)re->re_guts;
2065	if (g->magic != GUTSMAGIC)
2066		fprintf(f, "bad guts magic number (0x%x not 0x%x)\n", g->magic,
2067								GUTSMAGIC);
2068
2069	fprintf(f, "\n\n\n========= DUMP ==========\n");
2070	fprintf(f, "nsub %d, info 0%lo, csize %d, ntree %d\n",
2071		re->re_nsub, re->re_info, re->re_csize, g->ntree);
2072
2073	dumpcolors(&g->cmap, f);
2074	if (!NULLCNFA(g->search)) {
2075		printf("\nsearch:\n");
2076		dumpcnfa(&g->search, f);
2077	}
2078	for (i = 1; i < g->nlacons; i++) {
2079		fprintf(f, "\nla%d (%s):\n", i,
2080				(g->lacons[i].subno) ? "positive" : "negative");
2081		dumpcnfa(&g->lacons[i].cnfa, f);
2082	}
2083	fprintf(f, "\n");
2084	dumpst(g->tree, f, 0);
2085#endif
2086}
2087
2088/*
2089 - dumpst - dump a subRE tree
2090 ^ static VOID dumpst(struct subre *, FILE *, int);
2091 */
2092static VOID
2093dumpst(t, f, nfapresent)
2094struct subre *t;
2095FILE *f;
2096int nfapresent;			/* is the original NFA still around? */
2097{
2098	if (t == NULL)
2099		fprintf(f, "null tree\n");
2100	else
2101		stdump(t, f, nfapresent);
2102	fflush(f);
2103}
2104
2105/*
2106 - stdump - recursive guts of dumpst
2107 ^ static VOID stdump(struct subre *, FILE *, int);
2108 */
2109static VOID
2110stdump(t, f, nfapresent)
2111struct subre *t;
2112FILE *f;
2113int nfapresent;			/* is the original NFA still around? */
2114{
2115	char idbuf[50];
2116
2117	fprintf(f, "%s. `%c'", stid(t, idbuf, sizeof(idbuf)), t->op);
2118	if (t->flags&LONGER)
2119		fprintf(f, " longest");
2120	if (t->flags&SHORTER)
2121		fprintf(f, " shortest");
2122	if (t->flags&MIXED)
2123		fprintf(f, " hasmixed");
2124	if (t->flags&CAP)
2125		fprintf(f, " hascapture");
2126	if (t->flags&BACKR)
2127		fprintf(f, " hasbackref");
2128	if (!(t->flags&INUSE))
2129		fprintf(f, " UNUSED");
2130	if (t->subno != 0)
2131		fprintf(f, " (#%d)", t->subno);
2132	if (t->min != 1 || t->max != 1) {
2133		fprintf(f, " {%d,", t->min);
2134		if (t->max != INFINITY)
2135			fprintf(f, "%d", t->max);
2136		fprintf(f, "}");
2137	}
2138	if (nfapresent)
2139		fprintf(f, " %ld-%ld", (long)t->begin->no, (long)t->end->no);
2140	if (t->left != NULL)
2141		fprintf(f, " L:%s", stid(t->left, idbuf, sizeof(idbuf)));
2142	if (t->right != NULL)
2143		fprintf(f, " R:%s", stid(t->right, idbuf, sizeof(idbuf)));
2144	if (!NULLCNFA(t->cnfa)) {
2145		fprintf(f, "\n");
2146		dumpcnfa(&t->cnfa, f);
2147	}
2148	fprintf(f, "\n");
2149	if (t->left != NULL)
2150		stdump(t->left, f, nfapresent);
2151	if (t->right != NULL)
2152		stdump(t->right, f, nfapresent);
2153}
2154
2155/*
2156 - stid - identify a subtree node for dumping
2157 ^ static char *stid(struct subre *, char *, size_t);
2158 */
2159static char *			/* points to buf or constant string */
2160stid(t, buf, bufsize)
2161struct subre *t;
2162char *buf;
2163size_t bufsize;
2164{
2165	/* big enough for hex int or decimal t->retry? */
2166	if (bufsize < sizeof(void*)*2 + 3 || bufsize < sizeof(t->retry)*3 + 1)
2167		return "unable";
2168	if (t->retry != 0)
2169		sprintf(buf, "%d", t->retry);
2170	else
2171		sprintf(buf, "%p", t);
2172	return buf;
2173}
2174
2175#include "regc_lex.c"
2176#include "regc_color.c"
2177#include "regc_nfa.c"
2178#include "regc_cvec.c"
2179#include "regc_locale.c"
2180