1/*
2 * re_*comp and friends - compile REs
3 * This file #includes several others (see the bottom).
4 *
5 * Copyright (c) 1998, 1999 Henry Spencer. All rights reserved.
6 *
7 * Development of this software was funded, in part, by Cray Research Inc.,
8 * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics
9 * Corporation, none of whom are responsible for the results. The author
10 * thanks all of them.
11 *
12 * Redistribution and use in source and binary forms -- with or without
13 * modification -- are permitted for any purpose, provided that
14 * redistributions in source form retain this entire copyright notice and
15 * indicate the origin and nature of any modifications.
16 *
17 * I'd appreciate being given credit for this package in the documentation of
18 * software which uses it, but that is not a requirement.
19 *
20 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
21 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
22 * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
23 * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
24 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
25 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
26 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
27 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
28 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
29 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 *
31 */
32
33#include "regguts.h"
34
35/*
36 * forward declarations, up here so forward datatypes etc. are defined early
37 */
38/* =====^!^===== begin forwards =====^!^===== */
39/* automatically gathered by fwd; do not hand-edit */
40/* === regcomp.c === */
41int compile(regex_t *, const chr *, size_t, int);
42static void moresubs(struct vars *, int);
43static int freev(struct vars *, int);
44static void makesearch(struct vars *, struct nfa *);
45static struct subre *parse(struct vars *, int, int, struct state *, struct state *);
46static struct subre *parsebranch(struct vars *, int, int, struct state *, struct state *, int);
47static void parseqatom(struct vars *, int, int, struct state *, struct state *, struct subre *);
48static void nonword(struct vars *, int, struct state *, struct state *);
49static void word(struct vars *, int, struct state *, struct state *);
50static int scannum(struct vars *);
51static void repeat(struct vars *, struct state *, struct state *, int, int);
52static void bracket(struct vars *, struct state *, struct state *);
53static void cbracket(struct vars *, struct state *, struct state *);
54static void brackpart(struct vars *, struct state *, struct state *);
55static const chr *scanplain(struct vars *);
56static void onechr(struct vars *, pchr, struct state *, struct state *);
57static void dovec(struct vars *, struct cvec *, struct state *, struct state *);
58static void wordchrs(struct vars *);
59static struct subre *subre(struct vars *, int, int, struct state *, struct state *);
60static void freesubre(struct vars *, struct subre *);
61static void freesrnode(struct vars *, struct subre *);
62static void optst(struct vars *, struct subre *);
63static int numst(struct subre *, int);
64static void markst(struct subre *);
65static void cleanst(struct vars *);
66static long nfatree(struct vars *, struct subre *, FILE *);
67static long nfanode(struct vars *, struct subre *, FILE *);
68static int newlacon(struct vars *, struct state *, struct state *, int);
69static void freelacons(struct subre *, int);
70static void rfree(regex_t *);
71static void dump(regex_t *, FILE *);
72static void dumpst(struct subre *, FILE *, int);
73static void stdump(struct subre *, FILE *, int);
74static const char *stid(struct subre *, char *, size_t);
75/* === regc_lex.c === */
76static void lexstart(struct vars *);
77static void prefixes(struct vars *);
78static void lexnest(struct vars *, const chr *, const chr *);
79static void lexword(struct vars *);
80static int next(struct vars *);
81static int lexescape(struct vars *);
82static chr lexdigits(struct vars *, int, int, int);
83static int brenext(struct vars *, pchr);
84static void skip(struct vars *);
85static chr newline(NOPARMS);
86#ifdef REG_DEBUG
87static const chr *ch(NOPARMS);
88#endif
89static chr chrnamed(struct vars *, const chr *, const chr *, pchr);
90/* === regc_color.c === */
91static void initcm(struct vars *, struct colormap *);
92static void freecm(struct colormap *);
93static void cmtreefree(struct colormap *, union tree *, int);
94static color setcolor(struct colormap *, pchr, pcolor);
95static color maxcolor(struct colormap *);
96static color newcolor(struct colormap *);
97static void freecolor(struct colormap *, pcolor);
98static color pseudocolor(struct colormap *);
99static color subcolor(struct colormap *, pchr c);
100static color newsub(struct colormap *, pcolor);
101static void subrange(struct vars *, pchr, pchr, struct state *, struct state *);
102static void subblock(struct vars *, pchr, struct state *, struct state *);
103static void okcolors(struct nfa *, struct colormap *);
104static void colorchain(struct colormap *, struct arc *);
105static void uncolorchain(struct colormap *, struct arc *);
106static void rainbow(struct nfa *, struct colormap *, int, pcolor, struct state *, struct state *);
107static void colorcomplement(struct nfa *, struct colormap *, int, struct state *, struct state *, struct state *);
108#ifdef REG_DEBUG
109static void dumpcolors(struct colormap *, FILE *);
110static void fillcheck(struct colormap *, union tree *, int, FILE *);
111static void dumpchr(pchr, FILE *);
112#endif
113/* === regc_nfa.c === */
114static struct nfa *newnfa(struct vars *, struct colormap *, struct nfa *);
115static void freenfa(struct nfa *);
116static struct state *newstate(struct nfa *);
117static struct state *newfstate(struct nfa *, int flag);
118static void dropstate(struct nfa *, struct state *);
119static void freestate(struct nfa *, struct state *);
120static void destroystate(struct nfa *, struct state *);
121static void newarc(struct nfa *, int, pcolor, struct state *, struct state *);
122static struct arc *allocarc(struct nfa *, struct state *);
123static void freearc(struct nfa *, struct arc *);
124static struct arc *findarc(struct state *, int, pcolor);
125static void cparc(struct nfa *, struct arc *, struct state *, struct state *);
126static void moveins(struct nfa *, struct state *, struct state *);
127static void copyins(struct nfa *, struct state *, struct state *);
128static void moveouts(struct nfa *, struct state *, struct state *);
129static void copyouts(struct nfa *, struct state *, struct state *);
130static void cloneouts(struct nfa *, struct state *, struct state *, struct state *, int);
131static void delsub(struct nfa *, struct state *, struct state *);
132static void deltraverse(struct nfa *, struct state *, struct state *);
133static void dupnfa(struct nfa *, struct state *, struct state *, struct state *, struct state *);
134static void duptraverse(struct nfa *, struct state *, struct state *);
135static void cleartraverse(struct nfa *, struct state *);
136static void specialcolors(struct nfa *);
137static long optimize(struct nfa *, FILE *);
138static void pullback(struct nfa *, FILE *);
139static int pull(struct nfa *, struct arc *);
140static void pushfwd(struct nfa *, FILE *);
141static int push(struct nfa *, struct arc *);
142#define	INCOMPATIBLE	1	/* destroys arc */
143#define	SATISFIED	2	/* constraint satisfied */
144#define	COMPATIBLE	3	/* compatible but not satisfied yet */
145static int combine(struct arc *, struct arc *);
146static void fixempties(struct nfa *, FILE *);
147static int unempty(struct nfa *, struct arc *);
148static void cleanup(struct nfa *);
149static void markreachable(struct nfa *, struct state *, struct state *, struct state *);
150static void markcanreach(struct nfa *, struct state *, struct state *, struct state *);
151static long analyze(struct nfa *);
152static void compact(struct nfa *, struct cnfa *);
153static void carcsort(struct carc *, struct carc *);
154static void freecnfa(struct cnfa *);
155static void dumpnfa(struct nfa *, FILE *);
156#ifdef REG_DEBUG
157static void dumpstate(struct state *, FILE *);
158static void dumparcs(struct state *, FILE *);
159static int dumprarcs(struct arc *, struct state *, FILE *, int);
160static void dumparc(struct arc *, struct state *, FILE *);
161#endif
162static void dumpcnfa(struct cnfa *, FILE *);
163#ifdef REG_DEBUG
164static void dumpcstate(int, struct carc *, struct cnfa *, FILE *);
165#endif
166/* === regc_cvec.c === */
167static struct cvec *clearcvec(struct cvec *);
168static void addchr(struct cvec *, pchr);
169static void addrange(struct cvec *, pchr, pchr);
170static struct cvec *newcvec(int, int);
171static struct cvec *getcvec(struct vars *, int, int);
172static void freecvec(struct cvec *);
173/* === regc_locale.c === */
174static celt element(struct vars *, const chr *, const chr *);
175static struct cvec *range(struct vars *, celt, celt, int);
176static int before(celt, celt);
177static struct cvec *eclass(struct vars *, celt, int);
178static struct cvec *cclass(struct vars *, const chr *, const chr *, int);
179static struct cvec *allcases(struct vars *, pchr);
180static int cmp(const chr *, const chr *, size_t);
181static int casecmp(const chr *, const chr *, size_t);
182/* automatically gathered by fwd; do not hand-edit */
183/* =====^!^===== end forwards =====^!^===== */
184
185/* internal variables, bundled for easy passing around */
186struct vars {
187    regex_t *re;
188    const chr *now;		/* scan pointer into string */
189    const chr *stop;		/* end of string */
190    const chr *savenow;		/* saved now and stop for "subroutine call" */
191    const chr *savestop;
192    int err;			/* error code (0 if none) */
193    int cflags;			/* copy of compile flags */
194    int lasttype;		/* type of previous token */
195    int nexttype;		/* type of next token */
196    chr nextvalue;		/* value (if any) of next token */
197    int lexcon;			/* lexical context type (see lex.c) */
198    int nsubexp;		/* subexpression count */
199    struct subre **subs;	/* subRE pointer vector */
200    size_t nsubs;		/* length of vector */
201    struct subre *sub10[10];	/* initial vector, enough for most */
202    struct nfa *nfa;		/* the NFA */
203    struct colormap *cm;	/* character color map */
204    color nlcolor;		/* color of newline */
205    struct state *wordchrs;	/* state in nfa holding word-char outarcs */
206    struct subre *tree;		/* subexpression tree */
207    struct subre *treechain;	/* all tree nodes allocated */
208    struct subre *treefree;	/* any free tree nodes */
209    int ntree;			/* number of tree nodes */
210    struct cvec *cv;		/* interface cvec */
211    struct cvec *cv2;		/* utility cvec */
212    struct subre *lacons;	/* lookahead-constraint vector */
213    int nlacons;		/* size of lacons */
214};
215
216/* parsing macros; most know that `v' is the struct vars pointer */
217#define	NEXT()	(next(v))		/* advance by one token */
218#define	SEE(t)	(v->nexttype == (t))	/* is next token this? */
219#define	EAT(t)	(SEE(t) && next(v))	/* if next is this, swallow it */
220#define	VISERR(vv)	((vv)->err != 0)/* have we seen an error yet? */
221#define	ISERR()	VISERR(v)
222#define	VERR(vv,e) \
223	((vv)->nexttype = EOS, ((vv)->err) ? (vv)->err : ((vv)->err = (e)))
224#define	ERR(e)	VERR(v, e)		/* record an error */
225#define	NOERR()	{if (ISERR()) return;}	/* if error seen, return */
226#define	NOERRN()	{if (ISERR()) return NULL;}	/* NOERR with retval */
227#define	NOERRZ()	{if (ISERR()) return 0;}	/* NOERR with retval */
228#define	INSIST(c, e)	((c) ? 0 : ERR(e))	/* if condition false, error */
229#define	NOTE(b)	(v->re->re_info |= (b))		/* note visible condition */
230#define	EMPTYARC(x, y)	newarc(v->nfa, EMPTY, 0, x, y)
231
232/* token type codes, some also used as NFA arc types */
233#define	EMPTY	'n'		/* no token present */
234#define	EOS	'e'		/* end of string */
235#define	PLAIN	'p'		/* ordinary character */
236#define	DIGIT	'd'		/* digit (in bound) */
237#define	BACKREF	'b'		/* back reference */
238#define	COLLEL	'I'		/* start of [. */
239#define	ECLASS	'E'		/* start of [= */
240#define	CCLASS	'C'		/* start of [: */
241#define	END	'X'		/* end of [. [= [: */
242#define	RANGE	'R'		/* - within [] which might be range delim. */
243#define	LACON	'L'		/* lookahead constraint subRE */
244#define	AHEAD	'a'		/* color-lookahead arc */
245#define	BEHIND	'r'		/* color-lookbehind arc */
246#define	WBDRY	'w'		/* word boundary constraint */
247#define	NWBDRY	'W'		/* non-word-boundary constraint */
248#define	SBEGIN	'A'		/* beginning of string (even if not BOL) */
249#define	SEND	'Z'		/* end of string (even if not EOL) */
250#define	PREFER	'P'		/* length preference */
251
252/* is an arc colored, and hence on a color chain? */
253#define	COLORED(a) \
254	((a)->type == PLAIN || (a)->type == AHEAD || (a)->type == BEHIND)
255
256/* static function list */
257static struct fns functions = {
258    rfree,			/* regfree insides */
259};
260
261/*
262 - compile - compile regular expression
263 ^ int compile(regex_t *, const chr *, size_t, int);
264 */
265int
266compile(
267    regex_t *re,
268    const chr *string,
269    size_t len,
270    int flags)
271{
272    AllocVars(v);
273    struct guts *g;
274    int i;
275    size_t j;
276    FILE *debug = (flags&REG_PROGRESS) ? stdout : NULL;
277#define	CNOERR()	{ if (ISERR()) return freev(v, v->err); }
278
279    /*
280     * Sanity checks.
281     */
282
283    if (re == NULL || string == NULL) {
284	FreeVars(v);
285	return REG_INVARG;
286    }
287    if ((flags&REG_QUOTE) && (flags&(REG_ADVANCED|REG_EXPANDED|REG_NEWLINE))) {
288	FreeVars(v);
289	return REG_INVARG;
290    }
291    if (!(flags&REG_EXTENDED) && (flags&REG_ADVF)) {
292	FreeVars(v);
293	return REG_INVARG;
294    }
295
296    /*
297     * Initial setup (after which freev() is callable).
298     */
299
300    v->re = re;
301    v->now = string;
302    v->stop = v->now + len;
303    v->savenow = v->savestop = NULL;
304    v->err = 0;
305    v->cflags = flags;
306    v->nsubexp = 0;
307    v->subs = v->sub10;
308    v->nsubs = 10;
309    for (j = 0; j < v->nsubs; j++) {
310	v->subs[j] = NULL;
311    }
312    v->nfa = NULL;
313    v->cm = NULL;
314    v->nlcolor = COLORLESS;
315    v->wordchrs = NULL;
316    v->tree = NULL;
317    v->treechain = NULL;
318    v->treefree = NULL;
319    v->cv = NULL;
320    v->cv2 = NULL;
321    v->lacons = NULL;
322    v->nlacons = 0;
323    re->re_magic = REMAGIC;
324    re->re_info = 0;		/* bits get set during parse */
325    re->re_csize = sizeof(chr);
326    re->re_guts = NULL;
327    re->re_fns = VS(&functions);
328
329    /*
330     * More complex setup, malloced things.
331     */
332
333    re->re_guts = VS(MALLOC(sizeof(struct guts)));
334    if (re->re_guts == NULL) {
335	return freev(v, REG_ESPACE);
336    }
337    g = (struct guts *) re->re_guts;
338    g->tree = NULL;
339    initcm(v, &g->cmap);
340    v->cm = &g->cmap;
341    g->lacons = NULL;
342    g->nlacons = 0;
343    ZAPCNFA(g->search);
344    v->nfa = newnfa(v, v->cm, NULL);
345    CNOERR();
346    v->cv = newcvec(100, 20);
347    if (v->cv == NULL) {
348	return freev(v, REG_ESPACE);
349    }
350
351    /*
352     * Parsing.
353     */
354
355    lexstart(v);		/* also handles prefixes */
356    if ((v->cflags&REG_NLSTOP) || (v->cflags&REG_NLANCH)) {
357	/*
358	 * Assign newline a unique color.
359	 */
360
361	v->nlcolor = subcolor(v->cm, newline());
362	okcolors(v->nfa, v->cm);
363    }
364    CNOERR();
365    v->tree = parse(v, EOS, PLAIN, v->nfa->init, v->nfa->final);
366    assert(SEE(EOS));		/* even if error; ISERR() => SEE(EOS) */
367    CNOERR();
368    assert(v->tree != NULL);
369
370    /*
371     * Finish setup of nfa and its subre tree.
372     */
373
374    specialcolors(v->nfa);
375    CNOERR();
376    if (debug != NULL) {
377	fprintf(debug, "\n\n\n========= RAW ==========\n");
378	dumpnfa(v->nfa, debug);
379	dumpst(v->tree, debug, 1);
380    }
381    optst(v, v->tree);
382    v->ntree = numst(v->tree, 1);
383    markst(v->tree);
384    cleanst(v);
385    if (debug != NULL) {
386	fprintf(debug, "\n\n\n========= TREE FIXED ==========\n");
387	dumpst(v->tree, debug, 1);
388    }
389
390    /*
391     * Build compacted NFAs for tree and lacons.
392     */
393
394    re->re_info |= nfatree(v, v->tree, debug);
395    CNOERR();
396    assert(v->nlacons == 0 || v->lacons != NULL);
397    for (i = 1; i < v->nlacons; i++) {
398	if (debug != NULL) {
399	    fprintf(debug, "\n\n\n========= LA%d ==========\n", i);
400	}
401	nfanode(v, &v->lacons[i], debug);
402    }
403    CNOERR();
404    if (v->tree->flags&SHORTER) {
405	NOTE(REG_USHORTEST);
406    }
407
408    /*
409     * Build compacted NFAs for tree, lacons, fast search.
410     */
411
412    if (debug != NULL) {
413	fprintf(debug, "\n\n\n========= SEARCH ==========\n");
414    }
415
416    /*
417     * Can sacrifice main NFA now, so use it as work area.
418     */
419
420    (DISCARD) optimize(v->nfa, debug);
421    CNOERR();
422    makesearch(v, v->nfa);
423    CNOERR();
424    compact(v->nfa, &g->search);
425    CNOERR();
426
427    /*
428     * Looks okay, package it up.
429     */
430
431    re->re_nsub = v->nsubexp;
432    v->re = NULL;		/* freev no longer frees re */
433    g->magic = GUTSMAGIC;
434    g->cflags = v->cflags;
435    g->info = re->re_info;
436    g->nsub = re->re_nsub;
437    g->tree = v->tree;
438    v->tree = NULL;
439    g->ntree = v->ntree;
440    g->compare = (v->cflags&REG_ICASE) ? casecmp : cmp;
441    g->lacons = v->lacons;
442    v->lacons = NULL;
443    g->nlacons = v->nlacons;
444
445    if (flags&REG_DUMP) {
446	dump(re, stdout);
447    }
448
449    assert(v->err == 0);
450    return freev(v, 0);
451}
452
453/*
454 - moresubs - enlarge subRE vector
455 ^ static void moresubs(struct vars *, int);
456 */
457static void
458moresubs(
459    struct vars *v,
460    int wanted)			/* want enough room for this one */
461{
462    struct subre **p;
463    size_t n;
464
465    assert(wanted > 0 && (size_t)wanted >= v->nsubs);
466    n = (size_t)wanted * 3 / 2 + 1;
467    if (v->subs == v->sub10) {
468	p = (struct subre **) MALLOC(n * sizeof(struct subre *));
469	if (p != NULL) {
470	    memcpy(p, v->subs, v->nsubs * sizeof(struct subre *));
471	}
472    } else {
473	p = (struct subre **) REALLOC(v->subs, n*sizeof(struct subre *));
474    }
475    if (p == NULL) {
476	ERR(REG_ESPACE);
477	return;
478    }
479
480    v->subs = p;
481    for (p = &v->subs[v->nsubs]; v->nsubs < n; p++, v->nsubs++) {
482	*p = NULL;
483    }
484    assert(v->nsubs == n);
485    assert((size_t)wanted < v->nsubs);
486}
487
488/*
489 - freev - free vars struct's substructures where necessary
490 * Optionally does error-number setting, and always returns error code (if
491 * any), to make error-handling code terser.
492 ^ static int freev(struct vars *, int);
493 */
494static int
495freev(
496    struct vars *v,
497    int err)
498{
499    register int ret;
500
501    if (v->re != NULL) {
502	rfree(v->re);
503    }
504    if (v->subs != v->sub10) {
505	FREE(v->subs);
506    }
507    if (v->nfa != NULL) {
508	freenfa(v->nfa);
509    }
510    if (v->tree != NULL) {
511	freesubre(v, v->tree);
512    }
513    if (v->treechain != NULL) {
514	cleanst(v);
515    }
516    if (v->cv != NULL) {
517	freecvec(v->cv);
518    }
519    if (v->cv2 != NULL) {
520	freecvec(v->cv2);
521    }
522    if (v->lacons != NULL) {
523	freelacons(v->lacons, v->nlacons);
524    }
525    ERR(err);			/* nop if err==0 */
526
527    ret = v->err;
528    FreeVars(v);
529    return ret;
530}
531
532/*
533 - makesearch - turn an NFA into a search NFA (implicit prepend of .*?)
534 * NFA must have been optimize()d already.
535 ^ static void makesearch(struct vars *, struct nfa *);
536 */
537static void
538makesearch(
539    struct vars *v,
540    struct nfa *nfa)
541{
542    struct arc *a, *b;
543    struct state *pre = nfa->pre;
544    struct state *s, *s2, *slist;
545
546    /*
547     * No loops are needed if it's anchored.
548     */
549
550    for (a = pre->outs; a != NULL; a = a->outchain) {
551	assert(a->type == PLAIN);
552	if (a->co != nfa->bos[0] && a->co != nfa->bos[1]) {
553	    break;
554	}
555    }
556    if (a != NULL) {
557	/*
558	 * Add implicit .* in front.
559	 */
560
561	rainbow(nfa, v->cm, PLAIN, COLORLESS, pre, pre);
562
563	/*
564	 * And ^* and \A* too -- not always necessary, but harmless.
565	 */
566
567	newarc(nfa, PLAIN, nfa->bos[0], pre, pre);
568	newarc(nfa, PLAIN, nfa->bos[1], pre, pre);
569    }
570
571    /*
572     * Now here's the subtle part. Because many REs have no lookback
573     * constraints, often knowing when you were in the pre state tells you
574     * little; it's the next state(s) that are informative. But some of them
575     * may have other inarcs, i.e. it may be possible to make actual progress
576     * and then return to one of them. We must de-optimize such cases,
577     * splitting each such state into progress and no-progress states.
578     */
579
580    /*
581     * First, make a list of the states.
582     */
583
584    slist = NULL;
585    for (a=pre->outs ; a!=NULL ; a=a->outchain) {
586	s = a->to;
587	for (b=s->ins ; b!=NULL ; b=b->inchain) {
588	    if (b->from != pre) {
589		break;
590	    }
591	}
592	if (b != NULL && s->tmp == NULL) {
593	    /*
594	     * Must be split if not already in the list (fixes bugs 505048,
595	     * 230589, 840258, 504785).
596	     */
597
598	    s->tmp = slist;
599	    slist = s;
600	}
601    }
602
603    /*
604     * Do the splits.
605     */
606
607    for (s=slist ; s!=NULL ; s=s2) {
608	s2 = newstate(nfa);
609
610	copyouts(nfa, s, s2);
611	for (a=s->ins ; a!=NULL ; a=b) {
612	    b = a->inchain;
613
614	    if (a->from != pre) {
615		cparc(nfa, a, a->from, s2);
616		freearc(nfa, a);
617	    }
618	}
619	s2 = s->tmp;
620	s->tmp = NULL;		/* clean up while we're at it */
621    }
622}
623
624/*
625 - parse - parse an RE
626 * This is actually just the top level, which parses a bunch of branches tied
627 * together with '|'. They appear in the tree as the left children of a chain
628 * of '|' subres.
629 ^ static struct subre *parse(struct vars *, int, int, struct state *,
630 ^ 	struct state *);
631 */
632static struct subre *
633parse(
634    struct vars *v,
635    int stopper,		/* EOS or ')' */
636    int type,			/* LACON (lookahead subRE) or PLAIN */
637    struct state *init,		/* initial state */
638    struct state *final)	/* final state */
639{
640    struct state *left, *right;	/* scaffolding for branch */
641    struct subre *branches;	/* top level */
642    struct subre *branch;	/* current branch */
643    struct subre *t;		/* temporary */
644    int firstbranch;		/* is this the first branch? */
645
646    assert(stopper == ')' || stopper == EOS);
647
648    branches = subre(v, '|', LONGER, init, final);
649    NOERRN();
650    branch = branches;
651    firstbranch = 1;
652    do {	/* a branch */
653	if (!firstbranch) {
654	    /*
655	     * Need a place to hang the branch.
656	     */
657
658	    branch->right = subre(v, '|', LONGER, init, final);
659	    NOERRN();
660	    branch = branch->right;
661	}
662	firstbranch = 0;
663	left = newstate(v->nfa);
664	right = newstate(v->nfa);
665	NOERRN();
666	EMPTYARC(init, left);
667	EMPTYARC(right, final);
668	NOERRN();
669	branch->left = parsebranch(v, stopper, type, left, right, 0);
670	NOERRN();
671	branch->flags |= UP(branch->flags | branch->left->flags);
672	if ((branch->flags &~ branches->flags) != 0) {	/* new flags */
673	    for (t = branches; t != branch; t = t->right) {
674		t->flags |= branch->flags;
675	    }
676	}
677    } while (EAT('|'));
678    assert(SEE(stopper) || SEE(EOS));
679
680    if (!SEE(stopper)) {
681	assert(stopper == ')' && SEE(EOS));
682	ERR(REG_EPAREN);
683    }
684
685    /*
686     * Optimize out simple cases.
687     */
688
689    if (branch == branches) {	/* only one branch */
690	assert(branch->right == NULL);
691	t = branch->left;
692	branch->left = NULL;
693	freesubre(v, branches);
694	branches = t;
695    } else if (!MESSY(branches->flags)) {	/* no interesting innards */
696	freesubre(v, branches->left);
697	branches->left = NULL;
698	freesubre(v, branches->right);
699	branches->right = NULL;
700	branches->op = '=';
701    }
702
703    return branches;
704}
705
706/*
707 - parsebranch - parse one branch of an RE
708 * This mostly manages concatenation, working closely with parseqatom().
709 * Concatenated things are bundled up as much as possible, with separate
710 * ',' nodes introduced only when necessary due to substructure.
711 ^ static struct subre *parsebranch(struct vars *, int, int, struct state *,
712 ^ 	struct state *, int);
713 */
714static struct subre *
715parsebranch(
716    struct vars *v,
717    int stopper,		/* EOS or ')' */
718    int type,			/* LACON (lookahead subRE) or PLAIN */
719    struct state *left,		/* leftmost state */
720    struct state *right,	/* rightmost state */
721    int partial)		/* is this only part of a branch? */
722{
723    struct state *lp;		/* left end of current construct */
724    int seencontent;		/* is there anything in this branch yet? */
725    struct subre *t;
726
727    lp = left;
728    seencontent = 0;
729    t = subre(v, '=', 0, left, right);	/* op '=' is tentative */
730    NOERRN();
731    while (!SEE('|') && !SEE(stopper) && !SEE(EOS)) {
732	if (seencontent) {	/* implicit concat operator */
733	    lp = newstate(v->nfa);
734	    NOERRN();
735	    moveins(v->nfa, right, lp);
736	}
737	seencontent = 1;
738
739	/* NB, recursion in parseqatom() may swallow rest of branch */
740	parseqatom(v, stopper, type, lp, right, t);
741    }
742
743    if (!seencontent) {		/* empty branch */
744	if (!partial) {
745	    NOTE(REG_UUNSPEC);
746	}
747	assert(lp == left);
748	EMPTYARC(left, right);
749    }
750
751    return t;
752}
753
754/*
755 - parseqatom - parse one quantified atom or constraint of an RE
756 * The bookkeeping near the end cooperates very closely with parsebranch(); in
757 * particular, it contains a recursion that can involve parsing the rest of
758 * the branch, making this function's name somewhat inaccurate.
759 ^ static void parseqatom(struct vars *, int, int, struct state *,
760 ^ 	struct state *, struct subre *);
761 */
762static void
763parseqatom(
764    struct vars *v,
765    int stopper,		/* EOS or ')' */
766    int type,			/* LACON (lookahead subRE) or PLAIN */
767    struct state *lp,		/* left state to hang it on */
768    struct state *rp,		/* right state to hang it on */
769    struct subre *top)		/* subtree top */
770{
771    struct state *s;		/* temporaries for new states */
772    struct state *s2;
773#define	ARCV(t, val)	newarc(v->nfa, t, val, lp, rp)
774    int m, n;
775    struct subre *atom;		/* atom's subtree */
776    struct subre *t;
777    int cap;			/* capturing parens? */
778    int pos;			/* positive lookahead? */
779    int subno;			/* capturing-parens or backref number */
780    int atomtype;
781    int qprefer;		/* quantifier short/long preference */
782    int f;
783    struct subre **atomp;	/* where the pointer to atom is */
784
785    /*
786     * Initial bookkeeping.
787     */
788
789    atom = NULL;
790    assert(lp->nouts == 0);	/* must string new code */
791    assert(rp->nins == 0);	/* between lp and rp */
792    subno = 0;			/* just to shut lint up */
793
794    /*
795     * An atom or constraint...
796     */
797
798    atomtype = v->nexttype;
799    switch (atomtype) {
800	/* first, constraints, which end by returning */
801    case '^':
802	ARCV('^', 1);
803	if (v->cflags&REG_NLANCH) {
804	    ARCV(BEHIND, v->nlcolor);
805	}
806	NEXT();
807	return;
808    case '$':
809	ARCV('$', 1);
810	if (v->cflags&REG_NLANCH) {
811	    ARCV(AHEAD, v->nlcolor);
812	}
813	NEXT();
814	return;
815    case SBEGIN:
816	ARCV('^', 1);		/* BOL */
817	ARCV('^', 0);		/* or BOS */
818	NEXT();
819	return;
820    case SEND:
821	ARCV('$', 1);		/* EOL */
822	ARCV('$', 0);		/* or EOS */
823	NEXT();
824	return;
825    case '<':
826	wordchrs(v);		/* does NEXT() */
827	s = newstate(v->nfa);
828	NOERR();
829	nonword(v, BEHIND, lp, s);
830	word(v, AHEAD, s, rp);
831	return;
832    case '>':
833	wordchrs(v);		/* does NEXT() */
834	s = newstate(v->nfa);
835	NOERR();
836	word(v, BEHIND, lp, s);
837	nonword(v, AHEAD, s, rp);
838	return;
839    case WBDRY:
840	wordchrs(v);		/* does NEXT() */
841	s = newstate(v->nfa);
842	NOERR();
843	nonword(v, BEHIND, lp, s);
844	word(v, AHEAD, s, rp);
845	s = newstate(v->nfa);
846	NOERR();
847	word(v, BEHIND, lp, s);
848	nonword(v, AHEAD, s, rp);
849	return;
850    case NWBDRY:
851	wordchrs(v);		/* does NEXT() */
852	s = newstate(v->nfa);
853	NOERR();
854	word(v, BEHIND, lp, s);
855	word(v, AHEAD, s, rp);
856	s = newstate(v->nfa);
857	NOERR();
858	nonword(v, BEHIND, lp, s);
859	nonword(v, AHEAD, s, rp);
860	return;
861    case LACON:			/* lookahead constraint */
862	pos = v->nextvalue;
863	NEXT();
864	s = newstate(v->nfa);
865	s2 = newstate(v->nfa);
866	NOERR();
867	t = parse(v, ')', LACON, s, s2);
868	freesubre(v, t);	/* internal structure irrelevant */
869	assert(SEE(')') || ISERR());
870	NEXT();
871	n = newlacon(v, s, s2, pos);
872	NOERR();
873	ARCV(LACON, n);
874	return;
875
876	/*
877	 * Then errors, to get them out of the way.
878	 */
879
880    case '*':
881    case '+':
882    case '?':
883    case '{':
884	ERR(REG_BADRPT);
885	return;
886    default:
887	ERR(REG_ASSERT);
888	return;
889
890	/*
891	 * Then plain characters, and minor variants on that theme.
892	 */
893
894    case ')':			/* unbalanced paren */
895	if ((v->cflags&REG_ADVANCED) != REG_EXTENDED) {
896	    ERR(REG_EPAREN);
897	    return;
898	}
899
900	/*
901	 * Legal in EREs due to specification botch.
902	 */
903
904	NOTE(REG_UPBOTCH);
905	/* fallthrough into case PLAIN */
906    case PLAIN:
907	onechr(v, v->nextvalue, lp, rp);
908	okcolors(v->nfa, v->cm);
909	NOERR();
910	NEXT();
911	break;
912    case '[':
913	if (v->nextvalue == 1) {
914	    bracket(v, lp, rp);
915	} else {
916	    cbracket(v, lp, rp);
917	}
918	assert(SEE(']') || ISERR());
919	NEXT();
920	break;
921    case '.':
922	rainbow(v->nfa, v->cm, PLAIN,
923		(v->cflags&REG_NLSTOP) ? v->nlcolor : COLORLESS, lp, rp);
924	NEXT();
925	break;
926
927	/*
928	 * And finally the ugly stuff.
929	 */
930
931    case '(':			/* value flags as capturing or non */
932	cap = (type == LACON) ? 0 : v->nextvalue;
933	if (cap) {
934	    v->nsubexp++;
935	    subno = v->nsubexp;
936	    if ((size_t)subno >= v->nsubs) {
937		moresubs(v, subno);
938	    }
939	    assert((size_t)subno < v->nsubs);
940	} else {
941	    atomtype = PLAIN;	/* something that's not '(' */
942	}
943	NEXT();
944
945	/*
946	 * Need new endpoints because tree will contain pointers.
947	 */
948
949	s = newstate(v->nfa);
950	s2 = newstate(v->nfa);
951	NOERR();
952	EMPTYARC(lp, s);
953	EMPTYARC(s2, rp);
954	NOERR();
955	atom = parse(v, ')', PLAIN, s, s2);
956	assert(SEE(')') || ISERR());
957	NEXT();
958	NOERR();
959	if (cap) {
960	    v->subs[subno] = atom;
961	    t = subre(v, '(', atom->flags|CAP, lp, rp);
962	    NOERR();
963	    t->subno = subno;
964	    t->left = atom;
965	    atom = t;
966	}
967
968	/*
969	 * Postpone everything else pending possible {0}.
970	 */
971
972	break;
973    case BACKREF:		/* the Feature From The Black Lagoon */
974	INSIST(type != LACON, REG_ESUBREG);
975	INSIST(v->nextvalue < v->nsubs, REG_ESUBREG);
976	INSIST(v->subs[v->nextvalue] != NULL, REG_ESUBREG);
977	NOERR();
978	assert(v->nextvalue > 0);
979	atom = subre(v, 'b', BACKR, lp, rp);
980	subno = v->nextvalue;
981	atom->subno = subno;
982	EMPTYARC(lp, rp);	/* temporarily, so there's something */
983	NEXT();
984	break;
985    }
986
987    /*
988     * ...and an atom may be followed by a quantifier.
989     */
990
991    switch (v->nexttype) {
992    case '*':
993	m = 0;
994	n = INFINITY;
995	qprefer = (v->nextvalue) ? LONGER : SHORTER;
996	NEXT();
997	break;
998    case '+':
999	m = 1;
1000	n = INFINITY;
1001	qprefer = (v->nextvalue) ? LONGER : SHORTER;
1002	NEXT();
1003	break;
1004    case '?':
1005	m = 0;
1006	n = 1;
1007	qprefer = (v->nextvalue) ? LONGER : SHORTER;
1008	NEXT();
1009	break;
1010    case '{':
1011	NEXT();
1012	m = scannum(v);
1013	if (EAT(',')) {
1014	    if (SEE(DIGIT)) {
1015		n = scannum(v);
1016	    } else {
1017		n = INFINITY;
1018	    }
1019	    if (m > n) {
1020		ERR(REG_BADBR);
1021		return;
1022	    }
1023
1024	    /*
1025	     * {m,n} exercises preference, even if it's {m,m}
1026	     */
1027
1028	    qprefer = (v->nextvalue) ? LONGER : SHORTER;
1029	} else {
1030	    n = m;
1031	    /*
1032	     * {m} passes operand's preference through.
1033	     */
1034
1035	    qprefer = 0;
1036	}
1037	if (!SEE('}')) {	/* catches errors too */
1038	    ERR(REG_BADBR);
1039	    return;
1040	}
1041	NEXT();
1042	break;
1043    default:			/* no quantifier */
1044	m = n = 1;
1045	qprefer = 0;
1046	break;
1047    }
1048
1049    /*
1050     * Annoying special case: {0} or {0,0} cancels everything.
1051     */
1052
1053    if (m == 0 && n == 0) {
1054	if (atom != NULL) {
1055	    freesubre(v, atom);
1056	}
1057	if (atomtype == '(') {
1058	    v->subs[subno] = NULL;
1059	}
1060	delsub(v->nfa, lp, rp);
1061	EMPTYARC(lp, rp);
1062	return;
1063    }
1064
1065    /*
1066     * If not a messy case, avoid hard part.
1067     */
1068
1069    assert(!MESSY(top->flags));
1070    f = top->flags | qprefer | ((atom != NULL) ? atom->flags : 0);
1071    if (atomtype != '(' && atomtype != BACKREF && !MESSY(UP(f))) {
1072	if (!(m == 1 && n == 1)) {
1073	    repeat(v, lp, rp, m, n);
1074	}
1075	if (atom != NULL) {
1076	    freesubre(v, atom);
1077	}
1078	top->flags = f;
1079	return;
1080    }
1081
1082    /*
1083     * hard part: something messy
1084     * That is, capturing parens, back reference, short/long clash, or an atom
1085     * with substructure containing one of those.
1086     */
1087
1088    /*
1089     * Now we'll need a subre for the contents even if they're boring.
1090     */
1091
1092    if (atom == NULL) {
1093	atom = subre(v, '=', 0, lp, rp);
1094	NOERR();
1095    }
1096
1097    /*
1098     * Prepare a general-purpose state skeleton.
1099     *
1100     *    ---> [s] ---prefix---> [begin] ---atom---> [end] ----rest---> [rp]
1101     *   /                                            /
1102     * [lp] ----> [s2] ----bypass---------------------
1103     *
1104     * where bypass is an empty, and prefix is some repetitions of atom
1105     */
1106
1107    s = newstate(v->nfa);	/* first, new endpoints for the atom */
1108    s2 = newstate(v->nfa);
1109    NOERR();
1110    moveouts(v->nfa, lp, s);
1111    moveins(v->nfa, rp, s2);
1112    NOERR();
1113    atom->begin = s;
1114    atom->end = s2;
1115    s = newstate(v->nfa);	/* and spots for prefix and bypass */
1116    s2 = newstate(v->nfa);
1117    NOERR();
1118    EMPTYARC(lp, s);
1119    EMPTYARC(lp, s2);
1120    NOERR();
1121
1122    /*
1123     * Break remaining subRE into x{...} and what follows.
1124     */
1125
1126    t = subre(v, '.', COMBINE(qprefer, atom->flags), lp, rp);
1127    t->left = atom;
1128    atomp = &t->left;
1129
1130    /*
1131     * Here we should recurse... but we must postpone that to the end.
1132     */
1133
1134    /*
1135     * Split top into prefix and remaining.
1136     */
1137
1138    assert(top->op == '=' && top->left == NULL && top->right == NULL);
1139    top->left = subre(v, '=', top->flags, top->begin, lp);
1140    top->op = '.';
1141    top->right = t;
1142
1143    /*
1144     * If it's a backref, now is the time to replicate the subNFA.
1145     */
1146
1147    if (atomtype == BACKREF) {
1148	assert(atom->begin->nouts == 1);	/* just the EMPTY */
1149	delsub(v->nfa, atom->begin, atom->end);
1150	assert(v->subs[subno] != NULL);
1151
1152	/*
1153	 * And here's why the recursion got postponed: it must wait until the
1154	 * skeleton is filled in, because it may hit a backref that wants to
1155	 * copy the filled-in skeleton.
1156	 */
1157
1158	dupnfa(v->nfa, v->subs[subno]->begin, v->subs[subno]->end,
1159		atom->begin, atom->end);
1160	NOERR();
1161    }
1162
1163    /*
1164     * It's quantifier time; first, turn x{0,...} into x{1,...}|empty
1165     */
1166
1167    if (m == 0) {
1168	EMPTYARC(s2, atom->end);/* the bypass */
1169	assert(PREF(qprefer) != 0);
1170	f = COMBINE(qprefer, atom->flags);
1171	t = subre(v, '|', f, lp, atom->end);
1172	NOERR();
1173	t->left = atom;
1174	t->right = subre(v, '|', PREF(f), s2, atom->end);
1175	NOERR();
1176	t->right->left = subre(v, '=', 0, s2, atom->end);
1177	NOERR();
1178	*atomp = t;
1179	atomp = &t->left;
1180	m = 1;
1181    }
1182
1183    /*
1184     * Deal with the rest of the quantifier.
1185     */
1186
1187    if (atomtype == BACKREF) {
1188	/*
1189	 * Special case: backrefs have internal quantifiers.
1190	 */
1191
1192	EMPTYARC(s, atom->begin);	/* empty prefix */
1193
1194	/*
1195	 * Just stuff everything into atom.
1196	 */
1197
1198	repeat(v, atom->begin, atom->end, m, n);
1199	atom->min = (short) m;
1200	atom->max = (short) n;
1201	atom->flags |= COMBINE(qprefer, atom->flags);
1202    } else if (m == 1 && n == 1) {
1203	/*
1204	 * No/vacuous quantifier: done.
1205	 */
1206
1207	EMPTYARC(s, atom->begin);	/* empty prefix */
1208    } else {
1209	/*
1210	 * Turn x{m,n} into x{m-1,n-1}x, with capturing parens in only second
1211	 * x
1212	 */
1213
1214	dupnfa(v->nfa, atom->begin, atom->end, s, atom->begin);
1215	assert(m >= 1 && m != INFINITY && n >= 1);
1216	repeat(v, s, atom->begin, m-1, (n == INFINITY) ? n : n-1);
1217	f = COMBINE(qprefer, atom->flags);
1218	t = subre(v, '.', f, s, atom->end);	/* prefix and atom */
1219	NOERR();
1220	t->left = subre(v, '=', PREF(f), s, atom->begin);
1221	NOERR();
1222	t->right = atom;
1223	*atomp = t;
1224    }
1225
1226    /*
1227     * And finally, look after that postponed recursion.
1228     */
1229
1230    t = top->right;
1231    if (!(SEE('|') || SEE(stopper) || SEE(EOS))) {
1232	t->right = parsebranch(v, stopper, type, atom->end, rp, 1);
1233    } else {
1234	EMPTYARC(atom->end, rp);
1235	t->right = subre(v, '=', 0, atom->end, rp);
1236    }
1237    assert(SEE('|') || SEE(stopper) || SEE(EOS));
1238    t->flags |= COMBINE(t->flags, t->right->flags);
1239    top->flags |= COMBINE(top->flags, t->flags);
1240}
1241
1242/*
1243 - nonword - generate arcs for non-word-character ahead or behind
1244 ^ static void nonword(struct vars *, int, struct state *, struct state *);
1245 */
1246static void
1247nonword(
1248    struct vars *v,
1249    int dir,			/* AHEAD or BEHIND */
1250    struct state *lp,
1251    struct state *rp)
1252{
1253    int anchor = (dir == AHEAD) ? '$' : '^';
1254
1255    assert(dir == AHEAD || dir == BEHIND);
1256    newarc(v->nfa, anchor, 1, lp, rp);
1257    newarc(v->nfa, anchor, 0, lp, rp);
1258    colorcomplement(v->nfa, v->cm, dir, v->wordchrs, lp, rp);
1259    /* (no need for special attention to \n) */
1260}
1261
1262/*
1263 - word - generate arcs for word character ahead or behind
1264 ^ static void word(struct vars *, int, struct state *, struct state *);
1265 */
1266static void
1267word(
1268    struct vars *v,
1269    int dir,			/* AHEAD or BEHIND */
1270    struct state *lp,
1271    struct state *rp)
1272{
1273    assert(dir == AHEAD || dir == BEHIND);
1274    cloneouts(v->nfa, v->wordchrs, lp, rp, dir);
1275    /* (no need for special attention to \n) */
1276}
1277
1278/*
1279 - scannum - scan a number
1280 ^ static int scannum(struct vars *);
1281 */
1282static int			/* value, <= DUPMAX */
1283scannum(
1284    struct vars *v)
1285{
1286    int n = 0;
1287
1288    while (SEE(DIGIT) && n < DUPMAX) {
1289	n = n*10 + v->nextvalue;
1290	NEXT();
1291    }
1292    if (SEE(DIGIT) || n > DUPMAX) {
1293	ERR(REG_BADBR);
1294	return 0;
1295    }
1296    return n;
1297}
1298
1299/*
1300 - repeat - replicate subNFA for quantifiers
1301 * The duplication sequences used here are chosen carefully so that any
1302 * pointers starting out pointing into the subexpression end up pointing into
1303 * the last occurrence. (Note that it may not be strung between the same left
1304 * and right end states, however!) This used to be important for the subRE
1305 * tree, although the important bits are now handled by the in-line code in
1306 * parse(), and when this is called, it doesn't matter any more.
1307 ^ static void repeat(struct vars *, struct state *, struct state *, int, int);
1308 */
1309static void
1310repeat(
1311    struct vars *v,
1312    struct state *lp,
1313    struct state *rp,
1314    int m,
1315    int n)
1316{
1317#define	SOME		2
1318#define	INF		3
1319#define	PAIR(x, y)	((x)*4 + (y))
1320#define	REDUCE(x)	( ((x) == INFINITY) ? INF : (((x) > 1) ? SOME : (x)) )
1321    const int rm = REDUCE(m);
1322    const int rn = REDUCE(n);
1323    struct state *s, *s2;
1324
1325    switch (PAIR(rm, rn)) {
1326    case PAIR(0, 0):		/* empty string */
1327	delsub(v->nfa, lp, rp);
1328	EMPTYARC(lp, rp);
1329	break;
1330    case PAIR(0, 1):		/* do as x| */
1331	EMPTYARC(lp, rp);
1332	break;
1333    case PAIR(0, SOME):		/* do as x{1,n}| */
1334	repeat(v, lp, rp, 1, n);
1335	NOERR();
1336	EMPTYARC(lp, rp);
1337	break;
1338    case PAIR(0, INF):		/* loop x around */
1339	s = newstate(v->nfa);
1340	NOERR();
1341	moveouts(v->nfa, lp, s);
1342	moveins(v->nfa, rp, s);
1343	EMPTYARC(lp, s);
1344	EMPTYARC(s, rp);
1345	break;
1346    case PAIR(1, 1):		/* no action required */
1347	break;
1348    case PAIR(1, SOME):		/* do as x{0,n-1}x = (x{1,n-1}|)x */
1349	s = newstate(v->nfa);
1350	NOERR();
1351	moveouts(v->nfa, lp, s);
1352	dupnfa(v->nfa, s, rp, lp, s);
1353	NOERR();
1354	repeat(v, lp, s, 1, n-1);
1355	NOERR();
1356	EMPTYARC(lp, s);
1357	break;
1358    case PAIR(1, INF):		/* add loopback arc */
1359	s = newstate(v->nfa);
1360	s2 = newstate(v->nfa);
1361	NOERR();
1362	moveouts(v->nfa, lp, s);
1363	moveins(v->nfa, rp, s2);
1364	EMPTYARC(lp, s);
1365	EMPTYARC(s2, rp);
1366	EMPTYARC(s2, s);
1367	break;
1368    case PAIR(SOME, SOME):		/* do as x{m-1,n-1}x */
1369	s = newstate(v->nfa);
1370	NOERR();
1371	moveouts(v->nfa, lp, s);
1372	dupnfa(v->nfa, s, rp, lp, s);
1373	NOERR();
1374	repeat(v, lp, s, m-1, n-1);
1375	break;
1376    case PAIR(SOME, INF):		/* do as x{m-1,}x */
1377	s = newstate(v->nfa);
1378	NOERR();
1379	moveouts(v->nfa, lp, s);
1380	dupnfa(v->nfa, s, rp, lp, s);
1381	NOERR();
1382	repeat(v, lp, s, m-1, n);
1383	break;
1384    default:
1385	ERR(REG_ASSERT);
1386	break;
1387    }
1388}
1389
1390/*
1391 - bracket - handle non-complemented bracket expression
1392 * Also called from cbracket for complemented bracket expressions.
1393 ^ static void bracket(struct vars *, struct state *, struct state *);
1394 */
1395static void
1396bracket(
1397    struct vars *v,
1398    struct state *lp,
1399    struct state *rp)
1400{
1401    assert(SEE('['));
1402    NEXT();
1403    while (!SEE(']') && !SEE(EOS)) {
1404	brackpart(v, lp, rp);
1405    }
1406    assert(SEE(']') || ISERR());
1407    okcolors(v->nfa, v->cm);
1408}
1409
1410/*
1411 - cbracket - handle complemented bracket expression
1412 * We do it by calling bracket() with dummy endpoints, and then complementing
1413 * the result. The alternative would be to invoke rainbow(), and then delete
1414 * arcs as the b.e. is seen... but that gets messy.
1415 ^ static void cbracket(struct vars *, struct state *, struct state *);
1416 */
1417static void
1418cbracket(
1419    struct vars *v,
1420    struct state *lp,
1421    struct state *rp)
1422{
1423    struct state *left = newstate(v->nfa);
1424    struct state *right = newstate(v->nfa);
1425
1426    NOERR();
1427    bracket(v, left, right);
1428    if (v->cflags&REG_NLSTOP) {
1429	newarc(v->nfa, PLAIN, v->nlcolor, left, right);
1430    }
1431    NOERR();
1432
1433    assert(lp->nouts == 0);	/* all outarcs will be ours */
1434
1435    /*
1436     * Easy part of complementing, and all there is to do since the MCCE code
1437     * was removed.
1438     */
1439
1440    colorcomplement(v->nfa, v->cm, PLAIN, left, lp, rp);
1441    NOERR();
1442    dropstate(v->nfa, left);
1443    assert(right->nins == 0);
1444    freestate(v->nfa, right);
1445    return;
1446}
1447
1448/*
1449 - brackpart - handle one item (or range) within a bracket expression
1450 ^ static void brackpart(struct vars *, struct state *, struct state *);
1451 */
1452static void
1453brackpart(
1454    struct vars *v,
1455    struct state *lp,
1456    struct state *rp)
1457{
1458    celt startc, endc;
1459    struct cvec *cv;
1460    const chr *startp, *endp;
1461    chr c[1];
1462
1463    /*
1464     * Parse something, get rid of special cases, take shortcuts.
1465     */
1466
1467    switch (v->nexttype) {
1468    case RANGE:			/* a-b-c or other botch */
1469	ERR(REG_ERANGE);
1470	return;
1471	break;
1472    case PLAIN:
1473	c[0] = v->nextvalue;
1474	NEXT();
1475
1476	/*
1477	 * Shortcut for ordinary chr (not range).
1478	 */
1479
1480	if (!SEE(RANGE)) {
1481	    onechr(v, c[0], lp, rp);
1482	    return;
1483	}
1484	startc = element(v, c, c+1);
1485	NOERR();
1486	break;
1487    case COLLEL:
1488	startp = v->now;
1489	endp = scanplain(v);
1490	INSIST(startp < endp, REG_ECOLLATE);
1491	NOERR();
1492	startc = element(v, startp, endp);
1493	NOERR();
1494	break;
1495    case ECLASS:
1496	startp = v->now;
1497	endp = scanplain(v);
1498	INSIST(startp < endp, REG_ECOLLATE);
1499	NOERR();
1500	startc = element(v, startp, endp);
1501	NOERR();
1502	cv = eclass(v, startc, (v->cflags&REG_ICASE));
1503	NOERR();
1504	dovec(v, cv, lp, rp);
1505	return;
1506	break;
1507    case CCLASS:
1508	startp = v->now;
1509	endp = scanplain(v);
1510	INSIST(startp < endp, REG_ECTYPE);
1511	NOERR();
1512	cv = cclass(v, startp, endp, (v->cflags&REG_ICASE));
1513	NOERR();
1514	dovec(v, cv, lp, rp);
1515	return;
1516	break;
1517    default:
1518	ERR(REG_ASSERT);
1519	return;
1520	break;
1521    }
1522
1523    if (SEE(RANGE)) {
1524	NEXT();
1525	switch (v->nexttype) {
1526	case PLAIN:
1527	case RANGE:
1528	    c[0] = v->nextvalue;
1529	    NEXT();
1530	    endc = element(v, c, c+1);
1531	    NOERR();
1532	    break;
1533	case COLLEL:
1534	    startp = v->now;
1535	    endp = scanplain(v);
1536	    INSIST(startp < endp, REG_ECOLLATE);
1537	    NOERR();
1538	    endc = element(v, startp, endp);
1539	    NOERR();
1540	    break;
1541	default:
1542	    ERR(REG_ERANGE);
1543	    return;
1544	    break;
1545	}
1546    } else {
1547	endc = startc;
1548    }
1549
1550    /*
1551     * Ranges are unportable. Actually, standard C does guarantee that digits
1552     * are contiguous, but making that an exception is just too complicated.
1553     */
1554
1555    if (startc != endc) {
1556	NOTE(REG_UUNPORT);
1557    }
1558    cv = range(v, startc, endc, (v->cflags&REG_ICASE));
1559    NOERR();
1560    dovec(v, cv, lp, rp);
1561}
1562
1563/*
1564 - scanplain - scan PLAIN contents of [. etc.
1565 * Certain bits of trickery in lex.c know that this code does not try to look
1566 * past the final bracket of the [. etc.
1567 ^ static const chr *scanplain(struct vars *);
1568 */
1569static const chr *		/* just after end of sequence */
1570scanplain(
1571    struct vars *v)
1572{
1573    const chr *endp;
1574
1575    assert(SEE(COLLEL) || SEE(ECLASS) || SEE(CCLASS));
1576    NEXT();
1577
1578    endp = v->now;
1579    while (SEE(PLAIN)) {
1580	endp = v->now;
1581	NEXT();
1582    }
1583
1584    assert(SEE(END) || ISERR());
1585    NEXT();
1586
1587    return endp;
1588}
1589
1590/*
1591 - onechr - fill in arcs for a plain character, and possible case complements
1592 * This is mostly a shortcut for efficient handling of the common case.
1593 ^ static void onechr(struct vars *, pchr, struct state *, struct state *);
1594 */
1595static void
1596onechr(
1597    struct vars *v,
1598    pchr c,
1599    struct state *lp,
1600    struct state *rp)
1601{
1602    if (!(v->cflags&REG_ICASE)) {
1603	newarc(v->nfa, PLAIN, subcolor(v->cm, c), lp, rp);
1604	return;
1605    }
1606
1607    /*
1608     * Rats, need general case anyway...
1609     */
1610
1611    dovec(v, allcases(v, c), lp, rp);
1612}
1613
1614/*
1615 - dovec - fill in arcs for each element of a cvec
1616 ^ static void dovec(struct vars *, struct cvec *, struct state *,
1617 ^ 	struct state *);
1618 */
1619static void
1620dovec(
1621    struct vars *v,
1622    struct cvec *cv,
1623    struct state *lp,
1624    struct state *rp)
1625{
1626    chr ch, from, to;
1627    const chr *p;
1628    int i;
1629
1630    for (p = cv->chrs, i = cv->nchrs; i > 0; p++, i--) {
1631	ch = *p;
1632	newarc(v->nfa, PLAIN, subcolor(v->cm, ch), lp, rp);
1633    }
1634
1635    for (p = cv->ranges, i = cv->nranges; i > 0; p += 2, i--) {
1636	from = *p;
1637	to = *(p+1);
1638	if (from <= to) {
1639	    subrange(v, from, to, lp, rp);
1640	}
1641    }
1642
1643}
1644
1645/*
1646 - wordchrs - set up word-chr list for word-boundary stuff, if needed
1647 * The list is kept as a bunch of arcs between two dummy states; it's disposed
1648 * of by the unreachable-states sweep in NFA optimization. Does NEXT(). Must
1649 * not be called from any unusual lexical context. This should be reconciled
1650 * with the \w etc. handling in lex.c, and should be cleaned up to reduce
1651 * dependencies on input scanning.
1652 ^ static void wordchrs(struct vars *);
1653 */
1654static void
1655wordchrs(
1656    struct vars *v)
1657{
1658    struct state *left, *right;
1659
1660    if (v->wordchrs != NULL) {
1661	NEXT();		/* for consistency */
1662	return;
1663    }
1664
1665    left = newstate(v->nfa);
1666    right = newstate(v->nfa);
1667    NOERR();
1668
1669    /*
1670     * Fine point: implemented with [::], and lexer will set REG_ULOCALE.
1671     */
1672
1673    lexword(v);
1674    NEXT();
1675    assert(v->savenow != NULL && SEE('['));
1676    bracket(v, left, right);
1677    assert((v->savenow != NULL && SEE(']')) || ISERR());
1678    NEXT();
1679    NOERR();
1680    v->wordchrs = left;
1681}
1682
1683/*
1684 - subre - allocate a subre
1685 ^ static struct subre *subre(struct vars *, int, int, struct state *,
1686 ^	struct state *);
1687 */
1688static struct subre *
1689subre(
1690    struct vars *v,
1691    int op,
1692    int flags,
1693    struct state *begin,
1694    struct state *end)
1695{
1696    struct subre *ret = v->treefree;
1697
1698    if (ret != NULL) {
1699	v->treefree = ret->left;
1700    } else {
1701	ret = (struct subre *) MALLOC(sizeof(struct subre));
1702	if (ret == NULL) {
1703	    ERR(REG_ESPACE);
1704	    return NULL;
1705	}
1706	ret->chain = v->treechain;
1707	v->treechain = ret;
1708    }
1709
1710    assert(strchr("|.b(=", op) != NULL);
1711
1712    ret->op = op;
1713    ret->flags = flags;
1714    ret->retry = 0;
1715    ret->subno = 0;
1716    ret->min = ret->max = 1;
1717    ret->left = NULL;
1718    ret->right = NULL;
1719    ret->begin = begin;
1720    ret->end = end;
1721    ZAPCNFA(ret->cnfa);
1722
1723    return ret;
1724}
1725
1726/*
1727 - freesubre - free a subRE subtree
1728 ^ static void freesubre(struct vars *, struct subre *);
1729 */
1730static void
1731freesubre(
1732    struct vars *v,		/* might be NULL */
1733    struct subre *sr)
1734{
1735    if (sr == NULL) {
1736	return;
1737    }
1738
1739    if (sr->left != NULL) {
1740	freesubre(v, sr->left);
1741    }
1742    if (sr->right != NULL) {
1743	freesubre(v, sr->right);
1744    }
1745
1746    freesrnode(v, sr);
1747}
1748
1749/*
1750 - freesrnode - free one node in a subRE subtree
1751 ^ static void freesrnode(struct vars *, struct subre *);
1752 */
1753static void
1754freesrnode(
1755    struct vars *v,		/* might be NULL */
1756    struct subre *sr)
1757{
1758    if (sr == NULL) {
1759	return;
1760    }
1761
1762    if (!NULLCNFA(sr->cnfa)) {
1763	freecnfa(&sr->cnfa);
1764    }
1765    sr->flags = 0;
1766
1767    if (v != NULL) {
1768	sr->left = v->treefree;
1769	v->treefree = sr;
1770    } else {
1771	FREE(sr);
1772    }
1773}
1774
1775/*
1776 - optst - optimize a subRE subtree
1777 ^ static void optst(struct vars *, struct subre *);
1778 */
1779static void
1780optst(
1781    struct vars *v,
1782    struct subre *t)
1783{
1784    /*
1785     * DGP (2007-11-13): I assume it was the programmer's intent to eventually
1786     * come back and add code to optimize subRE trees, but the routine coded
1787     * just spends effort traversing the tree and doing nothing. We can do
1788     * nothing with less effort.
1789     */
1790
1791    return;
1792}
1793
1794/*
1795 - numst - number tree nodes (assigning retry indexes)
1796 ^ static int numst(struct subre *, int);
1797 */
1798static int			/* next number */
1799numst(
1800    struct subre *t,
1801    int start)			/* starting point for subtree numbers */
1802{
1803    int i;
1804
1805    assert(t != NULL);
1806
1807    i = start;
1808    t->retry = (short) i++;
1809    if (t->left != NULL) {
1810	i = numst(t->left, i);
1811    }
1812    if (t->right != NULL) {
1813	i = numst(t->right, i);
1814    }
1815    return i;
1816}
1817
1818/*
1819 - markst - mark tree nodes as INUSE
1820 ^ static void markst(struct subre *);
1821 */
1822static void
1823markst(
1824    struct subre *t)
1825{
1826    assert(t != NULL);
1827
1828    t->flags |= INUSE;
1829    if (t->left != NULL) {
1830	markst(t->left);
1831    }
1832    if (t->right != NULL) {
1833	markst(t->right);
1834    }
1835}
1836
1837/*
1838 - cleanst - free any tree nodes not marked INUSE
1839 ^ static void cleanst(struct vars *);
1840 */
1841static void
1842cleanst(
1843    struct vars *v)
1844{
1845    struct subre *t;
1846    struct subre *next;
1847
1848    for (t = v->treechain; t != NULL; t = next) {
1849	next = t->chain;
1850	if (!(t->flags&INUSE)) {
1851	    FREE(t);
1852	}
1853    }
1854    v->treechain = NULL;
1855    v->treefree = NULL;		/* just on general principles */
1856}
1857
1858/*
1859 - nfatree - turn a subRE subtree into a tree of compacted NFAs
1860 ^ static long nfatree(struct vars *, struct subre *, FILE *);
1861 */
1862static long			/* optimize results from top node */
1863nfatree(
1864    struct vars *v,
1865    struct subre *t,
1866    FILE *f)			/* for debug output */
1867{
1868    assert(t != NULL && t->begin != NULL);
1869
1870    if (t->left != NULL) {
1871	(DISCARD) nfatree(v, t->left, f);
1872    }
1873    if (t->right != NULL) {
1874	(DISCARD) nfatree(v, t->right, f);
1875    }
1876
1877    return nfanode(v, t, f);
1878}
1879
1880/*
1881 - nfanode - do one NFA for nfatree
1882 ^ static long nfanode(struct vars *, struct subre *, FILE *);
1883 */
1884static long			/* optimize results */
1885nfanode(
1886    struct vars *v,
1887    struct subre *t,
1888    FILE *f)			/* for debug output */
1889{
1890    struct nfa *nfa;
1891    long ret = 0;
1892    char idbuf[50];
1893
1894    assert(t->begin != NULL);
1895
1896    if (f != NULL) {
1897	fprintf(f, "\n\n\n========= TREE NODE %s ==========\n",
1898		stid(t, idbuf, sizeof(idbuf)));
1899    }
1900    nfa = newnfa(v, v->cm, v->nfa);
1901    NOERRZ();
1902    dupnfa(nfa, t->begin, t->end, nfa->init, nfa->final);
1903    if (!ISERR()) {
1904	specialcolors(nfa);
1905	ret = optimize(nfa, f);
1906    }
1907    if (!ISERR()) {
1908	compact(nfa, &t->cnfa);
1909    }
1910
1911    freenfa(nfa);
1912    return ret;
1913}
1914
1915/*
1916 - newlacon - allocate a lookahead-constraint subRE
1917 ^ static int newlacon(struct vars *, struct state *, struct state *, int);
1918 */
1919static int			/* lacon number */
1920newlacon(
1921    struct vars *v,
1922    struct state *begin,
1923    struct state *end,
1924    int pos)
1925{
1926    struct subre *sub;
1927    int n;
1928
1929    if (v->nlacons == 0) {
1930	v->lacons = (struct subre *) MALLOC(2 * sizeof(struct subre));
1931	n = 1;		/* skip 0th */
1932	v->nlacons = 2;
1933    } else {
1934	v->lacons = (struct subre *) REALLOC(v->lacons,
1935		(v->nlacons+1)*sizeof(struct subre));
1936	n = v->nlacons++;
1937    }
1938
1939    if (v->lacons == NULL) {
1940	ERR(REG_ESPACE);
1941	return 0;
1942    }
1943
1944    sub = &v->lacons[n];
1945    sub->begin = begin;
1946    sub->end = end;
1947    sub->subno = pos;
1948    ZAPCNFA(sub->cnfa);
1949    return n;
1950}
1951
1952/*
1953 - freelacons - free lookahead-constraint subRE vector
1954 ^ static void freelacons(struct subre *, int);
1955 */
1956static void
1957freelacons(
1958    struct subre *subs,
1959    int n)
1960{
1961    struct subre *sub;
1962    int i;
1963
1964    assert(n > 0);
1965    for (sub=subs+1, i=n-1; i>0; sub++, i--) {	/* no 0th */
1966	if (!NULLCNFA(sub->cnfa)) {
1967	    freecnfa(&sub->cnfa);
1968	}
1969    }
1970    FREE(subs);
1971}
1972
1973/*
1974 - rfree - free a whole RE (insides of regfree)
1975 ^ static void rfree(regex_t *);
1976 */
1977static void
1978rfree(
1979    regex_t *re)
1980{
1981    struct guts *g;
1982
1983    if (re == NULL || re->re_magic != REMAGIC) {
1984	return;
1985    }
1986
1987    re->re_magic = 0;	/* invalidate RE */
1988    g = (struct guts *) re->re_guts;
1989    re->re_guts = NULL;
1990    re->re_fns = NULL;
1991    g->magic = 0;
1992    freecm(&g->cmap);
1993    if (g->tree != NULL) {
1994	freesubre(NULL, g->tree);
1995    }
1996    if (g->lacons != NULL) {
1997	freelacons(g->lacons, g->nlacons);
1998    }
1999    if (!NULLCNFA(g->search)) {
2000	freecnfa(&g->search);
2001    }
2002    FREE(g);
2003}
2004
2005/*
2006 - dump - dump an RE in human-readable form
2007 ^ static void dump(regex_t *, FILE *);
2008 */
2009static void
2010dump(
2011    regex_t *re,
2012    FILE *f)
2013{
2014#ifdef REG_DEBUG
2015    struct guts *g;
2016    int i;
2017
2018    if (re->re_magic != REMAGIC) {
2019	fprintf(f, "bad magic number (0x%x not 0x%x)\n",
2020		re->re_magic, REMAGIC);
2021    }
2022    if (re->re_guts == NULL) {
2023	fprintf(f, "NULL guts!!!\n");
2024	return;
2025    }
2026    g = (struct guts *) re->re_guts;
2027    if (g->magic != GUTSMAGIC) {
2028	fprintf(f, "bad guts magic number (0x%x not 0x%x)\n",
2029		g->magic, GUTSMAGIC);
2030    }
2031
2032    fprintf(f, "\n\n\n========= DUMP ==========\n");
2033    fprintf(f, "nsub %d, info 0%lo, csize %d, ntree %d\n",
2034	    re->re_nsub, re->re_info, re->re_csize, g->ntree);
2035
2036    dumpcolors(&g->cmap, f);
2037    if (!NULLCNFA(g->search)) {
2038	printf("\nsearch:\n");
2039	dumpcnfa(&g->search, f);
2040    }
2041    for (i = 1; i < g->nlacons; i++) {
2042	fprintf(f, "\nla%d (%s):\n", i,
2043		(g->lacons[i].subno) ? "positive" : "negative");
2044	dumpcnfa(&g->lacons[i].cnfa, f);
2045    }
2046    fprintf(f, "\n");
2047    dumpst(g->tree, f, 0);
2048#endif
2049}
2050
2051/*
2052 - dumpst - dump a subRE tree
2053 ^ static void dumpst(struct subre *, FILE *, int);
2054 */
2055static void
2056dumpst(
2057    struct subre *t,
2058    FILE *f,
2059    int nfapresent)		/* is the original NFA still around? */
2060{
2061    if (t == NULL) {
2062	fprintf(f, "null tree\n");
2063    } else {
2064	stdump(t, f, nfapresent);
2065    }
2066    fflush(f);
2067}
2068
2069/*
2070 - stdump - recursive guts of dumpst
2071 ^ static void stdump(struct subre *, FILE *, int);
2072 */
2073static void
2074stdump(
2075    struct subre *t,
2076    FILE *f,
2077    int nfapresent)		/* is the original NFA still around? */
2078{
2079    char idbuf[50];
2080
2081    fprintf(f, "%s. `%c'", stid(t, idbuf, sizeof(idbuf)), t->op);
2082    if (t->flags&LONGER) {
2083	fprintf(f, " longest");
2084    }
2085    if (t->flags&SHORTER) {
2086	fprintf(f, " shortest");
2087    }
2088    if (t->flags&MIXED) {
2089	fprintf(f, " hasmixed");
2090    }
2091    if (t->flags&CAP) {
2092	fprintf(f, " hascapture");
2093    }
2094    if (t->flags&BACKR) {
2095	fprintf(f, " hasbackref");
2096    }
2097    if (!(t->flags&INUSE)) {
2098	fprintf(f, " UNUSED");
2099    }
2100    if (t->subno != 0) {
2101	fprintf(f, " (#%d)", t->subno);
2102    }
2103    if (t->min != 1 || t->max != 1) {
2104	fprintf(f, " {%d,", t->min);
2105	if (t->max != INFINITY) {
2106	    fprintf(f, "%d", t->max);
2107	}
2108	fprintf(f, "}");
2109    }
2110    if (nfapresent) {
2111	fprintf(f, " %ld-%ld", (long)t->begin->no, (long)t->end->no);
2112    }
2113    if (t->left != NULL) {
2114	fprintf(f, " L:%s", stid(t->left, idbuf, sizeof(idbuf)));
2115    }
2116    if (t->right != NULL) {
2117	fprintf(f, " R:%s", stid(t->right, idbuf, sizeof(idbuf)));
2118    }
2119    if (!NULLCNFA(t->cnfa)) {
2120	fprintf(f, "\n");
2121	dumpcnfa(&t->cnfa, f);
2122    }
2123    fprintf(f, "\n");
2124    if (t->left != NULL) {
2125	stdump(t->left, f, nfapresent);
2126    }
2127    if (t->right != NULL) {
2128	stdump(t->right, f, nfapresent);
2129    }
2130}
2131
2132/*
2133 - stid - identify a subtree node for dumping
2134 ^ static char *stid(struct subre *, char *, size_t);
2135 */
2136static const char *			/* points to buf or constant string */
2137stid(
2138    struct subre *t,
2139    char *buf,
2140    size_t bufsize)
2141{
2142    /*
2143     * Big enough for hex int or decimal t->retry?
2144     */
2145
2146    if (bufsize < sizeof(void*)*2 + 3 || bufsize < sizeof(t->retry)*3 + 1) {
2147	return "unable";
2148    }
2149    if (t->retry != 0) {
2150	sprintf(buf, "%d", t->retry);
2151    } else {
2152	sprintf(buf, "%p", t);
2153    }
2154    return buf;
2155}
2156
2157#include "regc_lex.c"
2158#include "regc_color.c"
2159#include "regc_nfa.c"
2160#include "regc_cvec.c"
2161#include "regc_locale.c"
2162
2163/*
2164 * Local Variables:
2165 * mode: c
2166 * c-basic-offset: 4
2167 * fill-column: 78
2168 * End:
2169 */
2170