1/*
2 * re_*exec and friends - match REs
3 *
4 * Copyright (c) 1998, 1999 Henry Spencer.  All rights reserved.
5 *
6 * Development of this software was funded, in part, by Cray Research Inc.,
7 * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics
8 * Corporation, none of whom are responsible for the results.  The author
9 * thanks all of them.
10 *
11 * Redistribution and use in source and binary forms -- with or without
12 * modification -- are permitted for any purpose, provided that
13 * redistributions in source form retain this entire copyright notice and
14 * indicate the origin and nature of any modifications.
15 *
16 * I'd appreciate being given credit for this package in the documentation of
17 * software which uses it, but that is not a requirement.
18 *
19 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
20 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
21 * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL
22 * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
23 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
24 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
25 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
26 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
27 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
28 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 */
30
31#include "regguts.h"
32
33/*
34 * Lazy-DFA representation.
35 */
36
37struct arcp {			/* "pointer" to an outarc */
38    struct sset *ss;
39    color co;
40};
41
42struct sset {			/* state set */
43    unsigned *states;		/* pointer to bitvector */
44    unsigned hash;		/* hash of bitvector */
45#define	HASH(bv, nw)	(((nw) == 1) ? *(bv) : hash(bv, nw))
46#define	HIT(h,bv,ss,nw)	((ss)->hash == (h) && ((nw) == 1 || \
47	memcmp(VS(bv), VS((ss)->states), (nw)*sizeof(unsigned)) == 0))
48    int flags;
49#define	STARTER		01	/* the initial state set */
50#define	POSTSTATE	02	/* includes the goal state */
51#define	LOCKED		04	/* locked in cache */
52#define	NOPROGRESS	010	/* zero-progress state set */
53    struct arcp ins;		/* chain of inarcs pointing here */
54    chr *lastseen;		/* last entered on arrival here */
55    struct sset **outs;		/* outarc vector indexed by color */
56    struct arcp *inchain;	/* chain-pointer vector for outarcs */
57};
58
59struct dfa {
60    int nssets;			/* size of cache */
61    int nssused;		/* how many entries occupied yet */
62    int nstates;		/* number of states */
63    int ncolors;		/* length of outarc and inchain vectors */
64    int wordsper;		/* length of state-set bitvectors */
65    struct sset *ssets;		/* state-set cache */
66    unsigned *statesarea;	/* bitvector storage */
67    unsigned *work;		/* pointer to work area within statesarea */
68    struct sset **outsarea;	/* outarc-vector storage */
69    struct arcp *incarea;	/* inchain storage */
70    struct cnfa *cnfa;
71    struct colormap *cm;
72    chr *lastpost;		/* location of last cache-flushed success */
73    chr *lastnopr;		/* location of last cache-flushed NOPROGRESS */
74    struct sset *search;	/* replacement-search-pointer memory */
75    int cptsmalloced;		/* were the areas individually malloced? */
76    char *mallocarea;		/* self, or master malloced area, or NULL */
77};
78
79#define	WORK	1		/* number of work bitvectors needed */
80
81/*
82 * Setup for non-malloc allocation for small cases.
83 */
84
85#define	FEWSTATES	20	/* must be less than UBITS */
86#define	FEWCOLORS	15
87struct smalldfa {
88    struct dfa dfa;
89    struct sset ssets[FEWSTATES*2];
90    unsigned statesarea[FEWSTATES*2 + WORK];
91    struct sset *outsarea[FEWSTATES*2 * FEWCOLORS];
92    struct arcp incarea[FEWSTATES*2 * FEWCOLORS];
93};
94#define	DOMALLOC	((struct smalldfa *)NULL)	/* force malloc */
95
96/*
97 * Internal variables, bundled for easy passing around.
98 */
99
100struct vars {
101    regex_t *re;
102    struct guts *g;
103    int eflags;			/* copies of arguments */
104    size_t nmatch;
105    regmatch_t *pmatch;
106    rm_detail_t *details;
107    chr *start;			/* start of string */
108    chr *stop;			/* just past end of string */
109    int err;			/* error code if any (0 none) */
110    regoff_t *mem;		/* memory vector for backtracking */
111    struct smalldfa dfa1;
112    struct smalldfa dfa2;
113};
114#define	VISERR(vv) ((vv)->err != 0)	/* have we seen an error yet? */
115#define	ISERR()	VISERR(v)
116#define	VERR(vv,e) (((vv)->err) ? (vv)->err : ((vv)->err = (e)))
117#define	ERR(e)	VERR(v, e)	/* record an error */
118#define	NOERR()	{if (ISERR()) return v->err;}	/* if error seen, return it */
119#define	OFF(p)	((p) - v->start)
120#define	LOFF(p)	((long)OFF(p))
121
122/*
123 * forward declarations
124 */
125/* =====^!^===== begin forwards =====^!^===== */
126/* automatically gathered by fwd; do not hand-edit */
127/* === regexec.c === */
128int exec(regex_t *, CONST chr *, size_t, rm_detail_t *, size_t, regmatch_t [], int);
129static int find(struct vars *, struct cnfa *, struct colormap *);
130static int cfind(struct vars *, struct cnfa *, struct colormap *);
131static int cfindloop(struct vars *, struct cnfa *, struct colormap *, struct dfa *, struct dfa *, chr **);
132static VOID zapsubs(regmatch_t *, size_t);
133static VOID zapmem(struct vars *, struct subre *);
134static VOID subset(struct vars *, struct subre *, chr *, chr *);
135static int dissect(struct vars *, struct subre *, chr *, chr *);
136static int condissect(struct vars *, struct subre *, chr *, chr *);
137static int altdissect(struct vars *, struct subre *, chr *, chr *);
138static int cdissect(struct vars *, struct subre *, chr *, chr *);
139static int ccondissect(struct vars *, struct subre *, chr *, chr *);
140static int crevdissect(struct vars *, struct subre *, chr *, chr *);
141static int cbrdissect(struct vars *, struct subre *, chr *, chr *);
142static int caltdissect(struct vars *, struct subre *, chr *, chr *);
143/* === rege_dfa.c === */
144static chr *longest(struct vars *, struct dfa *, chr *, chr *, int *);
145static chr *shortest(struct vars *, struct dfa *, chr *, chr *, chr *, chr **, int *);
146static chr *lastcold(struct vars *, struct dfa *);
147static struct dfa *newdfa(struct vars *, struct cnfa *, struct colormap *, struct smalldfa *);
148static VOID freedfa(struct dfa *);
149static unsigned hash(unsigned *, int);
150static struct sset *initialize(struct vars *, struct dfa *, chr *);
151static struct sset *miss(struct vars *, struct dfa *, struct sset *, pcolor, chr *, chr *);
152static int lacon(struct vars *, struct cnfa *, chr *, pcolor);
153static struct sset *getvacant(struct vars *, struct dfa *, chr *, chr *);
154static struct sset *pickss(struct vars *, struct dfa *, chr *, chr *);
155/* automatically gathered by fwd; do not hand-edit */
156/* =====^!^===== end forwards =====^!^===== */
157
158/*
159 - exec - match regular expression
160 ^ int exec(regex_t *, CONST chr *, size_t, rm_detail_t *,
161 ^					size_t, regmatch_t [], int);
162 */
163int
164exec(
165    regex_t *re,
166    CONST chr *string,
167    size_t len,
168    rm_detail_t *details,
169    size_t nmatch,
170    regmatch_t pmatch[],
171    int flags)
172{
173    AllocVars(v);
174    int st;
175    size_t n;
176    int backref;
177#define	LOCALMAT	20
178    regmatch_t mat[LOCALMAT];
179#define	LOCALMEM	40
180    regoff_t mem[LOCALMEM];
181
182    /*
183     * Sanity checks.
184     */
185
186    if (re == NULL || string == NULL || re->re_magic != REMAGIC) {
187	FreeVars(v);
188	return REG_INVARG;
189    }
190    if (re->re_csize != sizeof(chr)) {
191	FreeVars(v);
192	return REG_MIXED;
193    }
194
195    /*
196     * Setup.
197     */
198
199    v->re = re;
200    v->g = (struct guts *)re->re_guts;
201    if ((v->g->cflags&REG_EXPECT) && details == NULL) {
202	FreeVars(v);
203	return REG_INVARG;
204    }
205    if (v->g->info&REG_UIMPOSSIBLE) {
206	FreeVars(v);
207	return REG_NOMATCH;
208    }
209    backref = (v->g->info&REG_UBACKREF) ? 1 : 0;
210    v->eflags = flags;
211    if (v->g->cflags&REG_NOSUB) {
212	nmatch = 0;		/* override client */
213    }
214    v->nmatch = nmatch;
215    if (backref) {
216	/*
217	 * Need work area.
218	 */
219
220	if (v->g->nsub + 1 <= LOCALMAT) {
221	    v->pmatch = mat;
222	} else {
223	    v->pmatch = (regmatch_t *)
224		    MALLOC((v->g->nsub + 1) * sizeof(regmatch_t));
225	}
226	if (v->pmatch == NULL) {
227	    FreeVars(v);
228	    return REG_ESPACE;
229	}
230	v->nmatch = v->g->nsub + 1;
231    } else {
232	v->pmatch = pmatch;
233    }
234    v->details = details;
235    v->start = (chr *)string;
236    v->stop = (chr *)string + len;
237    v->err = 0;
238    if (backref) {
239	/*
240	 * Need retry memory.
241	 */
242
243	assert(v->g->ntree >= 0);
244	n = (size_t)v->g->ntree;
245	if (n <= LOCALMEM) {
246	    v->mem = mem;
247	} else {
248	    v->mem = (regoff_t *) MALLOC(n*sizeof(regoff_t));
249	}
250	if (v->mem == NULL) {
251	    if (v->pmatch != pmatch && v->pmatch != mat) {
252		FREE(v->pmatch);
253	    }
254	    FreeVars(v);
255	    return REG_ESPACE;
256	}
257    } else {
258	v->mem = NULL;
259    }
260
261    /*
262     * Do it.
263     */
264
265    assert(v->g->tree != NULL);
266    if (backref) {
267	st = cfind(v, &v->g->tree->cnfa, &v->g->cmap);
268    } else {
269	st = find(v, &v->g->tree->cnfa, &v->g->cmap);
270    }
271
272    /*
273     * Copy (portion of) match vector over if necessary.
274     */
275
276    if (st == REG_OKAY && v->pmatch != pmatch && nmatch > 0) {
277	zapsubs(pmatch, nmatch);
278	n = (nmatch < v->nmatch) ? nmatch : v->nmatch;
279	memcpy(VS(pmatch), VS(v->pmatch), n*sizeof(regmatch_t));
280    }
281
282    /*
283     * Clean up.
284     */
285
286    if (v->pmatch != pmatch && v->pmatch != mat) {
287	FREE(v->pmatch);
288    }
289    if (v->mem != NULL && v->mem != mem) {
290	FREE(v->mem);
291    }
292    FreeVars(v);
293    return st;
294}
295
296/*
297 - find - find a match for the main NFA (no-complications case)
298 ^ static int find(struct vars *, struct cnfa *, struct colormap *);
299 */
300static int
301find(
302    struct vars *v,
303    struct cnfa *cnfa,
304    struct colormap *cm)
305{
306    struct dfa *s;
307    struct dfa *d;
308    chr *begin;
309    chr *end = NULL;
310    chr *cold;
311    chr *open;			/* Open and close of range of possible
312				 * starts */
313    chr *close;
314    int hitend;
315    int shorter = (v->g->tree->flags&SHORTER) ? 1 : 0;
316
317    /*
318     * First, a shot with the search RE.
319     */
320
321    s = newdfa(v, &v->g->search, cm, &v->dfa1);
322    assert(!(ISERR() && s != NULL));
323    NOERR();
324    MDEBUG(("\nsearch at %ld\n", LOFF(v->start)));
325    cold = NULL;
326    close = shortest(v, s, v->start, v->start, v->stop, &cold, NULL);
327    freedfa(s);
328    NOERR();
329    if (v->g->cflags&REG_EXPECT) {
330	assert(v->details != NULL);
331	if (cold != NULL) {
332	    v->details->rm_extend.rm_so = OFF(cold);
333	} else {
334	    v->details->rm_extend.rm_so = OFF(v->stop);
335	}
336	v->details->rm_extend.rm_eo = OFF(v->stop);	/* unknown */
337    }
338    if (close == NULL) {	/* not found */
339	return REG_NOMATCH;
340    }
341    if (v->nmatch == 0) {	/* found, don't need exact location */
342	return REG_OKAY;
343    }
344
345    /*
346     * Find starting point and match.
347     */
348
349    assert(cold != NULL);
350    open = cold;
351    cold = NULL;
352    MDEBUG(("between %ld and %ld\n", LOFF(open), LOFF(close)));
353    d = newdfa(v, cnfa, cm, &v->dfa1);
354    assert(!(ISERR() && d != NULL));
355    NOERR();
356    for (begin = open; begin <= close; begin++) {
357	MDEBUG(("\nfind trying at %ld\n", LOFF(begin)));
358	if (shorter) {
359	    end = shortest(v, d, begin, begin, v->stop, NULL, &hitend);
360	} else {
361	    end = longest(v, d, begin, v->stop, &hitend);
362	}
363	NOERR();
364	if (hitend && cold == NULL) {
365	    cold = begin;
366	}
367	if (end != NULL) {
368	    break;		/* NOTE BREAK OUT */
369	}
370    }
371    assert(end != NULL);	/* search RE succeeded so loop should */
372    freedfa(d);
373
374    /*
375     * And pin down details.
376     */
377
378    assert(v->nmatch > 0);
379    v->pmatch[0].rm_so = OFF(begin);
380    v->pmatch[0].rm_eo = OFF(end);
381    if (v->g->cflags&REG_EXPECT) {
382	if (cold != NULL) {
383	    v->details->rm_extend.rm_so = OFF(cold);
384	} else {
385	    v->details->rm_extend.rm_so = OFF(v->stop);
386	}
387	v->details->rm_extend.rm_eo = OFF(v->stop);	/* unknown */
388    }
389    if (v->nmatch == 1) {	/* no need for submatches */
390	return REG_OKAY;
391    }
392
393    /*
394     * Submatches.
395     */
396
397    zapsubs(v->pmatch, v->nmatch);
398    return dissect(v, v->g->tree, begin, end);
399}
400
401/*
402 - cfind - find a match for the main NFA (with complications)
403 ^ static int cfind(struct vars *, struct cnfa *, struct colormap *);
404 */
405static int
406cfind(
407    struct vars *v,
408    struct cnfa *cnfa,
409    struct colormap *cm)
410{
411    struct dfa *s;
412    struct dfa *d;
413    chr *cold = NULL; /* silence gcc 4 warning */
414    int ret;
415
416    s = newdfa(v, &v->g->search, cm, &v->dfa1);
417    NOERR();
418    d = newdfa(v, cnfa, cm, &v->dfa2);
419    if (ISERR()) {
420	assert(d == NULL);
421	freedfa(s);
422	return v->err;
423    }
424
425    ret = cfindloop(v, cnfa, cm, d, s, &cold);
426
427    freedfa(d);
428    freedfa(s);
429    NOERR();
430    if (v->g->cflags&REG_EXPECT) {
431	assert(v->details != NULL);
432	if (cold != NULL) {
433	    v->details->rm_extend.rm_so = OFF(cold);
434	} else {
435	    v->details->rm_extend.rm_so = OFF(v->stop);
436	}
437	v->details->rm_extend.rm_eo = OFF(v->stop);	/* unknown */
438    }
439    return ret;
440}
441
442/*
443 - cfindloop - the heart of cfind
444 ^ static int cfindloop(struct vars *, struct cnfa *, struct colormap *,
445 ^	struct dfa *, struct dfa *, chr **);
446 */
447static int
448cfindloop(
449    struct vars *v,
450    struct cnfa *cnfa,
451    struct colormap *cm,
452    struct dfa *d,
453    struct dfa *s,
454    chr **coldp)		/* where to put coldstart pointer */
455{
456    chr *begin;
457    chr *end;
458    chr *cold;
459    chr *open;			/* Open and close of range of possible
460				 * starts */
461    chr *close;
462    chr *estart;
463    chr *estop;
464    int er;
465    int shorter = v->g->tree->flags&SHORTER;
466    int hitend;
467
468    assert(d != NULL && s != NULL);
469    cold = NULL;
470    close = v->start;
471    do {
472	MDEBUG(("\ncsearch at %ld\n", LOFF(close)));
473	close = shortest(v, s, close, close, v->stop, &cold, NULL);
474	if (close == NULL) {
475	    break;		/* NOTE BREAK */
476	}
477	assert(cold != NULL);
478	open = cold;
479	cold = NULL;
480	MDEBUG(("cbetween %ld and %ld\n", LOFF(open), LOFF(close)));
481	for (begin = open; begin <= close; begin++) {
482	    MDEBUG(("\ncfind trying at %ld\n", LOFF(begin)));
483	    estart = begin;
484	    estop = v->stop;
485	    for (;;) {
486		if (shorter) {
487		    end = shortest(v, d, begin, estart, estop, NULL, &hitend);
488		} else {
489		    end = longest(v, d, begin, estop, &hitend);
490		}
491		if (hitend && cold == NULL) {
492		    cold = begin;
493		}
494		if (end == NULL) {
495		    break;	/* NOTE BREAK OUT */
496		}
497
498		MDEBUG(("tentative end %ld\n", LOFF(end)));
499		zapsubs(v->pmatch, v->nmatch);
500		zapmem(v, v->g->tree);
501		er = cdissect(v, v->g->tree, begin, end);
502		if (er == REG_OKAY) {
503		    if (v->nmatch > 0) {
504			v->pmatch[0].rm_so = OFF(begin);
505			v->pmatch[0].rm_eo = OFF(end);
506		    }
507		    *coldp = cold;
508		    return REG_OKAY;
509		}
510		if (er != REG_NOMATCH) {
511		    ERR(er);
512		    return er;
513		}
514		if ((shorter) ? end == estop : end == begin) {
515		    /*
516		     * No point in trying again.
517		     */
518
519		    *coldp = cold;
520		    return REG_NOMATCH;
521		}
522
523		/*
524		 * Go around and try again
525		 */
526
527		if (shorter) {
528		    estart = end + 1;
529		} else {
530		    estop = end - 1;
531		}
532	    }
533	}
534    } while (close < v->stop);
535
536    *coldp = cold;
537    return REG_NOMATCH;
538}
539
540/*
541 - zapsubs - initialize the subexpression matches to "no match"
542 ^ static VOID zapsubs(regmatch_t *, size_t);
543 */
544static void
545zapsubs(
546    regmatch_t *p,
547    size_t n)
548{
549    size_t i;
550
551    for (i = n-1; i > 0; i--) {
552	p[i].rm_so = -1;
553	p[i].rm_eo = -1;
554    }
555}
556
557/*
558 - zapmem - initialize the retry memory of a subtree to zeros
559 ^ static VOID zapmem(struct vars *, struct subre *);
560 */
561static void
562zapmem(
563    struct vars *v,
564    struct subre *t)
565{
566    if (t == NULL) {
567	return;
568    }
569
570    assert(v->mem != NULL);
571    v->mem[t->retry] = 0;
572    if (t->op == '(') {
573	assert(t->subno > 0);
574	v->pmatch[t->subno].rm_so = -1;
575		v->pmatch[t->subno].rm_eo = -1;
576    }
577
578    if (t->left != NULL) {
579	zapmem(v, t->left);
580    }
581    if (t->right != NULL) {
582	zapmem(v, t->right);
583    }
584}
585
586/*
587 - subset - set any subexpression relevant to a successful subre
588 ^ static VOID subset(struct vars *, struct subre *, chr *, chr *);
589 */
590static void
591subset(
592    struct vars *v,
593    struct subre *sub,
594    chr *begin,
595    chr *end)
596{
597    int n = sub->subno;
598
599    assert(n > 0);
600    if ((size_t)n >= v->nmatch) {
601	return;
602    }
603
604    MDEBUG(("setting %d\n", n));
605    v->pmatch[n].rm_so = OFF(begin);
606    v->pmatch[n].rm_eo = OFF(end);
607}
608
609/*
610 - dissect - determine subexpression matches (uncomplicated case)
611 ^ static int dissect(struct vars *, struct subre *, chr *, chr *);
612 */
613static int			/* regexec return code */
614dissect(
615    struct vars *v,
616    struct subre *t,
617    chr *begin,			/* beginning of relevant substring */
618    chr *end)			/* end of same */
619{
620    assert(t != NULL);
621    MDEBUG(("dissect %ld-%ld\n", LOFF(begin), LOFF(end)));
622
623    switch (t->op) {
624    case '=':			/* terminal node */
625	assert(t->left == NULL && t->right == NULL);
626	return REG_OKAY;	/* no action, parent did the work */
627    case '|':			/* alternation */
628	assert(t->left != NULL);
629	return altdissect(v, t, begin, end);
630    case 'b':			/* back ref -- shouldn't be calling us! */
631	return REG_ASSERT;
632    case '.':			/* concatenation */
633	assert(t->left != NULL && t->right != NULL);
634	return condissect(v, t, begin, end);
635    case '(':			/* capturing */
636	assert(t->left != NULL && t->right == NULL);
637	assert(t->subno > 0);
638	subset(v, t, begin, end);
639	return dissect(v, t->left, begin, end);
640    default:
641	return REG_ASSERT;
642    }
643}
644
645/*
646 - condissect - determine concatenation subexpression matches (uncomplicated)
647 ^ static int condissect(struct vars *, struct subre *, chr *, chr *);
648 */
649static int			/* regexec return code */
650condissect(
651    struct vars *v,
652    struct subre *t,
653    chr *begin,			/* beginning of relevant substring */
654    chr *end)			/* end of same */
655{
656    struct dfa *d;
657    struct dfa *d2;
658    chr *mid;
659    int i;
660    int shorter = (t->left->flags&SHORTER) ? 1 : 0;
661    chr *stop = (shorter) ? end : begin;
662
663    assert(t->op == '.');
664    assert(t->left != NULL && t->left->cnfa.nstates > 0);
665    assert(t->right != NULL && t->right->cnfa.nstates > 0);
666
667    d = newdfa(v, &t->left->cnfa, &v->g->cmap, &v->dfa1);
668    NOERR();
669    d2 = newdfa(v, &t->right->cnfa, &v->g->cmap, &v->dfa2);
670    if (ISERR()) {
671	assert(d2 == NULL);
672	freedfa(d);
673	return v->err;
674    }
675
676    /*
677     * Pick a tentative midpoint.
678     */
679
680    if (shorter) {
681	mid = shortest(v, d, begin, begin, end, NULL, NULL);
682    } else {
683	mid = longest(v, d, begin, end, NULL);
684    }
685    if (mid == NULL) {
686	freedfa(d);
687	freedfa(d2);
688	return REG_ASSERT;
689    }
690    MDEBUG(("tentative midpoint %ld\n", LOFF(mid)));
691
692    /*
693     * Iterate until satisfaction or failure.
694     */
695
696    while (longest(v, d2, mid, end, NULL) != end) {
697	/*
698	 * That midpoint didn't work, find a new one.
699	 */
700
701	if (mid == stop) {
702	    /*
703	     * All possibilities exhausted!
704	     */
705
706	    MDEBUG(("no midpoint!\n"));
707	    freedfa(d);
708	    freedfa(d2);
709	    return REG_ASSERT;
710	}
711	if (shorter) {
712	    mid = shortest(v, d, begin, mid+1, end, NULL, NULL);
713	} else {
714	    mid = longest(v, d, begin, mid-1, NULL);
715	}
716	if (mid == NULL) {
717	    /*
718	     * Failed to find a new one!
719	     */
720
721	    MDEBUG(("failed midpoint!\n"));
722	    freedfa(d);
723	    freedfa(d2);
724	    return REG_ASSERT;
725	}
726	MDEBUG(("new midpoint %ld\n", LOFF(mid)));
727    }
728
729    /*
730     * Satisfaction.
731     */
732
733    MDEBUG(("successful\n"));
734    freedfa(d);
735    freedfa(d2);
736    i = dissect(v, t->left, begin, mid);
737    if (i != REG_OKAY) {
738	return i;
739    }
740    return dissect(v, t->right, mid, end);
741}
742
743/*
744 - altdissect - determine alternative subexpression matches (uncomplicated)
745 ^ static int altdissect(struct vars *, struct subre *, chr *, chr *);
746 */
747static int			/* regexec return code */
748altdissect(
749    struct vars *v,
750    struct subre *t,
751    chr *begin,			/* beginning of relevant substring */
752    chr *end)			/* end of same */
753{
754    struct dfa *d;
755    int i;
756
757    assert(t != NULL);
758    assert(t->op == '|');
759
760    for (i = 0; t != NULL; t = t->right, i++) {
761	MDEBUG(("trying %dth\n", i));
762	assert(t->left != NULL && t->left->cnfa.nstates > 0);
763	d = newdfa(v, &t->left->cnfa, &v->g->cmap, &v->dfa1);
764	if (ISERR()) {
765	    return v->err;
766	}
767	if (longest(v, d, begin, end, NULL) == end) {
768	    MDEBUG(("success\n"));
769	    freedfa(d);
770	    return dissect(v, t->left, begin, end);
771	}
772	freedfa(d);
773    }
774    return REG_ASSERT;		/* none of them matched?!? */
775}
776
777/*
778 - cdissect - determine subexpression matches (with complications)
779 * The retry memory stores the offset of the trial midpoint from begin, plus 1
780 * so that 0 uniquely means "clean slate".
781 ^ static int cdissect(struct vars *, struct subre *, chr *, chr *);
782 */
783static int			/* regexec return code */
784cdissect(
785    struct vars *v,
786    struct subre *t,
787    chr *begin,			/* beginning of relevant substring */
788    chr *end)			/* end of same */
789{
790    int er;
791
792    assert(t != NULL);
793    MDEBUG(("cdissect %ld-%ld %c\n", LOFF(begin), LOFF(end), t->op));
794
795    switch (t->op) {
796    case '=':			/* terminal node */
797	assert(t->left == NULL && t->right == NULL);
798	return REG_OKAY;	/* no action, parent did the work */
799    case '|':			/* alternation */
800	assert(t->left != NULL);
801	return caltdissect(v, t, begin, end);
802    case 'b':			/* back ref -- shouldn't be calling us! */
803	assert(t->left == NULL && t->right == NULL);
804	return cbrdissect(v, t, begin, end);
805    case '.':			/* concatenation */
806	assert(t->left != NULL && t->right != NULL);
807	return ccondissect(v, t, begin, end);
808    case '(':			/* capturing */
809	assert(t->left != NULL && t->right == NULL);
810	assert(t->subno > 0);
811	er = cdissect(v, t->left, begin, end);
812	if (er == REG_OKAY) {
813	    subset(v, t, begin, end);
814	}
815	return er;
816    default:
817	return REG_ASSERT;
818    }
819}
820
821/*
822 - ccondissect - concatenation subexpression matches (with complications)
823 * The retry memory stores the offset of the trial midpoint from begin, plus 1
824 * so that 0 uniquely means "clean slate".
825 ^ static int ccondissect(struct vars *, struct subre *, chr *, chr *);
826 */
827static int			/* regexec return code */
828ccondissect(
829    struct vars *v,
830    struct subre *t,
831    chr *begin,			/* beginning of relevant substring */
832    chr *end)			/* end of same */
833{
834    struct dfa *d, *d2;
835    chr *mid;
836    int er;
837
838    assert(t->op == '.');
839    assert(t->left != NULL && t->left->cnfa.nstates > 0);
840    assert(t->right != NULL && t->right->cnfa.nstates > 0);
841
842    if (t->left->flags&SHORTER) { /* reverse scan */
843	return crevdissect(v, t, begin, end);
844    }
845
846    d = newdfa(v, &t->left->cnfa, &v->g->cmap, DOMALLOC);
847    if (ISERR()) {
848	return v->err;
849    }
850    d2 = newdfa(v, &t->right->cnfa, &v->g->cmap, DOMALLOC);
851    if (ISERR()) {
852	freedfa(d);
853	return v->err;
854    }
855    MDEBUG(("cconcat %d\n", t->retry));
856
857    /*
858     * Pick a tentative midpoint.
859     */
860
861    if (v->mem[t->retry] == 0) {
862	mid = longest(v, d, begin, end, NULL);
863	if (mid == NULL) {
864	    freedfa(d);
865	    freedfa(d2);
866	    return REG_NOMATCH;
867	}
868	MDEBUG(("tentative midpoint %ld\n", LOFF(mid)));
869	v->mem[t->retry] = (mid - begin) + 1;
870    } else {
871	mid = begin + (v->mem[t->retry] - 1);
872	MDEBUG(("working midpoint %ld\n", LOFF(mid)));
873    }
874
875    /*
876     * Iterate until satisfaction or failure.
877     */
878
879    for (;;) {
880	/*
881	 * Try this midpoint on for size.
882	 */
883
884	if (longest(v, d2, mid, end, NULL) == end) {
885	    er = cdissect(v, t->left, begin, mid);
886	    if (er == REG_OKAY) {
887		er = cdissect(v, t->right, mid, end);
888		if (er == REG_OKAY) {
889		    /*
890		     * Satisfaction.
891		     */
892
893		    MDEBUG(("successful\n"));
894		    freedfa(d);
895		    freedfa(d2);
896		    return REG_OKAY;
897		}
898	    }
899	    if ((er != REG_OKAY) && (er != REG_NOMATCH)) {
900		freedfa(d);
901		freedfa(d2);
902		return er;
903	    }
904	}
905
906	/*
907	 * That midpoint didn't work, find a new one.
908	 */
909
910	if (mid == begin) {
911	    /*
912	     * All possibilities exhausted.
913	     */
914
915	    MDEBUG(("%d no midpoint\n", t->retry));
916	    freedfa(d);
917	    freedfa(d2);
918	    return REG_NOMATCH;
919	}
920	mid = longest(v, d, begin, mid-1, NULL);
921	if (mid == NULL) {
922	    /*
923	     * Failed to find a new one.
924	     */
925
926	    MDEBUG(("%d failed midpoint\n", t->retry));
927	    freedfa(d);
928	    freedfa(d2);
929	    return REG_NOMATCH;
930	}
931	MDEBUG(("%d: new midpoint %ld\n", t->retry, LOFF(mid)));
932	v->mem[t->retry] = (mid - begin) + 1;
933	zapmem(v, t->left);
934	zapmem(v, t->right);
935    }
936}
937
938/*
939 - crevdissect - determine backref shortest-first subexpression matches
940 * The retry memory stores the offset of the trial midpoint from begin, plus 1
941 * so that 0 uniquely means "clean slate".
942 ^ static int crevdissect(struct vars *, struct subre *, chr *, chr *);
943 */
944static int			/* regexec return code */
945crevdissect(
946    struct vars *v,
947    struct subre *t,
948    chr *begin,			/* beginning of relevant substring */
949    chr *end)			/* end of same */
950{
951    struct dfa *d;
952    struct dfa *d2;
953    chr *mid;
954    int er;
955
956    assert(t->op == '.');
957    assert(t->left != NULL && t->left->cnfa.nstates > 0);
958    assert(t->right != NULL && t->right->cnfa.nstates > 0);
959    assert(t->left->flags&SHORTER);
960
961    /*
962     * Concatenation -- need to split the substring between parts.
963     */
964
965    d = newdfa(v, &t->left->cnfa, &v->g->cmap, DOMALLOC);
966    if (ISERR()) {
967	return v->err;
968    }
969    d2 = newdfa(v, &t->right->cnfa, &v->g->cmap, DOMALLOC);
970    if (ISERR()) {
971	freedfa(d);
972	return v->err;
973    }
974    MDEBUG(("crev %d\n", t->retry));
975
976    /*
977     * Pick a tentative midpoint.
978     */
979
980    if (v->mem[t->retry] == 0) {
981	mid = shortest(v, d, begin, begin, end, NULL, NULL);
982	if (mid == NULL) {
983	    freedfa(d);
984	    freedfa(d2);
985	    return REG_NOMATCH;
986	}
987	MDEBUG(("tentative midpoint %ld\n", LOFF(mid)));
988	v->mem[t->retry] = (mid - begin) + 1;
989    } else {
990	mid = begin + (v->mem[t->retry] - 1);
991	MDEBUG(("working midpoint %ld\n", LOFF(mid)));
992    }
993
994    /*
995     * Iterate until satisfaction or failure.
996     */
997
998    for (;;) {
999	/*
1000	 * Try this midpoint on for size.
1001	 */
1002
1003	if (longest(v, d2, mid, end, NULL) == end) {
1004	    er = cdissect(v, t->left, begin, mid);
1005	    if (er == REG_OKAY) {
1006		er = cdissect(v, t->right, mid, end);
1007		if (er == REG_OKAY) {
1008		    /*
1009		     * Satisfaction.
1010		     */
1011
1012		    MDEBUG(("successful\n"));
1013		    freedfa(d);
1014		    freedfa(d2);
1015		    return REG_OKAY;
1016		}
1017	    }
1018	    if (er != REG_OKAY && er != REG_NOMATCH) {
1019		freedfa(d);
1020		freedfa(d2);
1021		return er;
1022	    }
1023	}
1024
1025	/*
1026	 * That midpoint didn't work, find a new one.
1027	 */
1028
1029	if (mid == end) {
1030	    /*
1031	     * All possibilities exhausted.
1032	     */
1033
1034	    MDEBUG(("%d no midpoint\n", t->retry));
1035	    freedfa(d);
1036	    freedfa(d2);
1037	    return REG_NOMATCH;
1038	}
1039	mid = shortest(v, d, begin, mid+1, end, NULL, NULL);
1040	if (mid == NULL) {
1041	    /*
1042	     * Failed to find a new one.
1043	     */
1044
1045	    MDEBUG(("%d failed midpoint\n", t->retry));
1046	    freedfa(d);
1047	    freedfa(d2);
1048	    return REG_NOMATCH;
1049	}
1050	MDEBUG(("%d: new midpoint %ld\n", t->retry, LOFF(mid)));
1051	v->mem[t->retry] = (mid - begin) + 1;
1052	zapmem(v, t->left);
1053	zapmem(v, t->right);
1054    }
1055}
1056
1057/*
1058 - cbrdissect - determine backref subexpression matches
1059 ^ static int cbrdissect(struct vars *, struct subre *, chr *, chr *);
1060 */
1061static int			/* regexec return code */
1062cbrdissect(
1063    struct vars *v,
1064    struct subre *t,
1065    chr *begin,			/* beginning of relevant substring */
1066    chr *end)			/* end of same */
1067{
1068    int i;
1069    int n = t->subno;
1070    size_t len;
1071    chr *paren;
1072    chr *p;
1073    chr *stop;
1074    int min = t->min;
1075    int max = t->max;
1076
1077    assert(t != NULL);
1078    assert(t->op == 'b');
1079    assert(n >= 0);
1080    assert((size_t)n < v->nmatch);
1081
1082    MDEBUG(("cbackref n%d %d{%d-%d}\n", t->retry, n, min, max));
1083
1084    if (v->pmatch[n].rm_so == -1) {
1085	return REG_NOMATCH;
1086    }
1087    paren = v->start + v->pmatch[n].rm_so;
1088    len = v->pmatch[n].rm_eo - v->pmatch[n].rm_so;
1089
1090    /*
1091     * No room to maneuver -- retries are pointless.
1092     */
1093
1094    if (v->mem[t->retry]) {
1095	return REG_NOMATCH;
1096    }
1097    v->mem[t->retry] = 1;
1098
1099    /*
1100     * Special-case zero-length string.
1101     */
1102
1103    if (len == 0) {
1104	if (begin == end) {
1105	    return REG_OKAY;
1106	}
1107	return REG_NOMATCH;
1108    }
1109
1110    /*
1111     * And too-short string.
1112     */
1113
1114    assert(end >= begin);
1115    if ((size_t)(end - begin) < len) {
1116	return REG_NOMATCH;
1117    }
1118    stop = end - len;
1119
1120    /*
1121     * Count occurrences.
1122     */
1123
1124    i = 0;
1125    for (p = begin; p <= stop && (i < max || max == INFINITY); p += len) {
1126	if ((*v->g->compare)(paren, p, len) != 0) {
1127	    break;
1128	}
1129	i++;
1130    }
1131    MDEBUG(("cbackref found %d\n", i));
1132
1133    /*
1134     * And sort it out.
1135     */
1136
1137    if (p != end) {		/* didn't consume all of it */
1138	return REG_NOMATCH;
1139    }
1140    if (min <= i && (i <= max || max == INFINITY)) {
1141	return REG_OKAY;
1142    }
1143    return REG_NOMATCH;		/* out of range */
1144}
1145
1146/*
1147 - caltdissect - determine alternative subexpression matches (w. complications)
1148 ^ static int caltdissect(struct vars *, struct subre *, chr *, chr *);
1149 */
1150static int			/* regexec return code */
1151caltdissect(
1152    struct vars *v,
1153    struct subre *t,
1154    chr *begin,			/* beginning of relevant substring */
1155    chr *end)			/* end of same */
1156{
1157    struct dfa *d;
1158    int er;
1159#define	UNTRIED	0		/* not yet tried at all */
1160#define	TRYING	1		/* top matched, trying submatches */
1161#define	TRIED	2		/* top didn't match or submatches exhausted */
1162
1163    if (t == NULL) {
1164	return REG_NOMATCH;
1165    }
1166    assert(t->op == '|');
1167    if (v->mem[t->retry] == TRIED) {
1168	return caltdissect(v, t->right, begin, end);
1169    }
1170
1171    MDEBUG(("calt n%d\n", t->retry));
1172    assert(t->left != NULL);
1173
1174    if (v->mem[t->retry] == UNTRIED) {
1175	d = newdfa(v, &t->left->cnfa, &v->g->cmap, DOMALLOC);
1176	if (ISERR()) {
1177	    return v->err;
1178	}
1179	if (longest(v, d, begin, end, NULL) != end) {
1180	    freedfa(d);
1181	    v->mem[t->retry] = TRIED;
1182	    return caltdissect(v, t->right, begin, end);
1183	}
1184	freedfa(d);
1185	MDEBUG(("calt matched\n"));
1186	v->mem[t->retry] = TRYING;
1187    }
1188
1189    er = cdissect(v, t->left, begin, end);
1190    if (er != REG_NOMATCH) {
1191	return er;
1192    }
1193
1194    v->mem[t->retry] = TRIED;
1195    return caltdissect(v, t->right, begin, end);
1196}
1197
1198#include "rege_dfa.c"
1199
1200/*
1201 * Local Variables:
1202 * mode: c
1203 * c-basic-offset: 4
1204 * fill-column: 78
1205 * End:
1206 */
1207