engine.c revision 62754
1/*-
2 * Copyright (c) 1992, 1993, 1994 Henry Spencer.
3 * Copyright (c) 1992, 1993, 1994
4 *	The Regents of the University of California.  All rights reserved.
5 *
6 * This code is derived from software contributed to Berkeley by
7 * Henry Spencer.
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 * 1. Redistributions of source code must retain the above copyright
13 *    notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 *    notice, this list of conditions and the following disclaimer in the
16 *    documentation and/or other materials provided with the distribution.
17 * 3. All advertising materials mentioning features or use of this software
18 *    must display the following acknowledgement:
19 *	This product includes software developed by the University of
20 *	California, Berkeley and its contributors.
21 * 4. Neither the name of the University nor the names of its contributors
22 *    may be used to endorse or promote products derived from this software
23 *    without specific prior written permission.
24 *
25 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
26 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
28 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
29 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
30 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
31 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
32 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
33 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
34 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
35 * SUCH DAMAGE.
36 *
37 *	@(#)engine.c	8.5 (Berkeley) 3/20/94
38 *
39 * $FreeBSD: head/lib/libc/regex/engine.c 62754 2000-07-07 07:46:36Z dcs $
40 */
41
42/*
43 * The matching engine and friends.  This file is #included by regexec.c
44 * after suitable #defines of a variety of macros used herein, so that
45 * different state representations can be used without duplicating masses
46 * of code.
47 */
48
49#ifdef SNAMES
50#define	matcher	smatcher
51#define	fast	sfast
52#define	slow	sslow
53#define	dissect	sdissect
54#define	backref	sbackref
55#define	step	sstep
56#define	print	sprint
57#define	at	sat
58#define	match	smat
59#endif
60#ifdef LNAMES
61#define	matcher	lmatcher
62#define	fast	lfast
63#define	slow	lslow
64#define	dissect	ldissect
65#define	backref	lbackref
66#define	step	lstep
67#define	print	lprint
68#define	at	lat
69#define	match	lmat
70#endif
71
72/* another structure passed up and down to avoid zillions of parameters */
73struct match {
74	struct re_guts *g;
75	int eflags;
76	regmatch_t *pmatch;	/* [nsub+1] (0 element unused) */
77	char *offp;		/* offsets work from here */
78	char *beginp;		/* start of string -- virtual NUL precedes */
79	char *endp;		/* end of string -- virtual NUL here */
80	char *coldp;		/* can be no match starting before here */
81	char **lastpos;		/* [nplus+1] */
82	STATEVARS;
83	states st;		/* current states */
84	states fresh;		/* states for a fresh start */
85	states tmp;		/* temporary */
86	states empty;		/* empty set of states */
87};
88
89/* ========= begin header generated by ./mkh ========= */
90#ifdef __cplusplus
91extern "C" {
92#endif
93
94/* === engine.c === */
95static int matcher __P((struct re_guts *g, char *string, size_t nmatch, regmatch_t pmatch[], int eflags));
96static char *dissect __P((struct match *m, char *start, char *stop, sopno startst, sopno stopst));
97static char *backref __P((struct match *m, char *start, char *stop, sopno startst, sopno stopst, sopno lev));
98static char *fast __P((struct match *m, char *start, char *stop, sopno startst, sopno stopst));
99static char *slow __P((struct match *m, char *start, char *stop, sopno startst, sopno stopst));
100static states step __P((struct re_guts *g, sopno start, sopno stop, states bef, int ch, states aft));
101#define	BOL	(OUT+1)
102#define	EOL	(BOL+1)
103#define	BOLEOL	(BOL+2)
104#define	NOTHING	(BOL+3)
105#define	BOW	(BOL+4)
106#define	EOW	(BOL+5)
107#define	CODEMAX	(BOL+5)		/* highest code used */
108#define	NONCHAR(c)	((c) > CHAR_MAX)
109#define	NNONCHAR	(CODEMAX-CHAR_MAX)
110#ifdef REDEBUG
111static void print __P((struct match *m, char *caption, states st, int ch, FILE *d));
112#endif
113#ifdef REDEBUG
114static void at __P((struct match *m, char *title, char *start, char *stop, sopno startst, sopno stopst));
115#endif
116#ifdef REDEBUG
117static char *pchar __P((int ch));
118#endif
119
120#ifdef __cplusplus
121}
122#endif
123/* ========= end header generated by ./mkh ========= */
124
125#ifdef REDEBUG
126#define	SP(t, s, c)	print(m, t, s, c, stdout)
127#define	AT(t, p1, p2, s1, s2)	at(m, t, p1, p2, s1, s2)
128#define	NOTE(str)	{ if (m->eflags&REG_TRACE) printf("=%s\n", (str)); }
129#else
130#define	SP(t, s, c)	/* nothing */
131#define	AT(t, p1, p2, s1, s2)	/* nothing */
132#define	NOTE(s)	/* nothing */
133#endif
134
135/*
136 - matcher - the actual matching engine
137 == static int matcher(register struct re_guts *g, char *string, \
138 ==	size_t nmatch, regmatch_t pmatch[], int eflags);
139 */
140static int			/* 0 success, REG_NOMATCH failure */
141matcher(g, string, nmatch, pmatch, eflags)
142register struct re_guts *g;
143char *string;
144size_t nmatch;
145regmatch_t pmatch[];
146int eflags;
147{
148	register char *endp;
149	register int i;
150	struct match mv;
151	register struct match *m = &mv;
152	register char *dp;
153	register const sopno gf = g->firststate+1;	/* +1 for OEND */
154	register const sopno gl = g->laststate;
155	char *start;
156	char *stop;
157	/* Boyer-Moore algorithms variables */
158	register char *pp;
159	int cj, mj;
160	register char *mustfirst;
161	register char *mustlast;
162	register int *matchjump;
163	register int *charjump;
164
165	/* simplify the situation where possible */
166	if (g->cflags&REG_NOSUB)
167		nmatch = 0;
168	if (eflags&REG_STARTEND) {
169		start = string + pmatch[0].rm_so;
170		stop = string + pmatch[0].rm_eo;
171	} else {
172		start = string;
173		stop = start + strlen(start);
174	}
175	if (stop < start)
176		return(REG_INVARG);
177
178	/* prescreening; this does wonders for this rather slow code */
179	if (g->must != NULL) {
180		if (g->charjump != NULL && g->matchjump != NULL) {
181			mustfirst = g->must;
182			mustlast = g->must + g->mlen - 1;
183			charjump = g->charjump;
184			matchjump = g->matchjump;
185			pp = mustlast;
186			for (dp = start+g->mlen-1; dp < stop;) {
187				/* Fast skip non-matches */
188				while (dp < stop && charjump[*dp])
189					dp += charjump[*dp];
190
191				if (dp >= stop)
192					break;
193
194				/* Greedy matcher */
195				/* We depend on not being used for
196				 * for strings of length 1
197				 */
198				while (*--dp == *--pp && pp != mustfirst);
199
200				if (*dp == *pp)
201					break;
202
203				/* Jump to next possible match */
204				mj = matchjump[pp - mustfirst];
205				cj = charjump[*dp];
206				dp += (cj < mj ? mj : cj);
207				pp = mustlast;
208			}
209			if (pp != mustfirst)
210				return(REG_NOMATCH);
211		} else {
212			for (dp = start; dp < stop; dp++)
213				if (*dp == g->must[0] &&
214				    stop - dp >= g->mlen &&
215				    memcmp(dp, g->must, (size_t)g->mlen) == 0)
216					break;
217			if (dp == stop)		/* we didn't find g->must */
218				return(REG_NOMATCH);
219		}
220	}
221
222	/* match struct setup */
223	m->g = g;
224	m->eflags = eflags;
225	m->pmatch = NULL;
226	m->lastpos = NULL;
227	m->offp = string;
228	m->beginp = start;
229	m->endp = stop;
230	STATESETUP(m, 4);
231	SETUP(m->st);
232	SETUP(m->fresh);
233	SETUP(m->tmp);
234	SETUP(m->empty);
235	CLEAR(m->empty);
236
237	/* Adjust start according to moffset, to speed things up */
238	if (g->moffset > -1)
239		start = dp - g->moffset;
240
241	/* this loop does only one repetition except for backrefs */
242	for (;;) {
243		endp = fast(m, start, stop, gf, gl);
244		if (endp == NULL) {		/* a miss */
245			STATETEARDOWN(m);
246			return(REG_NOMATCH);
247		}
248		if (nmatch == 0 && !g->backrefs)
249			break;		/* no further info needed */
250
251		/* where? */
252		assert(m->coldp != NULL);
253		for (;;) {
254			NOTE("finding start");
255			endp = slow(m, m->coldp, stop, gf, gl);
256			if (endp != NULL)
257				break;
258			assert(m->coldp < m->endp);
259			m->coldp++;
260		}
261		if (nmatch == 1 && !g->backrefs)
262			break;		/* no further info needed */
263
264		/* oh my, he wants the subexpressions... */
265		if (m->pmatch == NULL)
266			m->pmatch = (regmatch_t *)malloc((m->g->nsub + 1) *
267							sizeof(regmatch_t));
268		if (m->pmatch == NULL) {
269			STATETEARDOWN(m);
270			return(REG_ESPACE);
271		}
272		for (i = 1; i <= m->g->nsub; i++)
273			m->pmatch[i].rm_so = m->pmatch[i].rm_eo = -1;
274		if (!g->backrefs && !(m->eflags&REG_BACKR)) {
275			NOTE("dissecting");
276			dp = dissect(m, m->coldp, endp, gf, gl);
277		} else {
278			if (g->nplus > 0 && m->lastpos == NULL)
279				m->lastpos = (char **)malloc((g->nplus+1) *
280							sizeof(char *));
281			if (g->nplus > 0 && m->lastpos == NULL) {
282				free(m->pmatch);
283				STATETEARDOWN(m);
284				return(REG_ESPACE);
285			}
286			NOTE("backref dissect");
287			dp = backref(m, m->coldp, endp, gf, gl, (sopno)0);
288		}
289		if (dp != NULL)
290			break;
291
292		/* uh-oh... we couldn't find a subexpression-level match */
293		assert(g->backrefs);	/* must be back references doing it */
294		assert(g->nplus == 0 || m->lastpos != NULL);
295		for (;;) {
296			if (dp != NULL || endp <= m->coldp)
297				break;		/* defeat */
298			NOTE("backoff");
299			endp = slow(m, m->coldp, endp-1, gf, gl);
300			if (endp == NULL)
301				break;		/* defeat */
302			/* try it on a shorter possibility */
303#ifndef NDEBUG
304			for (i = 1; i <= m->g->nsub; i++) {
305				assert(m->pmatch[i].rm_so == -1);
306				assert(m->pmatch[i].rm_eo == -1);
307			}
308#endif
309			NOTE("backoff dissect");
310			dp = backref(m, m->coldp, endp, gf, gl, (sopno)0);
311		}
312		assert(dp == NULL || dp == endp);
313		if (dp != NULL)		/* found a shorter one */
314			break;
315
316		/* despite initial appearances, there is no match here */
317		NOTE("false alarm");
318		start = m->coldp + 1;	/* recycle starting later */
319		assert(start <= stop);
320	}
321
322	/* fill in the details if requested */
323	if (nmatch > 0) {
324		pmatch[0].rm_so = m->coldp - m->offp;
325		pmatch[0].rm_eo = endp - m->offp;
326	}
327	if (nmatch > 1) {
328		assert(m->pmatch != NULL);
329		for (i = 1; i < nmatch; i++)
330			if (i <= m->g->nsub)
331				pmatch[i] = m->pmatch[i];
332			else {
333				pmatch[i].rm_so = -1;
334				pmatch[i].rm_eo = -1;
335			}
336	}
337
338	if (m->pmatch != NULL)
339		free((char *)m->pmatch);
340	if (m->lastpos != NULL)
341		free((char *)m->lastpos);
342	STATETEARDOWN(m);
343	return(0);
344}
345
346/*
347 - dissect - figure out what matched what, no back references
348 == static char *dissect(register struct match *m, char *start, \
349 ==	char *stop, sopno startst, sopno stopst);
350 */
351static char *			/* == stop (success) always */
352dissect(m, start, stop, startst, stopst)
353register struct match *m;
354char *start;
355char *stop;
356sopno startst;
357sopno stopst;
358{
359	register int i;
360	register sopno ss;	/* start sop of current subRE */
361	register sopno es;	/* end sop of current subRE */
362	register char *sp;	/* start of string matched by it */
363	register char *stp;	/* string matched by it cannot pass here */
364	register char *rest;	/* start of rest of string */
365	register char *tail;	/* string unmatched by rest of RE */
366	register sopno ssub;	/* start sop of subsubRE */
367	register sopno esub;	/* end sop of subsubRE */
368	register char *ssp;	/* start of string matched by subsubRE */
369	register char *sep;	/* end of string matched by subsubRE */
370	register char *oldssp;	/* previous ssp */
371	register char *dp;
372
373	AT("diss", start, stop, startst, stopst);
374	sp = start;
375	for (ss = startst; ss < stopst; ss = es) {
376		/* identify end of subRE */
377		es = ss;
378		switch (OP(m->g->strip[es])) {
379		case OPLUS_:
380		case OQUEST_:
381			es += OPND(m->g->strip[es]);
382			break;
383		case OCH_:
384			while (OP(m->g->strip[es]) != O_CH)
385				es += OPND(m->g->strip[es]);
386			break;
387		}
388		es++;
389
390		/* figure out what it matched */
391		switch (OP(m->g->strip[ss])) {
392		case OEND:
393			assert(nope);
394			break;
395		case OCHAR:
396			sp++;
397			break;
398		case OBOL:
399		case OEOL:
400		case OBOW:
401		case OEOW:
402			break;
403		case OANY:
404		case OANYOF:
405			sp++;
406			break;
407		case OBACK_:
408		case O_BACK:
409			assert(nope);
410			break;
411		/* cases where length of match is hard to find */
412		case OQUEST_:
413			stp = stop;
414			for (;;) {
415				/* how long could this one be? */
416				rest = slow(m, sp, stp, ss, es);
417				assert(rest != NULL);	/* it did match */
418				/* could the rest match the rest? */
419				tail = slow(m, rest, stop, es, stopst);
420				if (tail == stop)
421					break;		/* yes! */
422				/* no -- try a shorter match for this one */
423				stp = rest - 1;
424				assert(stp >= sp);	/* it did work */
425			}
426			ssub = ss + 1;
427			esub = es - 1;
428			/* did innards match? */
429			if (slow(m, sp, rest, ssub, esub) != NULL) {
430				dp = dissect(m, sp, rest, ssub, esub);
431				assert(dp == rest);
432			} else		/* no */
433				assert(sp == rest);
434			sp = rest;
435			break;
436		case OPLUS_:
437			stp = stop;
438			for (;;) {
439				/* how long could this one be? */
440				rest = slow(m, sp, stp, ss, es);
441				assert(rest != NULL);	/* it did match */
442				/* could the rest match the rest? */
443				tail = slow(m, rest, stop, es, stopst);
444				if (tail == stop)
445					break;		/* yes! */
446				/* no -- try a shorter match for this one */
447				stp = rest - 1;
448				assert(stp >= sp);	/* it did work */
449			}
450			ssub = ss + 1;
451			esub = es - 1;
452			ssp = sp;
453			oldssp = ssp;
454			for (;;) {	/* find last match of innards */
455				sep = slow(m, ssp, rest, ssub, esub);
456				if (sep == NULL || sep == ssp)
457					break;	/* failed or matched null */
458				oldssp = ssp;	/* on to next try */
459				ssp = sep;
460			}
461			if (sep == NULL) {
462				/* last successful match */
463				sep = ssp;
464				ssp = oldssp;
465			}
466			assert(sep == rest);	/* must exhaust substring */
467			assert(slow(m, ssp, sep, ssub, esub) == rest);
468			dp = dissect(m, ssp, sep, ssub, esub);
469			assert(dp == sep);
470			sp = rest;
471			break;
472		case OCH_:
473			stp = stop;
474			for (;;) {
475				/* how long could this one be? */
476				rest = slow(m, sp, stp, ss, es);
477				assert(rest != NULL);	/* it did match */
478				/* could the rest match the rest? */
479				tail = slow(m, rest, stop, es, stopst);
480				if (tail == stop)
481					break;		/* yes! */
482				/* no -- try a shorter match for this one */
483				stp = rest - 1;
484				assert(stp >= sp);	/* it did work */
485			}
486			ssub = ss + 1;
487			esub = ss + OPND(m->g->strip[ss]) - 1;
488			assert(OP(m->g->strip[esub]) == OOR1);
489			for (;;) {	/* find first matching branch */
490				if (slow(m, sp, rest, ssub, esub) == rest)
491					break;	/* it matched all of it */
492				/* that one missed, try next one */
493				assert(OP(m->g->strip[esub]) == OOR1);
494				esub++;
495				assert(OP(m->g->strip[esub]) == OOR2);
496				ssub = esub + 1;
497				esub += OPND(m->g->strip[esub]);
498				if (OP(m->g->strip[esub]) == OOR2)
499					esub--;
500				else
501					assert(OP(m->g->strip[esub]) == O_CH);
502			}
503			dp = dissect(m, sp, rest, ssub, esub);
504			assert(dp == rest);
505			sp = rest;
506			break;
507		case O_PLUS:
508		case O_QUEST:
509		case OOR1:
510		case OOR2:
511		case O_CH:
512			assert(nope);
513			break;
514		case OLPAREN:
515			i = OPND(m->g->strip[ss]);
516			assert(0 < i && i <= m->g->nsub);
517			m->pmatch[i].rm_so = sp - m->offp;
518			break;
519		case ORPAREN:
520			i = OPND(m->g->strip[ss]);
521			assert(0 < i && i <= m->g->nsub);
522			m->pmatch[i].rm_eo = sp - m->offp;
523			break;
524		default:		/* uh oh */
525			assert(nope);
526			break;
527		}
528	}
529
530	assert(sp == stop);
531	return(sp);
532}
533
534/*
535 - backref - figure out what matched what, figuring in back references
536 == static char *backref(register struct match *m, char *start, \
537 ==	char *stop, sopno startst, sopno stopst, sopno lev);
538 */
539static char *			/* == stop (success) or NULL (failure) */
540backref(m, start, stop, startst, stopst, lev)
541register struct match *m;
542char *start;
543char *stop;
544sopno startst;
545sopno stopst;
546sopno lev;			/* PLUS nesting level */
547{
548	register int i;
549	register sopno ss;	/* start sop of current subRE */
550	register char *sp;	/* start of string matched by it */
551	register sopno ssub;	/* start sop of subsubRE */
552	register sopno esub;	/* end sop of subsubRE */
553	register char *ssp;	/* start of string matched by subsubRE */
554	register char *dp;
555	register size_t len;
556	register int hard;
557	register sop s;
558	register regoff_t offsave;
559	register cset *cs;
560
561	AT("back", start, stop, startst, stopst);
562	sp = start;
563
564	/* get as far as we can with easy stuff */
565	hard = 0;
566	for (ss = startst; !hard && ss < stopst; ss++)
567		switch (OP(s = m->g->strip[ss])) {
568		case OCHAR:
569			if (sp == stop || *sp++ != (char)OPND(s))
570				return(NULL);
571			break;
572		case OANY:
573			if (sp == stop)
574				return(NULL);
575			sp++;
576			break;
577		case OANYOF:
578			cs = &m->g->sets[OPND(s)];
579			if (sp == stop || !CHIN(cs, *sp++))
580				return(NULL);
581			break;
582		case OBOL:
583			if ( (sp == m->beginp && !(m->eflags&REG_NOTBOL)) ||
584					(sp < m->endp && *(sp-1) == '\n' &&
585						(m->g->cflags&REG_NEWLINE)) )
586				{ /* yes */ }
587			else
588				return(NULL);
589			break;
590		case OEOL:
591			if ( (sp == m->endp && !(m->eflags&REG_NOTEOL)) ||
592					(sp < m->endp && *sp == '\n' &&
593						(m->g->cflags&REG_NEWLINE)) )
594				{ /* yes */ }
595			else
596				return(NULL);
597			break;
598		case OBOW:
599			if (( (sp == m->beginp && !(m->eflags&REG_NOTBOL)) ||
600					(sp < m->endp && *(sp-1) == '\n' &&
601						(m->g->cflags&REG_NEWLINE)) ||
602					(sp > m->beginp &&
603							!ISWORD(*(sp-1))) ) &&
604					(sp < m->endp && ISWORD(*sp)) )
605				{ /* yes */ }
606			else
607				return(NULL);
608			break;
609		case OEOW:
610			if (( (sp == m->endp && !(m->eflags&REG_NOTEOL)) ||
611					(sp < m->endp && *sp == '\n' &&
612						(m->g->cflags&REG_NEWLINE)) ||
613					(sp < m->endp && !ISWORD(*sp)) ) &&
614					(sp > m->beginp && ISWORD(*(sp-1))) )
615				{ /* yes */ }
616			else
617				return(NULL);
618			break;
619		case O_QUEST:
620			break;
621		case OOR1:	/* matches null but needs to skip */
622			ss++;
623			s = m->g->strip[ss];
624			do {
625				assert(OP(s) == OOR2);
626				ss += OPND(s);
627			} while (OP(s = m->g->strip[ss]) != O_CH);
628			/* note that the ss++ gets us past the O_CH */
629			break;
630		default:	/* have to make a choice */
631			hard = 1;
632			break;
633		}
634	if (!hard) {		/* that was it! */
635		if (sp != stop)
636			return(NULL);
637		return(sp);
638	}
639	ss--;			/* adjust for the for's final increment */
640
641	/* the hard stuff */
642	AT("hard", sp, stop, ss, stopst);
643	s = m->g->strip[ss];
644	switch (OP(s)) {
645	case OBACK_:		/* the vilest depths */
646		i = OPND(s);
647		assert(0 < i && i <= m->g->nsub);
648		if (m->pmatch[i].rm_eo == -1)
649			return(NULL);
650		assert(m->pmatch[i].rm_so != -1);
651		len = m->pmatch[i].rm_eo - m->pmatch[i].rm_so;
652		assert(stop - m->beginp >= len);
653		if (sp > stop - len)
654			return(NULL);	/* not enough left to match */
655		ssp = m->offp + m->pmatch[i].rm_so;
656		if (memcmp(sp, ssp, len) != 0)
657			return(NULL);
658		while (m->g->strip[ss] != SOP(O_BACK, i))
659			ss++;
660		return(backref(m, sp+len, stop, ss+1, stopst, lev));
661		break;
662	case OQUEST_:		/* to null or not */
663		dp = backref(m, sp, stop, ss+1, stopst, lev);
664		if (dp != NULL)
665			return(dp);	/* not */
666		return(backref(m, sp, stop, ss+OPND(s)+1, stopst, lev));
667		break;
668	case OPLUS_:
669		assert(m->lastpos != NULL);
670		assert(lev+1 <= m->g->nplus);
671		m->lastpos[lev+1] = sp;
672		return(backref(m, sp, stop, ss+1, stopst, lev+1));
673		break;
674	case O_PLUS:
675		if (sp == m->lastpos[lev])	/* last pass matched null */
676			return(backref(m, sp, stop, ss+1, stopst, lev-1));
677		/* try another pass */
678		m->lastpos[lev] = sp;
679		dp = backref(m, sp, stop, ss-OPND(s)+1, stopst, lev);
680		if (dp == NULL)
681			return(backref(m, sp, stop, ss+1, stopst, lev-1));
682		else
683			return(dp);
684		break;
685	case OCH_:		/* find the right one, if any */
686		ssub = ss + 1;
687		esub = ss + OPND(s) - 1;
688		assert(OP(m->g->strip[esub]) == OOR1);
689		for (;;) {	/* find first matching branch */
690			dp = backref(m, sp, stop, ssub, esub, lev);
691			if (dp != NULL)
692				return(dp);
693			/* that one missed, try next one */
694			if (OP(m->g->strip[esub]) == O_CH)
695				return(NULL);	/* there is none */
696			esub++;
697			assert(OP(m->g->strip[esub]) == OOR2);
698			ssub = esub + 1;
699			esub += OPND(m->g->strip[esub]);
700			if (OP(m->g->strip[esub]) == OOR2)
701				esub--;
702			else
703				assert(OP(m->g->strip[esub]) == O_CH);
704		}
705		break;
706	case OLPAREN:		/* must undo assignment if rest fails */
707		i = OPND(s);
708		assert(0 < i && i <= m->g->nsub);
709		offsave = m->pmatch[i].rm_so;
710		m->pmatch[i].rm_so = sp - m->offp;
711		dp = backref(m, sp, stop, ss+1, stopst, lev);
712		if (dp != NULL)
713			return(dp);
714		m->pmatch[i].rm_so = offsave;
715		return(NULL);
716		break;
717	case ORPAREN:		/* must undo assignment if rest fails */
718		i = OPND(s);
719		assert(0 < i && i <= m->g->nsub);
720		offsave = m->pmatch[i].rm_eo;
721		m->pmatch[i].rm_eo = sp - m->offp;
722		dp = backref(m, sp, stop, ss+1, stopst, lev);
723		if (dp != NULL)
724			return(dp);
725		m->pmatch[i].rm_eo = offsave;
726		return(NULL);
727		break;
728	default:		/* uh oh */
729		assert(nope);
730		break;
731	}
732
733	/* "can't happen" */
734	assert(nope);
735	/* NOTREACHED */
736	return "shut up gcc";
737}
738
739/*
740 - fast - step through the string at top speed
741 == static char *fast(register struct match *m, char *start, \
742 ==	char *stop, sopno startst, sopno stopst);
743 */
744static char *			/* where tentative match ended, or NULL */
745fast(m, start, stop, startst, stopst)
746register struct match *m;
747char *start;
748char *stop;
749sopno startst;
750sopno stopst;
751{
752	register states st = m->st;
753	register states fresh = m->fresh;
754	register states tmp = m->tmp;
755	register char *p = start;
756	register int c = (start == m->beginp) ? OUT : *(start-1);
757	register int lastc;	/* previous c */
758	register int flagch;
759	register int i;
760	register char *coldp;	/* last p after which no match was underway */
761
762	CLEAR(st);
763	SET1(st, startst);
764	st = step(m->g, startst, stopst, st, NOTHING, st);
765	ASSIGN(fresh, st);
766	SP("start", st, *p);
767	coldp = NULL;
768	for (;;) {
769		/* next character */
770		lastc = c;
771		c = (p == m->endp) ? OUT : *p;
772		if (EQ(st, fresh))
773			coldp = p;
774
775		/* is there an EOL and/or BOL between lastc and c? */
776		flagch = '\0';
777		i = 0;
778		if ( (lastc == '\n' && m->g->cflags&REG_NEWLINE) ||
779				(lastc == OUT && !(m->eflags&REG_NOTBOL)) ) {
780			flagch = BOL;
781			i = m->g->nbol;
782		}
783		if ( (c == '\n' && m->g->cflags&REG_NEWLINE) ||
784				(c == OUT && !(m->eflags&REG_NOTEOL)) ) {
785			flagch = (flagch == BOL) ? BOLEOL : EOL;
786			i += m->g->neol;
787		}
788		if (i != 0) {
789			for (; i > 0; i--)
790				st = step(m->g, startst, stopst, st, flagch, st);
791			SP("boleol", st, c);
792		}
793
794		/* how about a word boundary? */
795		if ( (flagch == BOL || (lastc != OUT && !ISWORD(lastc))) &&
796					(c != OUT && ISWORD(c)) ) {
797			flagch = BOW;
798		}
799		if ( (lastc != OUT && ISWORD(lastc)) &&
800				(flagch == EOL || (c != OUT && !ISWORD(c))) ) {
801			flagch = EOW;
802		}
803		if (flagch == BOW || flagch == EOW) {
804			st = step(m->g, startst, stopst, st, flagch, st);
805			SP("boweow", st, c);
806		}
807
808		/* are we done? */
809		if (ISSET(st, stopst) || p == stop)
810			break;		/* NOTE BREAK OUT */
811
812		/* no, we must deal with this character */
813		ASSIGN(tmp, st);
814		ASSIGN(st, fresh);
815		assert(c != OUT);
816		st = step(m->g, startst, stopst, tmp, c, st);
817		SP("aft", st, c);
818		assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st));
819		p++;
820	}
821
822	assert(coldp != NULL);
823	m->coldp = coldp;
824	if (ISSET(st, stopst))
825		return(p+1);
826	else
827		return(NULL);
828}
829
830/*
831 - slow - step through the string more deliberately
832 == static char *slow(register struct match *m, char *start, \
833 ==	char *stop, sopno startst, sopno stopst);
834 */
835static char *			/* where it ended */
836slow(m, start, stop, startst, stopst)
837register struct match *m;
838char *start;
839char *stop;
840sopno startst;
841sopno stopst;
842{
843	register states st = m->st;
844	register states empty = m->empty;
845	register states tmp = m->tmp;
846	register char *p = start;
847	register int c = (start == m->beginp) ? OUT : *(start-1);
848	register int lastc;	/* previous c */
849	register int flagch;
850	register int i;
851	register char *matchp;	/* last p at which a match ended */
852
853	AT("slow", start, stop, startst, stopst);
854	CLEAR(st);
855	SET1(st, startst);
856	SP("sstart", st, *p);
857	st = step(m->g, startst, stopst, st, NOTHING, st);
858	matchp = NULL;
859	for (;;) {
860		/* next character */
861		lastc = c;
862		c = (p == m->endp) ? OUT : *p;
863
864		/* is there an EOL and/or BOL between lastc and c? */
865		flagch = '\0';
866		i = 0;
867		if ( (lastc == '\n' && m->g->cflags&REG_NEWLINE) ||
868				(lastc == OUT && !(m->eflags&REG_NOTBOL)) ) {
869			flagch = BOL;
870			i = m->g->nbol;
871		}
872		if ( (c == '\n' && m->g->cflags&REG_NEWLINE) ||
873				(c == OUT && !(m->eflags&REG_NOTEOL)) ) {
874			flagch = (flagch == BOL) ? BOLEOL : EOL;
875			i += m->g->neol;
876		}
877		if (i != 0) {
878			for (; i > 0; i--)
879				st = step(m->g, startst, stopst, st, flagch, st);
880			SP("sboleol", st, c);
881		}
882
883		/* how about a word boundary? */
884		if ( (flagch == BOL || (lastc != OUT && !ISWORD(lastc))) &&
885					(c != OUT && ISWORD(c)) ) {
886			flagch = BOW;
887		}
888		if ( (lastc != OUT && ISWORD(lastc)) &&
889				(flagch == EOL || (c != OUT && !ISWORD(c))) ) {
890			flagch = EOW;
891		}
892		if (flagch == BOW || flagch == EOW) {
893			st = step(m->g, startst, stopst, st, flagch, st);
894			SP("sboweow", st, c);
895		}
896
897		/* are we done? */
898		if (ISSET(st, stopst))
899			matchp = p;
900		if (EQ(st, empty) || p == stop)
901			break;		/* NOTE BREAK OUT */
902
903		/* no, we must deal with this character */
904		ASSIGN(tmp, st);
905		ASSIGN(st, empty);
906		assert(c != OUT);
907		st = step(m->g, startst, stopst, tmp, c, st);
908		SP("saft", st, c);
909		assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st));
910		p++;
911	}
912
913	return(matchp);
914}
915
916
917/*
918 - step - map set of states reachable before char to set reachable after
919 == static states step(register struct re_guts *g, sopno start, sopno stop, \
920 ==	register states bef, int ch, register states aft);
921 == #define	BOL	(OUT+1)
922 == #define	EOL	(BOL+1)
923 == #define	BOLEOL	(BOL+2)
924 == #define	NOTHING	(BOL+3)
925 == #define	BOW	(BOL+4)
926 == #define	EOW	(BOL+5)
927 == #define	CODEMAX	(BOL+5)		// highest code used
928 == #define	NONCHAR(c)	((c) > CHAR_MAX)
929 == #define	NNONCHAR	(CODEMAX-CHAR_MAX)
930 */
931static states
932step(g, start, stop, bef, ch, aft)
933register struct re_guts *g;
934sopno start;			/* start state within strip */
935sopno stop;			/* state after stop state within strip */
936register states bef;		/* states reachable before */
937int ch;				/* character or NONCHAR code */
938register states aft;		/* states already known reachable after */
939{
940	register cset *cs;
941	register sop s;
942	register sopno pc;
943	register onestate here;		/* note, macros know this name */
944	register sopno look;
945	register int i;
946
947	for (pc = start, INIT(here, pc); pc != stop; pc++, INC(here)) {
948		s = g->strip[pc];
949		switch (OP(s)) {
950		case OEND:
951			assert(pc == stop-1);
952			break;
953		case OCHAR:
954			/* only characters can match */
955			assert(!NONCHAR(ch) || ch != (char)OPND(s));
956			if (ch == (char)OPND(s))
957				FWD(aft, bef, 1);
958			break;
959		case OBOL:
960			if (ch == BOL || ch == BOLEOL)
961				FWD(aft, bef, 1);
962			break;
963		case OEOL:
964			if (ch == EOL || ch == BOLEOL)
965				FWD(aft, bef, 1);
966			break;
967		case OBOW:
968			if (ch == BOW)
969				FWD(aft, bef, 1);
970			break;
971		case OEOW:
972			if (ch == EOW)
973				FWD(aft, bef, 1);
974			break;
975		case OANY:
976			if (!NONCHAR(ch))
977				FWD(aft, bef, 1);
978			break;
979		case OANYOF:
980			cs = &g->sets[OPND(s)];
981			if (!NONCHAR(ch) && CHIN(cs, ch))
982				FWD(aft, bef, 1);
983			break;
984		case OBACK_:		/* ignored here */
985		case O_BACK:
986			FWD(aft, aft, 1);
987			break;
988		case OPLUS_:		/* forward, this is just an empty */
989			FWD(aft, aft, 1);
990			break;
991		case O_PLUS:		/* both forward and back */
992			FWD(aft, aft, 1);
993			i = ISSETBACK(aft, OPND(s));
994			BACK(aft, aft, OPND(s));
995			if (!i && ISSETBACK(aft, OPND(s))) {
996				/* oho, must reconsider loop body */
997				pc -= OPND(s) + 1;
998				INIT(here, pc);
999			}
1000			break;
1001		case OQUEST_:		/* two branches, both forward */
1002			FWD(aft, aft, 1);
1003			FWD(aft, aft, OPND(s));
1004			break;
1005		case O_QUEST:		/* just an empty */
1006			FWD(aft, aft, 1);
1007			break;
1008		case OLPAREN:		/* not significant here */
1009		case ORPAREN:
1010			FWD(aft, aft, 1);
1011			break;
1012		case OCH_:		/* mark the first two branches */
1013			FWD(aft, aft, 1);
1014			assert(OP(g->strip[pc+OPND(s)]) == OOR2);
1015			FWD(aft, aft, OPND(s));
1016			break;
1017		case OOR1:		/* done a branch, find the O_CH */
1018			if (ISSTATEIN(aft, here)) {
1019				for (look = 1;
1020						OP(s = g->strip[pc+look]) != O_CH;
1021						look += OPND(s))
1022					assert(OP(s) == OOR2);
1023				FWD(aft, aft, look);
1024			}
1025			break;
1026		case OOR2:		/* propagate OCH_'s marking */
1027			FWD(aft, aft, 1);
1028			if (OP(g->strip[pc+OPND(s)]) != O_CH) {
1029				assert(OP(g->strip[pc+OPND(s)]) == OOR2);
1030				FWD(aft, aft, OPND(s));
1031			}
1032			break;
1033		case O_CH:		/* just empty */
1034			FWD(aft, aft, 1);
1035			break;
1036		default:		/* ooooops... */
1037			assert(nope);
1038			break;
1039		}
1040	}
1041
1042	return(aft);
1043}
1044
1045#ifdef REDEBUG
1046/*
1047 - print - print a set of states
1048 == #ifdef REDEBUG
1049 == static void print(struct match *m, char *caption, states st, \
1050 ==	int ch, FILE *d);
1051 == #endif
1052 */
1053static void
1054print(m, caption, st, ch, d)
1055struct match *m;
1056char *caption;
1057states st;
1058int ch;
1059FILE *d;
1060{
1061	register struct re_guts *g = m->g;
1062	register int i;
1063	register int first = 1;
1064
1065	if (!(m->eflags&REG_TRACE))
1066		return;
1067
1068	fprintf(d, "%s", caption);
1069	if (ch != '\0')
1070		fprintf(d, " %s", pchar(ch));
1071	for (i = 0; i < g->nstates; i++)
1072		if (ISSET(st, i)) {
1073			fprintf(d, "%s%d", (first) ? "\t" : ", ", i);
1074			first = 0;
1075		}
1076	fprintf(d, "\n");
1077}
1078
1079/*
1080 - at - print current situation
1081 == #ifdef REDEBUG
1082 == static void at(struct match *m, char *title, char *start, char *stop, \
1083 ==						sopno startst, sopno stopst);
1084 == #endif
1085 */
1086static void
1087at(m, title, start, stop, startst, stopst)
1088struct match *m;
1089char *title;
1090char *start;
1091char *stop;
1092sopno startst;
1093sopno stopst;
1094{
1095	if (!(m->eflags&REG_TRACE))
1096		return;
1097
1098	printf("%s %s-", title, pchar(*start));
1099	printf("%s ", pchar(*stop));
1100	printf("%ld-%ld\n", (long)startst, (long)stopst);
1101}
1102
1103#ifndef PCHARDONE
1104#define	PCHARDONE	/* never again */
1105/*
1106 - pchar - make a character printable
1107 == #ifdef REDEBUG
1108 == static char *pchar(int ch);
1109 == #endif
1110 *
1111 * Is this identical to regchar() over in debug.c?  Well, yes.  But a
1112 * duplicate here avoids having a debugging-capable regexec.o tied to
1113 * a matching debug.o, and this is convenient.  It all disappears in
1114 * the non-debug compilation anyway, so it doesn't matter much.
1115 */
1116static char *			/* -> representation */
1117pchar(ch)
1118int ch;
1119{
1120	static char pbuf[10];
1121
1122	if (isprint((uch)ch) || ch == ' ')
1123		sprintf(pbuf, "%c", ch);
1124	else
1125		sprintf(pbuf, "\\%o", ch);
1126	return(pbuf);
1127}
1128#endif
1129#endif
1130
1131#undef	matcher
1132#undef	fast
1133#undef	slow
1134#undef	dissect
1135#undef	backref
1136#undef	step
1137#undef	print
1138#undef	at
1139#undef	match
1140