engine.c revision 92986
1/*-
2 * Copyright (c) 1992, 1993, 1994 Henry Spencer.
3 * Copyright (c) 1992, 1993, 1994
4 *	The Regents of the University of California.  All rights reserved.
5 *
6 * This code is derived from software contributed to Berkeley by
7 * Henry Spencer.
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 * 1. Redistributions of source code must retain the above copyright
13 *    notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 *    notice, this list of conditions and the following disclaimer in the
16 *    documentation and/or other materials provided with the distribution.
17 * 3. All advertising materials mentioning features or use of this software
18 *    must display the following acknowledgement:
19 *	This product includes software developed by the University of
20 *	California, Berkeley and its contributors.
21 * 4. Neither the name of the University nor the names of its contributors
22 *    may be used to endorse or promote products derived from this software
23 *    without specific prior written permission.
24 *
25 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
26 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
28 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
29 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
30 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
31 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
32 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
33 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
34 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
35 * SUCH DAMAGE.
36 *
37 *	@(#)engine.c	8.5 (Berkeley) 3/20/94
38 */
39
40#include <sys/cdefs.h>
41__FBSDID("$FreeBSD: head/lib/libc/regex/engine.c 92986 2002-03-22 21:53:29Z obrien $");
42
43/*
44 * The matching engine and friends.  This file is #included by regexec.c
45 * after suitable #defines of a variety of macros used herein, so that
46 * different state representations can be used without duplicating masses
47 * of code.
48 */
49
50#ifdef SNAMES
51#define	matcher	smatcher
52#define	fast	sfast
53#define	slow	sslow
54#define	dissect	sdissect
55#define	backref	sbackref
56#define	step	sstep
57#define	print	sprint
58#define	at	sat
59#define	match	smat
60#endif
61#ifdef LNAMES
62#define	matcher	lmatcher
63#define	fast	lfast
64#define	slow	lslow
65#define	dissect	ldissect
66#define	backref	lbackref
67#define	step	lstep
68#define	print	lprint
69#define	at	lat
70#define	match	lmat
71#endif
72
73/* another structure passed up and down to avoid zillions of parameters */
74struct match {
75	struct re_guts *g;
76	int eflags;
77	regmatch_t *pmatch;	/* [nsub+1] (0 element unused) */
78	char *offp;		/* offsets work from here */
79	char *beginp;		/* start of string -- virtual NUL precedes */
80	char *endp;		/* end of string -- virtual NUL here */
81	char *coldp;		/* can be no match starting before here */
82	char **lastpos;		/* [nplus+1] */
83	STATEVARS;
84	states st;		/* current states */
85	states fresh;		/* states for a fresh start */
86	states tmp;		/* temporary */
87	states empty;		/* empty set of states */
88};
89
90/* ========= begin header generated by ./mkh ========= */
91#ifdef __cplusplus
92extern "C" {
93#endif
94
95/* === engine.c === */
96static int matcher(struct re_guts *g, char *string, size_t nmatch, regmatch_t pmatch[], int eflags);
97static char *dissect(struct match *m, char *start, char *stop, sopno startst, sopno stopst);
98static char *backref(struct match *m, char *start, char *stop, sopno startst, sopno stopst, sopno lev);
99static char *fast(struct match *m, char *start, char *stop, sopno startst, sopno stopst);
100static char *slow(struct match *m, char *start, char *stop, sopno startst, sopno stopst);
101static states step(struct re_guts *g, sopno start, sopno stop, states bef, int ch, states aft);
102#define	BOL	(OUT+1)
103#define	EOL	(BOL+1)
104#define	BOLEOL	(BOL+2)
105#define	NOTHING	(BOL+3)
106#define	BOW	(BOL+4)
107#define	EOW	(BOL+5)
108#define	CODEMAX	(BOL+5)		/* highest code used */
109#define	NONCHAR(c)	((c) > CHAR_MAX)
110#define	NNONCHAR	(CODEMAX-CHAR_MAX)
111#ifdef REDEBUG
112static void print(struct match *m, char *caption, states st, int ch, FILE *d);
113#endif
114#ifdef REDEBUG
115static void at(struct match *m, char *title, char *start, char *stop, sopno startst, sopno stopst);
116#endif
117#ifdef REDEBUG
118static char *pchar(int ch);
119#endif
120
121#ifdef __cplusplus
122}
123#endif
124/* ========= end header generated by ./mkh ========= */
125
126#ifdef REDEBUG
127#define	SP(t, s, c)	print(m, t, s, c, stdout)
128#define	AT(t, p1, p2, s1, s2)	at(m, t, p1, p2, s1, s2)
129#define	NOTE(str)	{ if (m->eflags&REG_TRACE) printf("=%s\n", (str)); }
130#else
131#define	SP(t, s, c)	/* nothing */
132#define	AT(t, p1, p2, s1, s2)	/* nothing */
133#define	NOTE(s)	/* nothing */
134#endif
135
136/*
137 - matcher - the actual matching engine
138 == static int matcher(struct re_guts *g, char *string, \
139 ==	size_t nmatch, regmatch_t pmatch[], int eflags);
140 */
141static int			/* 0 success, REG_NOMATCH failure */
142matcher(g, string, nmatch, pmatch, eflags)
143struct re_guts *g;
144char *string;
145size_t nmatch;
146regmatch_t pmatch[];
147int eflags;
148{
149	char *endp;
150	int i;
151	struct match mv;
152	struct match *m = &mv;
153	char *dp;
154	const sopno gf = g->firststate+1;	/* +1 for OEND */
155	const sopno gl = g->laststate;
156	char *start;
157	char *stop;
158	/* Boyer-Moore algorithms variables */
159	char *pp;
160	int cj, mj;
161	char *mustfirst;
162	char *mustlast;
163	int *matchjump;
164	int *charjump;
165
166	/* simplify the situation where possible */
167	if (g->cflags&REG_NOSUB)
168		nmatch = 0;
169	if (eflags&REG_STARTEND) {
170		start = string + pmatch[0].rm_so;
171		stop = string + pmatch[0].rm_eo;
172	} else {
173		start = string;
174		stop = start + strlen(start);
175	}
176	if (stop < start)
177		return(REG_INVARG);
178
179	/* prescreening; this does wonders for this rather slow code */
180	if (g->must != NULL) {
181		if (g->charjump != NULL && g->matchjump != NULL) {
182			mustfirst = g->must;
183			mustlast = g->must + g->mlen - 1;
184			charjump = g->charjump;
185			matchjump = g->matchjump;
186			pp = mustlast;
187			for (dp = start+g->mlen-1; dp < stop;) {
188				/* Fast skip non-matches */
189				while (dp < stop && charjump[*dp])
190					dp += charjump[*dp];
191
192				if (dp >= stop)
193					break;
194
195				/* Greedy matcher */
196				/* We depend on not being used for
197				 * for strings of length 1
198				 */
199				while (*--dp == *--pp && pp != mustfirst);
200
201				if (*dp == *pp)
202					break;
203
204				/* Jump to next possible match */
205				mj = matchjump[pp - mustfirst];
206				cj = charjump[*dp];
207				dp += (cj < mj ? mj : cj);
208				pp = mustlast;
209			}
210			if (pp != mustfirst)
211				return(REG_NOMATCH);
212		} else {
213			for (dp = start; dp < stop; dp++)
214				if (*dp == g->must[0] &&
215				    stop - dp >= g->mlen &&
216				    memcmp(dp, g->must, (size_t)g->mlen) == 0)
217					break;
218			if (dp == stop)		/* we didn't find g->must */
219				return(REG_NOMATCH);
220		}
221	}
222
223	/* match struct setup */
224	m->g = g;
225	m->eflags = eflags;
226	m->pmatch = NULL;
227	m->lastpos = NULL;
228	m->offp = string;
229	m->beginp = start;
230	m->endp = stop;
231	STATESETUP(m, 4);
232	SETUP(m->st);
233	SETUP(m->fresh);
234	SETUP(m->tmp);
235	SETUP(m->empty);
236	CLEAR(m->empty);
237
238	/* Adjust start according to moffset, to speed things up */
239	if (g->moffset > -1)
240		start = ((dp - g->moffset) < start) ? start : dp - g->moffset;
241
242	/* this loop does only one repetition except for backrefs */
243	for (;;) {
244		endp = fast(m, start, stop, gf, gl);
245		if (endp == NULL) {		/* a miss */
246			STATETEARDOWN(m);
247			return(REG_NOMATCH);
248		}
249		if (nmatch == 0 && !g->backrefs)
250			break;		/* no further info needed */
251
252		/* where? */
253		assert(m->coldp != NULL);
254		for (;;) {
255			NOTE("finding start");
256			endp = slow(m, m->coldp, stop, gf, gl);
257			if (endp != NULL)
258				break;
259			assert(m->coldp < m->endp);
260			m->coldp++;
261		}
262		if (nmatch == 1 && !g->backrefs)
263			break;		/* no further info needed */
264
265		/* oh my, he wants the subexpressions... */
266		if (m->pmatch == NULL)
267			m->pmatch = (regmatch_t *)malloc((m->g->nsub + 1) *
268							sizeof(regmatch_t));
269		if (m->pmatch == NULL) {
270			STATETEARDOWN(m);
271			return(REG_ESPACE);
272		}
273		for (i = 1; i <= m->g->nsub; i++)
274			m->pmatch[i].rm_so = m->pmatch[i].rm_eo = -1;
275		if (!g->backrefs && !(m->eflags&REG_BACKR)) {
276			NOTE("dissecting");
277			dp = dissect(m, m->coldp, endp, gf, gl);
278		} else {
279			if (g->nplus > 0 && m->lastpos == NULL)
280				m->lastpos = (char **)malloc((g->nplus+1) *
281							sizeof(char *));
282			if (g->nplus > 0 && m->lastpos == NULL) {
283				free(m->pmatch);
284				STATETEARDOWN(m);
285				return(REG_ESPACE);
286			}
287			NOTE("backref dissect");
288			dp = backref(m, m->coldp, endp, gf, gl, (sopno)0);
289		}
290		if (dp != NULL)
291			break;
292
293		/* uh-oh... we couldn't find a subexpression-level match */
294		assert(g->backrefs);	/* must be back references doing it */
295		assert(g->nplus == 0 || m->lastpos != NULL);
296		for (;;) {
297			if (dp != NULL || endp <= m->coldp)
298				break;		/* defeat */
299			NOTE("backoff");
300			endp = slow(m, m->coldp, endp-1, gf, gl);
301			if (endp == NULL)
302				break;		/* defeat */
303			/* try it on a shorter possibility */
304#ifndef NDEBUG
305			for (i = 1; i <= m->g->nsub; i++) {
306				assert(m->pmatch[i].rm_so == -1);
307				assert(m->pmatch[i].rm_eo == -1);
308			}
309#endif
310			NOTE("backoff dissect");
311			dp = backref(m, m->coldp, endp, gf, gl, (sopno)0);
312		}
313		assert(dp == NULL || dp == endp);
314		if (dp != NULL)		/* found a shorter one */
315			break;
316
317		/* despite initial appearances, there is no match here */
318		NOTE("false alarm");
319		start = m->coldp + 1;	/* recycle starting later */
320		assert(start <= stop);
321	}
322
323	/* fill in the details if requested */
324	if (nmatch > 0) {
325		pmatch[0].rm_so = m->coldp - m->offp;
326		pmatch[0].rm_eo = endp - m->offp;
327	}
328	if (nmatch > 1) {
329		assert(m->pmatch != NULL);
330		for (i = 1; i < nmatch; i++)
331			if (i <= m->g->nsub)
332				pmatch[i] = m->pmatch[i];
333			else {
334				pmatch[i].rm_so = -1;
335				pmatch[i].rm_eo = -1;
336			}
337	}
338
339	if (m->pmatch != NULL)
340		free((char *)m->pmatch);
341	if (m->lastpos != NULL)
342		free((char *)m->lastpos);
343	STATETEARDOWN(m);
344	return(0);
345}
346
347/*
348 - dissect - figure out what matched what, no back references
349 == static char *dissect(struct match *m, char *start, \
350 ==	char *stop, sopno startst, sopno stopst);
351 */
352static char *			/* == stop (success) always */
353dissect(m, start, stop, startst, stopst)
354struct match *m;
355char *start;
356char *stop;
357sopno startst;
358sopno stopst;
359{
360	int i;
361	sopno ss;		/* start sop of current subRE */
362	sopno es;		/* end sop of current subRE */
363	char *sp;		/* start of string matched by it */
364	char *stp;		/* string matched by it cannot pass here */
365	char *rest;		/* start of rest of string */
366	char *tail;		/* string unmatched by rest of RE */
367	sopno ssub;		/* start sop of subsubRE */
368	sopno esub;		/* end sop of subsubRE */
369	char *ssp;		/* start of string matched by subsubRE */
370	char *sep;		/* end of string matched by subsubRE */
371	char *oldssp;		/* previous ssp */
372	char *dp;
373
374	AT("diss", start, stop, startst, stopst);
375	sp = start;
376	for (ss = startst; ss < stopst; ss = es) {
377		/* identify end of subRE */
378		es = ss;
379		switch (OP(m->g->strip[es])) {
380		case OPLUS_:
381		case OQUEST_:
382			es += OPND(m->g->strip[es]);
383			break;
384		case OCH_:
385			while (OP(m->g->strip[es]) != O_CH)
386				es += OPND(m->g->strip[es]);
387			break;
388		}
389		es++;
390
391		/* figure out what it matched */
392		switch (OP(m->g->strip[ss])) {
393		case OEND:
394			assert(nope);
395			break;
396		case OCHAR:
397			sp++;
398			break;
399		case OBOL:
400		case OEOL:
401		case OBOW:
402		case OEOW:
403			break;
404		case OANY:
405		case OANYOF:
406			sp++;
407			break;
408		case OBACK_:
409		case O_BACK:
410			assert(nope);
411			break;
412		/* cases where length of match is hard to find */
413		case OQUEST_:
414			stp = stop;
415			for (;;) {
416				/* how long could this one be? */
417				rest = slow(m, sp, stp, ss, es);
418				assert(rest != NULL);	/* it did match */
419				/* could the rest match the rest? */
420				tail = slow(m, rest, stop, es, stopst);
421				if (tail == stop)
422					break;		/* yes! */
423				/* no -- try a shorter match for this one */
424				stp = rest - 1;
425				assert(stp >= sp);	/* it did work */
426			}
427			ssub = ss + 1;
428			esub = es - 1;
429			/* did innards match? */
430			if (slow(m, sp, rest, ssub, esub) != NULL) {
431				dp = dissect(m, sp, rest, ssub, esub);
432				assert(dp == rest);
433			} else		/* no */
434				assert(sp == rest);
435			sp = rest;
436			break;
437		case OPLUS_:
438			stp = stop;
439			for (;;) {
440				/* how long could this one be? */
441				rest = slow(m, sp, stp, ss, es);
442				assert(rest != NULL);	/* it did match */
443				/* could the rest match the rest? */
444				tail = slow(m, rest, stop, es, stopst);
445				if (tail == stop)
446					break;		/* yes! */
447				/* no -- try a shorter match for this one */
448				stp = rest - 1;
449				assert(stp >= sp);	/* it did work */
450			}
451			ssub = ss + 1;
452			esub = es - 1;
453			ssp = sp;
454			oldssp = ssp;
455			for (;;) {	/* find last match of innards */
456				sep = slow(m, ssp, rest, ssub, esub);
457				if (sep == NULL || sep == ssp)
458					break;	/* failed or matched null */
459				oldssp = ssp;	/* on to next try */
460				ssp = sep;
461			}
462			if (sep == NULL) {
463				/* last successful match */
464				sep = ssp;
465				ssp = oldssp;
466			}
467			assert(sep == rest);	/* must exhaust substring */
468			assert(slow(m, ssp, sep, ssub, esub) == rest);
469			dp = dissect(m, ssp, sep, ssub, esub);
470			assert(dp == sep);
471			sp = rest;
472			break;
473		case OCH_:
474			stp = stop;
475			for (;;) {
476				/* how long could this one be? */
477				rest = slow(m, sp, stp, ss, es);
478				assert(rest != NULL);	/* it did match */
479				/* could the rest match the rest? */
480				tail = slow(m, rest, stop, es, stopst);
481				if (tail == stop)
482					break;		/* yes! */
483				/* no -- try a shorter match for this one */
484				stp = rest - 1;
485				assert(stp >= sp);	/* it did work */
486			}
487			ssub = ss + 1;
488			esub = ss + OPND(m->g->strip[ss]) - 1;
489			assert(OP(m->g->strip[esub]) == OOR1);
490			for (;;) {	/* find first matching branch */
491				if (slow(m, sp, rest, ssub, esub) == rest)
492					break;	/* it matched all of it */
493				/* that one missed, try next one */
494				assert(OP(m->g->strip[esub]) == OOR1);
495				esub++;
496				assert(OP(m->g->strip[esub]) == OOR2);
497				ssub = esub + 1;
498				esub += OPND(m->g->strip[esub]);
499				if (OP(m->g->strip[esub]) == OOR2)
500					esub--;
501				else
502					assert(OP(m->g->strip[esub]) == O_CH);
503			}
504			dp = dissect(m, sp, rest, ssub, esub);
505			assert(dp == rest);
506			sp = rest;
507			break;
508		case O_PLUS:
509		case O_QUEST:
510		case OOR1:
511		case OOR2:
512		case O_CH:
513			assert(nope);
514			break;
515		case OLPAREN:
516			i = OPND(m->g->strip[ss]);
517			assert(0 < i && i <= m->g->nsub);
518			m->pmatch[i].rm_so = sp - m->offp;
519			break;
520		case ORPAREN:
521			i = OPND(m->g->strip[ss]);
522			assert(0 < i && i <= m->g->nsub);
523			m->pmatch[i].rm_eo = sp - m->offp;
524			break;
525		default:		/* uh oh */
526			assert(nope);
527			break;
528		}
529	}
530
531	assert(sp == stop);
532	return(sp);
533}
534
535/*
536 - backref - figure out what matched what, figuring in back references
537 == static char *backref(struct match *m, char *start, \
538 ==	char *stop, sopno startst, sopno stopst, sopno lev);
539 */
540static char *			/* == stop (success) or NULL (failure) */
541backref(m, start, stop, startst, stopst, lev)
542struct match *m;
543char *start;
544char *stop;
545sopno startst;
546sopno stopst;
547sopno lev;			/* PLUS nesting level */
548{
549	int i;
550	sopno ss;		/* start sop of current subRE */
551	char *sp;		/* start of string matched by it */
552	sopno ssub;		/* start sop of subsubRE */
553	sopno esub;		/* end sop of subsubRE */
554	char *ssp;		/* start of string matched by subsubRE */
555	char *dp;
556	size_t len;
557	int hard;
558	sop s;
559	regoff_t offsave;
560	cset *cs;
561
562	AT("back", start, stop, startst, stopst);
563	sp = start;
564
565	/* get as far as we can with easy stuff */
566	hard = 0;
567	for (ss = startst; !hard && ss < stopst; ss++)
568		switch (OP(s = m->g->strip[ss])) {
569		case OCHAR:
570			if (sp == stop || *sp++ != (char)OPND(s))
571				return(NULL);
572			break;
573		case OANY:
574			if (sp == stop)
575				return(NULL);
576			sp++;
577			break;
578		case OANYOF:
579			cs = &m->g->sets[OPND(s)];
580			if (sp == stop || !CHIN(cs, *sp++))
581				return(NULL);
582			break;
583		case OBOL:
584			if ( (sp == m->beginp && !(m->eflags&REG_NOTBOL)) ||
585					(sp < m->endp && *(sp-1) == '\n' &&
586						(m->g->cflags&REG_NEWLINE)) )
587				{ /* yes */ }
588			else
589				return(NULL);
590			break;
591		case OEOL:
592			if ( (sp == m->endp && !(m->eflags&REG_NOTEOL)) ||
593					(sp < m->endp && *sp == '\n' &&
594						(m->g->cflags&REG_NEWLINE)) )
595				{ /* yes */ }
596			else
597				return(NULL);
598			break;
599		case OBOW:
600			if (( (sp == m->beginp && !(m->eflags&REG_NOTBOL)) ||
601					(sp < m->endp && *(sp-1) == '\n' &&
602						(m->g->cflags&REG_NEWLINE)) ||
603					(sp > m->beginp &&
604							!ISWORD(*(sp-1))) ) &&
605					(sp < m->endp && ISWORD(*sp)) )
606				{ /* yes */ }
607			else
608				return(NULL);
609			break;
610		case OEOW:
611			if (( (sp == m->endp && !(m->eflags&REG_NOTEOL)) ||
612					(sp < m->endp && *sp == '\n' &&
613						(m->g->cflags&REG_NEWLINE)) ||
614					(sp < m->endp && !ISWORD(*sp)) ) &&
615					(sp > m->beginp && ISWORD(*(sp-1))) )
616				{ /* yes */ }
617			else
618				return(NULL);
619			break;
620		case O_QUEST:
621			break;
622		case OOR1:	/* matches null but needs to skip */
623			ss++;
624			s = m->g->strip[ss];
625			do {
626				assert(OP(s) == OOR2);
627				ss += OPND(s);
628			} while (OP(s = m->g->strip[ss]) != O_CH);
629			/* note that the ss++ gets us past the O_CH */
630			break;
631		default:	/* have to make a choice */
632			hard = 1;
633			break;
634		}
635	if (!hard) {		/* that was it! */
636		if (sp != stop)
637			return(NULL);
638		return(sp);
639	}
640	ss--;			/* adjust for the for's final increment */
641
642	/* the hard stuff */
643	AT("hard", sp, stop, ss, stopst);
644	s = m->g->strip[ss];
645	switch (OP(s)) {
646	case OBACK_:		/* the vilest depths */
647		i = OPND(s);
648		assert(0 < i && i <= m->g->nsub);
649		if (m->pmatch[i].rm_eo == -1)
650			return(NULL);
651		assert(m->pmatch[i].rm_so != -1);
652		len = m->pmatch[i].rm_eo - m->pmatch[i].rm_so;
653		assert(stop - m->beginp >= len);
654		if (sp > stop - len)
655			return(NULL);	/* not enough left to match */
656		ssp = m->offp + m->pmatch[i].rm_so;
657		if (memcmp(sp, ssp, len) != 0)
658			return(NULL);
659		while (m->g->strip[ss] != SOP(O_BACK, i))
660			ss++;
661		return(backref(m, sp+len, stop, ss+1, stopst, lev));
662		break;
663	case OQUEST_:		/* to null or not */
664		dp = backref(m, sp, stop, ss+1, stopst, lev);
665		if (dp != NULL)
666			return(dp);	/* not */
667		return(backref(m, sp, stop, ss+OPND(s)+1, stopst, lev));
668		break;
669	case OPLUS_:
670		assert(m->lastpos != NULL);
671		assert(lev+1 <= m->g->nplus);
672		m->lastpos[lev+1] = sp;
673		return(backref(m, sp, stop, ss+1, stopst, lev+1));
674		break;
675	case O_PLUS:
676		if (sp == m->lastpos[lev])	/* last pass matched null */
677			return(backref(m, sp, stop, ss+1, stopst, lev-1));
678		/* try another pass */
679		m->lastpos[lev] = sp;
680		dp = backref(m, sp, stop, ss-OPND(s)+1, stopst, lev);
681		if (dp == NULL)
682			return(backref(m, sp, stop, ss+1, stopst, lev-1));
683		else
684			return(dp);
685		break;
686	case OCH_:		/* find the right one, if any */
687		ssub = ss + 1;
688		esub = ss + OPND(s) - 1;
689		assert(OP(m->g->strip[esub]) == OOR1);
690		for (;;) {	/* find first matching branch */
691			dp = backref(m, sp, stop, ssub, esub, lev);
692			if (dp != NULL)
693				return(dp);
694			/* that one missed, try next one */
695			if (OP(m->g->strip[esub]) == O_CH)
696				return(NULL);	/* there is none */
697			esub++;
698			assert(OP(m->g->strip[esub]) == OOR2);
699			ssub = esub + 1;
700			esub += OPND(m->g->strip[esub]);
701			if (OP(m->g->strip[esub]) == OOR2)
702				esub--;
703			else
704				assert(OP(m->g->strip[esub]) == O_CH);
705		}
706		break;
707	case OLPAREN:		/* must undo assignment if rest fails */
708		i = OPND(s);
709		assert(0 < i && i <= m->g->nsub);
710		offsave = m->pmatch[i].rm_so;
711		m->pmatch[i].rm_so = sp - m->offp;
712		dp = backref(m, sp, stop, ss+1, stopst, lev);
713		if (dp != NULL)
714			return(dp);
715		m->pmatch[i].rm_so = offsave;
716		return(NULL);
717		break;
718	case ORPAREN:		/* must undo assignment if rest fails */
719		i = OPND(s);
720		assert(0 < i && i <= m->g->nsub);
721		offsave = m->pmatch[i].rm_eo;
722		m->pmatch[i].rm_eo = sp - m->offp;
723		dp = backref(m, sp, stop, ss+1, stopst, lev);
724		if (dp != NULL)
725			return(dp);
726		m->pmatch[i].rm_eo = offsave;
727		return(NULL);
728		break;
729	default:		/* uh oh */
730		assert(nope);
731		break;
732	}
733
734	/* "can't happen" */
735	assert(nope);
736	/* NOTREACHED */
737	return "shut up gcc";
738}
739
740/*
741 - fast - step through the string at top speed
742 == static char *fast(struct match *m, char *start, \
743 ==	char *stop, sopno startst, sopno stopst);
744 */
745static char *			/* where tentative match ended, or NULL */
746fast(m, start, stop, startst, stopst)
747struct match *m;
748char *start;
749char *stop;
750sopno startst;
751sopno stopst;
752{
753	states st = m->st;
754	states fresh = m->fresh;
755	states tmp = m->tmp;
756	char *p = start;
757	int c = (start == m->beginp) ? OUT : *(start-1);
758	int lastc;		/* previous c */
759	int flagch;
760	int i;
761	char *coldp;		/* last p after which no match was underway */
762
763	CLEAR(st);
764	SET1(st, startst);
765	st = step(m->g, startst, stopst, st, NOTHING, st);
766	ASSIGN(fresh, st);
767	SP("start", st, *p);
768	coldp = NULL;
769	for (;;) {
770		/* next character */
771		lastc = c;
772		c = (p == m->endp) ? OUT : *p;
773		if (EQ(st, fresh))
774			coldp = p;
775
776		/* is there an EOL and/or BOL between lastc and c? */
777		flagch = '\0';
778		i = 0;
779		if ( (lastc == '\n' && m->g->cflags&REG_NEWLINE) ||
780				(lastc == OUT && !(m->eflags&REG_NOTBOL)) ) {
781			flagch = BOL;
782			i = m->g->nbol;
783		}
784		if ( (c == '\n' && m->g->cflags&REG_NEWLINE) ||
785				(c == OUT && !(m->eflags&REG_NOTEOL)) ) {
786			flagch = (flagch == BOL) ? BOLEOL : EOL;
787			i += m->g->neol;
788		}
789		if (i != 0) {
790			for (; i > 0; i--)
791				st = step(m->g, startst, stopst, st, flagch, st);
792			SP("boleol", st, c);
793		}
794
795		/* how about a word boundary? */
796		if ( (flagch == BOL || (lastc != OUT && !ISWORD(lastc))) &&
797					(c != OUT && ISWORD(c)) ) {
798			flagch = BOW;
799		}
800		if ( (lastc != OUT && ISWORD(lastc)) &&
801				(flagch == EOL || (c != OUT && !ISWORD(c))) ) {
802			flagch = EOW;
803		}
804		if (flagch == BOW || flagch == EOW) {
805			st = step(m->g, startst, stopst, st, flagch, st);
806			SP("boweow", st, c);
807		}
808
809		/* are we done? */
810		if (ISSET(st, stopst) || p == stop)
811			break;		/* NOTE BREAK OUT */
812
813		/* no, we must deal with this character */
814		ASSIGN(tmp, st);
815		ASSIGN(st, fresh);
816		assert(c != OUT);
817		st = step(m->g, startst, stopst, tmp, c, st);
818		SP("aft", st, c);
819		assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st));
820		p++;
821	}
822
823	assert(coldp != NULL);
824	m->coldp = coldp;
825	if (ISSET(st, stopst))
826		return(p+1);
827	else
828		return(NULL);
829}
830
831/*
832 - slow - step through the string more deliberately
833 == static char *slow(struct match *m, char *start, \
834 ==	char *stop, sopno startst, sopno stopst);
835 */
836static char *			/* where it ended */
837slow(m, start, stop, startst, stopst)
838struct match *m;
839char *start;
840char *stop;
841sopno startst;
842sopno stopst;
843{
844	states st = m->st;
845	states empty = m->empty;
846	states tmp = m->tmp;
847	char *p = start;
848	int c = (start == m->beginp) ? OUT : *(start-1);
849	int lastc;		/* previous c */
850	int flagch;
851	int i;
852	char *matchp;		/* last p at which a match ended */
853
854	AT("slow", start, stop, startst, stopst);
855	CLEAR(st);
856	SET1(st, startst);
857	SP("sstart", st, *p);
858	st = step(m->g, startst, stopst, st, NOTHING, st);
859	matchp = NULL;
860	for (;;) {
861		/* next character */
862		lastc = c;
863		c = (p == m->endp) ? OUT : *p;
864
865		/* is there an EOL and/or BOL between lastc and c? */
866		flagch = '\0';
867		i = 0;
868		if ( (lastc == '\n' && m->g->cflags&REG_NEWLINE) ||
869				(lastc == OUT && !(m->eflags&REG_NOTBOL)) ) {
870			flagch = BOL;
871			i = m->g->nbol;
872		}
873		if ( (c == '\n' && m->g->cflags&REG_NEWLINE) ||
874				(c == OUT && !(m->eflags&REG_NOTEOL)) ) {
875			flagch = (flagch == BOL) ? BOLEOL : EOL;
876			i += m->g->neol;
877		}
878		if (i != 0) {
879			for (; i > 0; i--)
880				st = step(m->g, startst, stopst, st, flagch, st);
881			SP("sboleol", st, c);
882		}
883
884		/* how about a word boundary? */
885		if ( (flagch == BOL || (lastc != OUT && !ISWORD(lastc))) &&
886					(c != OUT && ISWORD(c)) ) {
887			flagch = BOW;
888		}
889		if ( (lastc != OUT && ISWORD(lastc)) &&
890				(flagch == EOL || (c != OUT && !ISWORD(c))) ) {
891			flagch = EOW;
892		}
893		if (flagch == BOW || flagch == EOW) {
894			st = step(m->g, startst, stopst, st, flagch, st);
895			SP("sboweow", st, c);
896		}
897
898		/* are we done? */
899		if (ISSET(st, stopst))
900			matchp = p;
901		if (EQ(st, empty) || p == stop)
902			break;		/* NOTE BREAK OUT */
903
904		/* no, we must deal with this character */
905		ASSIGN(tmp, st);
906		ASSIGN(st, empty);
907		assert(c != OUT);
908		st = step(m->g, startst, stopst, tmp, c, st);
909		SP("saft", st, c);
910		assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st));
911		p++;
912	}
913
914	return(matchp);
915}
916
917
918/*
919 - step - map set of states reachable before char to set reachable after
920 == static states step(struct re_guts *g, sopno start, sopno stop, \
921 ==	states bef, int ch, states aft);
922 == #define	BOL	(OUT+1)
923 == #define	EOL	(BOL+1)
924 == #define	BOLEOL	(BOL+2)
925 == #define	NOTHING	(BOL+3)
926 == #define	BOW	(BOL+4)
927 == #define	EOW	(BOL+5)
928 == #define	CODEMAX	(BOL+5)		// highest code used
929 == #define	NONCHAR(c)	((c) > CHAR_MAX)
930 == #define	NNONCHAR	(CODEMAX-CHAR_MAX)
931 */
932static states
933step(g, start, stop, bef, ch, aft)
934struct re_guts *g;
935sopno start;			/* start state within strip */
936sopno stop;			/* state after stop state within strip */
937states bef;			/* states reachable before */
938int ch;				/* character or NONCHAR code */
939states aft;			/* states already known reachable after */
940{
941	cset *cs;
942	sop s;
943	sopno pc;
944	onestate here;		/* note, macros know this name */
945	sopno look;
946	int i;
947
948	for (pc = start, INIT(here, pc); pc != stop; pc++, INC(here)) {
949		s = g->strip[pc];
950		switch (OP(s)) {
951		case OEND:
952			assert(pc == stop-1);
953			break;
954		case OCHAR:
955			/* only characters can match */
956			assert(!NONCHAR(ch) || ch != (char)OPND(s));
957			if (ch == (char)OPND(s))
958				FWD(aft, bef, 1);
959			break;
960		case OBOL:
961			if (ch == BOL || ch == BOLEOL)
962				FWD(aft, bef, 1);
963			break;
964		case OEOL:
965			if (ch == EOL || ch == BOLEOL)
966				FWD(aft, bef, 1);
967			break;
968		case OBOW:
969			if (ch == BOW)
970				FWD(aft, bef, 1);
971			break;
972		case OEOW:
973			if (ch == EOW)
974				FWD(aft, bef, 1);
975			break;
976		case OANY:
977			if (!NONCHAR(ch))
978				FWD(aft, bef, 1);
979			break;
980		case OANYOF:
981			cs = &g->sets[OPND(s)];
982			if (!NONCHAR(ch) && CHIN(cs, ch))
983				FWD(aft, bef, 1);
984			break;
985		case OBACK_:		/* ignored here */
986		case O_BACK:
987			FWD(aft, aft, 1);
988			break;
989		case OPLUS_:		/* forward, this is just an empty */
990			FWD(aft, aft, 1);
991			break;
992		case O_PLUS:		/* both forward and back */
993			FWD(aft, aft, 1);
994			i = ISSETBACK(aft, OPND(s));
995			BACK(aft, aft, OPND(s));
996			if (!i && ISSETBACK(aft, OPND(s))) {
997				/* oho, must reconsider loop body */
998				pc -= OPND(s) + 1;
999				INIT(here, pc);
1000			}
1001			break;
1002		case OQUEST_:		/* two branches, both forward */
1003			FWD(aft, aft, 1);
1004			FWD(aft, aft, OPND(s));
1005			break;
1006		case O_QUEST:		/* just an empty */
1007			FWD(aft, aft, 1);
1008			break;
1009		case OLPAREN:		/* not significant here */
1010		case ORPAREN:
1011			FWD(aft, aft, 1);
1012			break;
1013		case OCH_:		/* mark the first two branches */
1014			FWD(aft, aft, 1);
1015			assert(OP(g->strip[pc+OPND(s)]) == OOR2);
1016			FWD(aft, aft, OPND(s));
1017			break;
1018		case OOR1:		/* done a branch, find the O_CH */
1019			if (ISSTATEIN(aft, here)) {
1020				for (look = 1;
1021						OP(s = g->strip[pc+look]) != O_CH;
1022						look += OPND(s))
1023					assert(OP(s) == OOR2);
1024				FWD(aft, aft, look);
1025			}
1026			break;
1027		case OOR2:		/* propagate OCH_'s marking */
1028			FWD(aft, aft, 1);
1029			if (OP(g->strip[pc+OPND(s)]) != O_CH) {
1030				assert(OP(g->strip[pc+OPND(s)]) == OOR2);
1031				FWD(aft, aft, OPND(s));
1032			}
1033			break;
1034		case O_CH:		/* just empty */
1035			FWD(aft, aft, 1);
1036			break;
1037		default:		/* ooooops... */
1038			assert(nope);
1039			break;
1040		}
1041	}
1042
1043	return(aft);
1044}
1045
1046#ifdef REDEBUG
1047/*
1048 - print - print a set of states
1049 == #ifdef REDEBUG
1050 == static void print(struct match *m, char *caption, states st, \
1051 ==	int ch, FILE *d);
1052 == #endif
1053 */
1054static void
1055print(m, caption, st, ch, d)
1056struct match *m;
1057char *caption;
1058states st;
1059int ch;
1060FILE *d;
1061{
1062	struct re_guts *g = m->g;
1063	int i;
1064	int first = 1;
1065
1066	if (!(m->eflags&REG_TRACE))
1067		return;
1068
1069	fprintf(d, "%s", caption);
1070	if (ch != '\0')
1071		fprintf(d, " %s", pchar(ch));
1072	for (i = 0; i < g->nstates; i++)
1073		if (ISSET(st, i)) {
1074			fprintf(d, "%s%d", (first) ? "\t" : ", ", i);
1075			first = 0;
1076		}
1077	fprintf(d, "\n");
1078}
1079
1080/*
1081 - at - print current situation
1082 == #ifdef REDEBUG
1083 == static void at(struct match *m, char *title, char *start, char *stop, \
1084 ==						sopno startst, sopno stopst);
1085 == #endif
1086 */
1087static void
1088at(m, title, start, stop, startst, stopst)
1089struct match *m;
1090char *title;
1091char *start;
1092char *stop;
1093sopno startst;
1094sopno stopst;
1095{
1096	if (!(m->eflags&REG_TRACE))
1097		return;
1098
1099	printf("%s %s-", title, pchar(*start));
1100	printf("%s ", pchar(*stop));
1101	printf("%ld-%ld\n", (long)startst, (long)stopst);
1102}
1103
1104#ifndef PCHARDONE
1105#define	PCHARDONE	/* never again */
1106/*
1107 - pchar - make a character printable
1108 == #ifdef REDEBUG
1109 == static char *pchar(int ch);
1110 == #endif
1111 *
1112 * Is this identical to regchar() over in debug.c?  Well, yes.  But a
1113 * duplicate here avoids having a debugging-capable regexec.o tied to
1114 * a matching debug.o, and this is convenient.  It all disappears in
1115 * the non-debug compilation anyway, so it doesn't matter much.
1116 */
1117static char *			/* -> representation */
1118pchar(ch)
1119int ch;
1120{
1121	static char pbuf[10];
1122
1123	if (isprint((uch)ch) || ch == ' ')
1124		sprintf(pbuf, "%c", ch);
1125	else
1126		sprintf(pbuf, "\\%o", ch);
1127	return(pbuf);
1128}
1129#endif
1130#endif
1131
1132#undef	matcher
1133#undef	fast
1134#undef	slow
1135#undef	dissect
1136#undef	backref
1137#undef	step
1138#undef	print
1139#undef	at
1140#undef	match
1141