1/*	$OpenBSD: engine.c,v 1.26 2020/12/28 21:41:55 millert Exp $	*/
2
3/*-
4 * Copyright (c) 1992, 1993, 1994 Henry Spencer.
5 * Copyright (c) 1992, 1993, 1994
6 *	The Regents of the University of California.  All rights reserved.
7 *
8 * This code is derived from software contributed to Berkeley by
9 * Henry Spencer.
10 *
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
13 * are met:
14 * 1. Redistributions of source code must retain the above copyright
15 *    notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 *    notice, this list of conditions and the following disclaimer in the
18 *    documentation and/or other materials provided with the distribution.
19 * 3. Neither the name of the University nor the names of its contributors
20 *    may be used to endorse or promote products derived from this software
21 *    without specific prior written permission.
22 *
23 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
24 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
25 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
26 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
27 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
28 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
29 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
31 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
32 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33 * SUCH DAMAGE.
34 *
35 *	@(#)engine.c	8.5 (Berkeley) 3/20/94
36 */
37
38/*
39 * The matching engine and friends.  This file is #included by regexec.c
40 * after suitable #defines of a variety of macros used herein, so that
41 * different state representations can be used without duplicating masses
42 * of code.
43 */
44
45#ifdef SNAMES
46#define	matcher	smatcher
47#define	fast	sfast
48#define	slow	sslow
49#define	dissect	sdissect
50#define	backref	sbackref
51#define	step	sstep
52#define	print	sprint
53#define	at	sat
54#define	match	smat
55#define	nope	snope
56#endif
57#ifdef LNAMES
58#define	matcher	lmatcher
59#define	fast	lfast
60#define	slow	lslow
61#define	dissect	ldissect
62#define	backref	lbackref
63#define	step	lstep
64#define	print	lprint
65#define	at	lat
66#define	match	lmat
67#define	nope	lnope
68#endif
69
70/* another structure passed up and down to avoid zillions of parameters */
71struct match {
72	struct re_guts *g;
73	int eflags;
74	regmatch_t *pmatch;	/* [nsub+1] (0 element unused) */
75	const char *offp;	/* offsets work from here */
76	const char *beginp;	/* start of string -- virtual NUL precedes */
77	const char *endp;	/* end of string -- virtual NUL here */
78	const char *coldp;	/* can be no match starting before here */
79	const char **lastpos;	/* [nplus+1] */
80	STATEVARS;
81	states st;		/* current states */
82	states fresh;		/* states for a fresh start */
83	states tmp;		/* temporary */
84	states empty;		/* empty set of states */
85};
86
87static int matcher(struct re_guts *, const char *, size_t, regmatch_t[], int);
88static const char *dissect(struct match *, const char *, const char *, sopno,
89    sopno);
90static const char *backref(struct match *, const char *, const char *, sopno,
91    sopno, sopno, int);
92static const char *fast(struct match *, const char *, const char *, sopno,
93    sopno);
94static const char *slow(struct match *, const char *, const char *, sopno,
95    sopno);
96static states step(struct re_guts *, sopno, sopno, states, int, states);
97#define MAX_RECURSION	100
98#define	BOL	(OUT+1)
99#define	EOL	(BOL+1)
100#define	BOLEOL	(BOL+2)
101#define	NOTHING	(BOL+3)
102#define	BOW	(BOL+4)
103#define	EOW	(BOL+5)
104/* update nonchars[] array below when adding fake chars here */
105#define	CODEMAX	(BOL+5)		/* highest code used */
106#define	NONCHAR(c)	((c) > CHAR_MAX)
107#define	NNONCHAR	(CODEMAX-CHAR_MAX)
108#ifdef REDEBUG
109static void print(struct match *, const char *, states, int, FILE *);
110#endif
111#ifdef REDEBUG
112static void at(struct match *, const char *, const char *, const char *,
113    sopno, sopno);
114#endif
115#ifdef REDEBUG
116static const char *pchar(int);
117#endif
118
119#ifdef REDEBUG
120#define	SP(t, s, c)	print(m, t, s, c, stdout)
121#define	AT(t, p1, p2, s1, s2)	at(m, t, p1, p2, s1, s2)
122#define	NOTE(str)	{ if (m->eflags&REG_TRACE) (void)printf("=%s\n", (str)); }
123static int nope = 0;
124#else
125#define	SP(t, s, c)	/* nothing */
126#define	AT(t, p1, p2, s1, s2)	/* nothing */
127#define	NOTE(s)	/* nothing */
128#endif
129
130/*
131 - matcher - the actual matching engine
132 */
133static int			/* 0 success, REG_NOMATCH failure */
134matcher(struct re_guts *g, const char *string, size_t nmatch,
135    regmatch_t pmatch[], int eflags)
136{
137	const char *endp;
138	int i;
139	struct match mv;
140	struct match *m = &mv;
141	const char *dp;
142	const sopno gf = g->firststate+1;	/* +1 for OEND */
143	const sopno gl = g->laststate;
144	const char *start;
145	const char *stop;
146
147	/* simplify the situation where possible */
148	if (g->cflags&REG_NOSUB)
149		nmatch = 0;
150	if (eflags&REG_STARTEND) {
151		start = string + pmatch[0].rm_so;
152		stop = string + pmatch[0].rm_eo;
153	} else {
154		start = string;
155		stop = start + strlen(start);
156	}
157	if (stop < start)
158		return(REG_INVARG);
159
160	/* prescreening; this does wonders for this rather slow code */
161	if (g->must != NULL) {
162		for (dp = start; dp < stop; dp++)
163			if (*dp == g->must[0] && stop - dp >= g->mlen &&
164			    memcmp(dp, g->must, g->mlen) == 0)
165				break;
166		if (dp == stop)		/* we didn't find g->must */
167			return(REG_NOMATCH);
168	}
169
170	/* match struct setup */
171	m->g = g;
172	m->eflags = eflags;
173	m->pmatch = NULL;
174	m->lastpos = NULL;
175	m->offp = string;
176	m->beginp = start;
177	m->endp = stop;
178	STATESETUP(m, 4);
179	SETUP(m->st);
180	SETUP(m->fresh);
181	SETUP(m->tmp);
182	SETUP(m->empty);
183	CLEAR(m->empty);
184
185	/* this loop does only one repetition except for backrefs */
186	for (;;) {
187		endp = fast(m, start, stop, gf, gl);
188		if (endp == NULL) {		/* a miss */
189			free(m->pmatch);
190			free(m->lastpos);
191			STATETEARDOWN(m);
192			return(REG_NOMATCH);
193		}
194		if (nmatch == 0 && !g->backrefs)
195			break;		/* no further info needed */
196
197		/* where? */
198		assert(m->coldp != NULL);
199		for (;;) {
200			NOTE("finding start");
201			endp = slow(m, m->coldp, stop, gf, gl);
202			if (endp != NULL)
203				break;
204			assert(m->coldp < m->endp);
205			m->coldp++;
206		}
207		if (nmatch == 1 && !g->backrefs)
208			break;		/* no further info needed */
209
210		/* oh my, he wants the subexpressions... */
211		if (m->pmatch == NULL)
212			m->pmatch = reallocarray(NULL, m->g->nsub + 1,
213			    sizeof(regmatch_t));
214		if (m->pmatch == NULL) {
215			STATETEARDOWN(m);
216			return(REG_ESPACE);
217		}
218		for (i = 1; i <= m->g->nsub; i++)
219			m->pmatch[i].rm_so = m->pmatch[i].rm_eo = -1;
220		if (!g->backrefs && !(m->eflags&REG_BACKR)) {
221			NOTE("dissecting");
222			dp = dissect(m, m->coldp, endp, gf, gl);
223		} else {
224			if (g->nplus > 0 && m->lastpos == NULL)
225				m->lastpos = reallocarray(NULL,
226				    g->nplus+1, sizeof(char *));
227			if (g->nplus > 0 && m->lastpos == NULL) {
228				free(m->pmatch);
229				STATETEARDOWN(m);
230				return(REG_ESPACE);
231			}
232			NOTE("backref dissect");
233			dp = backref(m, m->coldp, endp, gf, gl, (sopno)0, 0);
234		}
235		if (dp != NULL)
236			break;
237
238		/* uh-oh... we couldn't find a subexpression-level match */
239		assert(g->backrefs);	/* must be back references doing it */
240		assert(g->nplus == 0 || m->lastpos != NULL);
241		for (;;) {
242			if (dp != NULL || endp <= m->coldp)
243				break;		/* defeat */
244			NOTE("backoff");
245			endp = slow(m, m->coldp, endp-1, gf, gl);
246			if (endp == NULL)
247				break;		/* defeat */
248			/* try it on a shorter possibility */
249#ifndef NDEBUG
250			for (i = 1; i <= m->g->nsub; i++) {
251				assert(m->pmatch[i].rm_so == -1);
252				assert(m->pmatch[i].rm_eo == -1);
253			}
254#endif
255			NOTE("backoff dissect");
256			dp = backref(m, m->coldp, endp, gf, gl, (sopno)0, 0);
257		}
258		assert(dp == NULL || dp == endp);
259		if (dp != NULL)		/* found a shorter one */
260			break;
261
262		/* despite initial appearances, there is no match here */
263		NOTE("false alarm");
264		if (m->coldp == stop)
265			break;
266		start = m->coldp + 1;	/* recycle starting later */
267	}
268
269	/* fill in the details if requested */
270	if (nmatch > 0) {
271		pmatch[0].rm_so = m->coldp - m->offp;
272		pmatch[0].rm_eo = endp - m->offp;
273	}
274	if (nmatch > 1) {
275		assert(m->pmatch != NULL);
276		for (i = 1; i < nmatch; i++)
277			if (i <= m->g->nsub)
278				pmatch[i] = m->pmatch[i];
279			else {
280				pmatch[i].rm_so = -1;
281				pmatch[i].rm_eo = -1;
282			}
283	}
284
285	free(m->pmatch);
286	free(m->lastpos);
287	STATETEARDOWN(m);
288	return(0);
289}
290
291/*
292 - dissect - figure out what matched what, no back references
293 */
294static const char *		/* == stop (success) always */
295dissect(struct match *m, const char *start, const char *stop, sopno startst,
296    sopno stopst)
297{
298	int i;
299	sopno ss;		/* start sop of current subRE */
300	sopno es;		/* end sop of current subRE */
301	const char *sp;		/* start of string matched by it */
302	const char *stp;	/* string matched by it cannot pass here */
303	const char *rest;	/* start of rest of string */
304	const char *tail;	/* string unmatched by rest of RE */
305	sopno ssub;		/* start sop of subsubRE */
306	sopno esub;		/* end sop of subsubRE */
307	const char *ssp;	/* start of string matched by subsubRE */
308	const char *sep;	/* end of string matched by subsubRE */
309	const char *oldssp;	/* previous ssp */
310	const char *dp;
311
312	AT("diss", start, stop, startst, stopst);
313	sp = start;
314	for (ss = startst; ss < stopst; ss = es) {
315		/* identify end of subRE */
316		es = ss;
317		switch (OP(m->g->strip[es])) {
318		case OPLUS_:
319		case OQUEST_:
320			es += OPND(m->g->strip[es]);
321			break;
322		case OCH_:
323			while (OP(m->g->strip[es]) != O_CH)
324				es += OPND(m->g->strip[es]);
325			break;
326		}
327		es++;
328
329		/* figure out what it matched */
330		switch (OP(m->g->strip[ss])) {
331		case OEND:
332			assert(nope);
333			break;
334		case OCHAR:
335			sp++;
336			break;
337		case OBOL:
338		case OEOL:
339		case OBOW:
340		case OEOW:
341			break;
342		case OANY:
343		case OANYOF:
344			sp++;
345			break;
346		case OBACK_:
347		case O_BACK:
348			assert(nope);
349			break;
350		/* cases where length of match is hard to find */
351		case OQUEST_:
352			stp = stop;
353			for (;;) {
354				/* how long could this one be? */
355				rest = slow(m, sp, stp, ss, es);
356				assert(rest != NULL);	/* it did match */
357				/* could the rest match the rest? */
358				tail = slow(m, rest, stop, es, stopst);
359				if (tail == stop)
360					break;		/* yes! */
361				/* no -- try a shorter match for this one */
362				stp = rest - 1;
363				assert(stp >= sp);	/* it did work */
364			}
365			ssub = ss + 1;
366			esub = es - 1;
367			/* did innards match? */
368			if (slow(m, sp, rest, ssub, esub) != NULL) {
369				dp = dissect(m, sp, rest, ssub, esub);
370				assert(dp == rest);
371			} else		/* no */
372				assert(sp == rest);
373			sp = rest;
374			break;
375		case OPLUS_:
376			stp = stop;
377			for (;;) {
378				/* how long could this one be? */
379				rest = slow(m, sp, stp, ss, es);
380				assert(rest != NULL);	/* it did match */
381				/* could the rest match the rest? */
382				tail = slow(m, rest, stop, es, stopst);
383				if (tail == stop)
384					break;		/* yes! */
385				/* no -- try a shorter match for this one */
386				stp = rest - 1;
387				assert(stp >= sp);	/* it did work */
388			}
389			ssub = ss + 1;
390			esub = es - 1;
391			ssp = sp;
392			oldssp = ssp;
393			for (;;) {	/* find last match of innards */
394				sep = slow(m, ssp, rest, ssub, esub);
395				if (sep == NULL || sep == ssp)
396					break;	/* failed or matched null */
397				oldssp = ssp;	/* on to next try */
398				ssp = sep;
399			}
400			if (sep == NULL) {
401				/* last successful match */
402				sep = ssp;
403				ssp = oldssp;
404			}
405			assert(sep == rest);	/* must exhaust substring */
406			assert(slow(m, ssp, sep, ssub, esub) == rest);
407			dp = dissect(m, ssp, sep, ssub, esub);
408			assert(dp == sep);
409			sp = rest;
410			break;
411		case OCH_:
412			stp = stop;
413			for (;;) {
414				/* how long could this one be? */
415				rest = slow(m, sp, stp, ss, es);
416				assert(rest != NULL);	/* it did match */
417				/* could the rest match the rest? */
418				tail = slow(m, rest, stop, es, stopst);
419				if (tail == stop)
420					break;		/* yes! */
421				/* no -- try a shorter match for this one */
422				stp = rest - 1;
423				assert(stp >= sp);	/* it did work */
424			}
425			ssub = ss + 1;
426			esub = ss + OPND(m->g->strip[ss]) - 1;
427			assert(OP(m->g->strip[esub]) == OOR1);
428			for (;;) {	/* find first matching branch */
429				if (slow(m, sp, rest, ssub, esub) == rest)
430					break;	/* it matched all of it */
431				/* that one missed, try next one */
432				assert(OP(m->g->strip[esub]) == OOR1);
433				esub++;
434				assert(OP(m->g->strip[esub]) == OOR2);
435				ssub = esub + 1;
436				esub += OPND(m->g->strip[esub]);
437				if (OP(m->g->strip[esub]) == OOR2)
438					esub--;
439				else
440					assert(OP(m->g->strip[esub]) == O_CH);
441			}
442			dp = dissect(m, sp, rest, ssub, esub);
443			assert(dp == rest);
444			sp = rest;
445			break;
446		case O_PLUS:
447		case O_QUEST:
448		case OOR1:
449		case OOR2:
450		case O_CH:
451			assert(nope);
452			break;
453		case OLPAREN:
454			i = OPND(m->g->strip[ss]);
455			assert(0 < i && i <= m->g->nsub);
456			m->pmatch[i].rm_so = sp - m->offp;
457			break;
458		case ORPAREN:
459			i = OPND(m->g->strip[ss]);
460			assert(0 < i && i <= m->g->nsub);
461			m->pmatch[i].rm_eo = sp - m->offp;
462			break;
463		default:		/* uh oh */
464			assert(nope);
465			break;
466		}
467	}
468
469	assert(sp == stop);
470	return(sp);
471}
472
473/*
474 - backref - figure out what matched what, figuring in back references
475 */
476static const char *		/* == stop (success) or NULL (failure) */
477backref(struct match *m, const char *start, const char *stop, sopno startst,
478    sopno stopst, sopno lev, int rec)			/* PLUS nesting level */
479{
480	int i;
481	sopno ss;	/* start sop of current subRE */
482	const char *sp;	/* start of string matched by it */
483	sopno ssub;	/* start sop of subsubRE */
484	sopno esub;	/* end sop of subsubRE */
485	const char *ssp;/* start of string matched by subsubRE */
486	const char *dp;
487	size_t len;
488	int hard;
489	sop s;
490	regoff_t offsave;
491	cset *cs;
492
493	AT("back", start, stop, startst, stopst);
494	sp = start;
495
496	/* get as far as we can with easy stuff */
497	hard = 0;
498	for (ss = startst; !hard && ss < stopst; ss++)
499		switch (OP(s = m->g->strip[ss])) {
500		case OCHAR:
501			if (sp == stop || *sp++ != (char)OPND(s))
502				return(NULL);
503			break;
504		case OANY:
505			if (sp == stop)
506				return(NULL);
507			sp++;
508			break;
509		case OANYOF:
510			cs = &m->g->sets[OPND(s)];
511			if (sp == stop || !CHIN(cs, *sp++))
512				return(NULL);
513			break;
514		case OBOL:
515			if ((sp == m->beginp && !(m->eflags&REG_NOTBOL)) ||
516			    (sp > m->offp && sp < m->endp &&
517			     *(sp-1) == '\n' && (m->g->cflags&REG_NEWLINE)))
518				{ /* yes */ }
519			else
520				return(NULL);
521			break;
522		case OEOL:
523			if ( (sp == m->endp && !(m->eflags&REG_NOTEOL)) ||
524					(sp < m->endp && *sp == '\n' &&
525						(m->g->cflags&REG_NEWLINE)) )
526				{ /* yes */ }
527			else
528				return(NULL);
529			break;
530		case OBOW:
531			if (sp < m->endp && ISWORD(*sp) &&
532			    ((sp == m->beginp && !(m->eflags&REG_NOTBOL)) ||
533			     (sp > m->offp && !ISWORD(*(sp-1)))))
534				{ /* yes */ }
535			else
536				return(NULL);
537			break;
538		case OEOW:
539			if (( (sp == m->endp && !(m->eflags&REG_NOTEOL)) ||
540					(sp < m->endp && *sp == '\n' &&
541						(m->g->cflags&REG_NEWLINE)) ||
542					(sp < m->endp && !ISWORD(*sp)) ) &&
543					(sp > m->beginp && ISWORD(*(sp-1))) )
544				{ /* yes */ }
545			else
546				return(NULL);
547			break;
548		case O_QUEST:
549			break;
550		case OOR1:	/* matches null but needs to skip */
551			ss++;
552			s = m->g->strip[ss];
553			do {
554				assert(OP(s) == OOR2);
555				ss += OPND(s);
556			} while (OP(s = m->g->strip[ss]) != O_CH);
557			/* note that the ss++ gets us past the O_CH */
558			break;
559		default:	/* have to make a choice */
560			hard = 1;
561			break;
562		}
563	if (!hard) {		/* that was it! */
564		if (sp != stop)
565			return(NULL);
566		return(sp);
567	}
568	ss--;			/* adjust for the for's final increment */
569
570	/* the hard stuff */
571	AT("hard", sp, stop, ss, stopst);
572	s = m->g->strip[ss];
573	switch (OP(s)) {
574	case OBACK_:		/* the vilest depths */
575		i = OPND(s);
576		assert(0 < i && i <= m->g->nsub);
577		if (m->pmatch[i].rm_eo == -1)
578			return(NULL);
579		assert(m->pmatch[i].rm_so != -1);
580		len = m->pmatch[i].rm_eo - m->pmatch[i].rm_so;
581		if (len == 0 && rec++ > MAX_RECURSION)
582			return(NULL);
583		assert(stop - m->beginp >= len);
584		if (sp > stop - len)
585			return(NULL);	/* not enough left to match */
586		ssp = m->offp + m->pmatch[i].rm_so;
587		if (memcmp(sp, ssp, len) != 0)
588			return(NULL);
589		while (m->g->strip[ss] != SOP(O_BACK, i))
590			ss++;
591		return(backref(m, sp+len, stop, ss+1, stopst, lev, rec));
592		break;
593	case OQUEST_:		/* to null or not */
594		dp = backref(m, sp, stop, ss+1, stopst, lev, rec);
595		if (dp != NULL)
596			return(dp);	/* not */
597		return(backref(m, sp, stop, ss+OPND(s)+1, stopst, lev, rec));
598		break;
599	case OPLUS_:
600		assert(m->lastpos != NULL);
601		assert(lev+1 <= m->g->nplus);
602		m->lastpos[lev+1] = sp;
603		return(backref(m, sp, stop, ss+1, stopst, lev+1, rec));
604		break;
605	case O_PLUS:
606		if (sp == m->lastpos[lev])	/* last pass matched null */
607			return(backref(m, sp, stop, ss+1, stopst, lev-1, rec));
608		/* try another pass */
609		m->lastpos[lev] = sp;
610		dp = backref(m, sp, stop, ss-OPND(s)+1, stopst, lev, rec);
611		if (dp == NULL)
612			return(backref(m, sp, stop, ss+1, stopst, lev-1, rec));
613		else
614			return(dp);
615		break;
616	case OCH_:		/* find the right one, if any */
617		ssub = ss + 1;
618		esub = ss + OPND(s) - 1;
619		assert(OP(m->g->strip[esub]) == OOR1);
620		for (;;) {	/* find first matching branch */
621			dp = backref(m, sp, stop, ssub, esub, lev, rec);
622			if (dp != NULL)
623				return(dp);
624			/* that one missed, try next one */
625			if (OP(m->g->strip[esub]) == O_CH)
626				return(NULL);	/* there is none */
627			esub++;
628			assert(OP(m->g->strip[esub]) == OOR2);
629			ssub = esub + 1;
630			esub += OPND(m->g->strip[esub]);
631			if (OP(m->g->strip[esub]) == OOR2)
632				esub--;
633			else
634				assert(OP(m->g->strip[esub]) == O_CH);
635		}
636		break;
637	case OLPAREN:		/* must undo assignment if rest fails */
638		i = OPND(s);
639		assert(0 < i && i <= m->g->nsub);
640		offsave = m->pmatch[i].rm_so;
641		m->pmatch[i].rm_so = sp - m->offp;
642		dp = backref(m, sp, stop, ss+1, stopst, lev, rec);
643		if (dp != NULL)
644			return(dp);
645		m->pmatch[i].rm_so = offsave;
646		return(NULL);
647		break;
648	case ORPAREN:		/* must undo assignment if rest fails */
649		i = OPND(s);
650		assert(0 < i && i <= m->g->nsub);
651		offsave = m->pmatch[i].rm_eo;
652		m->pmatch[i].rm_eo = sp - m->offp;
653		dp = backref(m, sp, stop, ss+1, stopst, lev, rec);
654		if (dp != NULL)
655			return(dp);
656		m->pmatch[i].rm_eo = offsave;
657		return(NULL);
658		break;
659	default:		/* uh oh */
660		assert(nope);
661		break;
662	}
663
664	/* "can't happen" */
665	assert(nope);
666	/* NOTREACHED */
667	return NULL;
668}
669
670/*
671 - fast - step through the string at top speed
672 */
673static const char *			/* where tentative match ended, or NULL */
674fast(struct match *m, const char *start, const char *stop, sopno startst,
675    sopno stopst)
676{
677	states st = m->st;
678	states fresh = m->fresh;
679	states tmp = m->tmp;
680	const char *p = start;
681	int c;
682	int lastc;		/* previous c */
683	int flagch;
684	int i;
685	const char *coldp;	/* last p after which no match was underway */
686
687	if (start == m->offp || (start == m->beginp && !(m->eflags&REG_NOTBOL)))
688		c = OUT;
689	else
690		c = *(start-1);
691
692	CLEAR(st);
693	SET1(st, startst);
694	st = step(m->g, startst, stopst, st, NOTHING, st);
695	ASSIGN(fresh, st);
696	SP("start", st, *p);
697	coldp = NULL;
698	for (;;) {
699		/* next character */
700		lastc = c;
701		c = (p == m->endp) ? OUT : *p;
702		if (EQ(st, fresh))
703			coldp = p;
704
705		/* is there an EOL and/or BOL between lastc and c? */
706		flagch = '\0';
707		i = 0;
708		if ((lastc == '\n' && m->g->cflags&REG_NEWLINE) ||
709		    (lastc == OUT && !(m->eflags&REG_NOTBOL))) {
710			flagch = BOL;
711			i = m->g->nbol;
712		}
713		if ((c == '\n' && m->g->cflags&REG_NEWLINE) ||
714		    (c == OUT && !(m->eflags&REG_NOTEOL)) ) {
715			flagch = (flagch == BOL) ? BOLEOL : EOL;
716			i += m->g->neol;
717		}
718		if (i != 0) {
719			for (; i > 0; i--)
720				st = step(m->g, startst, stopst,
721				    st, flagch, st);
722			SP("boleol", st, c);
723		}
724
725		/* how about a word boundary? */
726		if ((flagch == BOL || (lastc != OUT && !ISWORD(lastc))) &&
727		    (c != OUT && ISWORD(c)))
728			flagch = BOW;
729		if ((lastc != OUT && ISWORD(lastc)) &&
730		    (flagch == EOL || (c != OUT && !ISWORD(c))))
731			flagch = EOW;
732		if (flagch == BOW || flagch == EOW) {
733			st = step(m->g, startst, stopst, st, flagch, st);
734			SP("boweow", st, c);
735		}
736
737		/* are we done? */
738		if (ISSET(st, stopst) || p == stop)
739			break;		/* NOTE BREAK OUT */
740
741		/* no, we must deal with this character */
742		ASSIGN(tmp, st);
743		ASSIGN(st, fresh);
744		assert(c != OUT);
745		st = step(m->g, startst, stopst, tmp, c, st);
746		SP("aft", st, c);
747		assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st));
748		p++;
749	}
750
751	assert(coldp != NULL);
752	m->coldp = coldp;
753	if (ISSET(st, stopst))
754		return(p+1);
755	else
756		return(NULL);
757}
758
759/*
760 - slow - step through the string more deliberately
761 */
762static const char *		/* where it ended */
763slow(struct match *m, const char *start, const char *stop, sopno startst,
764    sopno stopst)
765{
766	states st = m->st;
767	states empty = m->empty;
768	states tmp = m->tmp;
769	const char *p = start;
770	int c;
771	int lastc;		/* previous c */
772	int flagch;
773	int i;
774	const char *matchp;	/* last p at which a match ended */
775
776	if (start == m->offp || (start == m->beginp && !(m->eflags&REG_NOTBOL)))
777		c = OUT;
778	else
779		c = *(start-1);
780
781	AT("slow", start, stop, startst, stopst);
782	CLEAR(st);
783	SET1(st, startst);
784	SP("sstart", st, *p);
785	st = step(m->g, startst, stopst, st, NOTHING, st);
786	matchp = NULL;
787	for (;;) {
788		/* next character */
789		lastc = c;
790		c = (p == m->endp) ? OUT : *p;
791
792		/* is there an EOL and/or BOL between lastc and c? */
793		flagch = '\0';
794		i = 0;
795		if ((lastc == '\n' && m->g->cflags&REG_NEWLINE) ||
796		    (lastc == OUT && !(m->eflags&REG_NOTBOL))) {
797			flagch = BOL;
798			i = m->g->nbol;
799		}
800		if ((c == '\n' && m->g->cflags&REG_NEWLINE) ||
801		    (c == OUT && !(m->eflags&REG_NOTEOL))) {
802			flagch = (flagch == BOL) ? BOLEOL : EOL;
803			i += m->g->neol;
804		}
805		if (i != 0) {
806			for (; i > 0; i--)
807				st = step(m->g, startst, stopst,
808				    st, flagch, st);
809			SP("sboleol", st, c);
810		}
811
812		/* how about a word boundary? */
813		if ((flagch == BOL || (lastc != OUT && !ISWORD(lastc))) &&
814		    (c != OUT && ISWORD(c)))
815			flagch = BOW;
816		if ((lastc != OUT && ISWORD(lastc)) &&
817		    (flagch == EOL || (c != OUT && !ISWORD(c))))
818			flagch = EOW;
819		if (flagch == BOW || flagch == EOW) {
820			st = step(m->g, startst, stopst, st, flagch, st);
821			SP("sboweow", st, c);
822		}
823
824		/* are we done? */
825		if (ISSET(st, stopst))
826			matchp = p;
827		if (EQ(st, empty) || p == stop)
828			break;		/* NOTE BREAK OUT */
829
830		/* no, we must deal with this character */
831		ASSIGN(tmp, st);
832		ASSIGN(st, empty);
833		assert(c != OUT);
834		st = step(m->g, startst, stopst, tmp, c, st);
835		SP("saft", st, c);
836		assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st));
837		p++;
838	}
839
840	return(matchp);
841}
842
843
844/*
845 - step - map set of states reachable before char to set reachable after
846 */
847static states
848step(struct re_guts *g,
849    sopno start,		/* start state within strip */
850    sopno stop,			/* state after stop state within strip */
851    states bef,			/* states reachable before */
852    int ch,			/* character or NONCHAR code */
853    states aft)			/* states already known reachable after */
854{
855	cset *cs;
856	sop s;
857	sopno pc;
858	onestate here;		/* note, macros know this name */
859	sopno look;
860	int i;
861
862	for (pc = start, INIT(here, pc); pc != stop; pc++, INC(here)) {
863		s = g->strip[pc];
864		switch (OP(s)) {
865		case OEND:
866			assert(pc == stop-1);
867			break;
868		case OCHAR:
869			/* only characters can match */
870			assert(!NONCHAR(ch) || ch != (char)OPND(s));
871			if (ch == (char)OPND(s))
872				FWD(aft, bef, 1);
873			break;
874		case OBOL:
875			if (ch == BOL || ch == BOLEOL)
876				FWD(aft, bef, 1);
877			break;
878		case OEOL:
879			if (ch == EOL || ch == BOLEOL)
880				FWD(aft, bef, 1);
881			break;
882		case OBOW:
883			if (ch == BOW)
884				FWD(aft, bef, 1);
885			break;
886		case OEOW:
887			if (ch == EOW)
888				FWD(aft, bef, 1);
889			break;
890		case OANY:
891			if (!NONCHAR(ch))
892				FWD(aft, bef, 1);
893			break;
894		case OANYOF:
895			cs = &g->sets[OPND(s)];
896			if (!NONCHAR(ch) && CHIN(cs, ch))
897				FWD(aft, bef, 1);
898			break;
899		case OBACK_:		/* ignored here */
900		case O_BACK:
901			FWD(aft, aft, 1);
902			break;
903		case OPLUS_:		/* forward, this is just an empty */
904			FWD(aft, aft, 1);
905			break;
906		case O_PLUS:		/* both forward and back */
907			FWD(aft, aft, 1);
908			i = ISSETBACK(aft, OPND(s));
909			BACK(aft, aft, OPND(s));
910			if (!i && ISSETBACK(aft, OPND(s))) {
911				/* oho, must reconsider loop body */
912				pc -= OPND(s) + 1;
913				INIT(here, pc);
914			}
915			break;
916		case OQUEST_:		/* two branches, both forward */
917			FWD(aft, aft, 1);
918			FWD(aft, aft, OPND(s));
919			break;
920		case O_QUEST:		/* just an empty */
921			FWD(aft, aft, 1);
922			break;
923		case OLPAREN:		/* not significant here */
924		case ORPAREN:
925			FWD(aft, aft, 1);
926			break;
927		case OCH_:		/* mark the first two branches */
928			FWD(aft, aft, 1);
929			assert(OP(g->strip[pc+OPND(s)]) == OOR2);
930			FWD(aft, aft, OPND(s));
931			break;
932		case OOR1:		/* done a branch, find the O_CH */
933			if (ISSTATEIN(aft, here)) {
934				for (look = 1;
935				    OP(s = g->strip[pc+look]) != O_CH;
936				    look += OPND(s))
937					assert(OP(s) == OOR2);
938				FWD(aft, aft, look + 1);
939			}
940			break;
941		case OOR2:		/* propagate OCH_'s marking */
942			FWD(aft, aft, 1);
943			if (OP(g->strip[pc+OPND(s)]) != O_CH) {
944				assert(OP(g->strip[pc+OPND(s)]) == OOR2);
945				FWD(aft, aft, OPND(s));
946			}
947			break;
948		case O_CH:		/* just empty */
949			FWD(aft, aft, 1);
950			break;
951		default:		/* ooooops... */
952			assert(nope);
953			break;
954		}
955	}
956
957	return(aft);
958}
959
960#ifdef REDEBUG
961/*
962 - print - print a set of states
963 */
964static void
965print(struct match *m, const char *caption, states st, int ch, FILE *d)
966{
967	struct re_guts *g = m->g;
968	int i;
969	int first = 1;
970
971	if (!(m->eflags&REG_TRACE))
972		return;
973
974	(void)fprintf(d, "%s", caption);
975	(void)fprintf(d, " %s", pchar(ch));
976	for (i = 0; i < g->nstates; i++) {
977		if (ISSET(st, i)) {
978			(void)fprintf(d, "%s%d", (first) ? "\t" : ", ", i);
979			first = 0;
980		}
981	}
982	(void)fprintf(d, "\n");
983}
984
985/*
986 - at - print current situation
987 */
988static void
989at(struct match *m, const char *title, const char *start, const char *stop,
990    sopno startst, sopno stopst)
991{
992	if (!(m->eflags&REG_TRACE))
993		return;
994
995	(void)printf("%s %s-", title, pchar(*start));
996	(void)printf("%s ", pchar(*stop));
997	(void)printf("%ld-%ld\n", (long)startst, (long)stopst);
998}
999
1000#ifndef PCHARDONE
1001#define	PCHARDONE	/* never again */
1002static const char *nonchars[] =
1003    { "OUT", "BOL", "EOL", "BOLEOL", "NOTHING", "BOW", "EOW" };
1004#define	PNONCHAR(c)						\
1005	((c) - OUT < (sizeof(nonchars)/sizeof(nonchars[0]))	\
1006	    ? nonchars[(c) - OUT] : "invalid")
1007
1008/*
1009 - pchar - make a character printable
1010 *
1011 * Is this identical to regchar() over in debug.c?  Well, yes.  But a
1012 * duplicate here avoids having a debugging-capable regexec.o tied to
1013 * a matching debug.o, and this is convenient.  It all disappears in
1014 * the non-debug compilation anyway, so it doesn't matter much.
1015 */
1016static const char *		/* -> representation */
1017pchar(int ch)
1018{
1019	static char pbuf[10];
1020
1021	if (NONCHAR(ch)) {
1022		if (ch - OUT < (sizeof(nonchars)/sizeof(nonchars[0])))
1023			return nonchars[ch - OUT];
1024		return "invalid";
1025	}
1026	if (isprint((unsigned char)ch) || ch == ' ')
1027		(void)snprintf(pbuf, sizeof pbuf, "%c", ch);
1028	else
1029		(void)snprintf(pbuf, sizeof pbuf, "\\%o", ch);
1030	return(pbuf);
1031}
1032#endif
1033#endif
1034
1035#undef	matcher
1036#undef	fast
1037#undef	slow
1038#undef	dissect
1039#undef	backref
1040#undef	step
1041#undef	print
1042#undef	at
1043#undef	match
1044#undef	nope
1045