engine.c revision 62391
1356290Sjkim/*-
2238405Sjkim * Copyright (c) 1992, 1993, 1994 Henry Spencer.
3238405Sjkim * Copyright (c) 1992, 1993, 1994
4238405Sjkim *	The Regents of the University of California.  All rights reserved.
5238405Sjkim *
6238405Sjkim * This code is derived from software contributed to Berkeley by
7238405Sjkim * Henry Spencer.
8238405Sjkim *
9238405Sjkim * Redistribution and use in source and binary forms, with or without
10238405Sjkim * modification, are permitted provided that the following conditions
11238405Sjkim * are met:
12238405Sjkim * 1. Redistributions of source code must retain the above copyright
13238405Sjkim *    notice, this list of conditions and the following disclaimer.
14238405Sjkim * 2. Redistributions in binary form must reproduce the above copyright
15238405Sjkim *    notice, this list of conditions and the following disclaimer in the
16238405Sjkim *    documentation and/or other materials provided with the distribution.
17238405Sjkim * 3. All advertising materials mentioning features or use of this software
18238405Sjkim *    must display the following acknowledgement:
19238405Sjkim *	This product includes software developed by the University of
20238405Sjkim *	California, Berkeley and its contributors.
21238405Sjkim * 4. Neither the name of the University nor the names of its contributors
22238405Sjkim *    may be used to endorse or promote products derived from this software
23238405Sjkim *    without specific prior written permission.
24238405Sjkim *
25238405Sjkim * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
26238405Sjkim * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27238405Sjkim * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
28238405Sjkim * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
29238405Sjkim * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
30238405Sjkim * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
31238405Sjkim * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
32238405Sjkim * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
33238405Sjkim * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
34238405Sjkim * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
35238405Sjkim * SUCH DAMAGE.
36238405Sjkim *
37238405Sjkim *	@(#)engine.c	8.5 (Berkeley) 3/20/94
38238405Sjkim *
39238405Sjkim * $FreeBSD: head/lib/libc/regex/engine.c 62391 2000-07-02 10:58:07Z dcs $
40238405Sjkim */
41276861Sjkim
42276861Sjkim/*
43238405Sjkim * The matching engine and friends.  This file is #included by regexec.c
44238405Sjkim * after suitable #defines of a variety of macros used herein, so that
45238405Sjkim * different state representations can be used without duplicating masses
46238405Sjkim * of code.
47238405Sjkim */
48238405Sjkim
49312826Sjkim#ifdef SNAMES
50238405Sjkim#define	matcher	smatcher
51238405Sjkim#define	fast	sfast
52238405Sjkim#define	slow	sslow
53276861Sjkim#define	dissect	sdissect
54276861Sjkim#define	backref	sbackref
55276861Sjkim#define	step	sstep
56238405Sjkim#define	print	sprint
57344604Sjkim#define	at	sat
58344604Sjkim#define	match	smat
59344604Sjkim#endif
60344604Sjkim#ifdef LNAMES
61344604Sjkim#define	matcher	lmatcher
62344604Sjkim#define	fast	lfast
63238405Sjkim#define	slow	lslow
64344604Sjkim#define	dissect	ldissect
65344604Sjkim#define	backref	lbackref
66344604Sjkim#define	step	lstep
67344604Sjkim#define	print	lprint
68276861Sjkim#define	at	lat
69238405Sjkim#define	match	lmat
70344604Sjkim#endif
71238405Sjkim
72238405Sjkim/* another structure passed up and down to avoid zillions of parameters */
73238405Sjkimstruct match {
74238405Sjkim	struct re_guts *g;
75238405Sjkim	int eflags;
76238405Sjkim	regmatch_t *pmatch;	/* [nsub+1] (0 element unused) */
77238405Sjkim	char *offp;		/* offsets work from here */
78238405Sjkim	char *beginp;		/* start of string -- virtual NUL precedes */
79238405Sjkim	char *endp;		/* end of string -- virtual NUL here */
80238405Sjkim	char *coldp;		/* can be no match starting before here */
81238405Sjkim	char **lastpos;		/* [nplus+1] */
82238405Sjkim	STATEVARS;
83238405Sjkim	states st;		/* current states */
84238405Sjkim	states fresh;		/* states for a fresh start */
85238405Sjkim	states tmp;		/* temporary */
86238405Sjkim	states empty;		/* empty set of states */
87238405Sjkim};
88238405Sjkim
89238405Sjkim/* ========= begin header generated by ./mkh ========= */
90238405Sjkim#ifdef __cplusplus
91238405Sjkimextern "C" {
92238405Sjkim#endif
93238405Sjkim
94238405Sjkim/* === engine.c === */
95238405Sjkimstatic int matcher __P((struct re_guts *g, char *string, size_t nmatch, regmatch_t pmatch[], int eflags));
96238405Sjkimstatic char *dissect __P((struct match *m, char *start, char *stop, sopno startst, sopno stopst));
97238405Sjkimstatic char *backref __P((struct match *m, char *start, char *stop, sopno startst, sopno stopst, sopno lev));
98238405Sjkimstatic char *fast __P((struct match *m, char *start, char *stop, sopno startst, sopno stopst));
99238405Sjkimstatic char *slow __P((struct match *m, char *start, char *stop, sopno startst, sopno stopst));
100238405Sjkimstatic states step __P((struct re_guts *g, sopno start, sopno stop, states bef, int ch, states aft));
101238405Sjkim#define	BOL	(OUT+1)
102238405Sjkim#define	EOL	(BOL+1)
103238405Sjkim#define	BOLEOL	(BOL+2)
104238405Sjkim#define	NOTHING	(BOL+3)
105238405Sjkim#define	BOW	(BOL+4)
106238405Sjkim#define	EOW	(BOL+5)
107238405Sjkim#define	CODEMAX	(BOL+5)		/* highest code used */
108238405Sjkim#define	NONCHAR(c)	((c) > CHAR_MAX)
109238405Sjkim#define	NNONCHAR	(CODEMAX-CHAR_MAX)
110238405Sjkim#ifdef REDEBUG
111238405Sjkimstatic void print __P((struct match *m, char *caption, states st, int ch, FILE *d));
112238405Sjkim#endif
113238405Sjkim#ifdef REDEBUG
114238405Sjkimstatic void at __P((struct match *m, char *title, char *start, char *stop, sopno startst, sopno stopst));
115238405Sjkim#endif
116238405Sjkim#ifdef REDEBUG
117238405Sjkimstatic char *pchar __P((int ch));
118238405Sjkim#endif
119238405Sjkim
120238405Sjkim#ifdef __cplusplus
121238405Sjkim}
122238405Sjkim#endif
123238405Sjkim/* ========= end header generated by ./mkh ========= */
124238405Sjkim
125238405Sjkim#ifdef REDEBUG
126238405Sjkim#define	SP(t, s, c)	print(m, t, s, c, stdout)
127238405Sjkim#define	AT(t, p1, p2, s1, s2)	at(m, t, p1, p2, s1, s2)
128238405Sjkim#define	NOTE(str)	{ if (m->eflags&REG_TRACE) printf("=%s\n", (str)); }
129238405Sjkim#else
130238405Sjkim#define	SP(t, s, c)	/* nothing */
131238405Sjkim#define	AT(t, p1, p2, s1, s2)	/* nothing */
132238405Sjkim#define	NOTE(s)	/* nothing */
133238405Sjkim#endif
134238405Sjkim
135238405Sjkim/*
136356290Sjkim - matcher - the actual matching engine
137238405Sjkim == static int matcher(register struct re_guts *g, char *string, \
138238405Sjkim ==	size_t nmatch, regmatch_t pmatch[], int eflags);
139238405Sjkim */
140238405Sjkimstatic int			/* 0 success, REG_NOMATCH failure */
141238405Sjkimmatcher(g, string, nmatch, pmatch, eflags)
142238405Sjkimregister struct re_guts *g;
143238405Sjkimchar *string;
144238405Sjkimsize_t nmatch;
145238405Sjkimregmatch_t pmatch[];
146238405Sjkimint eflags;
147238405Sjkim{
148238405Sjkim	register char *endp;
149238405Sjkim	register int i;
150238405Sjkim	struct match mv;
151238405Sjkim	register struct match *m = &mv;
152238405Sjkim	register char *dp;
153238405Sjkim	register const sopno gf = g->firststate+1;	/* +1 for OEND */
154238405Sjkim	register const sopno gl = g->laststate;
155344604Sjkim	char *start;
156238405Sjkim	char *stop;
157238405Sjkim	/* Boyer-Moore algorithms variables */
158344604Sjkim	register unsigned char *pp;
159238405Sjkim	int cj, mj;
160238405Sjkim	register unsigned char *mustfirst;
161238405Sjkim	register unsigned char *mustlast;
162238405Sjkim	register int mustlen;
163238405Sjkim	register int *matchjump;
164238405Sjkim	register int *charjump;
165238405Sjkim	register unsigned char *bmp;
166238405Sjkim	register unsigned char *bmps;
167344604Sjkim
168238405Sjkim	/* simplify the situation where possible */
169238405Sjkim	if (g->cflags&REG_NOSUB)
170238405Sjkim		nmatch = 0;
171344604Sjkim	if (eflags&REG_STARTEND) {
172238405Sjkim		start = string + pmatch[0].rm_so;
173238405Sjkim		stop = string + pmatch[0].rm_eo;
174238405Sjkim	} else {
175344604Sjkim		start = string;
176238405Sjkim		stop = start + strlen(start);
177238405Sjkim	}
178238405Sjkim	if (stop < start)
179238405Sjkim		return(REG_INVARG);
180337982Sjkim
181238405Sjkim	/* prescreening; this does wonders for this rather slow code */
182238405Sjkim	if (g->must != NULL) {
183238405Sjkim		if (g->charjump != NULL && g->matchjump != NULL) {
184238405Sjkim			mustlen = -g->mlen;
185238405Sjkim			mustfirst = g->must;
186238405Sjkim			mustlast = g->must + g->mlen - 1;
187238405Sjkim			charjump = g->charjump;
188238405Sjkim			matchjump = g->matchjump;
189238405Sjkim			bmps = stop;
190238405Sjkim			pp = mustlast;
191238405Sjkim			for (bmp = start+g->mlen-1; bmp < bmps;) {
192238405Sjkim				/* Fast skip non-matches */
193238405Sjkim				while (bmp < bmps && charjump[*bmp])
194238405Sjkim					bmp += charjump[*bmp];
195238405Sjkim
196238405Sjkim				if (bmp >= bmps)
197238405Sjkim					break;
198238405Sjkim
199238405Sjkim				/* Greedy matcher */
200238405Sjkim				/* We depend on not being used for
201238405Sjkim				 * for strings of length 1
202238405Sjkim				 */
203238405Sjkim				while (*--bmp == *--pp && pp != mustfirst);
204238405Sjkim
205238405Sjkim				if (*bmp == *pp)
206238405Sjkim					break;
207238405Sjkim
208238405Sjkim				/* Jump to next possible match */
209238405Sjkim				mj = matchjump[pp - mustfirst];
210238405Sjkim				cj = charjump[*bmp];
211238405Sjkim				bmp += (cj < mj ? mj : cj);
212238405Sjkim				pp = mustlast;
213238405Sjkim			}
214238405Sjkim			if (pp != mustfirst)
215238405Sjkim				return(REG_NOMATCH);
216238405Sjkim			dp = bmp;
217344604Sjkim		} else {
218344604Sjkim			for (dp = start; dp < stop; dp++)
219344604Sjkim				if (*dp == g->must[0] &&
220344604Sjkim				    stop - dp >= g->mlen &&
221344604Sjkim				    memcmp(dp, g->must, (size_t)g->mlen) == 0)
222344604Sjkim					break;
223238405Sjkim			if (dp == stop)		/* we didn't find g->must */
224238405Sjkim				return(REG_NOMATCH);
225238405Sjkim		}
226	}
227
228	/* match struct setup */
229	m->g = g;
230	m->eflags = eflags;
231	m->pmatch = NULL;
232	m->lastpos = NULL;
233	m->offp = string;
234	m->beginp = start;
235	m->endp = stop;
236	STATESETUP(m, 4);
237	SETUP(m->st);
238	SETUP(m->fresh);
239	SETUP(m->tmp);
240	SETUP(m->empty);
241	CLEAR(m->empty);
242
243	/* Adjust start according to moffset, to speed things up */
244	if (g->moffset > -1)
245		start = dp - g->moffset;
246
247	/* this loop does only one repetition except for backrefs */
248	for (;;) {
249		endp = fast(m, start, stop, gf, gl);
250		if (endp == NULL) {		/* a miss */
251			STATETEARDOWN(m);
252			return(REG_NOMATCH);
253		}
254		if (nmatch == 0 && !g->backrefs)
255			break;		/* no further info needed */
256
257		/* where? */
258		assert(m->coldp != NULL);
259		for (;;) {
260			NOTE("finding start");
261			endp = slow(m, m->coldp, stop, gf, gl);
262			if (endp != NULL)
263				break;
264			assert(m->coldp < m->endp);
265			m->coldp++;
266		}
267		if (nmatch == 1 && !g->backrefs)
268			break;		/* no further info needed */
269
270		/* oh my, he wants the subexpressions... */
271		if (m->pmatch == NULL)
272			m->pmatch = (regmatch_t *)malloc((m->g->nsub + 1) *
273							sizeof(regmatch_t));
274		if (m->pmatch == NULL) {
275			STATETEARDOWN(m);
276			return(REG_ESPACE);
277		}
278		for (i = 1; i <= m->g->nsub; i++)
279			m->pmatch[i].rm_so = m->pmatch[i].rm_eo = -1;
280		if (!g->backrefs && !(m->eflags&REG_BACKR)) {
281			NOTE("dissecting");
282			dp = dissect(m, m->coldp, endp, gf, gl);
283		} else {
284			if (g->nplus > 0 && m->lastpos == NULL)
285				m->lastpos = (char **)malloc((g->nplus+1) *
286							sizeof(char *));
287			if (g->nplus > 0 && m->lastpos == NULL) {
288				free(m->pmatch);
289				STATETEARDOWN(m);
290				return(REG_ESPACE);
291			}
292			NOTE("backref dissect");
293			dp = backref(m, m->coldp, endp, gf, gl, (sopno)0);
294		}
295		if (dp != NULL)
296			break;
297
298		/* uh-oh... we couldn't find a subexpression-level match */
299		assert(g->backrefs);	/* must be back references doing it */
300		assert(g->nplus == 0 || m->lastpos != NULL);
301		for (;;) {
302			if (dp != NULL || endp <= m->coldp)
303				break;		/* defeat */
304			NOTE("backoff");
305			endp = slow(m, m->coldp, endp-1, gf, gl);
306			if (endp == NULL)
307				break;		/* defeat */
308			/* try it on a shorter possibility */
309#ifndef NDEBUG
310			for (i = 1; i <= m->g->nsub; i++) {
311				assert(m->pmatch[i].rm_so == -1);
312				assert(m->pmatch[i].rm_eo == -1);
313			}
314#endif
315			NOTE("backoff dissect");
316			dp = backref(m, m->coldp, endp, gf, gl, (sopno)0);
317		}
318		assert(dp == NULL || dp == endp);
319		if (dp != NULL)		/* found a shorter one */
320			break;
321
322		/* despite initial appearances, there is no match here */
323		NOTE("false alarm");
324		start = m->coldp + 1;	/* recycle starting later */
325		assert(start <= stop);
326	}
327
328	/* fill in the details if requested */
329	if (nmatch > 0) {
330		pmatch[0].rm_so = m->coldp - m->offp;
331		pmatch[0].rm_eo = endp - m->offp;
332	}
333	if (nmatch > 1) {
334		assert(m->pmatch != NULL);
335		for (i = 1; i < nmatch; i++)
336			if (i <= m->g->nsub)
337				pmatch[i] = m->pmatch[i];
338			else {
339				pmatch[i].rm_so = -1;
340				pmatch[i].rm_eo = -1;
341			}
342	}
343
344	if (m->pmatch != NULL)
345		free((char *)m->pmatch);
346	if (m->lastpos != NULL)
347		free((char *)m->lastpos);
348	STATETEARDOWN(m);
349	return(0);
350}
351
352/*
353 - dissect - figure out what matched what, no back references
354 == static char *dissect(register struct match *m, char *start, \
355 ==	char *stop, sopno startst, sopno stopst);
356 */
357static char *			/* == stop (success) always */
358dissect(m, start, stop, startst, stopst)
359register struct match *m;
360char *start;
361char *stop;
362sopno startst;
363sopno stopst;
364{
365	register int i;
366	register sopno ss;	/* start sop of current subRE */
367	register sopno es;	/* end sop of current subRE */
368	register char *sp;	/* start of string matched by it */
369	register char *stp;	/* string matched by it cannot pass here */
370	register char *rest;	/* start of rest of string */
371	register char *tail;	/* string unmatched by rest of RE */
372	register sopno ssub;	/* start sop of subsubRE */
373	register sopno esub;	/* end sop of subsubRE */
374	register char *ssp;	/* start of string matched by subsubRE */
375	register char *sep;	/* end of string matched by subsubRE */
376	register char *oldssp;	/* previous ssp */
377	register char *dp;
378
379	AT("diss", start, stop, startst, stopst);
380	sp = start;
381	for (ss = startst; ss < stopst; ss = es) {
382		/* identify end of subRE */
383		es = ss;
384		switch (OP(m->g->strip[es])) {
385		case OPLUS_:
386		case OQUEST_:
387			es += OPND(m->g->strip[es]);
388			break;
389		case OCH_:
390			while (OP(m->g->strip[es]) != O_CH)
391				es += OPND(m->g->strip[es]);
392			break;
393		}
394		es++;
395
396		/* figure out what it matched */
397		switch (OP(m->g->strip[ss])) {
398		case OEND:
399			assert(nope);
400			break;
401		case OCHAR:
402			sp++;
403			break;
404		case OBOL:
405		case OEOL:
406		case OBOW:
407		case OEOW:
408			break;
409		case OANY:
410		case OANYOF:
411			sp++;
412			break;
413		case OBACK_:
414		case O_BACK:
415			assert(nope);
416			break;
417		/* cases where length of match is hard to find */
418		case OQUEST_:
419			stp = stop;
420			for (;;) {
421				/* how long could this one be? */
422				rest = slow(m, sp, stp, ss, es);
423				assert(rest != NULL);	/* it did match */
424				/* could the rest match the rest? */
425				tail = slow(m, rest, stop, es, stopst);
426				if (tail == stop)
427					break;		/* yes! */
428				/* no -- try a shorter match for this one */
429				stp = rest - 1;
430				assert(stp >= sp);	/* it did work */
431			}
432			ssub = ss + 1;
433			esub = es - 1;
434			/* did innards match? */
435			if (slow(m, sp, rest, ssub, esub) != NULL) {
436				dp = dissect(m, sp, rest, ssub, esub);
437				assert(dp == rest);
438			} else		/* no */
439				assert(sp == rest);
440			sp = rest;
441			break;
442		case OPLUS_:
443			stp = stop;
444			for (;;) {
445				/* how long could this one be? */
446				rest = slow(m, sp, stp, ss, es);
447				assert(rest != NULL);	/* it did match */
448				/* could the rest match the rest? */
449				tail = slow(m, rest, stop, es, stopst);
450				if (tail == stop)
451					break;		/* yes! */
452				/* no -- try a shorter match for this one */
453				stp = rest - 1;
454				assert(stp >= sp);	/* it did work */
455			}
456			ssub = ss + 1;
457			esub = es - 1;
458			ssp = sp;
459			oldssp = ssp;
460			for (;;) {	/* find last match of innards */
461				sep = slow(m, ssp, rest, ssub, esub);
462				if (sep == NULL || sep == ssp)
463					break;	/* failed or matched null */
464				oldssp = ssp;	/* on to next try */
465				ssp = sep;
466			}
467			if (sep == NULL) {
468				/* last successful match */
469				sep = ssp;
470				ssp = oldssp;
471			}
472			assert(sep == rest);	/* must exhaust substring */
473			assert(slow(m, ssp, sep, ssub, esub) == rest);
474			dp = dissect(m, ssp, sep, ssub, esub);
475			assert(dp == sep);
476			sp = rest;
477			break;
478		case OCH_:
479			stp = stop;
480			for (;;) {
481				/* how long could this one be? */
482				rest = slow(m, sp, stp, ss, es);
483				assert(rest != NULL);	/* it did match */
484				/* could the rest match the rest? */
485				tail = slow(m, rest, stop, es, stopst);
486				if (tail == stop)
487					break;		/* yes! */
488				/* no -- try a shorter match for this one */
489				stp = rest - 1;
490				assert(stp >= sp);	/* it did work */
491			}
492			ssub = ss + 1;
493			esub = ss + OPND(m->g->strip[ss]) - 1;
494			assert(OP(m->g->strip[esub]) == OOR1);
495			for (;;) {	/* find first matching branch */
496				if (slow(m, sp, rest, ssub, esub) == rest)
497					break;	/* it matched all of it */
498				/* that one missed, try next one */
499				assert(OP(m->g->strip[esub]) == OOR1);
500				esub++;
501				assert(OP(m->g->strip[esub]) == OOR2);
502				ssub = esub + 1;
503				esub += OPND(m->g->strip[esub]);
504				if (OP(m->g->strip[esub]) == OOR2)
505					esub--;
506				else
507					assert(OP(m->g->strip[esub]) == O_CH);
508			}
509			dp = dissect(m, sp, rest, ssub, esub);
510			assert(dp == rest);
511			sp = rest;
512			break;
513		case O_PLUS:
514		case O_QUEST:
515		case OOR1:
516		case OOR2:
517		case O_CH:
518			assert(nope);
519			break;
520		case OLPAREN:
521			i = OPND(m->g->strip[ss]);
522			assert(0 < i && i <= m->g->nsub);
523			m->pmatch[i].rm_so = sp - m->offp;
524			break;
525		case ORPAREN:
526			i = OPND(m->g->strip[ss]);
527			assert(0 < i && i <= m->g->nsub);
528			m->pmatch[i].rm_eo = sp - m->offp;
529			break;
530		default:		/* uh oh */
531			assert(nope);
532			break;
533		}
534	}
535
536	assert(sp == stop);
537	return(sp);
538}
539
540/*
541 - backref - figure out what matched what, figuring in back references
542 == static char *backref(register struct match *m, char *start, \
543 ==	char *stop, sopno startst, sopno stopst, sopno lev);
544 */
545static char *			/* == stop (success) or NULL (failure) */
546backref(m, start, stop, startst, stopst, lev)
547register struct match *m;
548char *start;
549char *stop;
550sopno startst;
551sopno stopst;
552sopno lev;			/* PLUS nesting level */
553{
554	register int i;
555	register sopno ss;	/* start sop of current subRE */
556	register char *sp;	/* start of string matched by it */
557	register sopno ssub;	/* start sop of subsubRE */
558	register sopno esub;	/* end sop of subsubRE */
559	register char *ssp;	/* start of string matched by subsubRE */
560	register char *dp;
561	register size_t len;
562	register int hard;
563	register sop s;
564	register regoff_t offsave;
565	register cset *cs;
566
567	AT("back", start, stop, startst, stopst);
568	sp = start;
569
570	/* get as far as we can with easy stuff */
571	hard = 0;
572	for (ss = startst; !hard && ss < stopst; ss++)
573		switch (OP(s = m->g->strip[ss])) {
574		case OCHAR:
575			if (sp == stop || *sp++ != (char)OPND(s))
576				return(NULL);
577			break;
578		case OANY:
579			if (sp == stop)
580				return(NULL);
581			sp++;
582			break;
583		case OANYOF:
584			cs = &m->g->sets[OPND(s)];
585			if (sp == stop || !CHIN(cs, *sp++))
586				return(NULL);
587			break;
588		case OBOL:
589			if ( (sp == m->beginp && !(m->eflags&REG_NOTBOL)) ||
590					(sp < m->endp && *(sp-1) == '\n' &&
591						(m->g->cflags&REG_NEWLINE)) )
592				{ /* yes */ }
593			else
594				return(NULL);
595			break;
596		case OEOL:
597			if ( (sp == m->endp && !(m->eflags&REG_NOTEOL)) ||
598					(sp < m->endp && *sp == '\n' &&
599						(m->g->cflags&REG_NEWLINE)) )
600				{ /* yes */ }
601			else
602				return(NULL);
603			break;
604		case OBOW:
605			if (( (sp == m->beginp && !(m->eflags&REG_NOTBOL)) ||
606					(sp < m->endp && *(sp-1) == '\n' &&
607						(m->g->cflags&REG_NEWLINE)) ||
608					(sp > m->beginp &&
609							!ISWORD(*(sp-1))) ) &&
610					(sp < m->endp && ISWORD(*sp)) )
611				{ /* yes */ }
612			else
613				return(NULL);
614			break;
615		case OEOW:
616			if (( (sp == m->endp && !(m->eflags&REG_NOTEOL)) ||
617					(sp < m->endp && *sp == '\n' &&
618						(m->g->cflags&REG_NEWLINE)) ||
619					(sp < m->endp && !ISWORD(*sp)) ) &&
620					(sp > m->beginp && ISWORD(*(sp-1))) )
621				{ /* yes */ }
622			else
623				return(NULL);
624			break;
625		case O_QUEST:
626			break;
627		case OOR1:	/* matches null but needs to skip */
628			ss++;
629			s = m->g->strip[ss];
630			do {
631				assert(OP(s) == OOR2);
632				ss += OPND(s);
633			} while (OP(s = m->g->strip[ss]) != O_CH);
634			/* note that the ss++ gets us past the O_CH */
635			break;
636		default:	/* have to make a choice */
637			hard = 1;
638			break;
639		}
640	if (!hard) {		/* that was it! */
641		if (sp != stop)
642			return(NULL);
643		return(sp);
644	}
645	ss--;			/* adjust for the for's final increment */
646
647	/* the hard stuff */
648	AT("hard", sp, stop, ss, stopst);
649	s = m->g->strip[ss];
650	switch (OP(s)) {
651	case OBACK_:		/* the vilest depths */
652		i = OPND(s);
653		assert(0 < i && i <= m->g->nsub);
654		if (m->pmatch[i].rm_eo == -1)
655			return(NULL);
656		assert(m->pmatch[i].rm_so != -1);
657		len = m->pmatch[i].rm_eo - m->pmatch[i].rm_so;
658		assert(stop - m->beginp >= len);
659		if (sp > stop - len)
660			return(NULL);	/* not enough left to match */
661		ssp = m->offp + m->pmatch[i].rm_so;
662		if (memcmp(sp, ssp, len) != 0)
663			return(NULL);
664		while (m->g->strip[ss] != SOP(O_BACK, i))
665			ss++;
666		return(backref(m, sp+len, stop, ss+1, stopst, lev));
667		break;
668	case OQUEST_:		/* to null or not */
669		dp = backref(m, sp, stop, ss+1, stopst, lev);
670		if (dp != NULL)
671			return(dp);	/* not */
672		return(backref(m, sp, stop, ss+OPND(s)+1, stopst, lev));
673		break;
674	case OPLUS_:
675		assert(m->lastpos != NULL);
676		assert(lev+1 <= m->g->nplus);
677		m->lastpos[lev+1] = sp;
678		return(backref(m, sp, stop, ss+1, stopst, lev+1));
679		break;
680	case O_PLUS:
681		if (sp == m->lastpos[lev])	/* last pass matched null */
682			return(backref(m, sp, stop, ss+1, stopst, lev-1));
683		/* try another pass */
684		m->lastpos[lev] = sp;
685		dp = backref(m, sp, stop, ss-OPND(s)+1, stopst, lev);
686		if (dp == NULL)
687			return(backref(m, sp, stop, ss+1, stopst, lev-1));
688		else
689			return(dp);
690		break;
691	case OCH_:		/* find the right one, if any */
692		ssub = ss + 1;
693		esub = ss + OPND(s) - 1;
694		assert(OP(m->g->strip[esub]) == OOR1);
695		for (;;) {	/* find first matching branch */
696			dp = backref(m, sp, stop, ssub, esub, lev);
697			if (dp != NULL)
698				return(dp);
699			/* that one missed, try next one */
700			if (OP(m->g->strip[esub]) == O_CH)
701				return(NULL);	/* there is none */
702			esub++;
703			assert(OP(m->g->strip[esub]) == OOR2);
704			ssub = esub + 1;
705			esub += OPND(m->g->strip[esub]);
706			if (OP(m->g->strip[esub]) == OOR2)
707				esub--;
708			else
709				assert(OP(m->g->strip[esub]) == O_CH);
710		}
711		break;
712	case OLPAREN:		/* must undo assignment if rest fails */
713		i = OPND(s);
714		assert(0 < i && i <= m->g->nsub);
715		offsave = m->pmatch[i].rm_so;
716		m->pmatch[i].rm_so = sp - m->offp;
717		dp = backref(m, sp, stop, ss+1, stopst, lev);
718		if (dp != NULL)
719			return(dp);
720		m->pmatch[i].rm_so = offsave;
721		return(NULL);
722		break;
723	case ORPAREN:		/* must undo assignment if rest fails */
724		i = OPND(s);
725		assert(0 < i && i <= m->g->nsub);
726		offsave = m->pmatch[i].rm_eo;
727		m->pmatch[i].rm_eo = sp - m->offp;
728		dp = backref(m, sp, stop, ss+1, stopst, lev);
729		if (dp != NULL)
730			return(dp);
731		m->pmatch[i].rm_eo = offsave;
732		return(NULL);
733		break;
734	default:		/* uh oh */
735		assert(nope);
736		break;
737	}
738
739	/* "can't happen" */
740	assert(nope);
741	/* NOTREACHED */
742	return "shut up gcc";
743}
744
745/*
746 - fast - step through the string at top speed
747 == static char *fast(register struct match *m, char *start, \
748 ==	char *stop, sopno startst, sopno stopst);
749 */
750static char *			/* where tentative match ended, or NULL */
751fast(m, start, stop, startst, stopst)
752register struct match *m;
753char *start;
754char *stop;
755sopno startst;
756sopno stopst;
757{
758	register states st = m->st;
759	register states fresh = m->fresh;
760	register states tmp = m->tmp;
761	register char *p = start;
762	register int c = (start == m->beginp) ? OUT : *(start-1);
763	register int lastc;	/* previous c */
764	register int flagch;
765	register int i;
766	register char *coldp;	/* last p after which no match was underway */
767
768	CLEAR(st);
769	SET1(st, startst);
770	st = step(m->g, startst, stopst, st, NOTHING, st);
771	ASSIGN(fresh, st);
772	SP("start", st, *p);
773	coldp = NULL;
774	for (;;) {
775		/* next character */
776		lastc = c;
777		c = (p == m->endp) ? OUT : *p;
778		if (EQ(st, fresh))
779			coldp = p;
780
781		/* is there an EOL and/or BOL between lastc and c? */
782		flagch = '\0';
783		i = 0;
784		if ( (lastc == '\n' && m->g->cflags&REG_NEWLINE) ||
785				(lastc == OUT && !(m->eflags&REG_NOTBOL)) ) {
786			flagch = BOL;
787			i = m->g->nbol;
788		}
789		if ( (c == '\n' && m->g->cflags&REG_NEWLINE) ||
790				(c == OUT && !(m->eflags&REG_NOTEOL)) ) {
791			flagch = (flagch == BOL) ? BOLEOL : EOL;
792			i += m->g->neol;
793		}
794		if (i != 0) {
795			for (; i > 0; i--)
796				st = step(m->g, startst, stopst, st, flagch, st);
797			SP("boleol", st, c);
798		}
799
800		/* how about a word boundary? */
801		if ( (flagch == BOL || (lastc != OUT && !ISWORD(lastc))) &&
802					(c != OUT && ISWORD(c)) ) {
803			flagch = BOW;
804		}
805		if ( (lastc != OUT && ISWORD(lastc)) &&
806				(flagch == EOL || (c != OUT && !ISWORD(c))) ) {
807			flagch = EOW;
808		}
809		if (flagch == BOW || flagch == EOW) {
810			st = step(m->g, startst, stopst, st, flagch, st);
811			SP("boweow", st, c);
812		}
813
814		/* are we done? */
815		if (ISSET(st, stopst) || p == stop)
816			break;		/* NOTE BREAK OUT */
817
818		/* no, we must deal with this character */
819		ASSIGN(tmp, st);
820		ASSIGN(st, fresh);
821		assert(c != OUT);
822		st = step(m->g, startst, stopst, tmp, c, st);
823		SP("aft", st, c);
824		assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st));
825		p++;
826	}
827
828	assert(coldp != NULL);
829	m->coldp = coldp;
830	if (ISSET(st, stopst))
831		return(p+1);
832	else
833		return(NULL);
834}
835
836/*
837 - slow - step through the string more deliberately
838 == static char *slow(register struct match *m, char *start, \
839 ==	char *stop, sopno startst, sopno stopst);
840 */
841static char *			/* where it ended */
842slow(m, start, stop, startst, stopst)
843register struct match *m;
844char *start;
845char *stop;
846sopno startst;
847sopno stopst;
848{
849	register states st = m->st;
850	register states empty = m->empty;
851	register states tmp = m->tmp;
852	register char *p = start;
853	register int c = (start == m->beginp) ? OUT : *(start-1);
854	register int lastc;	/* previous c */
855	register int flagch;
856	register int i;
857	register char *matchp;	/* last p at which a match ended */
858
859	AT("slow", start, stop, startst, stopst);
860	CLEAR(st);
861	SET1(st, startst);
862	SP("sstart", st, *p);
863	st = step(m->g, startst, stopst, st, NOTHING, st);
864	matchp = NULL;
865	for (;;) {
866		/* next character */
867		lastc = c;
868		c = (p == m->endp) ? OUT : *p;
869
870		/* is there an EOL and/or BOL between lastc and c? */
871		flagch = '\0';
872		i = 0;
873		if ( (lastc == '\n' && m->g->cflags&REG_NEWLINE) ||
874				(lastc == OUT && !(m->eflags&REG_NOTBOL)) ) {
875			flagch = BOL;
876			i = m->g->nbol;
877		}
878		if ( (c == '\n' && m->g->cflags&REG_NEWLINE) ||
879				(c == OUT && !(m->eflags&REG_NOTEOL)) ) {
880			flagch = (flagch == BOL) ? BOLEOL : EOL;
881			i += m->g->neol;
882		}
883		if (i != 0) {
884			for (; i > 0; i--)
885				st = step(m->g, startst, stopst, st, flagch, st);
886			SP("sboleol", st, c);
887		}
888
889		/* how about a word boundary? */
890		if ( (flagch == BOL || (lastc != OUT && !ISWORD(lastc))) &&
891					(c != OUT && ISWORD(c)) ) {
892			flagch = BOW;
893		}
894		if ( (lastc != OUT && ISWORD(lastc)) &&
895				(flagch == EOL || (c != OUT && !ISWORD(c))) ) {
896			flagch = EOW;
897		}
898		if (flagch == BOW || flagch == EOW) {
899			st = step(m->g, startst, stopst, st, flagch, st);
900			SP("sboweow", st, c);
901		}
902
903		/* are we done? */
904		if (ISSET(st, stopst))
905			matchp = p;
906		if (EQ(st, empty) || p == stop)
907			break;		/* NOTE BREAK OUT */
908
909		/* no, we must deal with this character */
910		ASSIGN(tmp, st);
911		ASSIGN(st, empty);
912		assert(c != OUT);
913		st = step(m->g, startst, stopst, tmp, c, st);
914		SP("saft", st, c);
915		assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st));
916		p++;
917	}
918
919	return(matchp);
920}
921
922
923/*
924 - step - map set of states reachable before char to set reachable after
925 == static states step(register struct re_guts *g, sopno start, sopno stop, \
926 ==	register states bef, int ch, register states aft);
927 == #define	BOL	(OUT+1)
928 == #define	EOL	(BOL+1)
929 == #define	BOLEOL	(BOL+2)
930 == #define	NOTHING	(BOL+3)
931 == #define	BOW	(BOL+4)
932 == #define	EOW	(BOL+5)
933 == #define	CODEMAX	(BOL+5)		// highest code used
934 == #define	NONCHAR(c)	((c) > CHAR_MAX)
935 == #define	NNONCHAR	(CODEMAX-CHAR_MAX)
936 */
937static states
938step(g, start, stop, bef, ch, aft)
939register struct re_guts *g;
940sopno start;			/* start state within strip */
941sopno stop;			/* state after stop state within strip */
942register states bef;		/* states reachable before */
943int ch;				/* character or NONCHAR code */
944register states aft;		/* states already known reachable after */
945{
946	register cset *cs;
947	register sop s;
948	register sopno pc;
949	register onestate here;		/* note, macros know this name */
950	register sopno look;
951	register int i;
952
953	for (pc = start, INIT(here, pc); pc != stop; pc++, INC(here)) {
954		s = g->strip[pc];
955		switch (OP(s)) {
956		case OEND:
957			assert(pc == stop-1);
958			break;
959		case OCHAR:
960			/* only characters can match */
961			assert(!NONCHAR(ch) || ch != (char)OPND(s));
962			if (ch == (char)OPND(s))
963				FWD(aft, bef, 1);
964			break;
965		case OBOL:
966			if (ch == BOL || ch == BOLEOL)
967				FWD(aft, bef, 1);
968			break;
969		case OEOL:
970			if (ch == EOL || ch == BOLEOL)
971				FWD(aft, bef, 1);
972			break;
973		case OBOW:
974			if (ch == BOW)
975				FWD(aft, bef, 1);
976			break;
977		case OEOW:
978			if (ch == EOW)
979				FWD(aft, bef, 1);
980			break;
981		case OANY:
982			if (!NONCHAR(ch))
983				FWD(aft, bef, 1);
984			break;
985		case OANYOF:
986			cs = &g->sets[OPND(s)];
987			if (!NONCHAR(ch) && CHIN(cs, ch))
988				FWD(aft, bef, 1);
989			break;
990		case OBACK_:		/* ignored here */
991		case O_BACK:
992			FWD(aft, aft, 1);
993			break;
994		case OPLUS_:		/* forward, this is just an empty */
995			FWD(aft, aft, 1);
996			break;
997		case O_PLUS:		/* both forward and back */
998			FWD(aft, aft, 1);
999			i = ISSETBACK(aft, OPND(s));
1000			BACK(aft, aft, OPND(s));
1001			if (!i && ISSETBACK(aft, OPND(s))) {
1002				/* oho, must reconsider loop body */
1003				pc -= OPND(s) + 1;
1004				INIT(here, pc);
1005			}
1006			break;
1007		case OQUEST_:		/* two branches, both forward */
1008			FWD(aft, aft, 1);
1009			FWD(aft, aft, OPND(s));
1010			break;
1011		case O_QUEST:		/* just an empty */
1012			FWD(aft, aft, 1);
1013			break;
1014		case OLPAREN:		/* not significant here */
1015		case ORPAREN:
1016			FWD(aft, aft, 1);
1017			break;
1018		case OCH_:		/* mark the first two branches */
1019			FWD(aft, aft, 1);
1020			assert(OP(g->strip[pc+OPND(s)]) == OOR2);
1021			FWD(aft, aft, OPND(s));
1022			break;
1023		case OOR1:		/* done a branch, find the O_CH */
1024			if (ISSTATEIN(aft, here)) {
1025				for (look = 1;
1026						OP(s = g->strip[pc+look]) != O_CH;
1027						look += OPND(s))
1028					assert(OP(s) == OOR2);
1029				FWD(aft, aft, look);
1030			}
1031			break;
1032		case OOR2:		/* propagate OCH_'s marking */
1033			FWD(aft, aft, 1);
1034			if (OP(g->strip[pc+OPND(s)]) != O_CH) {
1035				assert(OP(g->strip[pc+OPND(s)]) == OOR2);
1036				FWD(aft, aft, OPND(s));
1037			}
1038			break;
1039		case O_CH:		/* just empty */
1040			FWD(aft, aft, 1);
1041			break;
1042		default:		/* ooooops... */
1043			assert(nope);
1044			break;
1045		}
1046	}
1047
1048	return(aft);
1049}
1050
1051#ifdef REDEBUG
1052/*
1053 - print - print a set of states
1054 == #ifdef REDEBUG
1055 == static void print(struct match *m, char *caption, states st, \
1056 ==	int ch, FILE *d);
1057 == #endif
1058 */
1059static void
1060print(m, caption, st, ch, d)
1061struct match *m;
1062char *caption;
1063states st;
1064int ch;
1065FILE *d;
1066{
1067	register struct re_guts *g = m->g;
1068	register int i;
1069	register int first = 1;
1070
1071	if (!(m->eflags&REG_TRACE))
1072		return;
1073
1074	fprintf(d, "%s", caption);
1075	if (ch != '\0')
1076		fprintf(d, " %s", pchar(ch));
1077	for (i = 0; i < g->nstates; i++)
1078		if (ISSET(st, i)) {
1079			fprintf(d, "%s%d", (first) ? "\t" : ", ", i);
1080			first = 0;
1081		}
1082	fprintf(d, "\n");
1083}
1084
1085/*
1086 - at - print current situation
1087 == #ifdef REDEBUG
1088 == static void at(struct match *m, char *title, char *start, char *stop, \
1089 ==						sopno startst, sopno stopst);
1090 == #endif
1091 */
1092static void
1093at(m, title, start, stop, startst, stopst)
1094struct match *m;
1095char *title;
1096char *start;
1097char *stop;
1098sopno startst;
1099sopno stopst;
1100{
1101	if (!(m->eflags&REG_TRACE))
1102		return;
1103
1104	printf("%s %s-", title, pchar(*start));
1105	printf("%s ", pchar(*stop));
1106	printf("%ld-%ld\n", (long)startst, (long)stopst);
1107}
1108
1109#ifndef PCHARDONE
1110#define	PCHARDONE	/* never again */
1111/*
1112 - pchar - make a character printable
1113 == #ifdef REDEBUG
1114 == static char *pchar(int ch);
1115 == #endif
1116 *
1117 * Is this identical to regchar() over in debug.c?  Well, yes.  But a
1118 * duplicate here avoids having a debugging-capable regexec.o tied to
1119 * a matching debug.o, and this is convenient.  It all disappears in
1120 * the non-debug compilation anyway, so it doesn't matter much.
1121 */
1122static char *			/* -> representation */
1123pchar(ch)
1124int ch;
1125{
1126	static char pbuf[10];
1127
1128	if (isprint((uch)ch) || ch == ' ')
1129		sprintf(pbuf, "%c", ch);
1130	else
1131		sprintf(pbuf, "\\%o", ch);
1132	return(pbuf);
1133}
1134#endif
1135#endif
1136
1137#undef	matcher
1138#undef	fast
1139#undef	slow
1140#undef	dissect
1141#undef	backref
1142#undef	step
1143#undef	print
1144#undef	at
1145#undef	match
1146