1/*
2 * regcomp and regexec -- regsub and regerror are elsewhere
3 *
4 *	Copyright (c) 1986 by University of Toronto.
5 *	Written by Henry Spencer.  Not derived from licensed software.
6 *
7 *	Permission is granted to anyone to use this software for any
8 *	purpose on any computer system, and to redistribute it freely,
9 *	subject to the following restrictions:
10 *
11 *	1. The author is not responsible for the consequences of use of
12 *		this software, no matter how awful, even if they arise
13 *		from defects in it.
14 *
15 *	2. The origin of this software must not be misrepresented, either
16 *		by explicit claim or by omission.
17 *
18 *	3. Altered versions must be plainly marked as such, and must not
19 *		be misrepresented as being the original software.
20 *
21 * Beware that some of this code is subtly aware of the way operator
22 * precedence is structured in regular expressions.  Serious changes in
23 * regular-expression syntax might require a total rethink.
24 *
25 * *** NOTE: this code has been altered slightly for use in Tcl. ***
26 * Slightly modified by David MacKenzie to undo most of the changes for TCL.
27 * Added regexec2 with notbol parameter. -- 4/19/99 Mark Nudelman
28 */
29
30#include "less.h"
31#if HAVE_STDIO_H
32#include <stdio.h>
33#endif
34#if HAVE_STDLIB_H
35#include <stdlib.h>
36#endif
37#if HAVE_STRING_H
38#include <string.h>
39#endif
40#include "regexp.h"
41
42/*
43 * The "internal use only" fields in regexp.h are present to pass info from
44 * compile to execute that permits the execute phase to run lots faster on
45 * simple cases.  They are:
46 *
47 * regstart	char that must begin a match; '\0' if none obvious
48 * reganch	is the match anchored (at beginning-of-line only)?
49 * regmust	string (pointer into program) that match must include, or NULL
50 * regmlen	length of regmust string
51 *
52 * Regstart and reganch permit very fast decisions on suitable starting points
53 * for a match, cutting down the work a lot.  Regmust permits fast rejection
54 * of lines that cannot possibly match.  The regmust tests are costly enough
55 * that regcomp() supplies a regmust only if the r.e. contains something
56 * potentially expensive (at present, the only such thing detected is * or +
57 * at the start of the r.e., which can involve a lot of backup).  Regmlen is
58 * supplied because the test in regexec() needs it and regcomp() is
59 * computing it anyway.
60 */
61
62/*
63 * Structure for regexp "program".  This is essentially a linear encoding
64 * of a nondeterministic finite-state machine (aka syntax charts or
65 * "railroad normal form" in parsing technology).  Each node is an opcode
66 * plus a "next" pointer, possibly plus an operand.  "Next" pointers of
67 * all nodes except BRANCH implement concatenation; a "next" pointer with
68 * a BRANCH on both ends of it is connecting two alternatives.  (Here we
69 * have one of the subtle syntax dependencies:  an individual BRANCH (as
70 * opposed to a collection of them) is never concatenated with anything
71 * because of operator precedence.)  The operand of some types of node is
72 * a literal string; for others, it is a node leading into a sub-FSM.  In
73 * particular, the operand of a BRANCH node is the first node of the branch.
74 * (NB this is *not* a tree structure:  the tail of the branch connects
75 * to the thing following the set of BRANCHes.)  The opcodes are:
76 */
77
78/* definition	number	opnd?	meaning */
79#undef EOL
80#define	END	0	/* no	End of program. */
81#define	BOL	1	/* no	Match "" at beginning of line. */
82#define	EOL	2	/* no	Match "" at end of line. */
83#define	ANY	3	/* no	Match any one character. */
84#define	ANYOF	4	/* str	Match any character in this string. */
85#define	ANYBUT	5	/* str	Match any character not in this string. */
86#define	BRANCH	6	/* node	Match this alternative, or the next... */
87#define	BACK	7	/* no	Match "", "next" ptr points backward. */
88#define	EXACTLY	8	/* str	Match this string. */
89#define	NOTHING	9	/* no	Match empty string. */
90#define	STAR	10	/* node	Match this (simple) thing 0 or more times. */
91#define	PLUS	11	/* node	Match this (simple) thing 1 or more times. */
92#define	OPEN	20	/* no	Mark this point in input as start of #n. */
93			/*	OPEN+1 is number 1, etc. */
94#define	CLOSE	30	/* no	Analogous to OPEN. */
95
96/*
97 * Opcode notes:
98 *
99 * BRANCH	The set of branches constituting a single choice are hooked
100 *		together with their "next" pointers, since precedence prevents
101 *		anything being concatenated to any individual branch.  The
102 *		"next" pointer of the last BRANCH in a choice points to the
103 *		thing following the whole choice.  This is also where the
104 *		final "next" pointer of each individual branch points; each
105 *		branch starts with the operand node of a BRANCH node.
106 *
107 * BACK		Normal "next" pointers all implicitly point forward; BACK
108 *		exists to make loop structures possible.
109 *
110 * STAR,PLUS	'?', and complex '*' and '+', are implemented as circular
111 *		BRANCH structures using BACK.  Simple cases (one character
112 *		per match) are implemented with STAR and PLUS for speed
113 *		and to minimize recursive plunges.
114 *
115 * OPEN,CLOSE	...are numbered at compile time.
116 */
117
118/*
119 * A node is one char of opcode followed by two chars of "next" pointer.
120 * "Next" pointers are stored as two 8-bit pieces, high order first.  The
121 * value is a positive offset from the opcode of the node containing it.
122 * An operand, if any, simply follows the node.  (Note that much of the
123 * code generation knows about this implicit relationship.)
124 *
125 * Using two bytes for the "next" pointer is vast overkill for most things,
126 * but allows patterns to get big without disasters.
127 */
128#define	OP(p)	(*(p))
129#define	NEXT(p)	(((*((p)+1)&0377)<<8) + (*((p)+2)&0377))
130#define	OPERAND(p)	((p) + 3)
131
132/*
133 * See regmagic.h for one further detail of program structure.
134 */
135
136
137/*
138 * Utility definitions.
139 */
140#ifndef CHARBITS
141#define	UCHARAT(p)	((int)*(unsigned char *)(p))
142#else
143#define	UCHARAT(p)	((int)*(p)&CHARBITS)
144#endif
145
146#define	FAIL(m)	{ regerror(m); return(NULL); }
147#define	ISMULT(c)	((c) == '*' || (c) == '+' || (c) == '?')
148#define	META	"^$.[()|?+*\\"
149
150/*
151 * Flags to be passed up and down.
152 */
153#define	HASWIDTH	01	/* Known never to match null string. */
154#define	SIMPLE		02	/* Simple enough to be STAR/PLUS operand. */
155#define	SPSTART		04	/* Starts with * or +. */
156#define	WORST		0	/* Worst case. */
157
158/*
159 * Global work variables for regcomp().
160 */
161static char *regparse;		/* Input-scan pointer. */
162static int regnpar;		/* () count. */
163static char regdummy;
164static char *regcode;		/* Code-emit pointer; &regdummy = don't. */
165static long regsize;		/* Code size. */
166
167/*
168 * The first byte of the regexp internal "program" is actually this magic
169 * number; the start node begins in the second byte.
170 */
171#define	MAGIC	0234
172
173
174/*
175 * Forward declarations for regcomp()'s friends.
176 */
177#ifndef STATIC
178#define	STATIC	static
179#endif
180STATIC char *reg();
181STATIC char *regbranch();
182STATIC char *regpiece();
183STATIC char *regatom();
184STATIC char *regnode();
185STATIC char *regnext();
186STATIC void regc();
187STATIC void reginsert();
188STATIC void regtail();
189STATIC void regoptail();
190#ifdef STRCSPN
191STATIC int strcspn();
192#endif
193
194/*
195 - regcomp - compile a regular expression into internal code
196 *
197 * We can't allocate space until we know how big the compiled form will be,
198 * but we can't compile it (and thus know how big it is) until we've got a
199 * place to put the code.  So we cheat:  we compile it twice, once with code
200 * generation turned off and size counting turned on, and once "for real".
201 * This also means that we don't allocate space until we are sure that the
202 * thing really will compile successfully, and we never have to move the
203 * code and thus invalidate pointers into it.  (Note that it has to be in
204 * one piece because free() must be able to free it all.)
205 *
206 * Beware that the optimization-preparation code in here knows about some
207 * of the structure of the compiled regexp.
208 */
209regexp *
210regcomp(exp)
211char *exp;
212{
213	register regexp *r;
214	register char *scan;
215	register char *longest;
216	register int len;
217	int flags;
218
219	if (exp == NULL)
220		FAIL("NULL argument");
221
222	/* First pass: determine size, legality. */
223	regparse = exp;
224	regnpar = 1;
225	regsize = 0L;
226	regcode = &regdummy;
227	regc(MAGIC);
228	if (reg(0, &flags) == NULL)
229		return(NULL);
230
231	/* Small enough for pointer-storage convention? */
232	if (regsize >= 32767L)		/* Probably could be 65535L. */
233		FAIL("regexp too big");
234
235	/* Allocate space. */
236	r = (regexp *)malloc(sizeof(regexp) + (unsigned)regsize);
237	if (r == NULL)
238		FAIL("out of space");
239
240	/* Second pass: emit code. */
241	regparse = exp;
242	regnpar = 1;
243	regcode = r->program;
244	regc(MAGIC);
245	if (reg(0, &flags) == NULL)
246		return(NULL);
247
248	/* Dig out information for optimizations. */
249	r->regstart = '\0';	/* Worst-case defaults. */
250	r->reganch = 0;
251	r->regmust = NULL;
252	r->regmlen = 0;
253	scan = r->program+1;			/* First BRANCH. */
254	if (OP(regnext(scan)) == END) {		/* Only one top-level choice. */
255		scan = OPERAND(scan);
256
257		/* Starting-point info. */
258		if (OP(scan) == EXACTLY)
259			r->regstart = *OPERAND(scan);
260		else if (OP(scan) == BOL)
261			r->reganch++;
262
263		/*
264		 * If there's something expensive in the r.e., find the
265		 * longest literal string that must appear and make it the
266		 * regmust.  Resolve ties in favor of later strings, since
267		 * the regstart check works with the beginning of the r.e.
268		 * and avoiding duplication strengthens checking.  Not a
269		 * strong reason, but sufficient in the absence of others.
270		 */
271		if (flags&SPSTART) {
272			longest = NULL;
273			len = 0;
274			for (; scan != NULL; scan = regnext(scan))
275				if (OP(scan) == EXACTLY && ((int) strlen(OPERAND(scan))) >= len) {
276					longest = OPERAND(scan);
277					len = strlen(OPERAND(scan));
278				}
279			r->regmust = longest;
280			r->regmlen = len;
281		}
282	}
283
284	return(r);
285}
286
287/*
288 - reg - regular expression, i.e. main body or parenthesized thing
289 *
290 * Caller must absorb opening parenthesis.
291 *
292 * Combining parenthesis handling with the base level of regular expression
293 * is a trifle forced, but the need to tie the tails of the branches to what
294 * follows makes it hard to avoid.
295 */
296static char *
297reg(paren, flagp)
298int paren;			/* Parenthesized? */
299int *flagp;
300{
301	register char *ret;
302	register char *br;
303	register char *ender;
304	register int parno = 0;
305	int flags;
306
307	*flagp = HASWIDTH;	/* Tentatively. */
308
309	/* Make an OPEN node, if parenthesized. */
310	if (paren) {
311		if (regnpar >= NSUBEXP)
312			FAIL("too many ()");
313		parno = regnpar;
314		regnpar++;
315		ret = regnode(OPEN+parno);
316	} else
317		ret = NULL;
318
319	/* Pick up the branches, linking them together. */
320	br = regbranch(&flags);
321	if (br == NULL)
322		return(NULL);
323	if (ret != NULL)
324		regtail(ret, br);	/* OPEN -> first. */
325	else
326		ret = br;
327	if (!(flags&HASWIDTH))
328		*flagp &= ~HASWIDTH;
329	*flagp |= flags&SPSTART;
330	while (*regparse == '|') {
331		regparse++;
332		br = regbranch(&flags);
333		if (br == NULL)
334			return(NULL);
335		regtail(ret, br);	/* BRANCH -> BRANCH. */
336		if (!(flags&HASWIDTH))
337			*flagp &= ~HASWIDTH;
338		*flagp |= flags&SPSTART;
339	}
340
341	/* Make a closing node, and hook it on the end. */
342	ender = regnode((paren) ? CLOSE+parno : END);
343	regtail(ret, ender);
344
345	/* Hook the tails of the branches to the closing node. */
346	for (br = ret; br != NULL; br = regnext(br))
347		regoptail(br, ender);
348
349	/* Check for proper termination. */
350	if (paren && *regparse++ != ')') {
351		FAIL("unmatched ()");
352	} else if (!paren && *regparse != '\0') {
353		if (*regparse == ')') {
354			FAIL("unmatched ()");
355		} else
356			FAIL("junk on end");	/* "Can't happen". */
357		/* NOTREACHED */
358	}
359
360	return(ret);
361}
362
363/*
364 - regbranch - one alternative of an | operator
365 *
366 * Implements the concatenation operator.
367 */
368static char *
369regbranch(flagp)
370int *flagp;
371{
372	register char *ret;
373	register char *chain;
374	register char *latest;
375	int flags;
376
377	*flagp = WORST;		/* Tentatively. */
378
379	ret = regnode(BRANCH);
380	chain = NULL;
381	while (*regparse != '\0' && *regparse != '|' && *regparse != ')') {
382		latest = regpiece(&flags);
383		if (latest == NULL)
384			return(NULL);
385		*flagp |= flags&HASWIDTH;
386		if (chain == NULL)	/* First piece. */
387			*flagp |= flags&SPSTART;
388		else
389			regtail(chain, latest);
390		chain = latest;
391	}
392	if (chain == NULL)	/* Loop ran zero times. */
393		(void) regnode(NOTHING);
394
395	return(ret);
396}
397
398/*
399 - regpiece - something followed by possible [*+?]
400 *
401 * Note that the branching code sequences used for ? and the general cases
402 * of * and + are somewhat optimized:  they use the same NOTHING node as
403 * both the endmarker for their branch list and the body of the last branch.
404 * It might seem that this node could be dispensed with entirely, but the
405 * endmarker role is not redundant.
406 */
407static char *
408regpiece(flagp)
409int *flagp;
410{
411	register char *ret;
412	register char op;
413	register char *next;
414	int flags;
415
416	ret = regatom(&flags);
417	if (ret == NULL)
418		return(NULL);
419
420	op = *regparse;
421	if (!ISMULT(op)) {
422		*flagp = flags;
423		return(ret);
424	}
425
426	if (!(flags&HASWIDTH) && op != '?')
427		FAIL("*+ operand could be empty");
428	*flagp = (op != '+') ? (WORST|SPSTART) : (WORST|HASWIDTH);
429
430	if (op == '*' && (flags&SIMPLE))
431		reginsert(STAR, ret);
432	else if (op == '*') {
433		/* Emit x* as (x&|), where & means "self". */
434		reginsert(BRANCH, ret);			/* Either x */
435		regoptail(ret, regnode(BACK));		/* and loop */
436		regoptail(ret, ret);			/* back */
437		regtail(ret, regnode(BRANCH));		/* or */
438		regtail(ret, regnode(NOTHING));		/* null. */
439	} else if (op == '+' && (flags&SIMPLE))
440		reginsert(PLUS, ret);
441	else if (op == '+') {
442		/* Emit x+ as x(&|), where & means "self". */
443		next = regnode(BRANCH);			/* Either */
444		regtail(ret, next);
445		regtail(regnode(BACK), ret);		/* loop back */
446		regtail(next, regnode(BRANCH));		/* or */
447		regtail(ret, regnode(NOTHING));		/* null. */
448	} else if (op == '?') {
449		/* Emit x? as (x|) */
450		reginsert(BRANCH, ret);			/* Either x */
451		regtail(ret, regnode(BRANCH));		/* or */
452		next = regnode(NOTHING);		/* null. */
453		regtail(ret, next);
454		regoptail(ret, next);
455	}
456	regparse++;
457	if (ISMULT(*regparse))
458		FAIL("nested *?+");
459
460	return(ret);
461}
462
463/*
464 - regatom - the lowest level
465 *
466 * Optimization:  gobbles an entire sequence of ordinary characters so that
467 * it can turn them into a single node, which is smaller to store and
468 * faster to run.  Backslashed characters are exceptions, each becoming a
469 * separate node; the code is simpler that way and it's not worth fixing.
470 */
471static char *
472regatom(flagp)
473int *flagp;
474{
475	register char *ret;
476	int flags;
477
478	*flagp = WORST;		/* Tentatively. */
479
480	switch (*regparse++) {
481	case '^':
482		ret = regnode(BOL);
483		break;
484	case '$':
485		ret = regnode(EOL);
486		break;
487	case '.':
488		ret = regnode(ANY);
489		*flagp |= HASWIDTH|SIMPLE;
490		break;
491	case '[': {
492			register int clss;
493			register int classend;
494
495			if (*regparse == '^') {	/* Complement of range. */
496				ret = regnode(ANYBUT);
497				regparse++;
498			} else
499				ret = regnode(ANYOF);
500			if (*regparse == ']' || *regparse == '-')
501				regc(*regparse++);
502			while (*regparse != '\0' && *regparse != ']') {
503				if (*regparse == '-') {
504					regparse++;
505					if (*regparse == ']' || *regparse == '\0')
506						regc('-');
507					else {
508						clss = UCHARAT(regparse-2)+1;
509						classend = UCHARAT(regparse);
510						if (clss > classend+1)
511							FAIL("invalid [] range");
512						for (; clss <= classend; clss++)
513							regc(clss);
514						regparse++;
515					}
516				} else
517					regc(*regparse++);
518			}
519			regc('\0');
520			if (*regparse != ']')
521				FAIL("unmatched []");
522			regparse++;
523			*flagp |= HASWIDTH|SIMPLE;
524		}
525		break;
526	case '(':
527		ret = reg(1, &flags);
528		if (ret == NULL)
529			return(NULL);
530		*flagp |= flags&(HASWIDTH|SPSTART);
531		break;
532	case '\0':
533	case '|':
534	case ')':
535		FAIL("internal urp");	/* Supposed to be caught earlier. */
536		/* NOTREACHED */
537		break;
538	case '?':
539	case '+':
540	case '*':
541		FAIL("?+* follows nothing");
542		/* NOTREACHED */
543		break;
544	case '\\':
545		if (*regparse == '\0')
546			FAIL("trailing \\");
547		ret = regnode(EXACTLY);
548		regc(*regparse++);
549		regc('\0');
550		*flagp |= HASWIDTH|SIMPLE;
551		break;
552	default: {
553			register int len;
554			register char ender;
555
556			regparse--;
557			len = strcspn(regparse, META);
558			if (len <= 0)
559				FAIL("internal disaster");
560			ender = *(regparse+len);
561			if (len > 1 && ISMULT(ender))
562				len--;		/* Back off clear of ?+* operand. */
563			*flagp |= HASWIDTH;
564			if (len == 1)
565				*flagp |= SIMPLE;
566			ret = regnode(EXACTLY);
567			while (len > 0) {
568				regc(*regparse++);
569				len--;
570			}
571			regc('\0');
572		}
573		break;
574	}
575
576	return(ret);
577}
578
579/*
580 - regnode - emit a node
581 */
582static char *			/* Location. */
583regnode(op)
584char op;
585{
586	register char *ret;
587	register char *ptr;
588
589	ret = regcode;
590	if (ret == &regdummy) {
591		regsize += 3;
592		return(ret);
593	}
594
595	ptr = ret;
596	*ptr++ = op;
597	*ptr++ = '\0';		/* Null "next" pointer. */
598	*ptr++ = '\0';
599	regcode = ptr;
600
601	return(ret);
602}
603
604/*
605 - regc - emit (if appropriate) a byte of code
606 */
607static void
608regc(b)
609char b;
610{
611	if (regcode != &regdummy)
612		*regcode++ = b;
613	else
614		regsize++;
615}
616
617/*
618 - reginsert - insert an operator in front of already-emitted operand
619 *
620 * Means relocating the operand.
621 */
622static void
623reginsert(op, opnd)
624char op;
625char *opnd;
626{
627	register char *src;
628	register char *dst;
629	register char *place;
630
631	if (regcode == &regdummy) {
632		regsize += 3;
633		return;
634	}
635
636	src = regcode;
637	regcode += 3;
638	dst = regcode;
639	while (src > opnd)
640		*--dst = *--src;
641
642	place = opnd;		/* Op node, where operand used to be. */
643	*place++ = op;
644	*place++ = '\0';
645	*place++ = '\0';
646}
647
648/*
649 - regtail - set the next-pointer at the end of a node chain
650 */
651static void
652regtail(p, val)
653char *p;
654char *val;
655{
656	register char *scan;
657	register char *temp;
658	register int offset;
659
660	if (p == &regdummy)
661		return;
662
663	/* Find last node. */
664	scan = p;
665	for (;;) {
666		temp = regnext(scan);
667		if (temp == NULL)
668			break;
669		scan = temp;
670	}
671
672	if (OP(scan) == BACK)
673		offset = scan - val;
674	else
675		offset = val - scan;
676	*(scan+1) = (offset>>8)&0377;
677	*(scan+2) = offset&0377;
678}
679
680/*
681 - regoptail - regtail on operand of first argument; nop if operandless
682 */
683static void
684regoptail(p, val)
685char *p;
686char *val;
687{
688	/* "Operandless" and "op != BRANCH" are synonymous in practice. */
689	if (p == NULL || p == &regdummy || OP(p) != BRANCH)
690		return;
691	regtail(OPERAND(p), val);
692}
693
694/*
695 * regexec and friends
696 */
697
698/*
699 * Global work variables for regexec().
700 */
701static char *reginput;		/* String-input pointer. */
702static char *regbol;		/* Beginning of input, for ^ check. */
703static char **regstartp;	/* Pointer to startp array. */
704static char **regendp;		/* Ditto for endp. */
705
706/*
707 * Forwards.
708 */
709STATIC int regtry();
710STATIC int regmatch();
711STATIC int regrepeat();
712
713#ifdef DEBUG
714int regnarrate = 0;
715void regdump();
716STATIC char *regprop();
717#endif
718
719/*
720 - regexec - match a regexp against a string
721 */
722int
723regexec2(prog, string, notbol)
724register regexp *prog;
725register char *string;
726int notbol;
727{
728	register char *s;
729
730	/* Be paranoid... */
731	if (prog == NULL || string == NULL) {
732		regerror("NULL parameter");
733		return(0);
734	}
735
736	/* Check validity of program. */
737	if (UCHARAT(prog->program) != MAGIC) {
738		regerror("corrupted program");
739		return(0);
740	}
741
742	/* If there is a "must appear" string, look for it. */
743	if (prog->regmust != NULL) {
744		s = string;
745		while ((s = strchr(s, prog->regmust[0])) != NULL) {
746			if (strncmp(s, prog->regmust, prog->regmlen) == 0)
747				break;	/* Found it. */
748			s++;
749		}
750		if (s == NULL)	/* Not present. */
751			return(0);
752	}
753
754	/* Mark beginning of line for ^ . */
755	if (notbol)
756		regbol = NULL;
757	else
758		regbol = string;
759
760	/* Simplest case:  anchored match need be tried only once. */
761	if (prog->reganch)
762		return(regtry(prog, string));
763
764	/* Messy cases:  unanchored match. */
765	s = string;
766	if (prog->regstart != '\0')
767		/* We know what char it must start with. */
768		while ((s = strchr(s, prog->regstart)) != NULL) {
769			if (regtry(prog, s))
770				return(1);
771			s++;
772		}
773	else
774		/* We don't -- general case. */
775		do {
776			if (regtry(prog, s))
777				return(1);
778		} while (*s++ != '\0');
779
780	/* Failure. */
781	return(0);
782}
783
784int
785regexec(prog, string)
786register regexp *prog;
787register char *string;
788{
789	return regexec2(prog, string, 0);
790}
791
792/*
793 - regtry - try match at specific point
794 */
795static int			/* 0 failure, 1 success */
796regtry(prog, string)
797regexp *prog;
798char *string;
799{
800	register int i;
801	register char **sp;
802	register char **ep;
803
804	reginput = string;
805	regstartp = prog->startp;
806	regendp = prog->endp;
807
808	sp = prog->startp;
809	ep = prog->endp;
810	for (i = NSUBEXP; i > 0; i--) {
811		*sp++ = NULL;
812		*ep++ = NULL;
813	}
814	if (regmatch(prog->program + 1)) {
815		prog->startp[0] = string;
816		prog->endp[0] = reginput;
817		return(1);
818	} else
819		return(0);
820}
821
822/*
823 - regmatch - main matching routine
824 *
825 * Conceptually the strategy is simple:  check to see whether the current
826 * node matches, call self recursively to see whether the rest matches,
827 * and then act accordingly.  In practice we make some effort to avoid
828 * recursion, in particular by going through "ordinary" nodes (that don't
829 * need to know whether the rest of the match failed) by a loop instead of
830 * by recursion.
831 */
832static int			/* 0 failure, 1 success */
833regmatch(prog)
834char *prog;
835{
836	register char *scan;	/* Current node. */
837	char *next;		/* Next node. */
838
839	scan = prog;
840#ifdef DEBUG
841	if (scan != NULL && regnarrate)
842		fprintf(stderr, "%s(\n", regprop(scan));
843#endif
844	while (scan != NULL) {
845#ifdef DEBUG
846		if (regnarrate)
847			fprintf(stderr, "%s...\n", regprop(scan));
848#endif
849		next = regnext(scan);
850
851		switch (OP(scan)) {
852		case BOL:
853			if (reginput != regbol)
854				return(0);
855			break;
856		case EOL:
857			if (*reginput != '\0')
858				return(0);
859			break;
860		case ANY:
861			if (*reginput == '\0')
862				return(0);
863			reginput++;
864			break;
865		case EXACTLY: {
866				register int len;
867				register char *opnd;
868
869				opnd = OPERAND(scan);
870				/* Inline the first character, for speed. */
871				if (*opnd != *reginput)
872					return(0);
873				len = strlen(opnd);
874				if (len > 1 && strncmp(opnd, reginput, len) != 0)
875					return(0);
876				reginput += len;
877			}
878			break;
879		case ANYOF:
880 			if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) == NULL)
881				return(0);
882			reginput++;
883			break;
884		case ANYBUT:
885 			if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) != NULL)
886				return(0);
887			reginput++;
888			break;
889		case NOTHING:
890			break;
891		case BACK:
892			break;
893		case OPEN+1:
894		case OPEN+2:
895		case OPEN+3:
896		case OPEN+4:
897		case OPEN+5:
898		case OPEN+6:
899		case OPEN+7:
900		case OPEN+8:
901		case OPEN+9: {
902				register int no;
903				register char *save;
904
905				no = OP(scan) - OPEN;
906				save = reginput;
907
908				if (regmatch(next)) {
909					/*
910					 * Don't set startp if some later
911					 * invocation of the same parentheses
912					 * already has.
913					 */
914					if (regstartp[no] == NULL)
915						regstartp[no] = save;
916					return(1);
917				} else
918					return(0);
919			}
920			/* NOTREACHED */
921			break;
922		case CLOSE+1:
923		case CLOSE+2:
924		case CLOSE+3:
925		case CLOSE+4:
926		case CLOSE+5:
927		case CLOSE+6:
928		case CLOSE+7:
929		case CLOSE+8:
930		case CLOSE+9: {
931				register int no;
932				register char *save;
933
934				no = OP(scan) - CLOSE;
935				save = reginput;
936
937				if (regmatch(next)) {
938					/*
939					 * Don't set endp if some later
940					 * invocation of the same parentheses
941					 * already has.
942					 */
943					if (regendp[no] == NULL)
944						regendp[no] = save;
945					return(1);
946				} else
947					return(0);
948			}
949			/* NOTREACHED */
950			break;
951		case BRANCH: {
952				register char *save;
953
954				if (OP(next) != BRANCH)		/* No choice. */
955					next = OPERAND(scan);	/* Avoid recursion. */
956				else {
957					do {
958						save = reginput;
959						if (regmatch(OPERAND(scan)))
960							return(1);
961						reginput = save;
962						scan = regnext(scan);
963					} while (scan != NULL && OP(scan) == BRANCH);
964					return(0);
965					/* NOTREACHED */
966				}
967			}
968			/* NOTREACHED */
969			break;
970		case STAR:
971		case PLUS: {
972				register char nextch;
973				register int no;
974				register char *save;
975				register int min;
976
977				/*
978				 * Lookahead to avoid useless match attempts
979				 * when we know what character comes next.
980				 */
981				nextch = '\0';
982				if (OP(next) == EXACTLY)
983					nextch = *OPERAND(next);
984				min = (OP(scan) == STAR) ? 0 : 1;
985				save = reginput;
986				no = regrepeat(OPERAND(scan));
987				while (no >= min) {
988					/* If it could work, try it. */
989					if (nextch == '\0' || *reginput == nextch)
990						if (regmatch(next))
991							return(1);
992					/* Couldn't or didn't -- back up. */
993					no--;
994					reginput = save + no;
995				}
996				return(0);
997			}
998			/* NOTREACHED */
999			break;
1000		case END:
1001			return(1);	/* Success! */
1002			/* NOTREACHED */
1003			break;
1004		default:
1005			regerror("memory corruption");
1006			return(0);
1007			/* NOTREACHED */
1008			break;
1009		}
1010
1011		scan = next;
1012	}
1013
1014	/*
1015	 * We get here only if there's trouble -- normally "case END" is
1016	 * the terminating point.
1017	 */
1018	regerror("corrupted pointers");
1019	return(0);
1020}
1021
1022/*
1023 - regrepeat - repeatedly match something simple, report how many
1024 */
1025static int
1026regrepeat(p)
1027char *p;
1028{
1029	register int count = 0;
1030	register char *scan;
1031	register char *opnd;
1032
1033	scan = reginput;
1034	opnd = OPERAND(p);
1035	switch (OP(p)) {
1036	case ANY:
1037		count = strlen(scan);
1038		scan += count;
1039		break;
1040	case EXACTLY:
1041		while (*opnd == *scan) {
1042			count++;
1043			scan++;
1044		}
1045		break;
1046	case ANYOF:
1047		while (*scan != '\0' && strchr(opnd, *scan) != NULL) {
1048			count++;
1049			scan++;
1050		}
1051		break;
1052	case ANYBUT:
1053		while (*scan != '\0' && strchr(opnd, *scan) == NULL) {
1054			count++;
1055			scan++;
1056		}
1057		break;
1058	default:		/* Oh dear.  Called inappropriately. */
1059		regerror("internal foulup");
1060		count = 0;	/* Best compromise. */
1061		break;
1062	}
1063	reginput = scan;
1064
1065	return(count);
1066}
1067
1068/*
1069 - regnext - dig the "next" pointer out of a node
1070 */
1071static char *
1072regnext(p)
1073register char *p;
1074{
1075	register int offset;
1076
1077	if (p == &regdummy)
1078		return(NULL);
1079
1080	offset = NEXT(p);
1081	if (offset == 0)
1082		return(NULL);
1083
1084	if (OP(p) == BACK)
1085		return(p-offset);
1086	else
1087		return(p+offset);
1088}
1089
1090#ifdef DEBUG
1091
1092STATIC char *regprop();
1093
1094/*
1095 - regdump - dump a regexp onto stdout in vaguely comprehensible form
1096 */
1097void
1098regdump(r)
1099regexp *r;
1100{
1101	register char *s;
1102	register char op = EXACTLY;	/* Arbitrary non-END op. */
1103	register char *next;
1104
1105
1106	s = r->program + 1;
1107	while (op != END) {	/* While that wasn't END last time... */
1108		op = OP(s);
1109		printf("%2d%s", s-r->program, regprop(s));	/* Where, what. */
1110		next = regnext(s);
1111		if (next == NULL)		/* Next ptr. */
1112			printf("(0)");
1113		else
1114			printf("(%d)", (s-r->program)+(next-s));
1115		s += 3;
1116		if (op == ANYOF || op == ANYBUT || op == EXACTLY) {
1117			/* Literal string, where present. */
1118			while (*s != '\0') {
1119				putchar(*s);
1120				s++;
1121			}
1122			s++;
1123		}
1124		putchar('\n');
1125	}
1126
1127	/* Header fields of interest. */
1128	if (r->regstart != '\0')
1129		printf("start `%c' ", r->regstart);
1130	if (r->reganch)
1131		printf("anchored ");
1132	if (r->regmust != NULL)
1133		printf("must have \"%s\"", r->regmust);
1134	printf("\n");
1135}
1136
1137/*
1138 - regprop - printable representation of opcode
1139 */
1140static char *
1141regprop(op)
1142char *op;
1143{
1144	register char *p;
1145	static char buf[50];
1146
1147	(void) strcpy(buf, ":");
1148
1149	switch (OP(op)) {
1150	case BOL:
1151		p = "BOL";
1152		break;
1153	case EOL:
1154		p = "EOL";
1155		break;
1156	case ANY:
1157		p = "ANY";
1158		break;
1159	case ANYOF:
1160		p = "ANYOF";
1161		break;
1162	case ANYBUT:
1163		p = "ANYBUT";
1164		break;
1165	case BRANCH:
1166		p = "BRANCH";
1167		break;
1168	case EXACTLY:
1169		p = "EXACTLY";
1170		break;
1171	case NOTHING:
1172		p = "NOTHING";
1173		break;
1174	case BACK:
1175		p = "BACK";
1176		break;
1177	case END:
1178		p = "END";
1179		break;
1180	case OPEN+1:
1181	case OPEN+2:
1182	case OPEN+3:
1183	case OPEN+4:
1184	case OPEN+5:
1185	case OPEN+6:
1186	case OPEN+7:
1187	case OPEN+8:
1188	case OPEN+9:
1189		sprintf(buf+strlen(buf), "OPEN%d", OP(op)-OPEN);
1190		p = NULL;
1191		break;
1192	case CLOSE+1:
1193	case CLOSE+2:
1194	case CLOSE+3:
1195	case CLOSE+4:
1196	case CLOSE+5:
1197	case CLOSE+6:
1198	case CLOSE+7:
1199	case CLOSE+8:
1200	case CLOSE+9:
1201		sprintf(buf+strlen(buf), "CLOSE%d", OP(op)-CLOSE);
1202		p = NULL;
1203		break;
1204	case STAR:
1205		p = "STAR";
1206		break;
1207	case PLUS:
1208		p = "PLUS";
1209		break;
1210	default:
1211		regerror("corrupted opcode");
1212		break;
1213	}
1214	if (p != NULL)
1215		(void) strcat(buf, p);
1216	return(buf);
1217}
1218#endif
1219
1220/*
1221 * The following is provided for those people who do not have strcspn() in
1222 * their C libraries.  They should get off their butts and do something
1223 * about it; at least one public-domain implementation of those (highly
1224 * useful) string routines has been published on Usenet.
1225 */
1226#ifdef STRCSPN
1227/*
1228 * strcspn - find length of initial segment of s1 consisting entirely
1229 * of characters not from s2
1230 */
1231
1232static int
1233strcspn(s1, s2)
1234char *s1;
1235char *s2;
1236{
1237	register char *scan1;
1238	register char *scan2;
1239	register int count;
1240
1241	count = 0;
1242	for (scan1 = s1; *scan1 != '\0'; scan1++) {
1243		for (scan2 = s2; *scan2 != '\0';)	/* ++ moved down. */
1244			if (*scan1 == *scan2++)
1245				return(count);
1246		count++;
1247	}
1248	return(count);
1249}
1250#endif
1251