1/*
2 * regcomp and regexec -- regsub and regerror are elsewhere
3 *
4 *	Copyright (c) 1986 by University of Toronto.
5 *	Written by Henry Spencer.  Not derived from licensed software.
6 *
7 *	Permission is granted to anyone to use this software for any
8 *	purpose on any computer system, and to redistribute it freely,
9 *	subject to the following restrictions:
10 *
11 *	1. The author is not responsible for the consequences of use of
12 *		this software, no matter how awful, even if they arise
13 *		from defects in it.
14 *
15 *	2. The origin of this software must not be misrepresented, either
16 *		by explicit claim or by omission.
17 *
18 *	3. Altered versions must be plainly marked as such, and must not
19 *		be misrepresented as being the original software.
20 *
21 * Beware that some of this code is subtly aware of the way operator
22 * precedence is structured in regular expressions.  Serious changes in
23 * regular-expression syntax might require a total rethink.
24 *
25 * *** NOTE: this code has been altered slightly for use in Tcl. ***
26 * Slightly modified by David MacKenzie to undo most of the changes for TCL.
27 * Added regexec2 with notbol parameter. -- 4/19/99 Mark Nudelman
28 */
29
30#include "less.h"
31#if HAVE_STDIO_H
32#include <stdio.h>
33#endif
34#if HAVE_STDLIB_H
35#include <stdlib.h>
36#endif
37#if HAVE_STRING_H
38#include <string.h>
39#endif
40#include "regexp.h"
41
42/*
43 * The "internal use only" fields in regexp.h are present to pass info from
44 * compile to execute that permits the execute phase to run lots faster on
45 * simple cases.  They are:
46 *
47 * regstart	char that must begin a match; '\0' if none obvious
48 * reganch	is the match anchored (at beginning-of-line only)?
49 * regmust	string (pointer into program) that match must include, or NULL
50 * regmlen	length of regmust string
51 *
52 * Regstart and reganch permit very fast decisions on suitable starting points
53 * for a match, cutting down the work a lot.  Regmust permits fast rejection
54 * of lines that cannot possibly match.  The regmust tests are costly enough
55 * that regcomp() supplies a regmust only if the r.e. contains something
56 * potentially expensive (at present, the only such thing detected is * or +
57 * at the start of the r.e., which can involve a lot of backup).  Regmlen is
58 * supplied because the test in regexec() needs it and regcomp() is
59 * computing it anyway.
60 */
61
62/*
63 * Structure for regexp "program".  This is essentially a linear encoding
64 * of a nondeterministic finite-state machine (aka syntax charts or
65 * "railroad normal form" in parsing technology).  Each node is an opcode
66 * plus a "next" pointer, possibly plus an operand.  "Next" pointers of
67 * all nodes except BRANCH implement concatenation; a "next" pointer with
68 * a BRANCH on both ends of it is connecting two alternatives.  (Here we
69 * have one of the subtle syntax dependencies:  an individual BRANCH (as
70 * opposed to a collection of them) is never concatenated with anything
71 * because of operator precedence.)  The operand of some types of node is
72 * a literal string; for others, it is a node leading into a sub-FSM.  In
73 * particular, the operand of a BRANCH node is the first node of the branch.
74 * (NB this is *not* a tree structure:  the tail of the branch connects
75 * to the thing following the set of BRANCHes.)  The opcodes are:
76 */
77
78/* definition	number	opnd?	meaning */
79#undef EOL
80#define	END	0	/* no	End of program. */
81#define	BOL	1	/* no	Match "" at beginning of line. */
82#define	EOL	2	/* no	Match "" at end of line. */
83#define	ANY	3	/* no	Match any one character. */
84#define	ANYOF	4	/* str	Match any character in this string. */
85#define	ANYBUT	5	/* str	Match any character not in this string. */
86#define	BRANCH	6	/* node	Match this alternative, or the next... */
87#define	BACK	7	/* no	Match "", "next" ptr points backward. */
88#define	EXACTLY	8	/* str	Match this string. */
89#define	NOTHING	9	/* no	Match empty string. */
90#define	STAR	10	/* node	Match this (simple) thing 0 or more times. */
91#define	PLUS	11	/* node	Match this (simple) thing 1 or more times. */
92#define	OPEN	20	/* no	Mark this point in input as start of #n. */
93			/*	OPEN+1 is number 1, etc. */
94#define	CLOSE	30	/* no	Analogous to OPEN. */
95
96/*
97 * Opcode notes:
98 *
99 * BRANCH	The set of branches constituting a single choice are hooked
100 *		together with their "next" pointers, since precedence prevents
101 *		anything being concatenated to any individual branch.  The
102 *		"next" pointer of the last BRANCH in a choice points to the
103 *		thing following the whole choice.  This is also where the
104 *		final "next" pointer of each individual branch points; each
105 *		branch starts with the operand node of a BRANCH node.
106 *
107 * BACK		Normal "next" pointers all implicitly point forward; BACK
108 *		exists to make loop structures possible.
109 *
110 * STAR,PLUS	'?', and complex '*' and '+', are implemented as circular
111 *		BRANCH structures using BACK.  Simple cases (one character
112 *		per match) are implemented with STAR and PLUS for speed
113 *		and to minimize recursive plunges.
114 *
115 * OPEN,CLOSE	...are numbered at compile time.
116 */
117
118/*
119 * A node is one char of opcode followed by two chars of "next" pointer.
120 * "Next" pointers are stored as two 8-bit pieces, high order first.  The
121 * value is a positive offset from the opcode of the node containing it.
122 * An operand, if any, simply follows the node.  (Note that much of the
123 * code generation knows about this implicit relationship.)
124 *
125 * Using two bytes for the "next" pointer is vast overkill for most things,
126 * but allows patterns to get big without disasters.
127 */
128#define	OP(p)	(*(p))
129#define	NEXT(p)	(((*((p)+1)&0377)<<8) + (*((p)+2)&0377))
130#define	OPERAND(p)	((p) + 3)
131
132/*
133 * See regmagic.h for one further detail of program structure.
134 */
135
136
137/*
138 * Utility definitions.
139 */
140#ifndef CHARBITS
141#define	UCHARAT(p)	((int)*(unsigned char *)(p))
142#else
143#define	UCHARAT(p)	((int)*(p)&CHARBITS)
144#endif
145
146#define	FAIL(m)	{ regerror(m); return(NULL); }
147#define	ISMULT(c)	((c) == '*' || (c) == '+' || (c) == '?')
148#define	META	"^$.[()|?+*\\"
149
150/*
151 * Flags to be passed up and down.
152 */
153#define	HASWIDTH	01	/* Known never to match null string. */
154#define	SIMPLE		02	/* Simple enough to be STAR/PLUS operand. */
155#define	SPSTART		04	/* Starts with * or +. */
156#define	WORST		0	/* Worst case. */
157
158/*
159 * Global work variables for regcomp().
160 */
161static char *regparse;		/* Input-scan pointer. */
162static int regnpar;		/* () count. */
163static char regdummy;
164static char *regcode;		/* Code-emit pointer; &regdummy = don't. */
165static long regsize;		/* Code size. */
166
167/*
168 * The first byte of the regexp internal "program" is actually this magic
169 * number; the start node begins in the second byte.
170 */
171#define	MAGIC	0234
172
173
174/*
175 * Forward declarations for regcomp()'s friends.
176 */
177#ifndef STATIC
178#define	STATIC	static
179#endif
180STATIC char *reg();
181STATIC char *regbranch();
182STATIC char *regpiece();
183STATIC char *regatom();
184STATIC char *regnode();
185STATIC char *regnext();
186STATIC void regc();
187STATIC void reginsert();
188STATIC void regtail();
189STATIC void regoptail();
190#ifdef STRCSPN
191STATIC int strcspn();
192#endif
193
194/*
195 - regcomp - compile a regular expression into internal code
196 *
197 * We can't allocate space until we know how big the compiled form will be,
198 * but we can't compile it (and thus know how big it is) until we've got a
199 * place to put the code.  So we cheat:  we compile it twice, once with code
200 * generation turned off and size counting turned on, and once "for real".
201 * This also means that we don't allocate space until we are sure that the
202 * thing really will compile successfully, and we never have to move the
203 * code and thus invalidate pointers into it.  (Note that it has to be in
204 * one piece because free() must be able to free it all.)
205 *
206 * Beware that the optimization-preparation code in here knows about some
207 * of the structure of the compiled regexp.
208 */
209regexp *
210regcomp(exp)
211char *exp;
212{
213	register regexp *r;
214	register char *scan;
215	register char *longest;
216	register int len;
217	int flags;
218
219	if (exp == NULL)
220		FAIL("NULL argument");
221
222	/* First pass: determine size, legality. */
223	regparse = exp;
224	regnpar = 1;
225	regsize = 0L;
226	regcode = &regdummy;
227	regc(MAGIC);
228	if (reg(0, &flags) == NULL)
229		return(NULL);
230
231	/* Small enough for pointer-storage convention? */
232	if (regsize >= 32767L)		/* Probably could be 65535L. */
233		FAIL("regexp too big");
234
235	/* Allocate space. */
236	r = (regexp *)malloc(sizeof(regexp) + (unsigned)regsize);
237	if (r == NULL)
238		FAIL("out of space");
239
240	/* Second pass: emit code. */
241	regparse = exp;
242	regnpar = 1;
243	regcode = r->program;
244	regc(MAGIC);
245	if (reg(0, &flags) == NULL)
246	{
247		free(r);
248		return(NULL);
249	}
250
251	/* Dig out information for optimizations. */
252	r->regstart = '\0';	/* Worst-case defaults. */
253	r->reganch = 0;
254	r->regmust = NULL;
255	r->regmlen = 0;
256	scan = r->program+1;			/* First BRANCH. */
257	if (OP(regnext(scan)) == END) {		/* Only one top-level choice. */
258		scan = OPERAND(scan);
259
260		/* Starting-point info. */
261		if (OP(scan) == EXACTLY)
262			r->regstart = *OPERAND(scan);
263		else if (OP(scan) == BOL)
264			r->reganch++;
265
266		/*
267		 * If there's something expensive in the r.e., find the
268		 * longest literal string that must appear and make it the
269		 * regmust.  Resolve ties in favor of later strings, since
270		 * the regstart check works with the beginning of the r.e.
271		 * and avoiding duplication strengthens checking.  Not a
272		 * strong reason, but sufficient in the absence of others.
273		 */
274		if (flags&SPSTART) {
275			longest = NULL;
276			len = 0;
277			for (; scan != NULL; scan = regnext(scan))
278				if (OP(scan) == EXACTLY && ((int) strlen(OPERAND(scan))) >= len) {
279					longest = OPERAND(scan);
280					len = (int) strlen(OPERAND(scan));
281				}
282			r->regmust = longest;
283			r->regmlen = len;
284		}
285	}
286
287	return(r);
288}
289
290/*
291 - reg - regular expression, i.e. main body or parenthesized thing
292 *
293 * Caller must absorb opening parenthesis.
294 *
295 * Combining parenthesis handling with the base level of regular expression
296 * is a trifle forced, but the need to tie the tails of the branches to what
297 * follows makes it hard to avoid.
298 */
299static char *
300reg(paren, flagp)
301int paren;			/* Parenthesized? */
302int *flagp;
303{
304	register char *ret;
305	register char *br;
306	register char *ender;
307	register int parno = 0;
308	int flags;
309
310	*flagp = HASWIDTH;	/* Tentatively. */
311
312	/* Make an OPEN node, if parenthesized. */
313	if (paren) {
314		if (regnpar >= NSUBEXP)
315			FAIL("too many ()");
316		parno = regnpar;
317		regnpar++;
318		ret = regnode(OPEN+parno);
319	} else
320		ret = NULL;
321
322	/* Pick up the branches, linking them together. */
323	br = regbranch(&flags);
324	if (br == NULL)
325		return(NULL);
326	if (ret != NULL)
327		regtail(ret, br);	/* OPEN -> first. */
328	else
329		ret = br;
330	if (!(flags&HASWIDTH))
331		*flagp &= ~HASWIDTH;
332	*flagp |= flags&SPSTART;
333	while (*regparse == '|') {
334		regparse++;
335		br = regbranch(&flags);
336		if (br == NULL)
337			return(NULL);
338		regtail(ret, br);	/* BRANCH -> BRANCH. */
339		if (!(flags&HASWIDTH))
340			*flagp &= ~HASWIDTH;
341		*flagp |= flags&SPSTART;
342	}
343
344	/* Make a closing node, and hook it on the end. */
345	ender = regnode((paren) ? CLOSE+parno : END);
346	regtail(ret, ender);
347
348	/* Hook the tails of the branches to the closing node. */
349	for (br = ret; br != NULL; br = regnext(br))
350		regoptail(br, ender);
351
352	/* Check for proper termination. */
353	if (paren && *regparse++ != ')') {
354		FAIL("unmatched ()");
355	} else if (!paren && *regparse != '\0') {
356		if (*regparse == ')') {
357			FAIL("unmatched ()");
358		} else
359			FAIL("junk on end");	/* "Can't happen". */
360		/* NOTREACHED */
361	}
362
363	return(ret);
364}
365
366/*
367 - regbranch - one alternative of an | operator
368 *
369 * Implements the concatenation operator.
370 */
371static char *
372regbranch(flagp)
373int *flagp;
374{
375	register char *ret;
376	register char *chain;
377	register char *latest;
378	int flags;
379
380	*flagp = WORST;		/* Tentatively. */
381
382	ret = regnode(BRANCH);
383	chain = NULL;
384	while (*regparse != '\0' && *regparse != '|' && *regparse != ')') {
385		latest = regpiece(&flags);
386		if (latest == NULL)
387			return(NULL);
388		*flagp |= flags&HASWIDTH;
389		if (chain == NULL)	/* First piece. */
390			*flagp |= flags&SPSTART;
391		else
392			regtail(chain, latest);
393		chain = latest;
394	}
395	if (chain == NULL)	/* Loop ran zero times. */
396		(void) regnode(NOTHING);
397
398	return(ret);
399}
400
401/*
402 - regpiece - something followed by possible [*+?]
403 *
404 * Note that the branching code sequences used for ? and the general cases
405 * of * and + are somewhat optimized:  they use the same NOTHING node as
406 * both the endmarker for their branch list and the body of the last branch.
407 * It might seem that this node could be dispensed with entirely, but the
408 * endmarker role is not redundant.
409 */
410static char *
411regpiece(flagp)
412int *flagp;
413{
414	register char *ret;
415	register char op;
416	register char *next;
417	int flags;
418
419	ret = regatom(&flags);
420	if (ret == NULL)
421		return(NULL);
422
423	op = *regparse;
424	if (!ISMULT(op)) {
425		*flagp = flags;
426		return(ret);
427	}
428
429	if (!(flags&HASWIDTH) && op != '?')
430		FAIL("*+ operand could be empty");
431	*flagp = (op != '+') ? (WORST|SPSTART) : (WORST|HASWIDTH);
432
433	if (op == '*' && (flags&SIMPLE))
434		reginsert(STAR, ret);
435	else if (op == '*') {
436		/* Emit x* as (x&|), where & means "self". */
437		reginsert(BRANCH, ret);			/* Either x */
438		regoptail(ret, regnode(BACK));		/* and loop */
439		regoptail(ret, ret);			/* back */
440		regtail(ret, regnode(BRANCH));		/* or */
441		regtail(ret, regnode(NOTHING));		/* null. */
442	} else if (op == '+' && (flags&SIMPLE))
443		reginsert(PLUS, ret);
444	else if (op == '+') {
445		/* Emit x+ as x(&|), where & means "self". */
446		next = regnode(BRANCH);			/* Either */
447		regtail(ret, next);
448		regtail(regnode(BACK), ret);		/* loop back */
449		regtail(next, regnode(BRANCH));		/* or */
450		regtail(ret, regnode(NOTHING));		/* null. */
451	} else if (op == '?') {
452		/* Emit x? as (x|) */
453		reginsert(BRANCH, ret);			/* Either x */
454		regtail(ret, regnode(BRANCH));		/* or */
455		next = regnode(NOTHING);		/* null. */
456		regtail(ret, next);
457		regoptail(ret, next);
458	}
459	regparse++;
460	if (ISMULT(*regparse))
461		FAIL("nested *?+");
462
463	return(ret);
464}
465
466/*
467 - regatom - the lowest level
468 *
469 * Optimization:  gobbles an entire sequence of ordinary characters so that
470 * it can turn them into a single node, which is smaller to store and
471 * faster to run.  Backslashed characters are exceptions, each becoming a
472 * separate node; the code is simpler that way and it's not worth fixing.
473 */
474static char *
475regatom(flagp)
476int *flagp;
477{
478	register char *ret;
479	int flags;
480
481	*flagp = WORST;		/* Tentatively. */
482
483	switch (*regparse++) {
484	case '^':
485		ret = regnode(BOL);
486		break;
487	case '$':
488		ret = regnode(EOL);
489		break;
490	case '.':
491		ret = regnode(ANY);
492		*flagp |= HASWIDTH|SIMPLE;
493		break;
494	case '[': {
495			register int clss;
496			register int classend;
497
498			if (*regparse == '^') {	/* Complement of range. */
499				ret = regnode(ANYBUT);
500				regparse++;
501			} else
502				ret = regnode(ANYOF);
503			if (*regparse == ']' || *regparse == '-')
504				regc(*regparse++);
505			while (*regparse != '\0' && *regparse != ']') {
506				if (*regparse == '-') {
507					regparse++;
508					if (*regparse == ']' || *regparse == '\0')
509						regc('-');
510					else {
511						clss = UCHARAT(regparse-2)+1;
512						classend = UCHARAT(regparse);
513						if (clss > classend+1)
514							FAIL("invalid [] range");
515						for (; clss <= classend; clss++)
516							regc(clss);
517						regparse++;
518					}
519				} else
520					regc(*regparse++);
521			}
522			regc('\0');
523			if (*regparse != ']')
524				FAIL("unmatched []");
525			regparse++;
526			*flagp |= HASWIDTH|SIMPLE;
527		}
528		break;
529	case '(':
530		ret = reg(1, &flags);
531		if (ret == NULL)
532			return(NULL);
533		*flagp |= flags&(HASWIDTH|SPSTART);
534		break;
535	case '\0':
536	case '|':
537	case ')':
538		FAIL("internal urp");	/* Supposed to be caught earlier. */
539		/* NOTREACHED */
540		break;
541	case '?':
542	case '+':
543	case '*':
544		FAIL("?+* follows nothing");
545		/* NOTREACHED */
546		break;
547	case '\\':
548		if (*regparse == '\0')
549			FAIL("trailing \\");
550		ret = regnode(EXACTLY);
551		regc(*regparse++);
552		regc('\0');
553		*flagp |= HASWIDTH|SIMPLE;
554		break;
555	default: {
556			register int len;
557			register char ender;
558
559			regparse--;
560			len = (int) strcspn(regparse, META);
561			if (len <= 0)
562				FAIL("internal disaster");
563			ender = *(regparse+len);
564			if (len > 1 && ISMULT(ender))
565				len--;		/* Back off clear of ?+* operand. */
566			*flagp |= HASWIDTH;
567			if (len == 1)
568				*flagp |= SIMPLE;
569			ret = regnode(EXACTLY);
570			while (len > 0) {
571				regc(*regparse++);
572				len--;
573			}
574			regc('\0');
575		}
576		break;
577	}
578
579	return(ret);
580}
581
582/*
583 - regnode - emit a node
584 */
585static char *			/* Location. */
586regnode(op)
587char op;
588{
589	register char *ret;
590	register char *ptr;
591
592	ret = regcode;
593	if (ret == &regdummy) {
594		regsize += 3;
595		return(ret);
596	}
597
598	ptr = ret;
599	*ptr++ = op;
600	*ptr++ = '\0';		/* Null "next" pointer. */
601	*ptr++ = '\0';
602	regcode = ptr;
603
604	return(ret);
605}
606
607/*
608 - regc - emit (if appropriate) a byte of code
609 */
610static void
611regc(b)
612char b;
613{
614	if (regcode != &regdummy)
615		*regcode++ = b;
616	else
617		regsize++;
618}
619
620/*
621 - reginsert - insert an operator in front of already-emitted operand
622 *
623 * Means relocating the operand.
624 */
625static void
626reginsert(op, opnd)
627char op;
628char *opnd;
629{
630	register char *src;
631	register char *dst;
632	register char *place;
633
634	if (regcode == &regdummy) {
635		regsize += 3;
636		return;
637	}
638
639	src = regcode;
640	regcode += 3;
641	dst = regcode;
642	while (src > opnd)
643		*--dst = *--src;
644
645	place = opnd;		/* Op node, where operand used to be. */
646	*place++ = op;
647	*place++ = '\0';
648	*place++ = '\0';
649}
650
651/*
652 - regtail - set the next-pointer at the end of a node chain
653 */
654static void
655regtail(p, val)
656char *p;
657char *val;
658{
659	register char *scan;
660	register char *temp;
661	register int offset;
662
663	if (p == &regdummy)
664		return;
665
666	/* Find last node. */
667	scan = p;
668	for (;;) {
669		temp = regnext(scan);
670		if (temp == NULL)
671			break;
672		scan = temp;
673	}
674
675	if (OP(scan) == BACK)
676		offset = (int) (scan - val);
677	else
678		offset = (int) (val - scan);
679	*(scan+1) = (offset>>8)&0377;
680	*(scan+2) = offset&0377;
681}
682
683/*
684 - regoptail - regtail on operand of first argument; nop if operandless
685 */
686static void
687regoptail(p, val)
688char *p;
689char *val;
690{
691	/* "Operandless" and "op != BRANCH" are synonymous in practice. */
692	if (p == NULL || p == &regdummy || OP(p) != BRANCH)
693		return;
694	regtail(OPERAND(p), val);
695}
696
697/*
698 * regexec and friends
699 */
700
701/*
702 * Global work variables for regexec().
703 */
704static char *reginput;		/* String-input pointer. */
705static char *regbol;		/* Beginning of input, for ^ check. */
706static char **regstartp;	/* Pointer to startp array. */
707static char **regendp;		/* Ditto for endp. */
708
709/*
710 * Forwards.
711 */
712STATIC int regtry();
713STATIC int regmatch();
714STATIC int regrepeat();
715
716#ifdef DEBUG
717int regnarrate = 0;
718void regdump();
719STATIC char *regprop();
720#endif
721
722/*
723 - regexec - match a regexp against a string
724 */
725int
726regexec2(prog, string, notbol)
727register regexp *prog;
728register char *string;
729int notbol;
730{
731	register char *s;
732
733	/* Be paranoid... */
734	if (prog == NULL || string == NULL) {
735		regerror("NULL parameter");
736		return(0);
737	}
738
739	/* Check validity of program. */
740	if (UCHARAT(prog->program) != MAGIC) {
741		regerror("corrupted program");
742		return(0);
743	}
744
745	/* If there is a "must appear" string, look for it. */
746	if (prog->regmust != NULL) {
747		s = string;
748		while ((s = strchr(s, prog->regmust[0])) != NULL) {
749			if (strncmp(s, prog->regmust, prog->regmlen) == 0)
750				break;	/* Found it. */
751			s++;
752		}
753		if (s == NULL)	/* Not present. */
754			return(0);
755	}
756
757	/* Mark beginning of line for ^ . */
758	if (notbol)
759		regbol = NULL;
760	else
761		regbol = string;
762
763	/* Simplest case:  anchored match need be tried only once. */
764	if (prog->reganch)
765		return(regtry(prog, string));
766
767	/* Messy cases:  unanchored match. */
768	s = string;
769	if (prog->regstart != '\0')
770		/* We know what char it must start with. */
771		while ((s = strchr(s, prog->regstart)) != NULL) {
772			if (regtry(prog, s))
773				return(1);
774			s++;
775		}
776	else
777		/* We don't -- general case. */
778		do {
779			if (regtry(prog, s))
780				return(1);
781		} while (*s++ != '\0');
782
783	/* Failure. */
784	return(0);
785}
786
787int
788regexec(prog, string)
789register regexp *prog;
790register char *string;
791{
792	return regexec2(prog, string, 0);
793}
794
795/*
796 - regtry - try match at specific point
797 */
798static int			/* 0 failure, 1 success */
799regtry(prog, string)
800regexp *prog;
801char *string;
802{
803	register int i;
804	register char **sp;
805	register char **ep;
806
807	reginput = string;
808	regstartp = prog->startp;
809	regendp = prog->endp;
810
811	sp = prog->startp;
812	ep = prog->endp;
813	for (i = NSUBEXP; i > 0; i--) {
814		*sp++ = NULL;
815		*ep++ = NULL;
816	}
817	if (regmatch(prog->program + 1)) {
818		prog->startp[0] = string;
819		prog->endp[0] = reginput;
820		return(1);
821	} else
822		return(0);
823}
824
825/*
826 - regmatch - main matching routine
827 *
828 * Conceptually the strategy is simple:  check to see whether the current
829 * node matches, call self recursively to see whether the rest matches,
830 * and then act accordingly.  In practice we make some effort to avoid
831 * recursion, in particular by going through "ordinary" nodes (that don't
832 * need to know whether the rest of the match failed) by a loop instead of
833 * by recursion.
834 */
835static int			/* 0 failure, 1 success */
836regmatch(prog)
837char *prog;
838{
839	register char *scan;	/* Current node. */
840	char *next;		/* Next node. */
841
842	scan = prog;
843#ifdef DEBUG
844	if (scan != NULL && regnarrate)
845		fprintf(stderr, "%s(\n", regprop(scan));
846#endif
847	while (scan != NULL) {
848#ifdef DEBUG
849		if (regnarrate)
850			fprintf(stderr, "%s...\n", regprop(scan));
851#endif
852		next = regnext(scan);
853
854		switch (OP(scan)) {
855		case BOL:
856			if (reginput != regbol)
857				return(0);
858			break;
859		case EOL:
860			if (*reginput != '\0')
861				return(0);
862			break;
863		case ANY:
864			if (*reginput == '\0')
865				return(0);
866			reginput++;
867			break;
868		case EXACTLY: {
869				register int len;
870				register char *opnd;
871
872				opnd = OPERAND(scan);
873				/* Inline the first character, for speed. */
874				if (*opnd != *reginput)
875					return(0);
876				len = (int) strlen(opnd);
877				if (len > 1 && strncmp(opnd, reginput, len) != 0)
878					return(0);
879				reginput += len;
880			}
881			break;
882		case ANYOF:
883 			if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) == NULL)
884				return(0);
885			reginput++;
886			break;
887		case ANYBUT:
888 			if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) != NULL)
889				return(0);
890			reginput++;
891			break;
892		case NOTHING:
893			break;
894		case BACK:
895			break;
896		case OPEN+1:
897		case OPEN+2:
898		case OPEN+3:
899		case OPEN+4:
900		case OPEN+5:
901		case OPEN+6:
902		case OPEN+7:
903		case OPEN+8:
904		case OPEN+9: {
905				register int no;
906				register char *save;
907
908				no = OP(scan) - OPEN;
909				save = reginput;
910
911				if (regmatch(next)) {
912					/*
913					 * Don't set startp if some later
914					 * invocation of the same parentheses
915					 * already has.
916					 */
917					if (regstartp[no] == NULL)
918						regstartp[no] = save;
919					return(1);
920				} else
921					return(0);
922			}
923			/* NOTREACHED */
924			break;
925		case CLOSE+1:
926		case CLOSE+2:
927		case CLOSE+3:
928		case CLOSE+4:
929		case CLOSE+5:
930		case CLOSE+6:
931		case CLOSE+7:
932		case CLOSE+8:
933		case CLOSE+9: {
934				register int no;
935				register char *save;
936
937				no = OP(scan) - CLOSE;
938				save = reginput;
939
940				if (regmatch(next)) {
941					/*
942					 * Don't set endp if some later
943					 * invocation of the same parentheses
944					 * already has.
945					 */
946					if (regendp[no] == NULL)
947						regendp[no] = save;
948					return(1);
949				} else
950					return(0);
951			}
952			/* NOTREACHED */
953			break;
954		case BRANCH: {
955				register char *save;
956
957				if (OP(next) != BRANCH)		/* No choice. */
958					next = OPERAND(scan);	/* Avoid recursion. */
959				else {
960					do {
961						save = reginput;
962						if (regmatch(OPERAND(scan)))
963							return(1);
964						reginput = save;
965						scan = regnext(scan);
966					} while (scan != NULL && OP(scan) == BRANCH);
967					return(0);
968					/* NOTREACHED */
969				}
970			}
971			/* NOTREACHED */
972			break;
973		case STAR:
974		case PLUS: {
975				register char nextch;
976				register int no;
977				register char *save;
978				register int min;
979
980				/*
981				 * Lookahead to avoid useless match attempts
982				 * when we know what character comes next.
983				 */
984				nextch = '\0';
985				if (OP(next) == EXACTLY)
986					nextch = *OPERAND(next);
987				min = (OP(scan) == STAR) ? 0 : 1;
988				save = reginput;
989				no = regrepeat(OPERAND(scan));
990				while (no >= min) {
991					/* If it could work, try it. */
992					if (nextch == '\0' || *reginput == nextch)
993						if (regmatch(next))
994							return(1);
995					/* Couldn't or didn't -- back up. */
996					no--;
997					reginput = save + no;
998				}
999				return(0);
1000			}
1001			/* NOTREACHED */
1002			break;
1003		case END:
1004			return(1);	/* Success! */
1005			/* NOTREACHED */
1006			break;
1007		default:
1008			regerror("memory corruption");
1009			return(0);
1010			/* NOTREACHED */
1011			break;
1012		}
1013
1014		scan = next;
1015	}
1016
1017	/*
1018	 * We get here only if there's trouble -- normally "case END" is
1019	 * the terminating point.
1020	 */
1021	regerror("corrupted pointers");
1022	return(0);
1023}
1024
1025/*
1026 - regrepeat - repeatedly match something simple, report how many
1027 */
1028static int
1029regrepeat(p)
1030char *p;
1031{
1032	register int count = 0;
1033	register char *scan;
1034	register char *opnd;
1035
1036	scan = reginput;
1037	opnd = OPERAND(p);
1038	switch (OP(p)) {
1039	case ANY:
1040		count = (int) strlen(scan);
1041		scan += count;
1042		break;
1043	case EXACTLY:
1044		while (*opnd == *scan) {
1045			count++;
1046			scan++;
1047		}
1048		break;
1049	case ANYOF:
1050		while (*scan != '\0' && strchr(opnd, *scan) != NULL) {
1051			count++;
1052			scan++;
1053		}
1054		break;
1055	case ANYBUT:
1056		while (*scan != '\0' && strchr(opnd, *scan) == NULL) {
1057			count++;
1058			scan++;
1059		}
1060		break;
1061	default:		/* Oh dear.  Called inappropriately. */
1062		regerror("internal foulup");
1063		count = 0;	/* Best compromise. */
1064		break;
1065	}
1066	reginput = scan;
1067
1068	return(count);
1069}
1070
1071/*
1072 - regnext - dig the "next" pointer out of a node
1073 */
1074static char *
1075regnext(p)
1076register char *p;
1077{
1078	register int offset;
1079
1080	if (p == &regdummy)
1081		return(NULL);
1082
1083	offset = NEXT(p);
1084	if (offset == 0)
1085		return(NULL);
1086
1087	if (OP(p) == BACK)
1088		return(p-offset);
1089	else
1090		return(p+offset);
1091}
1092
1093#ifdef DEBUG
1094
1095STATIC char *regprop();
1096
1097/*
1098 - regdump - dump a regexp onto stdout in vaguely comprehensible form
1099 */
1100void
1101regdump(r)
1102regexp *r;
1103{
1104	register char *s;
1105	register char op = EXACTLY;	/* Arbitrary non-END op. */
1106	register char *next;
1107
1108
1109	s = r->program + 1;
1110	while (op != END) {	/* While that wasn't END last time... */
1111		op = OP(s);
1112		printf("%2d%s", s-r->program, regprop(s));	/* Where, what. */
1113		next = regnext(s);
1114		if (next == NULL)		/* Next ptr. */
1115			printf("(0)");
1116		else
1117			printf("(%d)", (s-r->program)+(next-s));
1118		s += 3;
1119		if (op == ANYOF || op == ANYBUT || op == EXACTLY) {
1120			/* Literal string, where present. */
1121			while (*s != '\0') {
1122				putchar(*s);
1123				s++;
1124			}
1125			s++;
1126		}
1127		putchar('\n');
1128	}
1129
1130	/* Header fields of interest. */
1131	if (r->regstart != '\0')
1132		printf("start `%c' ", r->regstart);
1133	if (r->reganch)
1134		printf("anchored ");
1135	if (r->regmust != NULL)
1136		printf("must have \"%s\"", r->regmust);
1137	printf("\n");
1138}
1139
1140/*
1141 - regprop - printable representation of opcode
1142 */
1143static char *
1144regprop(op)
1145char *op;
1146{
1147	register char *p;
1148	static char buf[50];
1149
1150	(void) strcpy(buf, ":");
1151
1152	switch (OP(op)) {
1153	case BOL:
1154		p = "BOL";
1155		break;
1156	case EOL:
1157		p = "EOL";
1158		break;
1159	case ANY:
1160		p = "ANY";
1161		break;
1162	case ANYOF:
1163		p = "ANYOF";
1164		break;
1165	case ANYBUT:
1166		p = "ANYBUT";
1167		break;
1168	case BRANCH:
1169		p = "BRANCH";
1170		break;
1171	case EXACTLY:
1172		p = "EXACTLY";
1173		break;
1174	case NOTHING:
1175		p = "NOTHING";
1176		break;
1177	case BACK:
1178		p = "BACK";
1179		break;
1180	case END:
1181		p = "END";
1182		break;
1183	case OPEN+1:
1184	case OPEN+2:
1185	case OPEN+3:
1186	case OPEN+4:
1187	case OPEN+5:
1188	case OPEN+6:
1189	case OPEN+7:
1190	case OPEN+8:
1191	case OPEN+9:
1192		sprintf(buf+strlen(buf), "OPEN%d", OP(op)-OPEN);
1193		p = NULL;
1194		break;
1195	case CLOSE+1:
1196	case CLOSE+2:
1197	case CLOSE+3:
1198	case CLOSE+4:
1199	case CLOSE+5:
1200	case CLOSE+6:
1201	case CLOSE+7:
1202	case CLOSE+8:
1203	case CLOSE+9:
1204		sprintf(buf+strlen(buf), "CLOSE%d", OP(op)-CLOSE);
1205		p = NULL;
1206		break;
1207	case STAR:
1208		p = "STAR";
1209		break;
1210	case PLUS:
1211		p = "PLUS";
1212		break;
1213	default:
1214		regerror("corrupted opcode");
1215		break;
1216	}
1217	if (p != NULL)
1218		(void) strcat(buf, p);
1219	return(buf);
1220}
1221#endif
1222
1223/*
1224 * The following is provided for those people who do not have strcspn() in
1225 * their C libraries.  They should get off their butts and do something
1226 * about it; at least one public-domain implementation of those (highly
1227 * useful) string routines has been published on Usenet.
1228 */
1229#ifdef STRCSPN
1230/*
1231 * strcspn - find length of initial segment of s1 consisting entirely
1232 * of characters not from s2
1233 */
1234
1235static int
1236strcspn(s1, s2)
1237char *s1;
1238char *s2;
1239{
1240	register char *scan1;
1241	register char *scan2;
1242	register int count;
1243
1244	count = 0;
1245	for (scan1 = s1; *scan1 != '\0'; scan1++) {
1246		for (scan2 = s2; *scan2 != '\0';)	/* ++ moved down. */
1247			if (*scan1 == *scan2++)
1248				return(count);
1249		count++;
1250	}
1251	return(count);
1252}
1253#endif
1254