1/*	$NetBSD: regexp.c,v 1.4 2023/10/06 05:49:49 simonb Exp $	*/
2
3/*
4 * regcomp and regexec -- regsub and regerror are elsewhere
5 *
6 *	Copyright (c) 1986 by University of Toronto.
7 *	Written by Henry Spencer.  Not derived from licensed software.
8 *
9 *	Permission is granted to anyone to use this software for any
10 *	purpose on any computer system, and to redistribute it freely,
11 *	subject to the following restrictions:
12 *
13 *	1. The author is not responsible for the consequences of use of
14 *		this software, no matter how awful, even if they arise
15 *		from defects in it.
16 *
17 *	2. The origin of this software must not be misrepresented, either
18 *		by explicit claim or by omission.
19 *
20 *	3. Altered versions must be plainly marked as such, and must not
21 *		be misrepresented as being the original software.
22 *
23 * Beware that some of this code is subtly aware of the way operator
24 * precedence is structured in regular expressions.  Serious changes in
25 * regular-expression syntax might require a total rethink.
26 *
27 * *** NOTE: this code has been altered slightly for use in Tcl. ***
28 * Slightly modified by David MacKenzie to undo most of the changes for TCL.
29 * Added regexec2 with notbol parameter. -- 4/19/99 Mark Nudelman
30 */
31
32#include "less.h"
33#if HAVE_STDIO_H
34#include <stdio.h>
35#endif
36#if HAVE_STDLIB_H
37#include <stdlib.h>
38#endif
39#if HAVE_STRING_H
40#include <string.h>
41#endif
42#include "regexp.h"
43
44/*
45 * The "internal use only" fields in regexp.h are present to pass info from
46 * compile to execute that permits the execute phase to run lots faster on
47 * simple cases.  They are:
48 *
49 * regstart	char that must begin a match; '\0' if none obvious
50 * reganch	is the match anchored (at beginning-of-line only)?
51 * regmust	string (pointer into program) that match must include, or NULL
52 * regmlen	length of regmust string
53 *
54 * Regstart and reganch permit very fast decisions on suitable starting points
55 * for a match, cutting down the work a lot.  Regmust permits fast rejection
56 * of lines that cannot possibly match.  The regmust tests are costly enough
57 * that regcomp() supplies a regmust only if the r.e. contains something
58 * potentially expensive (at present, the only such thing detected is * or +
59 * at the start of the r.e., which can involve a lot of backup).  Regmlen is
60 * supplied because the test in regexec() needs it and regcomp() is
61 * computing it anyway.
62 */
63
64/*
65 * Structure for regexp "program".  This is essentially a linear encoding
66 * of a nondeterministic finite-state machine (aka syntax charts or
67 * "railroad normal form" in parsing technology).  Each node is an opcode
68 * plus a "next" pointer, possibly plus an operand.  "Next" pointers of
69 * all nodes except BRANCH implement concatenation; a "next" pointer with
70 * a BRANCH on both ends of it is connecting two alternatives.  (Here we
71 * have one of the subtle syntax dependencies:  an individual BRANCH (as
72 * opposed to a collection of them) is never concatenated with anything
73 * because of operator precedence.)  The operand of some types of node is
74 * a literal string; for others, it is a node leading into a sub-FSM.  In
75 * particular, the operand of a BRANCH node is the first node of the branch.
76 * (NB this is *not* a tree structure:  the tail of the branch connects
77 * to the thing following the set of BRANCHes.)  The opcodes are:
78 */
79
80/* definition	number	opnd?	meaning */
81#undef EOL
82#define	END	0	/* no	End of program. */
83#define	BOL	1	/* no	Match "" at beginning of line. */
84#define	EOL	2	/* no	Match "" at end of line. */
85#define	ANY	3	/* no	Match any one character. */
86#define	ANYOF	4	/* str	Match any character in this string. */
87#define	ANYBUT	5	/* str	Match any character not in this string. */
88#define	BRANCH	6	/* node	Match this alternative, or the next... */
89#define	BACK	7	/* no	Match "", "next" ptr points backward. */
90#define	EXACTLY	8	/* str	Match this string. */
91#define	NOTHING	9	/* no	Match empty string. */
92#define	STAR	10	/* node	Match this (simple) thing 0 or more times. */
93#define	PLUS	11	/* node	Match this (simple) thing 1 or more times. */
94#define	OPEN	20	/* no	Mark this point in input as start of #n. */
95			/*	OPEN+1 is number 1, etc. */
96#define	CLOSE	30	/* no	Analogous to OPEN. */
97
98/*
99 * Opcode notes:
100 *
101 * BRANCH	The set of branches constituting a single choice are hooked
102 *		together with their "next" pointers, since precedence prevents
103 *		anything being concatenated to any individual branch.  The
104 *		"next" pointer of the last BRANCH in a choice points to the
105 *		thing following the whole choice.  This is also where the
106 *		final "next" pointer of each individual branch points; each
107 *		branch starts with the operand node of a BRANCH node.
108 *
109 * BACK		Normal "next" pointers all implicitly point forward; BACK
110 *		exists to make loop structures possible.
111 *
112 * STAR,PLUS	'?', and complex '*' and '+', are implemented as circular
113 *		BRANCH structures using BACK.  Simple cases (one character
114 *		per match) are implemented with STAR and PLUS for speed
115 *		and to minimize recursive plunges.
116 *
117 * OPEN,CLOSE	...are numbered at compile time.
118 */
119
120/*
121 * A node is one char of opcode followed by two chars of "next" pointer.
122 * "Next" pointers are stored as two 8-bit pieces, high order first.  The
123 * value is a positive offset from the opcode of the node containing it.
124 * An operand, if any, simply follows the node.  (Note that much of the
125 * code generation knows about this implicit relationship.)
126 *
127 * Using two bytes for the "next" pointer is vast overkill for most things,
128 * but allows patterns to get big without disasters.
129 */
130#define	OP(p)	(*(p))
131#define	NEXT(p)	(((*((p)+1)&0377)<<8) + (*((p)+2)&0377))
132#define	OPERAND(p)	((p) + 3)
133
134/*
135 * See regmagic.h for one further detail of program structure.
136 */
137
138
139/*
140 * Utility definitions.
141 */
142#ifndef CHARBITS
143#define	UCHARAT(p)	((int)*(unsigned char *)(p))
144#else
145#define	UCHARAT(p)	((int)*(p)&CHARBITS)
146#endif
147
148#define	FAIL(m)	{ regerror(m); return(NULL); }
149#define	ISMULT(c)	((c) == '*' || (c) == '+' || (c) == '?')
150#define	META	"^$.[()|?+*\\"
151
152/*
153 * Flags to be passed up and down.
154 */
155#define	HASWIDTH	01	/* Known never to match null string. */
156#define	SIMPLE		02	/* Simple enough to be STAR/PLUS operand. */
157#define	SPSTART		04	/* Starts with * or +. */
158#define	WORST		0	/* Worst case. */
159
160/*
161 * Global work variables for regcomp().
162 */
163static char *regparse;		/* Input-scan pointer. */
164static int regnpar;		/* () count. */
165static char regdummy;
166static char *regcode;		/* Code-emit pointer; &regdummy = don't. */
167static long regsize;		/* Code size. */
168
169/*
170 * The first byte of the regexp internal "program" is actually this magic
171 * number; the start node begins in the second byte.
172 */
173#define	MAGIC	0234
174
175
176/*
177 * Forward declarations for regcomp()'s friends.
178 */
179#ifndef STATIC
180#define	STATIC	static
181#endif
182STATIC char *reg();
183STATIC char *regbranch();
184STATIC char *regpiece();
185STATIC char *regatom();
186STATIC char *regnode();
187STATIC char *regnext();
188STATIC void regc();
189STATIC void reginsert();
190STATIC void regtail();
191STATIC void regoptail();
192#ifdef STRCSPN
193STATIC int strcspn();
194#endif
195
196/*
197 - regcomp - compile a regular expression into internal code
198 *
199 * We can't allocate space until we know how big the compiled form will be,
200 * but we can't compile it (and thus know how big it is) until we've got a
201 * place to put the code.  So we cheat:  we compile it twice, once with code
202 * generation turned off and size counting turned on, and once "for real".
203 * This also means that we don't allocate space until we are sure that the
204 * thing really will compile successfully, and we never have to move the
205 * code and thus invalidate pointers into it.  (Note that it has to be in
206 * one piece because free() must be able to free it all.)
207 *
208 * Beware that the optimization-preparation code in here knows about some
209 * of the structure of the compiled regexp.
210 */
211regexp *
212regcomp(exp)
213char *exp;
214{
215	register regexp *r;
216	register char *scan;
217	register char *longest;
218	register int len;
219	int flags;
220
221	if (exp == NULL)
222		FAIL("NULL argument");
223
224	/* First pass: determine size, legality. */
225	regparse = exp;
226	regnpar = 1;
227	regsize = 0L;
228	regcode = &regdummy;
229	regc(MAGIC);
230	if (reg(0, &flags) == NULL)
231		return(NULL);
232
233	/* Small enough for pointer-storage convention? */
234	if (regsize >= 32767L)		/* Probably could be 65535L. */
235		FAIL("regexp too big");
236
237	/* Allocate space. */
238	r = (regexp *)malloc(sizeof(regexp) + (unsigned)regsize);
239	if (r == NULL)
240		FAIL("out of space");
241
242	/* Second pass: emit code. */
243	regparse = exp;
244	regnpar = 1;
245	regcode = r->program;
246	regc(MAGIC);
247	if (reg(0, &flags) == NULL)
248	{
249		free(r);
250		return(NULL);
251	}
252
253	/* Dig out information for optimizations. */
254	r->regstart = '\0';	/* Worst-case defaults. */
255	r->reganch = 0;
256	r->regmust = NULL;
257	r->regmlen = 0;
258	scan = r->program+1;			/* First BRANCH. */
259	if (OP(regnext(scan)) == END) {		/* Only one top-level choice. */
260		scan = OPERAND(scan);
261
262		/* Starting-point info. */
263		if (OP(scan) == EXACTLY)
264			r->regstart = *OPERAND(scan);
265		else if (OP(scan) == BOL)
266			r->reganch++;
267
268		/*
269		 * If there's something expensive in the r.e., find the
270		 * longest literal string that must appear and make it the
271		 * regmust.  Resolve ties in favor of later strings, since
272		 * the regstart check works with the beginning of the r.e.
273		 * and avoiding duplication strengthens checking.  Not a
274		 * strong reason, but sufficient in the absence of others.
275		 */
276		if (flags&SPSTART) {
277			longest = NULL;
278			len = 0;
279			for (; scan != NULL; scan = regnext(scan))
280				if (OP(scan) == EXACTLY && ((int) strlen(OPERAND(scan))) >= len) {
281					longest = OPERAND(scan);
282					len = (int) strlen(OPERAND(scan));
283				}
284			r->regmust = longest;
285			r->regmlen = len;
286		}
287	}
288
289	return(r);
290}
291
292/*
293 - reg - regular expression, i.e. main body or parenthesized thing
294 *
295 * Caller must absorb opening parenthesis.
296 *
297 * Combining parenthesis handling with the base level of regular expression
298 * is a trifle forced, but the need to tie the tails of the branches to what
299 * follows makes it hard to avoid.
300 */
301static char *
302reg(paren, flagp)
303int paren;			/* Parenthesized? */
304int *flagp;
305{
306	register char *ret;
307	register char *br;
308	register char *ender;
309	register int parno = 0;
310	int flags;
311
312	*flagp = HASWIDTH;	/* Tentatively. */
313
314	/* Make an OPEN node, if parenthesized. */
315	if (paren) {
316		if (regnpar >= NSUBEXP)
317			FAIL("too many ()");
318		parno = regnpar;
319		regnpar++;
320		ret = regnode(OPEN+parno);
321	} else
322		ret = NULL;
323
324	/* Pick up the branches, linking them together. */
325	br = regbranch(&flags);
326	if (br == NULL)
327		return(NULL);
328	if (ret != NULL)
329		regtail(ret, br);	/* OPEN -> first. */
330	else
331		ret = br;
332	if (!(flags&HASWIDTH))
333		*flagp &= ~HASWIDTH;
334	*flagp |= flags&SPSTART;
335	while (*regparse == '|') {
336		regparse++;
337		br = regbranch(&flags);
338		if (br == NULL)
339			return(NULL);
340		regtail(ret, br);	/* BRANCH -> BRANCH. */
341		if (!(flags&HASWIDTH))
342			*flagp &= ~HASWIDTH;
343		*flagp |= flags&SPSTART;
344	}
345
346	/* Make a closing node, and hook it on the end. */
347	ender = regnode((paren) ? CLOSE+parno : END);
348	regtail(ret, ender);
349
350	/* Hook the tails of the branches to the closing node. */
351	for (br = ret; br != NULL; br = regnext(br))
352		regoptail(br, ender);
353
354	/* Check for proper termination. */
355	if (paren && *regparse++ != ')') {
356		FAIL("unmatched ()");
357	} else if (!paren && *regparse != '\0') {
358		if (*regparse == ')') {
359			FAIL("unmatched ()");
360		} else
361			FAIL("junk on end");	/* "Can't happen". */
362		/* NOTREACHED */
363	}
364
365	return(ret);
366}
367
368/*
369 - regbranch - one alternative of an | operator
370 *
371 * Implements the concatenation operator.
372 */
373static char *
374regbranch(flagp)
375int *flagp;
376{
377	register char *ret;
378	register char *chain;
379	register char *latest;
380	int flags;
381
382	*flagp = WORST;		/* Tentatively. */
383
384	ret = regnode(BRANCH);
385	chain = NULL;
386	while (*regparse != '\0' && *regparse != '|' && *regparse != ')') {
387		latest = regpiece(&flags);
388		if (latest == NULL)
389			return(NULL);
390		*flagp |= flags&HASWIDTH;
391		if (chain == NULL)	/* First piece. */
392			*flagp |= flags&SPSTART;
393		else
394			regtail(chain, latest);
395		chain = latest;
396	}
397	if (chain == NULL)	/* Loop ran zero times. */
398		(void) regnode(NOTHING);
399
400	return(ret);
401}
402
403/*
404 - regpiece - something followed by possible [*+?]
405 *
406 * Note that the branching code sequences used for ? and the general cases
407 * of * and + are somewhat optimized:  they use the same NOTHING node as
408 * both the endmarker for their branch list and the body of the last branch.
409 * It might seem that this node could be dispensed with entirely, but the
410 * endmarker role is not redundant.
411 */
412static char *
413regpiece(flagp)
414int *flagp;
415{
416	register char *ret;
417	register char op;
418	register char *next;
419	int flags;
420
421	ret = regatom(&flags);
422	if (ret == NULL)
423		return(NULL);
424
425	op = *regparse;
426	if (!ISMULT(op)) {
427		*flagp = flags;
428		return(ret);
429	}
430
431	if (!(flags&HASWIDTH) && op != '?')
432		FAIL("*+ operand could be empty");
433	*flagp = (op != '+') ? (WORST|SPSTART) : (WORST|HASWIDTH);
434
435	if (op == '*' && (flags&SIMPLE))
436		reginsert(STAR, ret);
437	else if (op == '*') {
438		/* Emit x* as (x&|), where & means "self". */
439		reginsert(BRANCH, ret);			/* Either x */
440		regoptail(ret, regnode(BACK));		/* and loop */
441		regoptail(ret, ret);			/* back */
442		regtail(ret, regnode(BRANCH));		/* or */
443		regtail(ret, regnode(NOTHING));		/* null. */
444	} else if (op == '+' && (flags&SIMPLE))
445		reginsert(PLUS, ret);
446	else if (op == '+') {
447		/* Emit x+ as x(&|), where & means "self". */
448		next = regnode(BRANCH);			/* Either */
449		regtail(ret, next);
450		regtail(regnode(BACK), ret);		/* loop back */
451		regtail(next, regnode(BRANCH));		/* or */
452		regtail(ret, regnode(NOTHING));		/* null. */
453	} else if (op == '?') {
454		/* Emit x? as (x|) */
455		reginsert(BRANCH, ret);			/* Either x */
456		regtail(ret, regnode(BRANCH));		/* or */
457		next = regnode(NOTHING);		/* null. */
458		regtail(ret, next);
459		regoptail(ret, next);
460	}
461	regparse++;
462	if (ISMULT(*regparse))
463		FAIL("nested *?+");
464
465	return(ret);
466}
467
468/*
469 - regatom - the lowest level
470 *
471 * Optimization:  gobbles an entire sequence of ordinary characters so that
472 * it can turn them into a single node, which is smaller to store and
473 * faster to run.  Backslashed characters are exceptions, each becoming a
474 * separate node; the code is simpler that way and it's not worth fixing.
475 */
476static char *
477regatom(flagp)
478int *flagp;
479{
480	register char *ret;
481	int flags;
482
483	*flagp = WORST;		/* Tentatively. */
484
485	switch (*regparse++) {
486	case '^':
487		ret = regnode(BOL);
488		break;
489	case '$':
490		ret = regnode(EOL);
491		break;
492	case '.':
493		ret = regnode(ANY);
494		*flagp |= HASWIDTH|SIMPLE;
495		break;
496	case '[': {
497			register int clss;
498			register int classend;
499
500			if (*regparse == '^') {	/* Complement of range. */
501				ret = regnode(ANYBUT);
502				regparse++;
503			} else
504				ret = regnode(ANYOF);
505			if (*regparse == ']' || *regparse == '-')
506				regc(*regparse++);
507			while (*regparse != '\0' && *regparse != ']') {
508				if (*regparse == '-') {
509					regparse++;
510					if (*regparse == ']' || *regparse == '\0')
511						regc('-');
512					else {
513						clss = UCHARAT(regparse-2)+1;
514						classend = UCHARAT(regparse);
515						if (clss > classend+1)
516							FAIL("invalid [] range");
517						for (; clss <= classend; clss++)
518							regc(clss);
519						regparse++;
520					}
521				} else
522					regc(*regparse++);
523			}
524			regc('\0');
525			if (*regparse != ']')
526				FAIL("unmatched []");
527			regparse++;
528			*flagp |= HASWIDTH|SIMPLE;
529		}
530		break;
531	case '(':
532		ret = reg(1, &flags);
533		if (ret == NULL)
534			return(NULL);
535		*flagp |= flags&(HASWIDTH|SPSTART);
536		break;
537	case '\0':
538	case '|':
539	case ')':
540		FAIL("internal urp");	/* Supposed to be caught earlier. */
541		/* NOTREACHED */
542		break;
543	case '?':
544	case '+':
545	case '*':
546		FAIL("?+* follows nothing");
547		/* NOTREACHED */
548		break;
549	case '\\':
550		if (*regparse == '\0')
551			FAIL("trailing \\");
552		ret = regnode(EXACTLY);
553		regc(*regparse++);
554		regc('\0');
555		*flagp |= HASWIDTH|SIMPLE;
556		break;
557	default: {
558			register int len;
559			register char ender;
560
561			regparse--;
562			len = (int) strcspn(regparse, META);
563			if (len <= 0)
564				FAIL("internal disaster");
565			ender = *(regparse+len);
566			if (len > 1 && ISMULT(ender))
567				len--;		/* Back off clear of ?+* operand. */
568			*flagp |= HASWIDTH;
569			if (len == 1)
570				*flagp |= SIMPLE;
571			ret = regnode(EXACTLY);
572			while (len > 0) {
573				regc(*regparse++);
574				len--;
575			}
576			regc('\0');
577		}
578		break;
579	}
580
581	return(ret);
582}
583
584/*
585 - regnode - emit a node
586 */
587static char *			/* Location. */
588regnode(op)
589char op;
590{
591	register char *ret;
592	register char *ptr;
593
594	ret = regcode;
595	if (ret == &regdummy) {
596		regsize += 3;
597		return(ret);
598	}
599
600	ptr = ret;
601	*ptr++ = op;
602	*ptr++ = '\0';		/* Null "next" pointer. */
603	*ptr++ = '\0';
604	regcode = ptr;
605
606	return(ret);
607}
608
609/*
610 - regc - emit (if appropriate) a byte of code
611 */
612static void
613regc(b)
614char b;
615{
616	if (regcode != &regdummy)
617		*regcode++ = b;
618	else
619		regsize++;
620}
621
622/*
623 - reginsert - insert an operator in front of already-emitted operand
624 *
625 * Means relocating the operand.
626 */
627static void
628reginsert(op, opnd)
629char op;
630char *opnd;
631{
632	register char *src;
633	register char *dst;
634	register char *place;
635
636	if (regcode == &regdummy) {
637		regsize += 3;
638		return;
639	}
640
641	src = regcode;
642	regcode += 3;
643	dst = regcode;
644	while (src > opnd)
645		*--dst = *--src;
646
647	place = opnd;		/* Op node, where operand used to be. */
648	*place++ = op;
649	*place++ = '\0';
650	*place++ = '\0';
651}
652
653/*
654 - regtail - set the next-pointer at the end of a node chain
655 */
656static void
657regtail(p, val)
658char *p;
659char *val;
660{
661	register char *scan;
662	register char *temp;
663	register int offset;
664
665	if (p == &regdummy)
666		return;
667
668	/* Find last node. */
669	scan = p;
670	for (;;) {
671		temp = regnext(scan);
672		if (temp == NULL)
673			break;
674		scan = temp;
675	}
676
677	if (OP(scan) == BACK)
678		offset = (int) (scan - val);
679	else
680		offset = (int) (val - scan);
681	*(scan+1) = (offset>>8)&0377;
682	*(scan+2) = offset&0377;
683}
684
685/*
686 - regoptail - regtail on operand of first argument; nop if operandless
687 */
688static void
689regoptail(p, val)
690char *p;
691char *val;
692{
693	/* "Operandless" and "op != BRANCH" are synonymous in practice. */
694	if (p == NULL || p == &regdummy || OP(p) != BRANCH)
695		return;
696	regtail(OPERAND(p), val);
697}
698
699/*
700 * regexec and friends
701 */
702
703/*
704 * Global work variables for regexec().
705 */
706static char *reginput;		/* String-input pointer. */
707static char *regbol;		/* Beginning of input, for ^ check. */
708static char **regstartp;	/* Pointer to startp array. */
709static char **regendp;		/* Ditto for endp. */
710
711/*
712 * Forwards.
713 */
714STATIC int regtry();
715STATIC int regmatch();
716STATIC int regrepeat();
717
718#ifdef DEBUG
719int regnarrate = 0;
720void regdump();
721STATIC char *regprop();
722#endif
723
724/*
725 - regexec - match a regexp against a string
726 */
727int
728regexec2(prog, string, notbol)
729register regexp *prog;
730register char *string;
731int notbol;
732{
733	register char *s;
734
735	/* Be paranoid... */
736	if (prog == NULL || string == NULL) {
737		regerror("NULL parameter");
738		return(0);
739	}
740
741	/* Check validity of program. */
742	if (UCHARAT(prog->program) != MAGIC) {
743		regerror("corrupted program");
744		return(0);
745	}
746
747	/* If there is a "must appear" string, look for it. */
748	if (prog->regmust != NULL) {
749		s = string;
750		while ((s = strchr(s, prog->regmust[0])) != NULL) {
751			if (strncmp(s, prog->regmust, prog->regmlen) == 0)
752				break;	/* Found it. */
753			s++;
754		}
755		if (s == NULL)	/* Not present. */
756			return(0);
757	}
758
759	/* Mark beginning of line for ^ . */
760	if (notbol)
761		regbol = NULL;
762	else
763		regbol = string;
764
765	/* Simplest case:  anchored match need be tried only once. */
766	if (prog->reganch)
767		return(regtry(prog, string));
768
769	/* Messy cases:  unanchored match. */
770	s = string;
771	if (prog->regstart != '\0')
772		/* We know what char it must start with. */
773		while ((s = strchr(s, prog->regstart)) != NULL) {
774			if (regtry(prog, s))
775				return(1);
776			s++;
777		}
778	else
779		/* We don't -- general case. */
780		do {
781			if (regtry(prog, s))
782				return(1);
783		} while (*s++ != '\0');
784
785	/* Failure. */
786	return(0);
787}
788
789int
790regexec(prog, string)
791register regexp *prog;
792register char *string;
793{
794	return regexec2(prog, string, 0);
795}
796
797/*
798 - regtry - try match at specific point
799 */
800static int			/* 0 failure, 1 success */
801regtry(prog, string)
802regexp *prog;
803char *string;
804{
805	register int i;
806	register char **sp;
807	register char **ep;
808
809	reginput = string;
810	regstartp = prog->startp;
811	regendp = prog->endp;
812
813	sp = prog->startp;
814	ep = prog->endp;
815	for (i = NSUBEXP; i > 0; i--) {
816		*sp++ = NULL;
817		*ep++ = NULL;
818	}
819	if (regmatch(prog->program + 1)) {
820		prog->startp[0] = string;
821		prog->endp[0] = reginput;
822		return(1);
823	} else
824		return(0);
825}
826
827/*
828 - regmatch - main matching routine
829 *
830 * Conceptually the strategy is simple:  check to see whether the current
831 * node matches, call self recursively to see whether the rest matches,
832 * and then act accordingly.  In practice we make some effort to avoid
833 * recursion, in particular by going through "ordinary" nodes (that don't
834 * need to know whether the rest of the match failed) by a loop instead of
835 * by recursion.
836 */
837static int			/* 0 failure, 1 success */
838regmatch(prog)
839char *prog;
840{
841	register char *scan;	/* Current node. */
842	char *next;		/* Next node. */
843
844	scan = prog;
845#ifdef DEBUG
846	if (scan != NULL && regnarrate)
847		fprintf(stderr, "%s(\n", regprop(scan));
848#endif
849	while (scan != NULL) {
850#ifdef DEBUG
851		if (regnarrate)
852			fprintf(stderr, "%s...\n", regprop(scan));
853#endif
854		next = regnext(scan);
855
856		switch (OP(scan)) {
857		case BOL:
858			if (reginput != regbol)
859				return(0);
860			break;
861		case EOL:
862			if (*reginput != '\0')
863				return(0);
864			break;
865		case ANY:
866			if (*reginput == '\0')
867				return(0);
868			reginput++;
869			break;
870		case EXACTLY: {
871				register int len;
872				register char *opnd;
873
874				opnd = OPERAND(scan);
875				/* Inline the first character, for speed. */
876				if (*opnd != *reginput)
877					return(0);
878				len = (int) strlen(opnd);
879				if (len > 1 && strncmp(opnd, reginput, len) != 0)
880					return(0);
881				reginput += len;
882			}
883			break;
884		case ANYOF:
885 			if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) == NULL)
886				return(0);
887			reginput++;
888			break;
889		case ANYBUT:
890 			if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) != NULL)
891				return(0);
892			reginput++;
893			break;
894		case NOTHING:
895			break;
896		case BACK:
897			break;
898		case OPEN+1:
899		case OPEN+2:
900		case OPEN+3:
901		case OPEN+4:
902		case OPEN+5:
903		case OPEN+6:
904		case OPEN+7:
905		case OPEN+8:
906		case OPEN+9: {
907				register int no;
908				register char *save;
909
910				no = OP(scan) - OPEN;
911				save = reginput;
912
913				if (regmatch(next)) {
914					/*
915					 * Don't set startp if some later
916					 * invocation of the same parentheses
917					 * already has.
918					 */
919					if (regstartp[no] == NULL)
920						regstartp[no] = save;
921					return(1);
922				} else
923					return(0);
924			}
925			/* NOTREACHED */
926			break;
927		case CLOSE+1:
928		case CLOSE+2:
929		case CLOSE+3:
930		case CLOSE+4:
931		case CLOSE+5:
932		case CLOSE+6:
933		case CLOSE+7:
934		case CLOSE+8:
935		case CLOSE+9: {
936				register int no;
937				register char *save;
938
939				no = OP(scan) - CLOSE;
940				save = reginput;
941
942				if (regmatch(next)) {
943					/*
944					 * Don't set endp if some later
945					 * invocation of the same parentheses
946					 * already has.
947					 */
948					if (regendp[no] == NULL)
949						regendp[no] = save;
950					return(1);
951				} else
952					return(0);
953			}
954			/* NOTREACHED */
955			break;
956		case BRANCH: {
957				register char *save;
958
959				if (OP(next) != BRANCH)		/* No choice. */
960					next = OPERAND(scan);	/* Avoid recursion. */
961				else {
962					do {
963						save = reginput;
964						if (regmatch(OPERAND(scan)))
965							return(1);
966						reginput = save;
967						scan = regnext(scan);
968					} while (scan != NULL && OP(scan) == BRANCH);
969					return(0);
970					/* NOTREACHED */
971				}
972			}
973			/* NOTREACHED */
974			break;
975		case STAR:
976		case PLUS: {
977				register char nextch;
978				register int no;
979				register char *save;
980				register int min;
981
982				/*
983				 * Lookahead to avoid useless match attempts
984				 * when we know what character comes next.
985				 */
986				nextch = '\0';
987				if (OP(next) == EXACTLY)
988					nextch = *OPERAND(next);
989				min = (OP(scan) == STAR) ? 0 : 1;
990				save = reginput;
991				no = regrepeat(OPERAND(scan));
992				while (no >= min) {
993					/* If it could work, try it. */
994					if (nextch == '\0' || *reginput == nextch)
995						if (regmatch(next))
996							return(1);
997					/* Couldn't or didn't -- back up. */
998					no--;
999					reginput = save + no;
1000				}
1001				return(0);
1002			}
1003			/* NOTREACHED */
1004			break;
1005		case END:
1006			return(1);	/* Success! */
1007			/* NOTREACHED */
1008			break;
1009		default:
1010			regerror("memory corruption");
1011			return(0);
1012			/* NOTREACHED */
1013			break;
1014		}
1015
1016		scan = next;
1017	}
1018
1019	/*
1020	 * We get here only if there's trouble -- normally "case END" is
1021	 * the terminating point.
1022	 */
1023	regerror("corrupted pointers");
1024	return(0);
1025}
1026
1027/*
1028 - regrepeat - repeatedly match something simple, report how many
1029 */
1030static int
1031regrepeat(p)
1032char *p;
1033{
1034	register int count = 0;
1035	register char *scan;
1036	register char *opnd;
1037
1038	scan = reginput;
1039	opnd = OPERAND(p);
1040	switch (OP(p)) {
1041	case ANY:
1042		count = (int) strlen(scan);
1043		scan += count;
1044		break;
1045	case EXACTLY:
1046		while (*opnd == *scan) {
1047			count++;
1048			scan++;
1049		}
1050		break;
1051	case ANYOF:
1052		while (*scan != '\0' && strchr(opnd, *scan) != NULL) {
1053			count++;
1054			scan++;
1055		}
1056		break;
1057	case ANYBUT:
1058		while (*scan != '\0' && strchr(opnd, *scan) == NULL) {
1059			count++;
1060			scan++;
1061		}
1062		break;
1063	default:		/* Oh dear.  Called inappropriately. */
1064		regerror("internal foulup");
1065		count = 0;	/* Best compromise. */
1066		break;
1067	}
1068	reginput = scan;
1069
1070	return(count);
1071}
1072
1073/*
1074 - regnext - dig the "next" pointer out of a node
1075 */
1076static char *
1077regnext(p)
1078register char *p;
1079{
1080	register int offset;
1081
1082	if (p == &regdummy)
1083		return(NULL);
1084
1085	offset = NEXT(p);
1086	if (offset == 0)
1087		return(NULL);
1088
1089	if (OP(p) == BACK)
1090		return(p-offset);
1091	else
1092		return(p+offset);
1093}
1094
1095#ifdef DEBUG
1096
1097STATIC char *regprop();
1098
1099/*
1100 - regdump - dump a regexp onto stdout in vaguely comprehensible form
1101 */
1102void
1103regdump(r)
1104regexp *r;
1105{
1106	register char *s;
1107	register char op = EXACTLY;	/* Arbitrary non-END op. */
1108	register char *next;
1109
1110
1111	s = r->program + 1;
1112	while (op != END) {	/* While that wasn't END last time... */
1113		op = OP(s);
1114		printf("%2d%s", s-r->program, regprop(s));	/* Where, what. */
1115		next = regnext(s);
1116		if (next == NULL)		/* Next ptr. */
1117			printf("(0)");
1118		else
1119			printf("(%d)", (s-r->program)+(next-s));
1120		s += 3;
1121		if (op == ANYOF || op == ANYBUT || op == EXACTLY) {
1122			/* Literal string, where present. */
1123			while (*s != '\0') {
1124				putchar(*s);
1125				s++;
1126			}
1127			s++;
1128		}
1129		putchar('\n');
1130	}
1131
1132	/* Header fields of interest. */
1133	if (r->regstart != '\0')
1134		printf("start `%c' ", r->regstart);
1135	if (r->reganch)
1136		printf("anchored ");
1137	if (r->regmust != NULL)
1138		printf("must have \"%s\"", r->regmust);
1139	printf("\n");
1140}
1141
1142/*
1143 - regprop - printable representation of opcode
1144 */
1145static char *
1146regprop(op)
1147char *op;
1148{
1149	register char *p;
1150	static char buf[50];
1151
1152	(void) strcpy(buf, ":");
1153
1154	switch (OP(op)) {
1155	case BOL:
1156		p = "BOL";
1157		break;
1158	case EOL:
1159		p = "EOL";
1160		break;
1161	case ANY:
1162		p = "ANY";
1163		break;
1164	case ANYOF:
1165		p = "ANYOF";
1166		break;
1167	case ANYBUT:
1168		p = "ANYBUT";
1169		break;
1170	case BRANCH:
1171		p = "BRANCH";
1172		break;
1173	case EXACTLY:
1174		p = "EXACTLY";
1175		break;
1176	case NOTHING:
1177		p = "NOTHING";
1178		break;
1179	case BACK:
1180		p = "BACK";
1181		break;
1182	case END:
1183		p = "END";
1184		break;
1185	case OPEN+1:
1186	case OPEN+2:
1187	case OPEN+3:
1188	case OPEN+4:
1189	case OPEN+5:
1190	case OPEN+6:
1191	case OPEN+7:
1192	case OPEN+8:
1193	case OPEN+9:
1194		sprintf(buf+strlen(buf), "OPEN%d", OP(op)-OPEN);
1195		p = NULL;
1196		break;
1197	case CLOSE+1:
1198	case CLOSE+2:
1199	case CLOSE+3:
1200	case CLOSE+4:
1201	case CLOSE+5:
1202	case CLOSE+6:
1203	case CLOSE+7:
1204	case CLOSE+8:
1205	case CLOSE+9:
1206		sprintf(buf+strlen(buf), "CLOSE%d", OP(op)-CLOSE);
1207		p = NULL;
1208		break;
1209	case STAR:
1210		p = "STAR";
1211		break;
1212	case PLUS:
1213		p = "PLUS";
1214		break;
1215	default:
1216		regerror("corrupted opcode");
1217		break;
1218	}
1219	if (p != NULL)
1220		(void) strcat(buf, p);
1221	return(buf);
1222}
1223#endif
1224
1225/*
1226 * The following is provided for those people who do not have strcspn() in
1227 * their C libraries.  They should get off their butts and do something
1228 * about it; at least one public-domain implementation of those (highly
1229 * useful) string routines has been published on Usenet.
1230 */
1231#ifdef STRCSPN
1232/*
1233 * strcspn - find length of initial segment of s1 consisting entirely
1234 * of characters not from s2
1235 */
1236
1237static int
1238strcspn(s1, s2)
1239char *s1;
1240char *s2;
1241{
1242	register char *scan1;
1243	register char *scan2;
1244	register int count;
1245
1246	count = 0;
1247	for (scan1 = s1; *scan1 != '\0'; scan1++) {
1248		for (scan2 = s2; *scan2 != '\0';)	/* ++ moved down. */
1249			if (*scan1 == *scan2++)
1250				return(count);
1251		count++;
1252	}
1253	return(count);
1254}
1255#endif
1256