regcomp.c revision 131973
11573Srgrimes/*-
21573Srgrimes * Copyright (c) 1992, 1993, 1994 Henry Spencer.
31573Srgrimes * Copyright (c) 1992, 1993, 1994
41573Srgrimes *	The Regents of the University of California.  All rights reserved.
51573Srgrimes *
61573Srgrimes * This code is derived from software contributed to Berkeley by
71573Srgrimes * Henry Spencer.
81573Srgrimes *
91573Srgrimes * Redistribution and use in source and binary forms, with or without
101573Srgrimes * modification, are permitted provided that the following conditions
111573Srgrimes * are met:
121573Srgrimes * 1. Redistributions of source code must retain the above copyright
131573Srgrimes *    notice, this list of conditions and the following disclaimer.
141573Srgrimes * 2. Redistributions in binary form must reproduce the above copyright
151573Srgrimes *    notice, this list of conditions and the following disclaimer in the
161573Srgrimes *    documentation and/or other materials provided with the distribution.
171573Srgrimes * 3. All advertising materials mentioning features or use of this software
181573Srgrimes *    must display the following acknowledgement:
191573Srgrimes *	This product includes software developed by the University of
201573Srgrimes *	California, Berkeley and its contributors.
211573Srgrimes * 4. Neither the name of the University nor the names of its contributors
221573Srgrimes *    may be used to endorse or promote products derived from this software
231573Srgrimes *    without specific prior written permission.
241573Srgrimes *
251573Srgrimes * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
261573Srgrimes * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
271573Srgrimes * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
281573Srgrimes * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
291573Srgrimes * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
301573Srgrimes * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
311573Srgrimes * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
321573Srgrimes * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
331573Srgrimes * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
341573Srgrimes * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
351573Srgrimes * SUCH DAMAGE.
361573Srgrimes *
371573Srgrimes *	@(#)regcomp.c	8.5 (Berkeley) 3/20/94
381573Srgrimes */
391573Srgrimes
401573Srgrimes#if defined(LIBC_SCCS) && !defined(lint)
411573Srgrimesstatic char sccsid[] = "@(#)regcomp.c	8.5 (Berkeley) 3/20/94";
421573Srgrimes#endif /* LIBC_SCCS and not lint */
4392986Sobrien#include <sys/cdefs.h>
4492986Sobrien__FBSDID("$FreeBSD: head/lib/libc/regex/regcomp.c 131973 2004-07-11 05:58:31Z tjr $");
451573Srgrimes
461573Srgrimes#include <sys/types.h>
471573Srgrimes#include <stdio.h>
481573Srgrimes#include <string.h>
491573Srgrimes#include <ctype.h>
501573Srgrimes#include <limits.h>
511573Srgrimes#include <stdlib.h>
521573Srgrimes#include <regex.h>
531573Srgrimes
5419277Sache#include "collate.h"
5519277Sache
561573Srgrimes#include "utils.h"
571573Srgrimes#include "regex2.h"
581573Srgrimes
591573Srgrimes#include "cclass.h"
601573Srgrimes#include "cname.h"
611573Srgrimes
621573Srgrimes/*
631573Srgrimes * parse structure, passed up and down to avoid global variables and
641573Srgrimes * other clumsinesses
651573Srgrimes */
661573Srgrimesstruct parse {
671573Srgrimes	char *next;		/* next character in RE */
681573Srgrimes	char *end;		/* end of string (-> NUL normally) */
691573Srgrimes	int error;		/* has an error been seen? */
701573Srgrimes	sop *strip;		/* malloced strip */
711573Srgrimes	sopno ssize;		/* malloced strip size (allocated) */
721573Srgrimes	sopno slen;		/* malloced strip length (used) */
731573Srgrimes	int ncsalloc;		/* number of csets allocated */
741573Srgrimes	struct re_guts *g;
751573Srgrimes#	define	NPAREN	10	/* we need to remember () 1-9 for back refs */
761573Srgrimes	sopno pbegin[NPAREN];	/* -> ( ([0] unused) */
771573Srgrimes	sopno pend[NPAREN];	/* -> ) ([0] unused) */
781573Srgrimes};
791573Srgrimes
801573Srgrimes/* ========= begin header generated by ./mkh ========= */
811573Srgrimes#ifdef __cplusplus
821573Srgrimesextern "C" {
831573Srgrimes#endif
841573Srgrimes
851573Srgrimes/* === regcomp.c === */
8692905Sobrienstatic void p_ere(struct parse *p, int stop);
8792905Sobrienstatic void p_ere_exp(struct parse *p);
8892905Sobrienstatic void p_str(struct parse *p);
8992905Sobrienstatic void p_bre(struct parse *p, int end1, int end2);
9092905Sobrienstatic int p_simp_re(struct parse *p, int starordinary);
9192905Sobrienstatic int p_count(struct parse *p);
9292905Sobrienstatic void p_bracket(struct parse *p);
9392905Sobrienstatic void p_b_term(struct parse *p, cset *cs);
9492905Sobrienstatic void p_b_cclass(struct parse *p, cset *cs);
9592905Sobrienstatic void p_b_eclass(struct parse *p, cset *cs);
9692905Sobrienstatic char p_b_symbol(struct parse *p);
9792905Sobrienstatic char p_b_coll_elem(struct parse *p, int endc);
9892905Sobrienstatic char othercase(int ch);
9992905Sobrienstatic void bothcases(struct parse *p, int ch);
10092905Sobrienstatic void ordinary(struct parse *p, int ch);
10192905Sobrienstatic void nonnewline(struct parse *p);
10292905Sobrienstatic void repeat(struct parse *p, sopno start, int from, int to);
10392905Sobrienstatic int seterr(struct parse *p, int e);
10492905Sobrienstatic cset *allocset(struct parse *p);
10592905Sobrienstatic void freeset(struct parse *p, cset *cs);
10692905Sobrienstatic int freezeset(struct parse *p, cset *cs);
10792905Sobrienstatic int firstch(struct parse *p, cset *cs);
10892905Sobrienstatic int nch(struct parse *p, cset *cs);
10992905Sobrienstatic sopno dupl(struct parse *p, sopno start, sopno finish);
11092905Sobrienstatic void doemit(struct parse *p, sop op, size_t opnd);
11192905Sobrienstatic void doinsert(struct parse *p, sop op, size_t opnd, sopno pos);
11292905Sobrienstatic void dofwd(struct parse *p, sopno pos, sop value);
11392905Sobrienstatic void enlarge(struct parse *p, sopno size);
11492905Sobrienstatic void stripsnug(struct parse *p, struct re_guts *g);
11592905Sobrienstatic void findmust(struct parse *p, struct re_guts *g);
116131973Stjrstatic int altoffset(sop *scan, int offset);
11792905Sobrienstatic void computejumps(struct parse *p, struct re_guts *g);
11892905Sobrienstatic void computematchjumps(struct parse *p, struct re_guts *g);
11992905Sobrienstatic sopno pluscount(struct parse *p, struct re_guts *g);
1201573Srgrimes
1211573Srgrimes#ifdef __cplusplus
1221573Srgrimes}
1231573Srgrimes#endif
1241573Srgrimes/* ========= end header generated by ./mkh ========= */
1251573Srgrimes
1261573Srgrimesstatic char nuls[10];		/* place to point scanner in event of error */
1271573Srgrimes
1281573Srgrimes/*
1291573Srgrimes * macros for use with parse structure
1301573Srgrimes * BEWARE:  these know that the parse structure is named `p' !!!
1311573Srgrimes */
1321573Srgrimes#define	PEEK()	(*p->next)
1331573Srgrimes#define	PEEK2()	(*(p->next+1))
1341573Srgrimes#define	MORE()	(p->next < p->end)
1351573Srgrimes#define	MORE2()	(p->next+1 < p->end)
1361573Srgrimes#define	SEE(c)	(MORE() && PEEK() == (c))
1371573Srgrimes#define	SEETWO(a, b)	(MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b))
1381573Srgrimes#define	EAT(c)	((SEE(c)) ? (NEXT(), 1) : 0)
1391573Srgrimes#define	EATTWO(a, b)	((SEETWO(a, b)) ? (NEXT2(), 1) : 0)
1401573Srgrimes#define	NEXT()	(p->next++)
1411573Srgrimes#define	NEXT2()	(p->next += 2)
1421573Srgrimes#define	NEXTn(n)	(p->next += (n))
1431573Srgrimes#define	GETNEXT()	(*p->next++)
1441573Srgrimes#define	SETERROR(e)	seterr(p, (e))
1451573Srgrimes#define	REQUIRE(co, e)	((co) || SETERROR(e))
1461573Srgrimes#define	MUSTSEE(c, e)	(REQUIRE(MORE() && PEEK() == (c), e))
1471573Srgrimes#define	MUSTEAT(c, e)	(REQUIRE(MORE() && GETNEXT() == (c), e))
1481573Srgrimes#define	MUSTNOTSEE(c, e)	(REQUIRE(!MORE() || PEEK() != (c), e))
1491573Srgrimes#define	EMIT(op, sopnd)	doemit(p, (sop)(op), (size_t)(sopnd))
1501573Srgrimes#define	INSERT(op, pos)	doinsert(p, (sop)(op), HERE()-(pos)+1, pos)
1511573Srgrimes#define	AHEAD(pos)		dofwd(p, pos, HERE()-(pos))
1521573Srgrimes#define	ASTERN(sop, pos)	EMIT(sop, HERE()-pos)
1531573Srgrimes#define	HERE()		(p->slen)
1541573Srgrimes#define	THERE()		(p->slen - 1)
1551573Srgrimes#define	THERETHERE()	(p->slen - 2)
1561573Srgrimes#define	DROP(n)	(p->slen -= (n))
1571573Srgrimes
1581573Srgrimes#ifndef NDEBUG
1591573Srgrimesstatic int never = 0;		/* for use in asserts; shuts lint up */
1601573Srgrimes#else
1611573Srgrimes#define	never	0		/* some <assert.h>s have bugs too */
1621573Srgrimes#endif
1631573Srgrimes
16462232Sdcs/* Macro used by computejump()/computematchjump() */
16562232Sdcs#define MIN(a,b)	((a)<(b)?(a):(b))
16662232Sdcs
1671573Srgrimes/*
1681573Srgrimes - regcomp - interface for parser and compilation
1691573Srgrimes = extern int regcomp(regex_t *, const char *, int);
1701573Srgrimes = #define	REG_BASIC	0000
1711573Srgrimes = #define	REG_EXTENDED	0001
1721573Srgrimes = #define	REG_ICASE	0002
1731573Srgrimes = #define	REG_NOSUB	0004
1741573Srgrimes = #define	REG_NEWLINE	0010
1751573Srgrimes = #define	REG_NOSPEC	0020
1761573Srgrimes = #define	REG_PEND	0040
1771573Srgrimes = #define	REG_DUMP	0200
1781573Srgrimes */
1791573Srgrimesint				/* 0 success, otherwise REG_something */
1801573Srgrimesregcomp(preg, pattern, cflags)
181104358Smikeregex_t * __restrict preg;
182104358Smikeconst char * __restrict pattern;
1831573Srgrimesint cflags;
1841573Srgrimes{
1851573Srgrimes	struct parse pa;
18692889Sobrien	struct re_guts *g;
18792889Sobrien	struct parse *p = &pa;
18892889Sobrien	int i;
18992889Sobrien	size_t len;
1901573Srgrimes#ifdef REDEBUG
1911573Srgrimes#	define	GOODFLAGS(f)	(f)
1921573Srgrimes#else
1931573Srgrimes#	define	GOODFLAGS(f)	((f)&~REG_DUMP)
1941573Srgrimes#endif
1951573Srgrimes
1961573Srgrimes	cflags = GOODFLAGS(cflags);
1971573Srgrimes	if ((cflags&REG_EXTENDED) && (cflags&REG_NOSPEC))
1981573Srgrimes		return(REG_INVARG);
1991573Srgrimes
2001573Srgrimes	if (cflags&REG_PEND) {
2011573Srgrimes		if (preg->re_endp < pattern)
2021573Srgrimes			return(REG_INVARG);
2031573Srgrimes		len = preg->re_endp - pattern;
2041573Srgrimes	} else
2051573Srgrimes		len = strlen((char *)pattern);
2061573Srgrimes
2071573Srgrimes	/* do the mallocs early so failure handling is easy */
208131973Stjr	g = (struct re_guts *)malloc(sizeof(struct re_guts));
2091573Srgrimes	if (g == NULL)
2101573Srgrimes		return(REG_ESPACE);
2111573Srgrimes	p->ssize = len/(size_t)2*(size_t)3 + (size_t)1;	/* ugh */
2121573Srgrimes	p->strip = (sop *)malloc(p->ssize * sizeof(sop));
2131573Srgrimes	p->slen = 0;
2141573Srgrimes	if (p->strip == NULL) {
2151573Srgrimes		free((char *)g);
2161573Srgrimes		return(REG_ESPACE);
2171573Srgrimes	}
2181573Srgrimes
2191573Srgrimes	/* set things up */
2201573Srgrimes	p->g = g;
2211573Srgrimes	p->next = (char *)pattern;	/* convenience; we do not modify it */
2221573Srgrimes	p->end = p->next + len;
2231573Srgrimes	p->error = 0;
2241573Srgrimes	p->ncsalloc = 0;
2251573Srgrimes	for (i = 0; i < NPAREN; i++) {
2261573Srgrimes		p->pbegin[i] = 0;
2271573Srgrimes		p->pend[i] = 0;
2281573Srgrimes	}
2291573Srgrimes	g->csetsize = NC;
2301573Srgrimes	g->sets = NULL;
2311573Srgrimes	g->setbits = NULL;
2321573Srgrimes	g->ncsets = 0;
2331573Srgrimes	g->cflags = cflags;
2341573Srgrimes	g->iflags = 0;
2351573Srgrimes	g->nbol = 0;
2361573Srgrimes	g->neol = 0;
2371573Srgrimes	g->must = NULL;
23862391Sdcs	g->moffset = -1;
23962263Sdcs	g->charjump = NULL;
24062263Sdcs	g->matchjump = NULL;
2411573Srgrimes	g->mlen = 0;
2421573Srgrimes	g->nsub = 0;
2431573Srgrimes	g->backrefs = 0;
2441573Srgrimes
2451573Srgrimes	/* do it */
2461573Srgrimes	EMIT(OEND, 0);
2471573Srgrimes	g->firststate = THERE();
2481573Srgrimes	if (cflags&REG_EXTENDED)
2491573Srgrimes		p_ere(p, OUT);
2501573Srgrimes	else if (cflags&REG_NOSPEC)
2511573Srgrimes		p_str(p);
2521573Srgrimes	else
2531573Srgrimes		p_bre(p, OUT, OUT);
2541573Srgrimes	EMIT(OEND, 0);
2551573Srgrimes	g->laststate = THERE();
2561573Srgrimes
2571573Srgrimes	/* tidy up loose ends and fill things in */
2581573Srgrimes	stripsnug(p, g);
2591573Srgrimes	findmust(p, g);
26062232Sdcs	/* only use Boyer-Moore algorithm if the pattern is bigger
26162232Sdcs	 * than three characters
26262232Sdcs	 */
26362232Sdcs	if(g->mlen > 3) {
26462232Sdcs		computejumps(p, g);
26562232Sdcs		computematchjumps(p, g);
26662755Sdcs		if(g->matchjump == NULL && g->charjump != NULL) {
26762232Sdcs			free(g->charjump);
26862232Sdcs			g->charjump = NULL;
26962232Sdcs		}
27062232Sdcs	}
2711573Srgrimes	g->nplus = pluscount(p, g);
2721573Srgrimes	g->magic = MAGIC2;
2731573Srgrimes	preg->re_nsub = g->nsub;
2741573Srgrimes	preg->re_g = g;
2751573Srgrimes	preg->re_magic = MAGIC1;
2761573Srgrimes#ifndef REDEBUG
2771573Srgrimes	/* not debugging, so can't rely on the assert() in regexec() */
2781573Srgrimes	if (g->iflags&BAD)
2791573Srgrimes		SETERROR(REG_ASSERT);
2801573Srgrimes#endif
2811573Srgrimes
2821573Srgrimes	/* win or lose, we're done */
2831573Srgrimes	if (p->error != 0)	/* lose */
2841573Srgrimes		regfree(preg);
2851573Srgrimes	return(p->error);
2861573Srgrimes}
2871573Srgrimes
2881573Srgrimes/*
2891573Srgrimes - p_ere - ERE parser top level, concatenation and alternation
29092889Sobrien == static void p_ere(struct parse *p, int stop);
2911573Srgrimes */
2921573Srgrimesstatic void
2931573Srgrimesp_ere(p, stop)
29492889Sobrienstruct parse *p;
2951573Srgrimesint stop;			/* character this ERE should end at */
2961573Srgrimes{
29792889Sobrien	char c;
29892889Sobrien	sopno prevback;
29992889Sobrien	sopno prevfwd;
30092889Sobrien	sopno conc;
30192889Sobrien	int first = 1;		/* is this the first alternative? */
3021573Srgrimes
3031573Srgrimes	for (;;) {
3041573Srgrimes		/* do a bunch of concatenated expressions */
3051573Srgrimes		conc = HERE();
3061573Srgrimes		while (MORE() && (c = PEEK()) != '|' && c != stop)
3071573Srgrimes			p_ere_exp(p);
30817141Sjkh		(void)REQUIRE(HERE() != conc, REG_EMPTY);	/* require nonempty */
3091573Srgrimes
3101573Srgrimes		if (!EAT('|'))
3111573Srgrimes			break;		/* NOTE BREAK OUT */
3121573Srgrimes
3131573Srgrimes		if (first) {
3141573Srgrimes			INSERT(OCH_, conc);	/* offset is wrong */
3151573Srgrimes			prevfwd = conc;
3161573Srgrimes			prevback = conc;
3171573Srgrimes			first = 0;
3181573Srgrimes		}
3191573Srgrimes		ASTERN(OOR1, prevback);
3201573Srgrimes		prevback = THERE();
3211573Srgrimes		AHEAD(prevfwd);			/* fix previous offset */
3221573Srgrimes		prevfwd = HERE();
3231573Srgrimes		EMIT(OOR2, 0);			/* offset is very wrong */
3241573Srgrimes	}
3251573Srgrimes
3261573Srgrimes	if (!first) {		/* tail-end fixups */
3271573Srgrimes		AHEAD(prevfwd);
3281573Srgrimes		ASTERN(O_CH, prevback);
3291573Srgrimes	}
3301573Srgrimes
3311573Srgrimes	assert(!MORE() || SEE(stop));
3321573Srgrimes}
3331573Srgrimes
3341573Srgrimes/*
3351573Srgrimes - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op
33692889Sobrien == static void p_ere_exp(struct parse *p);
3371573Srgrimes */
3381573Srgrimesstatic void
3391573Srgrimesp_ere_exp(p)
34092889Sobrienstruct parse *p;
3411573Srgrimes{
34292889Sobrien	char c;
34392889Sobrien	sopno pos;
34492889Sobrien	int count;
34592889Sobrien	int count2;
34692889Sobrien	sopno subno;
3471573Srgrimes	int wascaret = 0;
3481573Srgrimes
3491573Srgrimes	assert(MORE());		/* caller should have ensured this */
3501573Srgrimes	c = GETNEXT();
3511573Srgrimes
3521573Srgrimes	pos = HERE();
3531573Srgrimes	switch (c) {
3541573Srgrimes	case '(':
35517141Sjkh		(void)REQUIRE(MORE(), REG_EPAREN);
3561573Srgrimes		p->g->nsub++;
3571573Srgrimes		subno = p->g->nsub;
3581573Srgrimes		if (subno < NPAREN)
3591573Srgrimes			p->pbegin[subno] = HERE();
3601573Srgrimes		EMIT(OLPAREN, subno);
3611573Srgrimes		if (!SEE(')'))
3621573Srgrimes			p_ere(p, ')');
3631573Srgrimes		if (subno < NPAREN) {
3641573Srgrimes			p->pend[subno] = HERE();
3651573Srgrimes			assert(p->pend[subno] != 0);
3661573Srgrimes		}
3671573Srgrimes		EMIT(ORPAREN, subno);
36817141Sjkh		(void)MUSTEAT(')', REG_EPAREN);
3691573Srgrimes		break;
3701573Srgrimes#ifndef POSIX_MISTAKE
3711573Srgrimes	case ')':		/* happens only if no current unmatched ( */
3721573Srgrimes		/*
3731573Srgrimes		 * You may ask, why the ifndef?  Because I didn't notice
3741573Srgrimes		 * this until slightly too late for 1003.2, and none of the
3751573Srgrimes		 * other 1003.2 regular-expression reviewers noticed it at
3761573Srgrimes		 * all.  So an unmatched ) is legal POSIX, at least until
3771573Srgrimes		 * we can get it fixed.
3781573Srgrimes		 */
3791573Srgrimes		SETERROR(REG_EPAREN);
3801573Srgrimes		break;
3811573Srgrimes#endif
3821573Srgrimes	case '^':
3831573Srgrimes		EMIT(OBOL, 0);
3841573Srgrimes		p->g->iflags |= USEBOL;
3851573Srgrimes		p->g->nbol++;
3861573Srgrimes		wascaret = 1;
3871573Srgrimes		break;
3881573Srgrimes	case '$':
3891573Srgrimes		EMIT(OEOL, 0);
3901573Srgrimes		p->g->iflags |= USEEOL;
3911573Srgrimes		p->g->neol++;
3921573Srgrimes		break;
3931573Srgrimes	case '|':
3941573Srgrimes		SETERROR(REG_EMPTY);
3951573Srgrimes		break;
3961573Srgrimes	case '*':
3971573Srgrimes	case '+':
3981573Srgrimes	case '?':
3991573Srgrimes		SETERROR(REG_BADRPT);
4001573Srgrimes		break;
4011573Srgrimes	case '.':
4021573Srgrimes		if (p->g->cflags&REG_NEWLINE)
4031573Srgrimes			nonnewline(p);
4041573Srgrimes		else
4051573Srgrimes			EMIT(OANY, 0);
4061573Srgrimes		break;
4071573Srgrimes	case '[':
4081573Srgrimes		p_bracket(p);
4091573Srgrimes		break;
4101573Srgrimes	case '\\':
41117141Sjkh		(void)REQUIRE(MORE(), REG_EESCAPE);
4121573Srgrimes		c = GETNEXT();
4131573Srgrimes		ordinary(p, c);
4141573Srgrimes		break;
4151573Srgrimes	case '{':		/* okay as ordinary except if digit follows */
41649094Sache		(void)REQUIRE(!MORE() || !isdigit((uch)PEEK()), REG_BADRPT);
4171573Srgrimes		/* FALLTHROUGH */
4181573Srgrimes	default:
4191573Srgrimes		ordinary(p, c);
4201573Srgrimes		break;
4211573Srgrimes	}
4221573Srgrimes
4231573Srgrimes	if (!MORE())
4241573Srgrimes		return;
4251573Srgrimes	c = PEEK();
4261573Srgrimes	/* we call { a repetition if followed by a digit */
4271573Srgrimes	if (!( c == '*' || c == '+' || c == '?' ||
42849094Sache				(c == '{' && MORE2() && isdigit((uch)PEEK2())) ))
4291573Srgrimes		return;		/* no repetition, we're done */
4301573Srgrimes	NEXT();
4311573Srgrimes
43217141Sjkh	(void)REQUIRE(!wascaret, REG_BADRPT);
4331573Srgrimes	switch (c) {
4341573Srgrimes	case '*':	/* implemented as +? */
4351573Srgrimes		/* this case does not require the (y|) trick, noKLUDGE */
4361573Srgrimes		INSERT(OPLUS_, pos);
4371573Srgrimes		ASTERN(O_PLUS, pos);
4381573Srgrimes		INSERT(OQUEST_, pos);
4391573Srgrimes		ASTERN(O_QUEST, pos);
4401573Srgrimes		break;
4411573Srgrimes	case '+':
4421573Srgrimes		INSERT(OPLUS_, pos);
4431573Srgrimes		ASTERN(O_PLUS, pos);
4441573Srgrimes		break;
4451573Srgrimes	case '?':
4461573Srgrimes		/* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
4471573Srgrimes		INSERT(OCH_, pos);		/* offset slightly wrong */
4481573Srgrimes		ASTERN(OOR1, pos);		/* this one's right */
4491573Srgrimes		AHEAD(pos);			/* fix the OCH_ */
4501573Srgrimes		EMIT(OOR2, 0);			/* offset very wrong... */
4511573Srgrimes		AHEAD(THERE());			/* ...so fix it */
4521573Srgrimes		ASTERN(O_CH, THERETHERE());
4531573Srgrimes		break;
4541573Srgrimes	case '{':
4551573Srgrimes		count = p_count(p);
4561573Srgrimes		if (EAT(',')) {
45749094Sache			if (isdigit((uch)PEEK())) {
4581573Srgrimes				count2 = p_count(p);
45917141Sjkh				(void)REQUIRE(count <= count2, REG_BADBR);
4601573Srgrimes			} else		/* single number with comma */
4611573Srgrimes				count2 = INFINITY;
4621573Srgrimes		} else		/* just a single number */
4631573Srgrimes			count2 = count;
4641573Srgrimes		repeat(p, pos, count, count2);
4651573Srgrimes		if (!EAT('}')) {	/* error heuristics */
4661573Srgrimes			while (MORE() && PEEK() != '}')
4671573Srgrimes				NEXT();
46817141Sjkh			(void)REQUIRE(MORE(), REG_EBRACE);
4691573Srgrimes			SETERROR(REG_BADBR);
4701573Srgrimes		}
4711573Srgrimes		break;
4721573Srgrimes	}
4731573Srgrimes
4741573Srgrimes	if (!MORE())
4751573Srgrimes		return;
4761573Srgrimes	c = PEEK();
4771573Srgrimes	if (!( c == '*' || c == '+' || c == '?' ||
47849094Sache				(c == '{' && MORE2() && isdigit((uch)PEEK2())) ) )
4791573Srgrimes		return;
4801573Srgrimes	SETERROR(REG_BADRPT);
4811573Srgrimes}
4821573Srgrimes
4831573Srgrimes/*
4841573Srgrimes - p_str - string (no metacharacters) "parser"
48592889Sobrien == static void p_str(struct parse *p);
4861573Srgrimes */
4871573Srgrimesstatic void
4881573Srgrimesp_str(p)
48992889Sobrienstruct parse *p;
4901573Srgrimes{
49117141Sjkh	(void)REQUIRE(MORE(), REG_EMPTY);
4921573Srgrimes	while (MORE())
4931573Srgrimes		ordinary(p, GETNEXT());
4941573Srgrimes}
4951573Srgrimes
4961573Srgrimes/*
4971573Srgrimes - p_bre - BRE parser top level, anchoring and concatenation
49892889Sobrien == static void p_bre(struct parse *p, int end1, \
49992889Sobrien ==	int end2);
5001573Srgrimes * Giving end1 as OUT essentially eliminates the end1/end2 check.
5011573Srgrimes *
5021573Srgrimes * This implementation is a bit of a kludge, in that a trailing $ is first
503131973Stjr * taken as an ordinary character and then revised to be an anchor.
5041573Srgrimes * The amount of lookahead needed to avoid this kludge is excessive.
5051573Srgrimes */
5061573Srgrimesstatic void
5071573Srgrimesp_bre(p, end1, end2)
50892889Sobrienstruct parse *p;
50992889Sobrienint end1;			/* first terminating character */
51092889Sobrienint end2;			/* second terminating character */
5111573Srgrimes{
51292889Sobrien	sopno start = HERE();
51392889Sobrien	int first = 1;			/* first subexpression? */
51492889Sobrien	int wasdollar = 0;
5151573Srgrimes
5161573Srgrimes	if (EAT('^')) {
5171573Srgrimes		EMIT(OBOL, 0);
5181573Srgrimes		p->g->iflags |= USEBOL;
5191573Srgrimes		p->g->nbol++;
5201573Srgrimes	}
5211573Srgrimes	while (MORE() && !SEETWO(end1, end2)) {
5221573Srgrimes		wasdollar = p_simp_re(p, first);
5231573Srgrimes		first = 0;
5241573Srgrimes	}
5251573Srgrimes	if (wasdollar) {	/* oops, that was a trailing anchor */
5261573Srgrimes		DROP(1);
5271573Srgrimes		EMIT(OEOL, 0);
5281573Srgrimes		p->g->iflags |= USEEOL;
5291573Srgrimes		p->g->neol++;
5301573Srgrimes	}
5311573Srgrimes
53217141Sjkh	(void)REQUIRE(HERE() != start, REG_EMPTY);	/* require nonempty */
5331573Srgrimes}
5341573Srgrimes
5351573Srgrimes/*
5361573Srgrimes - p_simp_re - parse a simple RE, an atom possibly followed by a repetition
53792889Sobrien == static int p_simp_re(struct parse *p, int starordinary);
5381573Srgrimes */
5391573Srgrimesstatic int			/* was the simple RE an unbackslashed $? */
5401573Srgrimesp_simp_re(p, starordinary)
54192889Sobrienstruct parse *p;
5421573Srgrimesint starordinary;		/* is a leading * an ordinary character? */
5431573Srgrimes{
54492889Sobrien	int c;
54592889Sobrien	int count;
54692889Sobrien	int count2;
54792889Sobrien	sopno pos;
54892889Sobrien	int i;
54992889Sobrien	sopno subno;
5501573Srgrimes#	define	BACKSL	(1<<CHAR_BIT)
5511573Srgrimes
5521573Srgrimes	pos = HERE();		/* repetion op, if any, covers from here */
5531573Srgrimes
5541573Srgrimes	assert(MORE());		/* caller should have ensured this */
5551573Srgrimes	c = GETNEXT();
5561573Srgrimes	if (c == '\\') {
55717141Sjkh		(void)REQUIRE(MORE(), REG_EESCAPE);
55849094Sache		c = BACKSL | GETNEXT();
5591573Srgrimes	}
5601573Srgrimes	switch (c) {
5611573Srgrimes	case '.':
5621573Srgrimes		if (p->g->cflags&REG_NEWLINE)
5631573Srgrimes			nonnewline(p);
5641573Srgrimes		else
5651573Srgrimes			EMIT(OANY, 0);
5661573Srgrimes		break;
5671573Srgrimes	case '[':
5681573Srgrimes		p_bracket(p);
5691573Srgrimes		break;
5701573Srgrimes	case BACKSL|'{':
5711573Srgrimes		SETERROR(REG_BADRPT);
5721573Srgrimes		break;
5731573Srgrimes	case BACKSL|'(':
5741573Srgrimes		p->g->nsub++;
5751573Srgrimes		subno = p->g->nsub;
5761573Srgrimes		if (subno < NPAREN)
5771573Srgrimes			p->pbegin[subno] = HERE();
5781573Srgrimes		EMIT(OLPAREN, subno);
5791573Srgrimes		/* the MORE here is an error heuristic */
5801573Srgrimes		if (MORE() && !SEETWO('\\', ')'))
5811573Srgrimes			p_bre(p, '\\', ')');
5821573Srgrimes		if (subno < NPAREN) {
5831573Srgrimes			p->pend[subno] = HERE();
5841573Srgrimes			assert(p->pend[subno] != 0);
5851573Srgrimes		}
5861573Srgrimes		EMIT(ORPAREN, subno);
58717141Sjkh		(void)REQUIRE(EATTWO('\\', ')'), REG_EPAREN);
5881573Srgrimes		break;
5891573Srgrimes	case BACKSL|')':	/* should not get here -- must be user */
5901573Srgrimes	case BACKSL|'}':
5911573Srgrimes		SETERROR(REG_EPAREN);
5921573Srgrimes		break;
5931573Srgrimes	case BACKSL|'1':
5941573Srgrimes	case BACKSL|'2':
5951573Srgrimes	case BACKSL|'3':
5961573Srgrimes	case BACKSL|'4':
5971573Srgrimes	case BACKSL|'5':
5981573Srgrimes	case BACKSL|'6':
5991573Srgrimes	case BACKSL|'7':
6001573Srgrimes	case BACKSL|'8':
6011573Srgrimes	case BACKSL|'9':
6021573Srgrimes		i = (c&~BACKSL) - '0';
6031573Srgrimes		assert(i < NPAREN);
6041573Srgrimes		if (p->pend[i] != 0) {
6051573Srgrimes			assert(i <= p->g->nsub);
6061573Srgrimes			EMIT(OBACK_, i);
6071573Srgrimes			assert(p->pbegin[i] != 0);
6081573Srgrimes			assert(OP(p->strip[p->pbegin[i]]) == OLPAREN);
6091573Srgrimes			assert(OP(p->strip[p->pend[i]]) == ORPAREN);
6101573Srgrimes			(void) dupl(p, p->pbegin[i]+1, p->pend[i]);
6111573Srgrimes			EMIT(O_BACK, i);
6121573Srgrimes		} else
6131573Srgrimes			SETERROR(REG_ESUBREG);
6141573Srgrimes		p->g->backrefs = 1;
6151573Srgrimes		break;
6161573Srgrimes	case '*':
61717141Sjkh		(void)REQUIRE(starordinary, REG_BADRPT);
6181573Srgrimes		/* FALLTHROUGH */
6191573Srgrimes	default:
62049094Sache		ordinary(p, (char)c);
6211573Srgrimes		break;
6221573Srgrimes	}
6231573Srgrimes
6241573Srgrimes	if (EAT('*')) {		/* implemented as +? */
6251573Srgrimes		/* this case does not require the (y|) trick, noKLUDGE */
6261573Srgrimes		INSERT(OPLUS_, pos);
6271573Srgrimes		ASTERN(O_PLUS, pos);
6281573Srgrimes		INSERT(OQUEST_, pos);
6291573Srgrimes		ASTERN(O_QUEST, pos);
6301573Srgrimes	} else if (EATTWO('\\', '{')) {
6311573Srgrimes		count = p_count(p);
6321573Srgrimes		if (EAT(',')) {
63349094Sache			if (MORE() && isdigit((uch)PEEK())) {
6341573Srgrimes				count2 = p_count(p);
63517141Sjkh				(void)REQUIRE(count <= count2, REG_BADBR);
6361573Srgrimes			} else		/* single number with comma */
6371573Srgrimes				count2 = INFINITY;
6381573Srgrimes		} else		/* just a single number */
6391573Srgrimes			count2 = count;
6401573Srgrimes		repeat(p, pos, count, count2);
6411573Srgrimes		if (!EATTWO('\\', '}')) {	/* error heuristics */
6421573Srgrimes			while (MORE() && !SEETWO('\\', '}'))
6431573Srgrimes				NEXT();
64417141Sjkh			(void)REQUIRE(MORE(), REG_EBRACE);
6451573Srgrimes			SETERROR(REG_BADBR);
6461573Srgrimes		}
64749094Sache	} else if (c == '$')     /* $ (but not \$) ends it */
6481573Srgrimes		return(1);
6491573Srgrimes
6501573Srgrimes	return(0);
6511573Srgrimes}
6521573Srgrimes
6531573Srgrimes/*
6541573Srgrimes - p_count - parse a repetition count
65592889Sobrien == static int p_count(struct parse *p);
6561573Srgrimes */
6571573Srgrimesstatic int			/* the value */
6581573Srgrimesp_count(p)
65992889Sobrienstruct parse *p;
6601573Srgrimes{
66192889Sobrien	int count = 0;
66292889Sobrien	int ndigits = 0;
6631573Srgrimes
66449094Sache	while (MORE() && isdigit((uch)PEEK()) && count <= DUPMAX) {
6651573Srgrimes		count = count*10 + (GETNEXT() - '0');
6661573Srgrimes		ndigits++;
6671573Srgrimes	}
6681573Srgrimes
66917141Sjkh	(void)REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR);
6701573Srgrimes	return(count);
6711573Srgrimes}
6721573Srgrimes
6731573Srgrimes/*
6741573Srgrimes - p_bracket - parse a bracketed character list
67592889Sobrien == static void p_bracket(struct parse *p);
6761573Srgrimes *
6771573Srgrimes * Note a significant property of this code:  if the allocset() did SETERROR,
6781573Srgrimes * no set operations are done.
6791573Srgrimes */
6801573Srgrimesstatic void
6811573Srgrimesp_bracket(p)
68292889Sobrienstruct parse *p;
6831573Srgrimes{
68492889Sobrien	cset *cs = allocset(p);
68592889Sobrien	int invert = 0;
6861573Srgrimes
6871573Srgrimes	/* Dept of Truly Sickening Special-Case Kludges */
6881573Srgrimes	if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]", 6) == 0) {
6891573Srgrimes		EMIT(OBOW, 0);
6901573Srgrimes		NEXTn(6);
6911573Srgrimes		return;
6921573Srgrimes	}
6931573Srgrimes	if (p->next + 5 < p->end && strncmp(p->next, "[:>:]]", 6) == 0) {
6941573Srgrimes		EMIT(OEOW, 0);
6951573Srgrimes		NEXTn(6);
6961573Srgrimes		return;
6971573Srgrimes	}
6981573Srgrimes
6991573Srgrimes	if (EAT('^'))
7001573Srgrimes		invert++;	/* make note to invert set at end */
7011573Srgrimes	if (EAT(']'))
7021573Srgrimes		CHadd(cs, ']');
7031573Srgrimes	else if (EAT('-'))
7041573Srgrimes		CHadd(cs, '-');
7051573Srgrimes	while (MORE() && PEEK() != ']' && !SEETWO('-', ']'))
7061573Srgrimes		p_b_term(p, cs);
7071573Srgrimes	if (EAT('-'))
7081573Srgrimes		CHadd(cs, '-');
70917141Sjkh	(void)MUSTEAT(']', REG_EBRACK);
7101573Srgrimes
7111573Srgrimes	if (p->error != 0)	/* don't mess things up further */
7121573Srgrimes		return;
7131573Srgrimes
7141573Srgrimes	if (p->g->cflags&REG_ICASE) {
71592889Sobrien		int i;
71692889Sobrien		int ci;
7171573Srgrimes
7181573Srgrimes		for (i = p->g->csetsize - 1; i >= 0; i--)
7191573Srgrimes			if (CHIN(cs, i) && isalpha(i)) {
7201573Srgrimes				ci = othercase(i);
7211573Srgrimes				if (ci != i)
7221573Srgrimes					CHadd(cs, ci);
7231573Srgrimes			}
7241573Srgrimes	}
7251573Srgrimes	if (invert) {
72692889Sobrien		int i;
7271573Srgrimes
7281573Srgrimes		for (i = p->g->csetsize - 1; i >= 0; i--)
7291573Srgrimes			if (CHIN(cs, i))
7301573Srgrimes				CHsub(cs, i);
7311573Srgrimes			else
7321573Srgrimes				CHadd(cs, i);
7331573Srgrimes		if (p->g->cflags&REG_NEWLINE)
7341573Srgrimes			CHsub(cs, '\n');
7351573Srgrimes	}
7361573Srgrimes
7371573Srgrimes	if (nch(p, cs) == 1) {		/* optimize singleton sets */
7381573Srgrimes		ordinary(p, firstch(p, cs));
7391573Srgrimes		freeset(p, cs);
7401573Srgrimes	} else
7411573Srgrimes		EMIT(OANYOF, freezeset(p, cs));
7421573Srgrimes}
7431573Srgrimes
7441573Srgrimes/*
7451573Srgrimes - p_b_term - parse one term of a bracketed character list
74692889Sobrien == static void p_b_term(struct parse *p, cset *cs);
7471573Srgrimes */
7481573Srgrimesstatic void
7491573Srgrimesp_b_term(p, cs)
75092889Sobrienstruct parse *p;
75192889Sobriencset *cs;
7521573Srgrimes{
75392889Sobrien	char c;
75492889Sobrien	char start, finish;
75592889Sobrien	int i;
7561573Srgrimes
7571573Srgrimes	/* classify what we've got */
7581573Srgrimes	switch ((MORE()) ? PEEK() : '\0') {
7591573Srgrimes	case '[':
7601573Srgrimes		c = (MORE2()) ? PEEK2() : '\0';
7611573Srgrimes		break;
7621573Srgrimes	case '-':
7631573Srgrimes		SETERROR(REG_ERANGE);
7641573Srgrimes		return;			/* NOTE RETURN */
7651573Srgrimes		break;
7661573Srgrimes	default:
7671573Srgrimes		c = '\0';
7681573Srgrimes		break;
7691573Srgrimes	}
7701573Srgrimes
7711573Srgrimes	switch (c) {
7721573Srgrimes	case ':':		/* character class */
7731573Srgrimes		NEXT2();
77417141Sjkh		(void)REQUIRE(MORE(), REG_EBRACK);
7751573Srgrimes		c = PEEK();
77617141Sjkh		(void)REQUIRE(c != '-' && c != ']', REG_ECTYPE);
7771573Srgrimes		p_b_cclass(p, cs);
77817141Sjkh		(void)REQUIRE(MORE(), REG_EBRACK);
77917141Sjkh		(void)REQUIRE(EATTWO(':', ']'), REG_ECTYPE);
7801573Srgrimes		break;
7811573Srgrimes	case '=':		/* equivalence class */
7821573Srgrimes		NEXT2();
78317141Sjkh		(void)REQUIRE(MORE(), REG_EBRACK);
7841573Srgrimes		c = PEEK();
78517141Sjkh		(void)REQUIRE(c != '-' && c != ']', REG_ECOLLATE);
7861573Srgrimes		p_b_eclass(p, cs);
78717141Sjkh		(void)REQUIRE(MORE(), REG_EBRACK);
78817141Sjkh		(void)REQUIRE(EATTWO('=', ']'), REG_ECOLLATE);
7891573Srgrimes		break;
7901573Srgrimes	default:		/* symbol, ordinary character, or range */
7911573Srgrimes		start = p_b_symbol(p);
7921573Srgrimes		if (SEE('-') && MORE2() && PEEK2() != ']') {
7931573Srgrimes			/* range */
7941573Srgrimes			NEXT();
7951573Srgrimes			if (EAT('-'))
7961573Srgrimes				finish = '-';
7971573Srgrimes			else
7981573Srgrimes				finish = p_b_symbol(p);
7991573Srgrimes		} else
8001573Srgrimes			finish = start;
80117514Sache		if (start == finish)
80217514Sache			CHadd(cs, start);
80317514Sache		else {
80424637Sache			if (__collate_load_error) {
80524637Sache				(void)REQUIRE((uch)start <= (uch)finish, REG_ERANGE);
80624637Sache				for (i = (uch)start; i <= (uch)finish; i++)
80717514Sache					CHadd(cs, i);
80824637Sache			} else {
80924637Sache				(void)REQUIRE(__collate_range_cmp(start, finish) <= 0, REG_ERANGE);
81024637Sache				for (i = CHAR_MIN; i <= CHAR_MAX; i++) {
81124637Sache					if (   __collate_range_cmp(start, i) <= 0
81224637Sache					    && __collate_range_cmp(i, finish) <= 0
81324637Sache					   )
81424637Sache						CHadd(cs, i);
81524637Sache				}
81617514Sache			}
81717514Sache		}
8181573Srgrimes		break;
8191573Srgrimes	}
8201573Srgrimes}
8211573Srgrimes
8221573Srgrimes/*
8231573Srgrimes - p_b_cclass - parse a character-class name and deal with it
82492889Sobrien == static void p_b_cclass(struct parse *p, cset *cs);
8251573Srgrimes */
8261573Srgrimesstatic void
8271573Srgrimesp_b_cclass(p, cs)
82892889Sobrienstruct parse *p;
82992889Sobriencset *cs;
8301573Srgrimes{
83192889Sobrien	int c;
83292889Sobrien	char *sp = p->next;
83392889Sobrien	struct cclass *cp;
83492889Sobrien	size_t len;
8351573Srgrimes
83617508Sache	while (MORE() && isalpha((uch)PEEK()))
8371573Srgrimes		NEXT();
8381573Srgrimes	len = p->next - sp;
8391573Srgrimes	for (cp = cclasses; cp->name != NULL; cp++)
8401573Srgrimes		if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0')
8411573Srgrimes			break;
8421573Srgrimes	if (cp->name == NULL) {
8431573Srgrimes		/* oops, didn't find it */
8441573Srgrimes		SETERROR(REG_ECTYPE);
8451573Srgrimes		return;
8461573Srgrimes	}
8471573Srgrimes
84817508Sache	switch (cp->fidx) {
84917508Sache	case CALNUM:
85017508Sache		for (c = CHAR_MIN; c <= CHAR_MAX; c++)
85117508Sache			if (isalnum((uch)c))
85217508Sache				CHadd(cs, c);
85317508Sache		break;
85417508Sache	case CALPHA:
85517508Sache		for (c = CHAR_MIN; c <= CHAR_MAX; c++)
85617508Sache			if (isalpha((uch)c))
85717508Sache				CHadd(cs, c);
85817508Sache		break;
85917508Sache	case CBLANK:
86017508Sache		for (c = CHAR_MIN; c <= CHAR_MAX; c++)
86117508Sache			if (isblank((uch)c))
86217508Sache				CHadd(cs, c);
86317508Sache		break;
86417508Sache	case CCNTRL:
86517508Sache		for (c = CHAR_MIN; c <= CHAR_MAX; c++)
86617508Sache			if (iscntrl((uch)c))
86717508Sache				CHadd(cs, c);
86817508Sache		break;
86917508Sache	case CDIGIT:
87017508Sache		for (c = CHAR_MIN; c <= CHAR_MAX; c++)
87117508Sache			if (isdigit((uch)c))
87217508Sache				CHadd(cs, c);
87317508Sache		break;
87417508Sache	case CGRAPH:
87517508Sache		for (c = CHAR_MIN; c <= CHAR_MAX; c++)
87617508Sache			if (isgraph((uch)c))
87717508Sache				CHadd(cs, c);
87817508Sache		break;
87917508Sache	case CLOWER:
88017508Sache		for (c = CHAR_MIN; c <= CHAR_MAX; c++)
88117508Sache			if (islower((uch)c))
88217508Sache				CHadd(cs, c);
88317508Sache		break;
88417508Sache	case CPRINT:
88517508Sache		for (c = CHAR_MIN; c <= CHAR_MAX; c++)
88617508Sache			if (isprint((uch)c))
88717508Sache				CHadd(cs, c);
88817508Sache		break;
88917508Sache	case CPUNCT:
89017508Sache		for (c = CHAR_MIN; c <= CHAR_MAX; c++)
89117508Sache			if (ispunct((uch)c))
89217508Sache				CHadd(cs, c);
89317508Sache		break;
89417508Sache	case CSPACE:
89517508Sache		for (c = CHAR_MIN; c <= CHAR_MAX; c++)
89617508Sache			if (isspace((uch)c))
89717508Sache				CHadd(cs, c);
89817508Sache		break;
89917508Sache	case CUPPER:
90017508Sache		for (c = CHAR_MIN; c <= CHAR_MAX; c++)
90117508Sache			if (isupper((uch)c))
90217508Sache				CHadd(cs, c);
90317508Sache		break;
90417508Sache	case CXDIGIT:
90517508Sache		for (c = CHAR_MIN; c <= CHAR_MAX; c++)
90617508Sache			if (isxdigit((uch)c))
90717508Sache				CHadd(cs, c);
90817508Sache		break;
90917508Sache	}
9101573Srgrimes}
9111573Srgrimes
9121573Srgrimes/*
9131573Srgrimes - p_b_eclass - parse an equivalence-class name and deal with it
91492889Sobrien == static void p_b_eclass(struct parse *p, cset *cs);
9151573Srgrimes *
9161573Srgrimes * This implementation is incomplete. xxx
9171573Srgrimes */
9181573Srgrimesstatic void
9191573Srgrimesp_b_eclass(p, cs)
92092889Sobrienstruct parse *p;
92192889Sobriencset *cs;
9221573Srgrimes{
92392889Sobrien	char c;
9241573Srgrimes
9251573Srgrimes	c = p_b_coll_elem(p, '=');
9261573Srgrimes	CHadd(cs, c);
9271573Srgrimes}
9281573Srgrimes
9291573Srgrimes/*
9301573Srgrimes - p_b_symbol - parse a character or [..]ed multicharacter collating symbol
93192889Sobrien == static char p_b_symbol(struct parse *p);
9321573Srgrimes */
9331573Srgrimesstatic char			/* value of symbol */
9341573Srgrimesp_b_symbol(p)
93592889Sobrienstruct parse *p;
9361573Srgrimes{
93792889Sobrien	char value;
9381573Srgrimes
93917141Sjkh	(void)REQUIRE(MORE(), REG_EBRACK);
9401573Srgrimes	if (!EATTWO('[', '.'))
9411573Srgrimes		return(GETNEXT());
9421573Srgrimes
9431573Srgrimes	/* collating symbol */
9441573Srgrimes	value = p_b_coll_elem(p, '.');
94517141Sjkh	(void)REQUIRE(EATTWO('.', ']'), REG_ECOLLATE);
9461573Srgrimes	return(value);
9471573Srgrimes}
9481573Srgrimes
9491573Srgrimes/*
9501573Srgrimes - p_b_coll_elem - parse a collating-element name and look it up
95192889Sobrien == static char p_b_coll_elem(struct parse *p, int endc);
9521573Srgrimes */
9531573Srgrimesstatic char			/* value of collating element */
9541573Srgrimesp_b_coll_elem(p, endc)
95592889Sobrienstruct parse *p;
9561573Srgrimesint endc;			/* name ended by endc,']' */
9571573Srgrimes{
95892889Sobrien	char *sp = p->next;
95992889Sobrien	struct cname *cp;
96092889Sobrien	int len;
9611573Srgrimes
9621573Srgrimes	while (MORE() && !SEETWO(endc, ']'))
9631573Srgrimes		NEXT();
9641573Srgrimes	if (!MORE()) {
9651573Srgrimes		SETERROR(REG_EBRACK);
9661573Srgrimes		return(0);
9671573Srgrimes	}
9681573Srgrimes	len = p->next - sp;
9691573Srgrimes	for (cp = cnames; cp->name != NULL; cp++)
9701573Srgrimes		if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0')
9711573Srgrimes			return(cp->code);	/* known name */
9721573Srgrimes	if (len == 1)
9731573Srgrimes		return(*sp);	/* single character */
9741573Srgrimes	SETERROR(REG_ECOLLATE);			/* neither */
9751573Srgrimes	return(0);
9761573Srgrimes}
9771573Srgrimes
9781573Srgrimes/*
9791573Srgrimes - othercase - return the case counterpart of an alphabetic
9801573Srgrimes == static char othercase(int ch);
9811573Srgrimes */
9821573Srgrimesstatic char			/* if no counterpart, return ch */
9831573Srgrimesothercase(ch)
9841573Srgrimesint ch;
9851573Srgrimes{
98649094Sache	ch = (uch)ch;
9871573Srgrimes	assert(isalpha(ch));
9881573Srgrimes	if (isupper(ch))
9891573Srgrimes		return(tolower(ch));
9901573Srgrimes	else if (islower(ch))
9911573Srgrimes		return(toupper(ch));
9921573Srgrimes	else			/* peculiar, but could happen */
9931573Srgrimes		return(ch);
9941573Srgrimes}
9951573Srgrimes
9961573Srgrimes/*
9971573Srgrimes - bothcases - emit a dualcase version of a two-case character
99892889Sobrien == static void bothcases(struct parse *p, int ch);
9991573Srgrimes *
10001573Srgrimes * Boy, is this implementation ever a kludge...
10011573Srgrimes */
10021573Srgrimesstatic void
10031573Srgrimesbothcases(p, ch)
100492889Sobrienstruct parse *p;
10051573Srgrimesint ch;
10061573Srgrimes{
100792889Sobrien	char *oldnext = p->next;
100892889Sobrien	char *oldend = p->end;
10091573Srgrimes	char bracket[3];
10101573Srgrimes
101149094Sache	ch = (uch)ch;
10121573Srgrimes	assert(othercase(ch) != ch);	/* p_bracket() would recurse */
10131573Srgrimes	p->next = bracket;
10141573Srgrimes	p->end = bracket+2;
10151573Srgrimes	bracket[0] = ch;
10161573Srgrimes	bracket[1] = ']';
10171573Srgrimes	bracket[2] = '\0';
10181573Srgrimes	p_bracket(p);
10191573Srgrimes	assert(p->next == bracket+2);
10201573Srgrimes	p->next = oldnext;
10211573Srgrimes	p->end = oldend;
10221573Srgrimes}
10231573Srgrimes
10241573Srgrimes/*
10251573Srgrimes - ordinary - emit an ordinary character
102692889Sobrien == static void ordinary(struct parse *p, int ch);
10271573Srgrimes */
10281573Srgrimesstatic void
10291573Srgrimesordinary(p, ch)
103092889Sobrienstruct parse *p;
103192889Sobrienint ch;
10321573Srgrimes{
10331573Srgrimes
103449094Sache	if ((p->g->cflags&REG_ICASE) && isalpha((uch)ch) && othercase(ch) != ch)
10351573Srgrimes		bothcases(p, ch);
1036131973Stjr	else
103749094Sache		EMIT(OCHAR, (uch)ch);
10381573Srgrimes}
10391573Srgrimes
10401573Srgrimes/*
10411573Srgrimes - nonnewline - emit REG_NEWLINE version of OANY
104292889Sobrien == static void nonnewline(struct parse *p);
10431573Srgrimes *
10441573Srgrimes * Boy, is this implementation ever a kludge...
10451573Srgrimes */
10461573Srgrimesstatic void
10471573Srgrimesnonnewline(p)
104892889Sobrienstruct parse *p;
10491573Srgrimes{
105092889Sobrien	char *oldnext = p->next;
105192889Sobrien	char *oldend = p->end;
10521573Srgrimes	char bracket[4];
10531573Srgrimes
10541573Srgrimes	p->next = bracket;
10551573Srgrimes	p->end = bracket+3;
10561573Srgrimes	bracket[0] = '^';
10571573Srgrimes	bracket[1] = '\n';
10581573Srgrimes	bracket[2] = ']';
10591573Srgrimes	bracket[3] = '\0';
10601573Srgrimes	p_bracket(p);
10611573Srgrimes	assert(p->next == bracket+3);
10621573Srgrimes	p->next = oldnext;
10631573Srgrimes	p->end = oldend;
10641573Srgrimes}
10651573Srgrimes
10661573Srgrimes/*
10671573Srgrimes - repeat - generate code for a bounded repetition, recursively if needed
106892889Sobrien == static void repeat(struct parse *p, sopno start, int from, int to);
10691573Srgrimes */
10701573Srgrimesstatic void
10711573Srgrimesrepeat(p, start, from, to)
107292889Sobrienstruct parse *p;
10731573Srgrimessopno start;			/* operand from here to end of strip */
10741573Srgrimesint from;			/* repeated from this number */
10751573Srgrimesint to;				/* to this number of times (maybe INFINITY) */
10761573Srgrimes{
107792889Sobrien	sopno finish = HERE();
10781573Srgrimes#	define	N	2
10791573Srgrimes#	define	INF	3
10801573Srgrimes#	define	REP(f, t)	((f)*8 + (t))
10811573Srgrimes#	define	MAP(n)	(((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N)
108292889Sobrien	sopno copy;
10831573Srgrimes
10841573Srgrimes	if (p->error != 0)	/* head off possible runaway recursion */
10851573Srgrimes		return;
10861573Srgrimes
10871573Srgrimes	assert(from <= to);
10881573Srgrimes
10891573Srgrimes	switch (REP(MAP(from), MAP(to))) {
10901573Srgrimes	case REP(0, 0):			/* must be user doing this */
10911573Srgrimes		DROP(finish-start);	/* drop the operand */
10921573Srgrimes		break;
10931573Srgrimes	case REP(0, 1):			/* as x{1,1}? */
10941573Srgrimes	case REP(0, N):			/* as x{1,n}? */
10951573Srgrimes	case REP(0, INF):		/* as x{1,}? */
10961573Srgrimes		/* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
10971573Srgrimes		INSERT(OCH_, start);		/* offset is wrong... */
10981573Srgrimes		repeat(p, start+1, 1, to);
10991573Srgrimes		ASTERN(OOR1, start);
11001573Srgrimes		AHEAD(start);			/* ... fix it */
11011573Srgrimes		EMIT(OOR2, 0);
11021573Srgrimes		AHEAD(THERE());
11031573Srgrimes		ASTERN(O_CH, THERETHERE());
11041573Srgrimes		break;
11051573Srgrimes	case REP(1, 1):			/* trivial case */
11061573Srgrimes		/* done */
11071573Srgrimes		break;
11081573Srgrimes	case REP(1, N):			/* as x?x{1,n-1} */
11091573Srgrimes		/* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
11101573Srgrimes		INSERT(OCH_, start);
11111573Srgrimes		ASTERN(OOR1, start);
11121573Srgrimes		AHEAD(start);
11131573Srgrimes		EMIT(OOR2, 0);			/* offset very wrong... */
11141573Srgrimes		AHEAD(THERE());			/* ...so fix it */
11151573Srgrimes		ASTERN(O_CH, THERETHERE());
11161573Srgrimes		copy = dupl(p, start+1, finish+1);
11171573Srgrimes		assert(copy == finish+4);
11181573Srgrimes		repeat(p, copy, 1, to-1);
11191573Srgrimes		break;
11201573Srgrimes	case REP(1, INF):		/* as x+ */
11211573Srgrimes		INSERT(OPLUS_, start);
11221573Srgrimes		ASTERN(O_PLUS, start);
11231573Srgrimes		break;
11241573Srgrimes	case REP(N, N):			/* as xx{m-1,n-1} */
11251573Srgrimes		copy = dupl(p, start, finish);
11261573Srgrimes		repeat(p, copy, from-1, to-1);
11271573Srgrimes		break;
11281573Srgrimes	case REP(N, INF):		/* as xx{n-1,INF} */
11291573Srgrimes		copy = dupl(p, start, finish);
11301573Srgrimes		repeat(p, copy, from-1, to);
11311573Srgrimes		break;
11321573Srgrimes	default:			/* "can't happen" */
11331573Srgrimes		SETERROR(REG_ASSERT);	/* just in case */
11341573Srgrimes		break;
11351573Srgrimes	}
11361573Srgrimes}
11371573Srgrimes
11381573Srgrimes/*
11391573Srgrimes - seterr - set an error condition
114092889Sobrien == static int seterr(struct parse *p, int e);
11411573Srgrimes */
11421573Srgrimesstatic int			/* useless but makes type checking happy */
11431573Srgrimesseterr(p, e)
114492889Sobrienstruct parse *p;
11451573Srgrimesint e;
11461573Srgrimes{
11471573Srgrimes	if (p->error == 0)	/* keep earliest error condition */
11481573Srgrimes		p->error = e;
11491573Srgrimes	p->next = nuls;		/* try to bring things to a halt */
11501573Srgrimes	p->end = nuls;
11511573Srgrimes	return(0);		/* make the return value well-defined */
11521573Srgrimes}
11531573Srgrimes
11541573Srgrimes/*
11551573Srgrimes - allocset - allocate a set of characters for []
115692889Sobrien == static cset *allocset(struct parse *p);
11571573Srgrimes */
11581573Srgrimesstatic cset *
11591573Srgrimesallocset(p)
116092889Sobrienstruct parse *p;
11611573Srgrimes{
116292889Sobrien	int no = p->g->ncsets++;
116392889Sobrien	size_t nc;
116492889Sobrien	size_t nbytes;
116592889Sobrien	cset *cs;
116692889Sobrien	size_t css = (size_t)p->g->csetsize;
116792889Sobrien	int i;
11681573Srgrimes
11691573Srgrimes	if (no >= p->ncsalloc) {	/* need another column of space */
11701573Srgrimes		p->ncsalloc += CHAR_BIT;
11711573Srgrimes		nc = p->ncsalloc;
11721573Srgrimes		assert(nc % CHAR_BIT == 0);
11731573Srgrimes		nbytes = nc / CHAR_BIT * css;
11741573Srgrimes		if (p->g->sets == NULL)
11751573Srgrimes			p->g->sets = (cset *)malloc(nc * sizeof(cset));
11761573Srgrimes		else
117739327Simp			p->g->sets = (cset *)reallocf((char *)p->g->sets,
11781573Srgrimes							nc * sizeof(cset));
11791573Srgrimes		if (p->g->setbits == NULL)
11801573Srgrimes			p->g->setbits = (uch *)malloc(nbytes);
11811573Srgrimes		else {
118239327Simp			p->g->setbits = (uch *)reallocf((char *)p->g->setbits,
11831573Srgrimes								nbytes);
11841573Srgrimes			/* xxx this isn't right if setbits is now NULL */
11851573Srgrimes			for (i = 0; i < no; i++)
11861573Srgrimes				p->g->sets[i].ptr = p->g->setbits + css*(i/CHAR_BIT);
11871573Srgrimes		}
11881573Srgrimes		if (p->g->sets != NULL && p->g->setbits != NULL)
11891573Srgrimes			(void) memset((char *)p->g->setbits + (nbytes - css),
11901573Srgrimes								0, css);
11911573Srgrimes		else {
11921573Srgrimes			no = 0;
11931573Srgrimes			SETERROR(REG_ESPACE);
11941573Srgrimes			/* caller's responsibility not to do set ops */
11951573Srgrimes		}
11961573Srgrimes	}
11971573Srgrimes
11981573Srgrimes	assert(p->g->sets != NULL);	/* xxx */
11991573Srgrimes	cs = &p->g->sets[no];
12001573Srgrimes	cs->ptr = p->g->setbits + css*((no)/CHAR_BIT);
12011573Srgrimes	cs->mask = 1 << ((no) % CHAR_BIT);
12021573Srgrimes	cs->hash = 0;
12031573Srgrimes
12041573Srgrimes	return(cs);
12051573Srgrimes}
12061573Srgrimes
12071573Srgrimes/*
12081573Srgrimes - freeset - free a now-unused set
120992889Sobrien == static void freeset(struct parse *p, cset *cs);
12101573Srgrimes */
12111573Srgrimesstatic void
12121573Srgrimesfreeset(p, cs)
121392889Sobrienstruct parse *p;
121492889Sobriencset *cs;
12151573Srgrimes{
121692889Sobrien	int i;
121792889Sobrien	cset *top = &p->g->sets[p->g->ncsets];
121892889Sobrien	size_t css = (size_t)p->g->csetsize;
12191573Srgrimes
12201573Srgrimes	for (i = 0; i < css; i++)
12211573Srgrimes		CHsub(cs, i);
12221573Srgrimes	if (cs == top-1)	/* recover only the easy case */
12231573Srgrimes		p->g->ncsets--;
12241573Srgrimes}
12251573Srgrimes
12261573Srgrimes/*
12271573Srgrimes - freezeset - final processing on a set of characters
122892889Sobrien == static int freezeset(struct parse *p, cset *cs);
12291573Srgrimes *
12301573Srgrimes * The main task here is merging identical sets.  This is usually a waste
12311573Srgrimes * of time (although the hash code minimizes the overhead), but can win
12321573Srgrimes * big if REG_ICASE is being used.  REG_ICASE, by the way, is why the hash
12331573Srgrimes * is done using addition rather than xor -- all ASCII [aA] sets xor to
12341573Srgrimes * the same value!
12351573Srgrimes */
12361573Srgrimesstatic int			/* set number */
12371573Srgrimesfreezeset(p, cs)
123892889Sobrienstruct parse *p;
123992889Sobriencset *cs;
12401573Srgrimes{
124192889Sobrien	short h = cs->hash;
124292889Sobrien	int i;
124392889Sobrien	cset *top = &p->g->sets[p->g->ncsets];
124492889Sobrien	cset *cs2;
124592889Sobrien	size_t css = (size_t)p->g->csetsize;
12461573Srgrimes
12471573Srgrimes	/* look for an earlier one which is the same */
12481573Srgrimes	for (cs2 = &p->g->sets[0]; cs2 < top; cs2++)
12491573Srgrimes		if (cs2->hash == h && cs2 != cs) {
12501573Srgrimes			/* maybe */
12511573Srgrimes			for (i = 0; i < css; i++)
12521573Srgrimes				if (!!CHIN(cs2, i) != !!CHIN(cs, i))
12531573Srgrimes					break;		/* no */
12541573Srgrimes			if (i == css)
12551573Srgrimes				break;			/* yes */
12561573Srgrimes		}
12571573Srgrimes
12581573Srgrimes	if (cs2 < top) {	/* found one */
12591573Srgrimes		freeset(p, cs);
12601573Srgrimes		cs = cs2;
12611573Srgrimes	}
12621573Srgrimes
12631573Srgrimes	return((int)(cs - p->g->sets));
12641573Srgrimes}
12651573Srgrimes
12661573Srgrimes/*
12671573Srgrimes - firstch - return first character in a set (which must have at least one)
126892889Sobrien == static int firstch(struct parse *p, cset *cs);
12691573Srgrimes */
12701573Srgrimesstatic int			/* character; there is no "none" value */
12711573Srgrimesfirstch(p, cs)
127292889Sobrienstruct parse *p;
127392889Sobriencset *cs;
12741573Srgrimes{
127592889Sobrien	int i;
127692889Sobrien	size_t css = (size_t)p->g->csetsize;
12771573Srgrimes
12781573Srgrimes	for (i = 0; i < css; i++)
12791573Srgrimes		if (CHIN(cs, i))
128049094Sache			return((char)i);
12811573Srgrimes	assert(never);
12821573Srgrimes	return(0);		/* arbitrary */
12831573Srgrimes}
12841573Srgrimes
12851573Srgrimes/*
12861573Srgrimes - nch - number of characters in a set
128792889Sobrien == static int nch(struct parse *p, cset *cs);
12881573Srgrimes */
12891573Srgrimesstatic int
12901573Srgrimesnch(p, cs)
129192889Sobrienstruct parse *p;
129292889Sobriencset *cs;
12931573Srgrimes{
129492889Sobrien	int i;
129592889Sobrien	size_t css = (size_t)p->g->csetsize;
129692889Sobrien	int n = 0;
12971573Srgrimes
12981573Srgrimes	for (i = 0; i < css; i++)
12991573Srgrimes		if (CHIN(cs, i))
13001573Srgrimes			n++;
13011573Srgrimes	return(n);
13021573Srgrimes}
13031573Srgrimes
13041573Srgrimes/*
13051573Srgrimes - dupl - emit a duplicate of a bunch of sops
130692889Sobrien == static sopno dupl(struct parse *p, sopno start, sopno finish);
13071573Srgrimes */
13081573Srgrimesstatic sopno			/* start of duplicate */
13091573Srgrimesdupl(p, start, finish)
131092889Sobrienstruct parse *p;
13111573Srgrimessopno start;			/* from here */
13121573Srgrimessopno finish;			/* to this less one */
13131573Srgrimes{
131492889Sobrien	sopno ret = HERE();
131592889Sobrien	sopno len = finish - start;
13161573Srgrimes
13171573Srgrimes	assert(finish >= start);
13181573Srgrimes	if (len == 0)
13191573Srgrimes		return(ret);
13201573Srgrimes	enlarge(p, p->ssize + len);	/* this many unexpected additions */
13211573Srgrimes	assert(p->ssize >= p->slen + len);
13221573Srgrimes	(void) memcpy((char *)(p->strip + p->slen),
13231573Srgrimes		(char *)(p->strip + start), (size_t)len*sizeof(sop));
13241573Srgrimes	p->slen += len;
13251573Srgrimes	return(ret);
13261573Srgrimes}
13271573Srgrimes
13281573Srgrimes/*
13291573Srgrimes - doemit - emit a strip operator
133092889Sobrien == static void doemit(struct parse *p, sop op, size_t opnd);
13311573Srgrimes *
13321573Srgrimes * It might seem better to implement this as a macro with a function as
13331573Srgrimes * hard-case backup, but it's just too big and messy unless there are
13341573Srgrimes * some changes to the data structures.  Maybe later.
13351573Srgrimes */
13361573Srgrimesstatic void
13371573Srgrimesdoemit(p, op, opnd)
133892889Sobrienstruct parse *p;
13391573Srgrimessop op;
13401573Srgrimessize_t opnd;
13411573Srgrimes{
13421573Srgrimes	/* avoid making error situations worse */
13431573Srgrimes	if (p->error != 0)
13441573Srgrimes		return;
13451573Srgrimes
13461573Srgrimes	/* deal with oversize operands ("can't happen", more or less) */
13471573Srgrimes	assert(opnd < 1<<OPSHIFT);
13481573Srgrimes
13491573Srgrimes	/* deal with undersized strip */
13501573Srgrimes	if (p->slen >= p->ssize)
13511573Srgrimes		enlarge(p, (p->ssize+1) / 2 * 3);	/* +50% */
13521573Srgrimes	assert(p->slen < p->ssize);
13531573Srgrimes
13541573Srgrimes	/* finally, it's all reduced to the easy case */
13551573Srgrimes	p->strip[p->slen++] = SOP(op, opnd);
13561573Srgrimes}
13571573Srgrimes
13581573Srgrimes/*
13591573Srgrimes - doinsert - insert a sop into the strip
136092889Sobrien == static void doinsert(struct parse *p, sop op, size_t opnd, sopno pos);
13611573Srgrimes */
13621573Srgrimesstatic void
13631573Srgrimesdoinsert(p, op, opnd, pos)
136492889Sobrienstruct parse *p;
13651573Srgrimessop op;
13661573Srgrimessize_t opnd;
13671573Srgrimessopno pos;
13681573Srgrimes{
136992889Sobrien	sopno sn;
137092889Sobrien	sop s;
137192889Sobrien	int i;
13721573Srgrimes
13731573Srgrimes	/* avoid making error situations worse */
13741573Srgrimes	if (p->error != 0)
13751573Srgrimes		return;
13761573Srgrimes
13771573Srgrimes	sn = HERE();
13781573Srgrimes	EMIT(op, opnd);		/* do checks, ensure space */
13791573Srgrimes	assert(HERE() == sn+1);
13801573Srgrimes	s = p->strip[sn];
13811573Srgrimes
13821573Srgrimes	/* adjust paren pointers */
13831573Srgrimes	assert(pos > 0);
13841573Srgrimes	for (i = 1; i < NPAREN; i++) {
13851573Srgrimes		if (p->pbegin[i] >= pos) {
13861573Srgrimes			p->pbegin[i]++;
13871573Srgrimes		}
13881573Srgrimes		if (p->pend[i] >= pos) {
13891573Srgrimes			p->pend[i]++;
13901573Srgrimes		}
13911573Srgrimes	}
13921573Srgrimes
13931573Srgrimes	memmove((char *)&p->strip[pos+1], (char *)&p->strip[pos],
13941573Srgrimes						(HERE()-pos-1)*sizeof(sop));
13951573Srgrimes	p->strip[pos] = s;
13961573Srgrimes}
13971573Srgrimes
13981573Srgrimes/*
13991573Srgrimes - dofwd - complete a forward reference
140092889Sobrien == static void dofwd(struct parse *p, sopno pos, sop value);
14011573Srgrimes */
14021573Srgrimesstatic void
14031573Srgrimesdofwd(p, pos, value)
140492889Sobrienstruct parse *p;
140592889Sobriensopno pos;
14061573Srgrimessop value;
14071573Srgrimes{
14081573Srgrimes	/* avoid making error situations worse */
14091573Srgrimes	if (p->error != 0)
14101573Srgrimes		return;
14111573Srgrimes
14121573Srgrimes	assert(value < 1<<OPSHIFT);
14131573Srgrimes	p->strip[pos] = OP(p->strip[pos]) | value;
14141573Srgrimes}
14151573Srgrimes
14161573Srgrimes/*
14171573Srgrimes - enlarge - enlarge the strip
141892889Sobrien == static void enlarge(struct parse *p, sopno size);
14191573Srgrimes */
14201573Srgrimesstatic void
14211573Srgrimesenlarge(p, size)
142292889Sobrienstruct parse *p;
142392889Sobriensopno size;
14241573Srgrimes{
142592889Sobrien	sop *sp;
14261573Srgrimes
14271573Srgrimes	if (p->ssize >= size)
14281573Srgrimes		return;
14291573Srgrimes
14301573Srgrimes	sp = (sop *)realloc(p->strip, size*sizeof(sop));
14311573Srgrimes	if (sp == NULL) {
14321573Srgrimes		SETERROR(REG_ESPACE);
14331573Srgrimes		return;
14341573Srgrimes	}
14351573Srgrimes	p->strip = sp;
14361573Srgrimes	p->ssize = size;
14371573Srgrimes}
14381573Srgrimes
14391573Srgrimes/*
14401573Srgrimes - stripsnug - compact the strip
144192889Sobrien == static void stripsnug(struct parse *p, struct re_guts *g);
14421573Srgrimes */
14431573Srgrimesstatic void
14441573Srgrimesstripsnug(p, g)
144592889Sobrienstruct parse *p;
144692889Sobrienstruct re_guts *g;
14471573Srgrimes{
14481573Srgrimes	g->nstates = p->slen;
14491573Srgrimes	g->strip = (sop *)realloc((char *)p->strip, p->slen * sizeof(sop));
14501573Srgrimes	if (g->strip == NULL) {
14511573Srgrimes		SETERROR(REG_ESPACE);
14521573Srgrimes		g->strip = p->strip;
14531573Srgrimes	}
14541573Srgrimes}
14551573Srgrimes
14561573Srgrimes/*
14571573Srgrimes - findmust - fill in must and mlen with longest mandatory literal string
145892889Sobrien == static void findmust(struct parse *p, struct re_guts *g);
14591573Srgrimes *
14601573Srgrimes * This algorithm could do fancy things like analyzing the operands of |
14611573Srgrimes * for common subsequences.  Someday.  This code is simple and finds most
14621573Srgrimes * of the interesting cases.
14631573Srgrimes *
14641573Srgrimes * Note that must and mlen got initialized during setup.
14651573Srgrimes */
14661573Srgrimesstatic void
14671573Srgrimesfindmust(p, g)
14681573Srgrimesstruct parse *p;
146992889Sobrienstruct re_guts *g;
14701573Srgrimes{
147192889Sobrien	sop *scan;
14721573Srgrimes	sop *start;
147392889Sobrien	sop *newstart;
147492889Sobrien	sopno newlen;
147592889Sobrien	sop s;
147692889Sobrien	char *cp;
147792889Sobrien	sopno i;
147862391Sdcs	int offset;
14791573Srgrimes
14801573Srgrimes	/* avoid making error situations worse */
14811573Srgrimes	if (p->error != 0)
14821573Srgrimes		return;
14831573Srgrimes
14841573Srgrimes	/* find the longest OCHAR sequence in strip */
14851573Srgrimes	newlen = 0;
148662391Sdcs	offset = 0;
148762391Sdcs	g->moffset = 0;
14881573Srgrimes	scan = g->strip + 1;
14891573Srgrimes	do {
14901573Srgrimes		s = *scan++;
14911573Srgrimes		switch (OP(s)) {
14921573Srgrimes		case OCHAR:		/* sequence member */
14931573Srgrimes			if (newlen == 0)		/* new sequence */
14941573Srgrimes				newstart = scan - 1;
14951573Srgrimes			newlen++;
14961573Srgrimes			break;
14971573Srgrimes		case OPLUS_:		/* things that don't break one */
14981573Srgrimes		case OLPAREN:
14991573Srgrimes		case ORPAREN:
15001573Srgrimes			break;
15011573Srgrimes		case OQUEST_:		/* things that must be skipped */
15021573Srgrimes		case OCH_:
1503131973Stjr			offset = altoffset(scan, offset);
15041573Srgrimes			scan--;
15051573Srgrimes			do {
15061573Srgrimes				scan += OPND(s);
15071573Srgrimes				s = *scan;
15081573Srgrimes				/* assert() interferes w debug printouts */
15091573Srgrimes				if (OP(s) != O_QUEST && OP(s) != O_CH &&
15101573Srgrimes							OP(s) != OOR2) {
15111573Srgrimes					g->iflags |= BAD;
15121573Srgrimes					return;
15131573Srgrimes				}
15141573Srgrimes			} while (OP(s) != O_QUEST && OP(s) != O_CH);
1515102411Scharnier			/* FALLTHROUGH */
151662391Sdcs		case OBOW:		/* things that break a sequence */
151762391Sdcs		case OEOW:
151862391Sdcs		case OBOL:
151962391Sdcs		case OEOL:
152062391Sdcs		case O_QUEST:
152162391Sdcs		case O_CH:
152262391Sdcs		case OEND:
15231573Srgrimes			if (newlen > g->mlen) {		/* ends one */
15241573Srgrimes				start = newstart;
15251573Srgrimes				g->mlen = newlen;
152662391Sdcs				if (offset > -1) {
152762391Sdcs					g->moffset += offset;
152862391Sdcs					offset = newlen;
152962391Sdcs				} else
153062391Sdcs					g->moffset = offset;
153162391Sdcs			} else {
153262391Sdcs				if (offset > -1)
153362391Sdcs					offset += newlen;
15341573Srgrimes			}
15351573Srgrimes			newlen = 0;
15361573Srgrimes			break;
153762391Sdcs		case OANY:
153862391Sdcs			if (newlen > g->mlen) {		/* ends one */
153962391Sdcs				start = newstart;
154062391Sdcs				g->mlen = newlen;
154162391Sdcs				if (offset > -1) {
154262391Sdcs					g->moffset += offset;
154362391Sdcs					offset = newlen;
154462391Sdcs				} else
154562391Sdcs					g->moffset = offset;
154662391Sdcs			} else {
154762391Sdcs				if (offset > -1)
154862391Sdcs					offset += newlen;
154962391Sdcs			}
155062391Sdcs			if (offset > -1)
155162391Sdcs				offset++;
155262391Sdcs			newlen = 0;
155362391Sdcs			break;
155462391Sdcs		case OANYOF:		/* may or may not invalidate offset */
155562391Sdcs			/* First, everything as OANY */
155662391Sdcs			if (newlen > g->mlen) {		/* ends one */
155762391Sdcs				start = newstart;
155862391Sdcs				g->mlen = newlen;
155962391Sdcs				if (offset > -1) {
156062391Sdcs					g->moffset += offset;
156162391Sdcs					offset = newlen;
156262391Sdcs				} else
156362391Sdcs					g->moffset = offset;
156462391Sdcs			} else {
156562391Sdcs				if (offset > -1)
156662391Sdcs					offset += newlen;
156762391Sdcs			}
156862391Sdcs			if (offset > -1)
156962391Sdcs				offset++;
157062391Sdcs			newlen = 0;
157162391Sdcs			break;
157262391Sdcs		default:
157362391Sdcs			/* Anything here makes it impossible or too hard
157462391Sdcs			 * to calculate the offset -- so we give up;
157562391Sdcs			 * save the last known good offset, in case the
157662391Sdcs			 * must sequence doesn't occur later.
157762391Sdcs			 */
157862391Sdcs			if (newlen > g->mlen) {		/* ends one */
157962391Sdcs				start = newstart;
158062391Sdcs				g->mlen = newlen;
158162391Sdcs				if (offset > -1)
158262391Sdcs					g->moffset += offset;
158362391Sdcs				else
158462391Sdcs					g->moffset = offset;
158562391Sdcs			}
158662391Sdcs			offset = -1;
158762391Sdcs			newlen = 0;
158862391Sdcs			break;
15891573Srgrimes		}
15901573Srgrimes	} while (OP(s) != OEND);
15911573Srgrimes
159262391Sdcs	if (g->mlen == 0) {		/* there isn't one */
159362391Sdcs		g->moffset = -1;
15941573Srgrimes		return;
159562391Sdcs	}
15961573Srgrimes
15971573Srgrimes	/* turn it into a character string */
15981573Srgrimes	g->must = malloc((size_t)g->mlen + 1);
15991573Srgrimes	if (g->must == NULL) {		/* argh; just forget it */
16001573Srgrimes		g->mlen = 0;
160162391Sdcs		g->moffset = -1;
16021573Srgrimes		return;
16031573Srgrimes	}
16041573Srgrimes	cp = g->must;
16051573Srgrimes	scan = start;
16061573Srgrimes	for (i = g->mlen; i > 0; i--) {
16071573Srgrimes		while (OP(s = *scan++) != OCHAR)
16081573Srgrimes			continue;
16091573Srgrimes		assert(cp < g->must + g->mlen);
16101573Srgrimes		*cp++ = (char)OPND(s);
16111573Srgrimes	}
16121573Srgrimes	assert(cp == g->must + g->mlen);
16131573Srgrimes	*cp++ = '\0';		/* just on general principles */
16141573Srgrimes}
16151573Srgrimes
16161573Srgrimes/*
161762391Sdcs - altoffset - choose biggest offset among multiple choices
1618131973Stjr == static int altoffset(sop *scan, int offset);
161962391Sdcs *
162062391Sdcs * Compute, recursively if necessary, the largest offset among multiple
162162391Sdcs * re paths.
162262391Sdcs */
162362391Sdcsstatic int
1624131973Stjraltoffset(scan, offset)
162562391Sdcssop *scan;
162662391Sdcsint offset;
162762391Sdcs{
162862391Sdcs	int largest;
162962391Sdcs	int try;
163062391Sdcs	sop s;
163162391Sdcs
163262391Sdcs	/* If we gave up already on offsets, return */
163362391Sdcs	if (offset == -1)
163462391Sdcs		return -1;
163562391Sdcs
163662391Sdcs	largest = 0;
163762391Sdcs	try = 0;
163862391Sdcs	s = *scan++;
163962391Sdcs	while (OP(s) != O_QUEST && OP(s) != O_CH) {
164062391Sdcs		switch (OP(s)) {
164162391Sdcs		case OOR1:
164262391Sdcs			if (try > largest)
164362391Sdcs				largest = try;
164462391Sdcs			try = 0;
164562391Sdcs			break;
164662391Sdcs		case OQUEST_:
164762391Sdcs		case OCH_:
1648131973Stjr			try = altoffset(scan, try);
164962391Sdcs			if (try == -1)
165062391Sdcs				return -1;
165162391Sdcs			scan--;
165262391Sdcs			do {
165362391Sdcs				scan += OPND(s);
165462391Sdcs				s = *scan;
165562391Sdcs				if (OP(s) != O_QUEST && OP(s) != O_CH &&
165662391Sdcs							OP(s) != OOR2)
165762391Sdcs					return -1;
165862391Sdcs			} while (OP(s) != O_QUEST && OP(s) != O_CH);
165962855Sdcs			/* We must skip to the next position, or we'll
166062855Sdcs			 * leave altoffset() too early.
166162855Sdcs			 */
166262855Sdcs			scan++;
166362391Sdcs			break;
166462391Sdcs		case OANYOF:
166562391Sdcs		case OCHAR:
166662391Sdcs		case OANY:
166762391Sdcs			try++;
166862391Sdcs		case OBOW:
166962391Sdcs		case OEOW:
167062391Sdcs		case OLPAREN:
167162391Sdcs		case ORPAREN:
167262391Sdcs		case OOR2:
167362391Sdcs			break;
167462391Sdcs		default:
167562391Sdcs			try = -1;
167662391Sdcs			break;
167762391Sdcs		}
167862391Sdcs		if (try == -1)
167962391Sdcs			return -1;
168062391Sdcs		s = *scan++;
168162391Sdcs	}
168262391Sdcs
168362391Sdcs	if (try > largest)
168462391Sdcs		largest = try;
168562391Sdcs
168662391Sdcs	return largest+offset;
168762391Sdcs}
168862391Sdcs
168962391Sdcs/*
169062232Sdcs - computejumps - compute char jumps for BM scan
169192889Sobrien == static void computejumps(struct parse *p, struct re_guts *g);
169262232Sdcs *
169362232Sdcs * This algorithm assumes g->must exists and is has size greater than
169462232Sdcs * zero. It's based on the algorithm found on Computer Algorithms by
169562232Sdcs * Sara Baase.
169662232Sdcs *
169762232Sdcs * A char jump is the number of characters one needs to jump based on
169862232Sdcs * the value of the character from the text that was mismatched.
169962232Sdcs */
170062232Sdcsstatic void
170162232Sdcscomputejumps(p, g)
170262232Sdcsstruct parse *p;
170362232Sdcsstruct re_guts *g;
170462232Sdcs{
170562232Sdcs	int ch;
170662232Sdcs	int mindex;
170762232Sdcs
170862232Sdcs	/* Avoid making errors worse */
170962232Sdcs	if (p->error != 0)
171062232Sdcs		return;
171162232Sdcs
171262848Sdcs	g->charjump = (int*) malloc((NC + 1) * sizeof(int));
171362232Sdcs	if (g->charjump == NULL)	/* Not a fatal error */
171462232Sdcs		return;
171562754Sdcs	/* Adjust for signed chars, if necessary */
171662754Sdcs	g->charjump = &g->charjump[-(CHAR_MIN)];
171762232Sdcs
171862232Sdcs	/* If the character does not exist in the pattern, the jump
171962232Sdcs	 * is equal to the number of characters in the pattern.
172062232Sdcs	 */
172162754Sdcs	for (ch = CHAR_MIN; ch < (CHAR_MAX + 1); ch++)
172262232Sdcs		g->charjump[ch] = g->mlen;
172362232Sdcs
172462232Sdcs	/* If the character does exist, compute the jump that would
172562232Sdcs	 * take us to the last character in the pattern equal to it
172662232Sdcs	 * (notice that we match right to left, so that last character
172762232Sdcs	 * is the first one that would be matched).
172862232Sdcs	 */
172962232Sdcs	for (mindex = 0; mindex < g->mlen; mindex++)
1730111010Snectar		g->charjump[(int)g->must[mindex]] = g->mlen - mindex - 1;
173162232Sdcs}
173262232Sdcs
173362232Sdcs/*
173462232Sdcs - computematchjumps - compute match jumps for BM scan
173592889Sobrien == static void computematchjumps(struct parse *p, struct re_guts *g);
173662232Sdcs *
173762232Sdcs * This algorithm assumes g->must exists and is has size greater than
173862232Sdcs * zero. It's based on the algorithm found on Computer Algorithms by
173962232Sdcs * Sara Baase.
174062232Sdcs *
174162232Sdcs * A match jump is the number of characters one needs to advance based
174262232Sdcs * on the already-matched suffix.
174362232Sdcs * Notice that all values here are minus (g->mlen-1), because of the way
174462232Sdcs * the search algorithm works.
174562232Sdcs */
174662232Sdcsstatic void
174762232Sdcscomputematchjumps(p, g)
174862232Sdcsstruct parse *p;
174962232Sdcsstruct re_guts *g;
175062232Sdcs{
175162232Sdcs	int mindex;		/* General "must" iterator */
175262232Sdcs	int suffix;		/* Keeps track of matching suffix */
175362232Sdcs	int ssuffix;		/* Keeps track of suffixes' suffix */
175462232Sdcs	int* pmatches;		/* pmatches[k] points to the next i
175562232Sdcs				 * such that i+1...mlen is a substring
175662232Sdcs				 * of k+1...k+mlen-i-1
175762232Sdcs				 */
175862232Sdcs
175962232Sdcs	/* Avoid making errors worse */
176062232Sdcs	if (p->error != 0)
176162232Sdcs		return;
176262232Sdcs
176362848Sdcs	pmatches = (int*) malloc(g->mlen * sizeof(unsigned int));
176462232Sdcs	if (pmatches == NULL) {
176562232Sdcs		g->matchjump = NULL;
176662232Sdcs		return;
176762232Sdcs	}
176862232Sdcs
176962848Sdcs	g->matchjump = (int*) malloc(g->mlen * sizeof(unsigned int));
177062232Sdcs	if (g->matchjump == NULL)	/* Not a fatal error */
177162232Sdcs		return;
177262232Sdcs
177362232Sdcs	/* Set maximum possible jump for each character in the pattern */
177462232Sdcs	for (mindex = 0; mindex < g->mlen; mindex++)
177562232Sdcs		g->matchjump[mindex] = 2*g->mlen - mindex - 1;
177662232Sdcs
177762232Sdcs	/* Compute pmatches[] */
177862232Sdcs	for (mindex = g->mlen - 1, suffix = g->mlen; mindex >= 0;
177962232Sdcs	    mindex--, suffix--) {
178062232Sdcs		pmatches[mindex] = suffix;
178162232Sdcs
178262232Sdcs		/* If a mismatch is found, interrupting the substring,
178362232Sdcs		 * compute the matchjump for that position. If no
178462232Sdcs		 * mismatch is found, then a text substring mismatched
178562232Sdcs		 * against the suffix will also mismatch against the
178662232Sdcs		 * substring.
178762232Sdcs		 */
178862232Sdcs		while (suffix < g->mlen
178962232Sdcs		    && g->must[mindex] != g->must[suffix]) {
179062232Sdcs			g->matchjump[suffix] = MIN(g->matchjump[suffix],
179162232Sdcs			    g->mlen - mindex - 1);
179262232Sdcs			suffix = pmatches[suffix];
179362232Sdcs		}
179462232Sdcs	}
179562232Sdcs
179662232Sdcs	/* Compute the matchjump up to the last substring found to jump
179762232Sdcs	 * to the beginning of the largest must pattern prefix matching
179862232Sdcs	 * it's own suffix.
179962232Sdcs	 */
180062232Sdcs	for (mindex = 0; mindex <= suffix; mindex++)
180162232Sdcs		g->matchjump[mindex] = MIN(g->matchjump[mindex],
180262232Sdcs		    g->mlen + suffix - mindex);
180362232Sdcs
180462232Sdcs        ssuffix = pmatches[suffix];
180562391Sdcs        while (suffix < g->mlen) {
180662673Sdcs                while (suffix <= ssuffix && suffix < g->mlen) {
180762232Sdcs                        g->matchjump[suffix] = MIN(g->matchjump[suffix],
180862232Sdcs			    g->mlen + ssuffix - suffix);
180962232Sdcs                        suffix++;
181062232Sdcs                }
181186208Sdcs		if (suffix < g->mlen)
181286208Sdcs                	ssuffix = pmatches[ssuffix];
181362232Sdcs        }
181462232Sdcs
181562232Sdcs	free(pmatches);
181662232Sdcs}
181762232Sdcs
181862232Sdcs/*
18191573Srgrimes - pluscount - count + nesting
182092889Sobrien == static sopno pluscount(struct parse *p, struct re_guts *g);
18211573Srgrimes */
18221573Srgrimesstatic sopno			/* nesting depth */
18231573Srgrimespluscount(p, g)
18241573Srgrimesstruct parse *p;
182592889Sobrienstruct re_guts *g;
18261573Srgrimes{
182792889Sobrien	sop *scan;
182892889Sobrien	sop s;
182992889Sobrien	sopno plusnest = 0;
183092889Sobrien	sopno maxnest = 0;
18311573Srgrimes
18321573Srgrimes	if (p->error != 0)
18331573Srgrimes		return(0);	/* there may not be an OEND */
18341573Srgrimes
18351573Srgrimes	scan = g->strip + 1;
18361573Srgrimes	do {
18371573Srgrimes		s = *scan++;
18381573Srgrimes		switch (OP(s)) {
18391573Srgrimes		case OPLUS_:
18401573Srgrimes			plusnest++;
18411573Srgrimes			break;
18421573Srgrimes		case O_PLUS:
18431573Srgrimes			if (plusnest > maxnest)
18441573Srgrimes				maxnest = plusnest;
18451573Srgrimes			plusnest--;
18461573Srgrimes			break;
18471573Srgrimes		}
18481573Srgrimes	} while (OP(s) != OEND);
18491573Srgrimes	if (plusnest != 0)
18501573Srgrimes		g->iflags |= BAD;
18511573Srgrimes	return(maxnest);
18521573Srgrimes}
1853