regcomp.c revision 92889
11573Srgrimes/*-
21573Srgrimes * Copyright (c) 1992, 1993, 1994 Henry Spencer.
31573Srgrimes * Copyright (c) 1992, 1993, 1994
41573Srgrimes *	The Regents of the University of California.  All rights reserved.
51573Srgrimes *
61573Srgrimes * This code is derived from software contributed to Berkeley by
71573Srgrimes * Henry Spencer.
81573Srgrimes *
91573Srgrimes * Redistribution and use in source and binary forms, with or without
101573Srgrimes * modification, are permitted provided that the following conditions
111573Srgrimes * are met:
121573Srgrimes * 1. Redistributions of source code must retain the above copyright
131573Srgrimes *    notice, this list of conditions and the following disclaimer.
141573Srgrimes * 2. Redistributions in binary form must reproduce the above copyright
151573Srgrimes *    notice, this list of conditions and the following disclaimer in the
161573Srgrimes *    documentation and/or other materials provided with the distribution.
171573Srgrimes * 3. All advertising materials mentioning features or use of this software
181573Srgrimes *    must display the following acknowledgement:
191573Srgrimes *	This product includes software developed by the University of
201573Srgrimes *	California, Berkeley and its contributors.
211573Srgrimes * 4. Neither the name of the University nor the names of its contributors
221573Srgrimes *    may be used to endorse or promote products derived from this software
231573Srgrimes *    without specific prior written permission.
241573Srgrimes *
251573Srgrimes * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
261573Srgrimes * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
271573Srgrimes * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
281573Srgrimes * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
291573Srgrimes * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
301573Srgrimes * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
311573Srgrimes * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
321573Srgrimes * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
331573Srgrimes * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
341573Srgrimes * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
351573Srgrimes * SUCH DAMAGE.
361573Srgrimes *
371573Srgrimes *	@(#)regcomp.c	8.5 (Berkeley) 3/20/94
3862232Sdcs *
3962232Sdcs * $FreeBSD: head/lib/libc/regex/regcomp.c 92889 2002-03-21 18:49:23Z obrien $
401573Srgrimes */
411573Srgrimes
421573Srgrimes#if defined(LIBC_SCCS) && !defined(lint)
431573Srgrimesstatic char sccsid[] = "@(#)regcomp.c	8.5 (Berkeley) 3/20/94";
441573Srgrimes#endif /* LIBC_SCCS and not lint */
451573Srgrimes
461573Srgrimes#include <sys/types.h>
471573Srgrimes#include <stdio.h>
481573Srgrimes#include <string.h>
491573Srgrimes#include <ctype.h>
501573Srgrimes#include <limits.h>
511573Srgrimes#include <stdlib.h>
521573Srgrimes#include <regex.h>
531573Srgrimes
5419277Sache#include "collate.h"
5519277Sache
561573Srgrimes#include "utils.h"
571573Srgrimes#include "regex2.h"
581573Srgrimes
591573Srgrimes#include "cclass.h"
601573Srgrimes#include "cname.h"
611573Srgrimes
621573Srgrimes/*
631573Srgrimes * parse structure, passed up and down to avoid global variables and
641573Srgrimes * other clumsinesses
651573Srgrimes */
661573Srgrimesstruct parse {
671573Srgrimes	char *next;		/* next character in RE */
681573Srgrimes	char *end;		/* end of string (-> NUL normally) */
691573Srgrimes	int error;		/* has an error been seen? */
701573Srgrimes	sop *strip;		/* malloced strip */
711573Srgrimes	sopno ssize;		/* malloced strip size (allocated) */
721573Srgrimes	sopno slen;		/* malloced strip length (used) */
731573Srgrimes	int ncsalloc;		/* number of csets allocated */
741573Srgrimes	struct re_guts *g;
751573Srgrimes#	define	NPAREN	10	/* we need to remember () 1-9 for back refs */
761573Srgrimes	sopno pbegin[NPAREN];	/* -> ( ([0] unused) */
771573Srgrimes	sopno pend[NPAREN];	/* -> ) ([0] unused) */
781573Srgrimes};
791573Srgrimes
801573Srgrimes/* ========= begin header generated by ./mkh ========= */
811573Srgrimes#ifdef __cplusplus
821573Srgrimesextern "C" {
831573Srgrimes#endif
841573Srgrimes
851573Srgrimes/* === regcomp.c === */
861573Srgrimesstatic void p_ere __P((struct parse *p, int stop));
871573Srgrimesstatic void p_ere_exp __P((struct parse *p));
881573Srgrimesstatic void p_str __P((struct parse *p));
891573Srgrimesstatic void p_bre __P((struct parse *p, int end1, int end2));
901573Srgrimesstatic int p_simp_re __P((struct parse *p, int starordinary));
911573Srgrimesstatic int p_count __P((struct parse *p));
921573Srgrimesstatic void p_bracket __P((struct parse *p));
931573Srgrimesstatic void p_b_term __P((struct parse *p, cset *cs));
941573Srgrimesstatic void p_b_cclass __P((struct parse *p, cset *cs));
951573Srgrimesstatic void p_b_eclass __P((struct parse *p, cset *cs));
961573Srgrimesstatic char p_b_symbol __P((struct parse *p));
971573Srgrimesstatic char p_b_coll_elem __P((struct parse *p, int endc));
981573Srgrimesstatic char othercase __P((int ch));
991573Srgrimesstatic void bothcases __P((struct parse *p, int ch));
1001573Srgrimesstatic void ordinary __P((struct parse *p, int ch));
1011573Srgrimesstatic void nonnewline __P((struct parse *p));
1021573Srgrimesstatic void repeat __P((struct parse *p, sopno start, int from, int to));
1031573Srgrimesstatic int seterr __P((struct parse *p, int e));
1041573Srgrimesstatic cset *allocset __P((struct parse *p));
1051573Srgrimesstatic void freeset __P((struct parse *p, cset *cs));
1061573Srgrimesstatic int freezeset __P((struct parse *p, cset *cs));
1071573Srgrimesstatic int firstch __P((struct parse *p, cset *cs));
1081573Srgrimesstatic int nch __P((struct parse *p, cset *cs));
1091573Srgrimesstatic void mcadd __P((struct parse *p, cset *cs, char *cp));
11017141Sjkh#if used
1111573Srgrimesstatic void mcsub __P((cset *cs, char *cp));
1121573Srgrimesstatic int mcin __P((cset *cs, char *cp));
1131573Srgrimesstatic char *mcfind __P((cset *cs, char *cp));
11417141Sjkh#endif
1151573Srgrimesstatic void mcinvert __P((struct parse *p, cset *cs));
1161573Srgrimesstatic void mccase __P((struct parse *p, cset *cs));
1171573Srgrimesstatic int isinsets __P((struct re_guts *g, int c));
1181573Srgrimesstatic int samesets __P((struct re_guts *g, int c1, int c2));
1191573Srgrimesstatic void categorize __P((struct parse *p, struct re_guts *g));
1201573Srgrimesstatic sopno dupl __P((struct parse *p, sopno start, sopno finish));
1211573Srgrimesstatic void doemit __P((struct parse *p, sop op, size_t opnd));
1221573Srgrimesstatic void doinsert __P((struct parse *p, sop op, size_t opnd, sopno pos));
1231573Srgrimesstatic void dofwd __P((struct parse *p, sopno pos, sop value));
1241573Srgrimesstatic void enlarge __P((struct parse *p, sopno size));
1251573Srgrimesstatic void stripsnug __P((struct parse *p, struct re_guts *g));
1261573Srgrimesstatic void findmust __P((struct parse *p, struct re_guts *g));
12762391Sdcsstatic int altoffset __P((sop *scan, int offset, int mccs));
12862232Sdcsstatic void computejumps __P((struct parse *p, struct re_guts *g));
12962232Sdcsstatic void computematchjumps __P((struct parse *p, struct re_guts *g));
1301573Srgrimesstatic sopno pluscount __P((struct parse *p, struct re_guts *g));
1311573Srgrimes
1321573Srgrimes#ifdef __cplusplus
1331573Srgrimes}
1341573Srgrimes#endif
1351573Srgrimes/* ========= end header generated by ./mkh ========= */
1361573Srgrimes
1371573Srgrimesstatic char nuls[10];		/* place to point scanner in event of error */
1381573Srgrimes
1391573Srgrimes/*
1401573Srgrimes * macros for use with parse structure
1411573Srgrimes * BEWARE:  these know that the parse structure is named `p' !!!
1421573Srgrimes */
1431573Srgrimes#define	PEEK()	(*p->next)
1441573Srgrimes#define	PEEK2()	(*(p->next+1))
1451573Srgrimes#define	MORE()	(p->next < p->end)
1461573Srgrimes#define	MORE2()	(p->next+1 < p->end)
1471573Srgrimes#define	SEE(c)	(MORE() && PEEK() == (c))
1481573Srgrimes#define	SEETWO(a, b)	(MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b))
1491573Srgrimes#define	EAT(c)	((SEE(c)) ? (NEXT(), 1) : 0)
1501573Srgrimes#define	EATTWO(a, b)	((SEETWO(a, b)) ? (NEXT2(), 1) : 0)
1511573Srgrimes#define	NEXT()	(p->next++)
1521573Srgrimes#define	NEXT2()	(p->next += 2)
1531573Srgrimes#define	NEXTn(n)	(p->next += (n))
1541573Srgrimes#define	GETNEXT()	(*p->next++)
1551573Srgrimes#define	SETERROR(e)	seterr(p, (e))
1561573Srgrimes#define	REQUIRE(co, e)	((co) || SETERROR(e))
1571573Srgrimes#define	MUSTSEE(c, e)	(REQUIRE(MORE() && PEEK() == (c), e))
1581573Srgrimes#define	MUSTEAT(c, e)	(REQUIRE(MORE() && GETNEXT() == (c), e))
1591573Srgrimes#define	MUSTNOTSEE(c, e)	(REQUIRE(!MORE() || PEEK() != (c), e))
1601573Srgrimes#define	EMIT(op, sopnd)	doemit(p, (sop)(op), (size_t)(sopnd))
1611573Srgrimes#define	INSERT(op, pos)	doinsert(p, (sop)(op), HERE()-(pos)+1, pos)
1621573Srgrimes#define	AHEAD(pos)		dofwd(p, pos, HERE()-(pos))
1631573Srgrimes#define	ASTERN(sop, pos)	EMIT(sop, HERE()-pos)
1641573Srgrimes#define	HERE()		(p->slen)
1651573Srgrimes#define	THERE()		(p->slen - 1)
1661573Srgrimes#define	THERETHERE()	(p->slen - 2)
1671573Srgrimes#define	DROP(n)	(p->slen -= (n))
1681573Srgrimes
1691573Srgrimes#ifndef NDEBUG
1701573Srgrimesstatic int never = 0;		/* for use in asserts; shuts lint up */
1711573Srgrimes#else
1721573Srgrimes#define	never	0		/* some <assert.h>s have bugs too */
1731573Srgrimes#endif
1741573Srgrimes
17562232Sdcs/* Macro used by computejump()/computematchjump() */
17662232Sdcs#define MIN(a,b)	((a)<(b)?(a):(b))
17762232Sdcs
1781573Srgrimes/*
1791573Srgrimes - regcomp - interface for parser and compilation
1801573Srgrimes = extern int regcomp(regex_t *, const char *, int);
1811573Srgrimes = #define	REG_BASIC	0000
1821573Srgrimes = #define	REG_EXTENDED	0001
1831573Srgrimes = #define	REG_ICASE	0002
1841573Srgrimes = #define	REG_NOSUB	0004
1851573Srgrimes = #define	REG_NEWLINE	0010
1861573Srgrimes = #define	REG_NOSPEC	0020
1871573Srgrimes = #define	REG_PEND	0040
1881573Srgrimes = #define	REG_DUMP	0200
1891573Srgrimes */
1901573Srgrimesint				/* 0 success, otherwise REG_something */
1911573Srgrimesregcomp(preg, pattern, cflags)
1921573Srgrimesregex_t *preg;
1931573Srgrimesconst char *pattern;
1941573Srgrimesint cflags;
1951573Srgrimes{
1961573Srgrimes	struct parse pa;
19792889Sobrien	struct re_guts *g;
19892889Sobrien	struct parse *p = &pa;
19992889Sobrien	int i;
20092889Sobrien	size_t len;
2011573Srgrimes#ifdef REDEBUG
2021573Srgrimes#	define	GOODFLAGS(f)	(f)
2031573Srgrimes#else
2041573Srgrimes#	define	GOODFLAGS(f)	((f)&~REG_DUMP)
2051573Srgrimes#endif
2061573Srgrimes
2071573Srgrimes	cflags = GOODFLAGS(cflags);
2081573Srgrimes	if ((cflags&REG_EXTENDED) && (cflags&REG_NOSPEC))
2091573Srgrimes		return(REG_INVARG);
2101573Srgrimes
2111573Srgrimes	if (cflags&REG_PEND) {
2121573Srgrimes		if (preg->re_endp < pattern)
2131573Srgrimes			return(REG_INVARG);
2141573Srgrimes		len = preg->re_endp - pattern;
2151573Srgrimes	} else
2161573Srgrimes		len = strlen((char *)pattern);
2171573Srgrimes
2181573Srgrimes	/* do the mallocs early so failure handling is easy */
2191573Srgrimes	g = (struct re_guts *)malloc(sizeof(struct re_guts) +
2201573Srgrimes							(NC-1)*sizeof(cat_t));
2211573Srgrimes	if (g == NULL)
2221573Srgrimes		return(REG_ESPACE);
2231573Srgrimes	p->ssize = len/(size_t)2*(size_t)3 + (size_t)1;	/* ugh */
2241573Srgrimes	p->strip = (sop *)malloc(p->ssize * sizeof(sop));
2251573Srgrimes	p->slen = 0;
2261573Srgrimes	if (p->strip == NULL) {
2271573Srgrimes		free((char *)g);
2281573Srgrimes		return(REG_ESPACE);
2291573Srgrimes	}
2301573Srgrimes
2311573Srgrimes	/* set things up */
2321573Srgrimes	p->g = g;
2331573Srgrimes	p->next = (char *)pattern;	/* convenience; we do not modify it */
2341573Srgrimes	p->end = p->next + len;
2351573Srgrimes	p->error = 0;
2361573Srgrimes	p->ncsalloc = 0;
2371573Srgrimes	for (i = 0; i < NPAREN; i++) {
2381573Srgrimes		p->pbegin[i] = 0;
2391573Srgrimes		p->pend[i] = 0;
2401573Srgrimes	}
2411573Srgrimes	g->csetsize = NC;
2421573Srgrimes	g->sets = NULL;
2431573Srgrimes	g->setbits = NULL;
2441573Srgrimes	g->ncsets = 0;
2451573Srgrimes	g->cflags = cflags;
2461573Srgrimes	g->iflags = 0;
2471573Srgrimes	g->nbol = 0;
2481573Srgrimes	g->neol = 0;
2491573Srgrimes	g->must = NULL;
25062391Sdcs	g->moffset = -1;
25162263Sdcs	g->charjump = NULL;
25262263Sdcs	g->matchjump = NULL;
2531573Srgrimes	g->mlen = 0;
2541573Srgrimes	g->nsub = 0;
2551573Srgrimes	g->ncategories = 1;	/* category 0 is "everything else" */
2561573Srgrimes	g->categories = &g->catspace[-(CHAR_MIN)];
2571573Srgrimes	(void) memset((char *)g->catspace, 0, NC*sizeof(cat_t));
2581573Srgrimes	g->backrefs = 0;
2591573Srgrimes
2601573Srgrimes	/* do it */
2611573Srgrimes	EMIT(OEND, 0);
2621573Srgrimes	g->firststate = THERE();
2631573Srgrimes	if (cflags&REG_EXTENDED)
2641573Srgrimes		p_ere(p, OUT);
2651573Srgrimes	else if (cflags&REG_NOSPEC)
2661573Srgrimes		p_str(p);
2671573Srgrimes	else
2681573Srgrimes		p_bre(p, OUT, OUT);
2691573Srgrimes	EMIT(OEND, 0);
2701573Srgrimes	g->laststate = THERE();
2711573Srgrimes
2721573Srgrimes	/* tidy up loose ends and fill things in */
2731573Srgrimes	categorize(p, g);
2741573Srgrimes	stripsnug(p, g);
2751573Srgrimes	findmust(p, g);
27662232Sdcs	/* only use Boyer-Moore algorithm if the pattern is bigger
27762232Sdcs	 * than three characters
27862232Sdcs	 */
27962232Sdcs	if(g->mlen > 3) {
28062232Sdcs		computejumps(p, g);
28162232Sdcs		computematchjumps(p, g);
28262755Sdcs		if(g->matchjump == NULL && g->charjump != NULL) {
28362232Sdcs			free(g->charjump);
28462232Sdcs			g->charjump = NULL;
28562232Sdcs		}
28662232Sdcs	}
2871573Srgrimes	g->nplus = pluscount(p, g);
2881573Srgrimes	g->magic = MAGIC2;
2891573Srgrimes	preg->re_nsub = g->nsub;
2901573Srgrimes	preg->re_g = g;
2911573Srgrimes	preg->re_magic = MAGIC1;
2921573Srgrimes#ifndef REDEBUG
2931573Srgrimes	/* not debugging, so can't rely on the assert() in regexec() */
2941573Srgrimes	if (g->iflags&BAD)
2951573Srgrimes		SETERROR(REG_ASSERT);
2961573Srgrimes#endif
2971573Srgrimes
2981573Srgrimes	/* win or lose, we're done */
2991573Srgrimes	if (p->error != 0)	/* lose */
3001573Srgrimes		regfree(preg);
3011573Srgrimes	return(p->error);
3021573Srgrimes}
3031573Srgrimes
3041573Srgrimes/*
3051573Srgrimes - p_ere - ERE parser top level, concatenation and alternation
30692889Sobrien == static void p_ere(struct parse *p, int stop);
3071573Srgrimes */
3081573Srgrimesstatic void
3091573Srgrimesp_ere(p, stop)
31092889Sobrienstruct parse *p;
3111573Srgrimesint stop;			/* character this ERE should end at */
3121573Srgrimes{
31392889Sobrien	char c;
31492889Sobrien	sopno prevback;
31592889Sobrien	sopno prevfwd;
31692889Sobrien	sopno conc;
31792889Sobrien	int first = 1;		/* is this the first alternative? */
3181573Srgrimes
3191573Srgrimes	for (;;) {
3201573Srgrimes		/* do a bunch of concatenated expressions */
3211573Srgrimes		conc = HERE();
3221573Srgrimes		while (MORE() && (c = PEEK()) != '|' && c != stop)
3231573Srgrimes			p_ere_exp(p);
32417141Sjkh		(void)REQUIRE(HERE() != conc, REG_EMPTY);	/* require nonempty */
3251573Srgrimes
3261573Srgrimes		if (!EAT('|'))
3271573Srgrimes			break;		/* NOTE BREAK OUT */
3281573Srgrimes
3291573Srgrimes		if (first) {
3301573Srgrimes			INSERT(OCH_, conc);	/* offset is wrong */
3311573Srgrimes			prevfwd = conc;
3321573Srgrimes			prevback = conc;
3331573Srgrimes			first = 0;
3341573Srgrimes		}
3351573Srgrimes		ASTERN(OOR1, prevback);
3361573Srgrimes		prevback = THERE();
3371573Srgrimes		AHEAD(prevfwd);			/* fix previous offset */
3381573Srgrimes		prevfwd = HERE();
3391573Srgrimes		EMIT(OOR2, 0);			/* offset is very wrong */
3401573Srgrimes	}
3411573Srgrimes
3421573Srgrimes	if (!first) {		/* tail-end fixups */
3431573Srgrimes		AHEAD(prevfwd);
3441573Srgrimes		ASTERN(O_CH, prevback);
3451573Srgrimes	}
3461573Srgrimes
3471573Srgrimes	assert(!MORE() || SEE(stop));
3481573Srgrimes}
3491573Srgrimes
3501573Srgrimes/*
3511573Srgrimes - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op
35292889Sobrien == static void p_ere_exp(struct parse *p);
3531573Srgrimes */
3541573Srgrimesstatic void
3551573Srgrimesp_ere_exp(p)
35692889Sobrienstruct parse *p;
3571573Srgrimes{
35892889Sobrien	char c;
35992889Sobrien	sopno pos;
36092889Sobrien	int count;
36192889Sobrien	int count2;
36292889Sobrien	sopno subno;
3631573Srgrimes	int wascaret = 0;
3641573Srgrimes
3651573Srgrimes	assert(MORE());		/* caller should have ensured this */
3661573Srgrimes	c = GETNEXT();
3671573Srgrimes
3681573Srgrimes	pos = HERE();
3691573Srgrimes	switch (c) {
3701573Srgrimes	case '(':
37117141Sjkh		(void)REQUIRE(MORE(), REG_EPAREN);
3721573Srgrimes		p->g->nsub++;
3731573Srgrimes		subno = p->g->nsub;
3741573Srgrimes		if (subno < NPAREN)
3751573Srgrimes			p->pbegin[subno] = HERE();
3761573Srgrimes		EMIT(OLPAREN, subno);
3771573Srgrimes		if (!SEE(')'))
3781573Srgrimes			p_ere(p, ')');
3791573Srgrimes		if (subno < NPAREN) {
3801573Srgrimes			p->pend[subno] = HERE();
3811573Srgrimes			assert(p->pend[subno] != 0);
3821573Srgrimes		}
3831573Srgrimes		EMIT(ORPAREN, subno);
38417141Sjkh		(void)MUSTEAT(')', REG_EPAREN);
3851573Srgrimes		break;
3861573Srgrimes#ifndef POSIX_MISTAKE
3871573Srgrimes	case ')':		/* happens only if no current unmatched ( */
3881573Srgrimes		/*
3891573Srgrimes		 * You may ask, why the ifndef?  Because I didn't notice
3901573Srgrimes		 * this until slightly too late for 1003.2, and none of the
3911573Srgrimes		 * other 1003.2 regular-expression reviewers noticed it at
3921573Srgrimes		 * all.  So an unmatched ) is legal POSIX, at least until
3931573Srgrimes		 * we can get it fixed.
3941573Srgrimes		 */
3951573Srgrimes		SETERROR(REG_EPAREN);
3961573Srgrimes		break;
3971573Srgrimes#endif
3981573Srgrimes	case '^':
3991573Srgrimes		EMIT(OBOL, 0);
4001573Srgrimes		p->g->iflags |= USEBOL;
4011573Srgrimes		p->g->nbol++;
4021573Srgrimes		wascaret = 1;
4031573Srgrimes		break;
4041573Srgrimes	case '$':
4051573Srgrimes		EMIT(OEOL, 0);
4061573Srgrimes		p->g->iflags |= USEEOL;
4071573Srgrimes		p->g->neol++;
4081573Srgrimes		break;
4091573Srgrimes	case '|':
4101573Srgrimes		SETERROR(REG_EMPTY);
4111573Srgrimes		break;
4121573Srgrimes	case '*':
4131573Srgrimes	case '+':
4141573Srgrimes	case '?':
4151573Srgrimes		SETERROR(REG_BADRPT);
4161573Srgrimes		break;
4171573Srgrimes	case '.':
4181573Srgrimes		if (p->g->cflags&REG_NEWLINE)
4191573Srgrimes			nonnewline(p);
4201573Srgrimes		else
4211573Srgrimes			EMIT(OANY, 0);
4221573Srgrimes		break;
4231573Srgrimes	case '[':
4241573Srgrimes		p_bracket(p);
4251573Srgrimes		break;
4261573Srgrimes	case '\\':
42717141Sjkh		(void)REQUIRE(MORE(), REG_EESCAPE);
4281573Srgrimes		c = GETNEXT();
4291573Srgrimes		ordinary(p, c);
4301573Srgrimes		break;
4311573Srgrimes	case '{':		/* okay as ordinary except if digit follows */
43249094Sache		(void)REQUIRE(!MORE() || !isdigit((uch)PEEK()), REG_BADRPT);
4331573Srgrimes		/* FALLTHROUGH */
4341573Srgrimes	default:
4351573Srgrimes		ordinary(p, c);
4361573Srgrimes		break;
4371573Srgrimes	}
4381573Srgrimes
4391573Srgrimes	if (!MORE())
4401573Srgrimes		return;
4411573Srgrimes	c = PEEK();
4421573Srgrimes	/* we call { a repetition if followed by a digit */
4431573Srgrimes	if (!( c == '*' || c == '+' || c == '?' ||
44449094Sache				(c == '{' && MORE2() && isdigit((uch)PEEK2())) ))
4451573Srgrimes		return;		/* no repetition, we're done */
4461573Srgrimes	NEXT();
4471573Srgrimes
44817141Sjkh	(void)REQUIRE(!wascaret, REG_BADRPT);
4491573Srgrimes	switch (c) {
4501573Srgrimes	case '*':	/* implemented as +? */
4511573Srgrimes		/* this case does not require the (y|) trick, noKLUDGE */
4521573Srgrimes		INSERT(OPLUS_, pos);
4531573Srgrimes		ASTERN(O_PLUS, pos);
4541573Srgrimes		INSERT(OQUEST_, pos);
4551573Srgrimes		ASTERN(O_QUEST, pos);
4561573Srgrimes		break;
4571573Srgrimes	case '+':
4581573Srgrimes		INSERT(OPLUS_, pos);
4591573Srgrimes		ASTERN(O_PLUS, pos);
4601573Srgrimes		break;
4611573Srgrimes	case '?':
4621573Srgrimes		/* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
4631573Srgrimes		INSERT(OCH_, pos);		/* offset slightly wrong */
4641573Srgrimes		ASTERN(OOR1, pos);		/* this one's right */
4651573Srgrimes		AHEAD(pos);			/* fix the OCH_ */
4661573Srgrimes		EMIT(OOR2, 0);			/* offset very wrong... */
4671573Srgrimes		AHEAD(THERE());			/* ...so fix it */
4681573Srgrimes		ASTERN(O_CH, THERETHERE());
4691573Srgrimes		break;
4701573Srgrimes	case '{':
4711573Srgrimes		count = p_count(p);
4721573Srgrimes		if (EAT(',')) {
47349094Sache			if (isdigit((uch)PEEK())) {
4741573Srgrimes				count2 = p_count(p);
47517141Sjkh				(void)REQUIRE(count <= count2, REG_BADBR);
4761573Srgrimes			} else		/* single number with comma */
4771573Srgrimes				count2 = INFINITY;
4781573Srgrimes		} else		/* just a single number */
4791573Srgrimes			count2 = count;
4801573Srgrimes		repeat(p, pos, count, count2);
4811573Srgrimes		if (!EAT('}')) {	/* error heuristics */
4821573Srgrimes			while (MORE() && PEEK() != '}')
4831573Srgrimes				NEXT();
48417141Sjkh			(void)REQUIRE(MORE(), REG_EBRACE);
4851573Srgrimes			SETERROR(REG_BADBR);
4861573Srgrimes		}
4871573Srgrimes		break;
4881573Srgrimes	}
4891573Srgrimes
4901573Srgrimes	if (!MORE())
4911573Srgrimes		return;
4921573Srgrimes	c = PEEK();
4931573Srgrimes	if (!( c == '*' || c == '+' || c == '?' ||
49449094Sache				(c == '{' && MORE2() && isdigit((uch)PEEK2())) ) )
4951573Srgrimes		return;
4961573Srgrimes	SETERROR(REG_BADRPT);
4971573Srgrimes}
4981573Srgrimes
4991573Srgrimes/*
5001573Srgrimes - p_str - string (no metacharacters) "parser"
50192889Sobrien == static void p_str(struct parse *p);
5021573Srgrimes */
5031573Srgrimesstatic void
5041573Srgrimesp_str(p)
50592889Sobrienstruct parse *p;
5061573Srgrimes{
50717141Sjkh	(void)REQUIRE(MORE(), REG_EMPTY);
5081573Srgrimes	while (MORE())
5091573Srgrimes		ordinary(p, GETNEXT());
5101573Srgrimes}
5111573Srgrimes
5121573Srgrimes/*
5131573Srgrimes - p_bre - BRE parser top level, anchoring and concatenation
51492889Sobrien == static void p_bre(struct parse *p, int end1, \
51592889Sobrien ==	int end2);
5161573Srgrimes * Giving end1 as OUT essentially eliminates the end1/end2 check.
5171573Srgrimes *
5181573Srgrimes * This implementation is a bit of a kludge, in that a trailing $ is first
5191573Srgrimes * taken as an ordinary character and then revised to be an anchor.  The
5201573Srgrimes * only undesirable side effect is that '$' gets included as a character
5211573Srgrimes * category in such cases.  This is fairly harmless; not worth fixing.
5221573Srgrimes * The amount of lookahead needed to avoid this kludge is excessive.
5231573Srgrimes */
5241573Srgrimesstatic void
5251573Srgrimesp_bre(p, end1, end2)
52692889Sobrienstruct parse *p;
52792889Sobrienint end1;			/* first terminating character */
52892889Sobrienint end2;			/* second terminating character */
5291573Srgrimes{
53092889Sobrien	sopno start = HERE();
53192889Sobrien	int first = 1;			/* first subexpression? */
53292889Sobrien	int wasdollar = 0;
5331573Srgrimes
5341573Srgrimes	if (EAT('^')) {
5351573Srgrimes		EMIT(OBOL, 0);
5361573Srgrimes		p->g->iflags |= USEBOL;
5371573Srgrimes		p->g->nbol++;
5381573Srgrimes	}
5391573Srgrimes	while (MORE() && !SEETWO(end1, end2)) {
5401573Srgrimes		wasdollar = p_simp_re(p, first);
5411573Srgrimes		first = 0;
5421573Srgrimes	}
5431573Srgrimes	if (wasdollar) {	/* oops, that was a trailing anchor */
5441573Srgrimes		DROP(1);
5451573Srgrimes		EMIT(OEOL, 0);
5461573Srgrimes		p->g->iflags |= USEEOL;
5471573Srgrimes		p->g->neol++;
5481573Srgrimes	}
5491573Srgrimes
55017141Sjkh	(void)REQUIRE(HERE() != start, REG_EMPTY);	/* require nonempty */
5511573Srgrimes}
5521573Srgrimes
5531573Srgrimes/*
5541573Srgrimes - p_simp_re - parse a simple RE, an atom possibly followed by a repetition
55592889Sobrien == static int p_simp_re(struct parse *p, int starordinary);
5561573Srgrimes */
5571573Srgrimesstatic int			/* was the simple RE an unbackslashed $? */
5581573Srgrimesp_simp_re(p, starordinary)
55992889Sobrienstruct parse *p;
5601573Srgrimesint starordinary;		/* is a leading * an ordinary character? */
5611573Srgrimes{
56292889Sobrien	int c;
56392889Sobrien	int count;
56492889Sobrien	int count2;
56592889Sobrien	sopno pos;
56692889Sobrien	int i;
56792889Sobrien	sopno subno;
5681573Srgrimes#	define	BACKSL	(1<<CHAR_BIT)
5691573Srgrimes
5701573Srgrimes	pos = HERE();		/* repetion op, if any, covers from here */
5711573Srgrimes
5721573Srgrimes	assert(MORE());		/* caller should have ensured this */
5731573Srgrimes	c = GETNEXT();
5741573Srgrimes	if (c == '\\') {
57517141Sjkh		(void)REQUIRE(MORE(), REG_EESCAPE);
57649094Sache		c = BACKSL | GETNEXT();
5771573Srgrimes	}
5781573Srgrimes	switch (c) {
5791573Srgrimes	case '.':
5801573Srgrimes		if (p->g->cflags&REG_NEWLINE)
5811573Srgrimes			nonnewline(p);
5821573Srgrimes		else
5831573Srgrimes			EMIT(OANY, 0);
5841573Srgrimes		break;
5851573Srgrimes	case '[':
5861573Srgrimes		p_bracket(p);
5871573Srgrimes		break;
5881573Srgrimes	case BACKSL|'{':
5891573Srgrimes		SETERROR(REG_BADRPT);
5901573Srgrimes		break;
5911573Srgrimes	case BACKSL|'(':
5921573Srgrimes		p->g->nsub++;
5931573Srgrimes		subno = p->g->nsub;
5941573Srgrimes		if (subno < NPAREN)
5951573Srgrimes			p->pbegin[subno] = HERE();
5961573Srgrimes		EMIT(OLPAREN, subno);
5971573Srgrimes		/* the MORE here is an error heuristic */
5981573Srgrimes		if (MORE() && !SEETWO('\\', ')'))
5991573Srgrimes			p_bre(p, '\\', ')');
6001573Srgrimes		if (subno < NPAREN) {
6011573Srgrimes			p->pend[subno] = HERE();
6021573Srgrimes			assert(p->pend[subno] != 0);
6031573Srgrimes		}
6041573Srgrimes		EMIT(ORPAREN, subno);
60517141Sjkh		(void)REQUIRE(EATTWO('\\', ')'), REG_EPAREN);
6061573Srgrimes		break;
6071573Srgrimes	case BACKSL|')':	/* should not get here -- must be user */
6081573Srgrimes	case BACKSL|'}':
6091573Srgrimes		SETERROR(REG_EPAREN);
6101573Srgrimes		break;
6111573Srgrimes	case BACKSL|'1':
6121573Srgrimes	case BACKSL|'2':
6131573Srgrimes	case BACKSL|'3':
6141573Srgrimes	case BACKSL|'4':
6151573Srgrimes	case BACKSL|'5':
6161573Srgrimes	case BACKSL|'6':
6171573Srgrimes	case BACKSL|'7':
6181573Srgrimes	case BACKSL|'8':
6191573Srgrimes	case BACKSL|'9':
6201573Srgrimes		i = (c&~BACKSL) - '0';
6211573Srgrimes		assert(i < NPAREN);
6221573Srgrimes		if (p->pend[i] != 0) {
6231573Srgrimes			assert(i <= p->g->nsub);
6241573Srgrimes			EMIT(OBACK_, i);
6251573Srgrimes			assert(p->pbegin[i] != 0);
6261573Srgrimes			assert(OP(p->strip[p->pbegin[i]]) == OLPAREN);
6271573Srgrimes			assert(OP(p->strip[p->pend[i]]) == ORPAREN);
6281573Srgrimes			(void) dupl(p, p->pbegin[i]+1, p->pend[i]);
6291573Srgrimes			EMIT(O_BACK, i);
6301573Srgrimes		} else
6311573Srgrimes			SETERROR(REG_ESUBREG);
6321573Srgrimes		p->g->backrefs = 1;
6331573Srgrimes		break;
6341573Srgrimes	case '*':
63517141Sjkh		(void)REQUIRE(starordinary, REG_BADRPT);
6361573Srgrimes		/* FALLTHROUGH */
6371573Srgrimes	default:
63849094Sache		ordinary(p, (char)c);
6391573Srgrimes		break;
6401573Srgrimes	}
6411573Srgrimes
6421573Srgrimes	if (EAT('*')) {		/* implemented as +? */
6431573Srgrimes		/* this case does not require the (y|) trick, noKLUDGE */
6441573Srgrimes		INSERT(OPLUS_, pos);
6451573Srgrimes		ASTERN(O_PLUS, pos);
6461573Srgrimes		INSERT(OQUEST_, pos);
6471573Srgrimes		ASTERN(O_QUEST, pos);
6481573Srgrimes	} else if (EATTWO('\\', '{')) {
6491573Srgrimes		count = p_count(p);
6501573Srgrimes		if (EAT(',')) {
65149094Sache			if (MORE() && isdigit((uch)PEEK())) {
6521573Srgrimes				count2 = p_count(p);
65317141Sjkh				(void)REQUIRE(count <= count2, REG_BADBR);
6541573Srgrimes			} else		/* single number with comma */
6551573Srgrimes				count2 = INFINITY;
6561573Srgrimes		} else		/* just a single number */
6571573Srgrimes			count2 = count;
6581573Srgrimes		repeat(p, pos, count, count2);
6591573Srgrimes		if (!EATTWO('\\', '}')) {	/* error heuristics */
6601573Srgrimes			while (MORE() && !SEETWO('\\', '}'))
6611573Srgrimes				NEXT();
66217141Sjkh			(void)REQUIRE(MORE(), REG_EBRACE);
6631573Srgrimes			SETERROR(REG_BADBR);
6641573Srgrimes		}
66549094Sache	} else if (c == '$')     /* $ (but not \$) ends it */
6661573Srgrimes		return(1);
6671573Srgrimes
6681573Srgrimes	return(0);
6691573Srgrimes}
6701573Srgrimes
6711573Srgrimes/*
6721573Srgrimes - p_count - parse a repetition count
67392889Sobrien == static int p_count(struct parse *p);
6741573Srgrimes */
6751573Srgrimesstatic int			/* the value */
6761573Srgrimesp_count(p)
67792889Sobrienstruct parse *p;
6781573Srgrimes{
67992889Sobrien	int count = 0;
68092889Sobrien	int ndigits = 0;
6811573Srgrimes
68249094Sache	while (MORE() && isdigit((uch)PEEK()) && count <= DUPMAX) {
6831573Srgrimes		count = count*10 + (GETNEXT() - '0');
6841573Srgrimes		ndigits++;
6851573Srgrimes	}
6861573Srgrimes
68717141Sjkh	(void)REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR);
6881573Srgrimes	return(count);
6891573Srgrimes}
6901573Srgrimes
6911573Srgrimes/*
6921573Srgrimes - p_bracket - parse a bracketed character list
69392889Sobrien == static void p_bracket(struct parse *p);
6941573Srgrimes *
6951573Srgrimes * Note a significant property of this code:  if the allocset() did SETERROR,
6961573Srgrimes * no set operations are done.
6971573Srgrimes */
6981573Srgrimesstatic void
6991573Srgrimesp_bracket(p)
70092889Sobrienstruct parse *p;
7011573Srgrimes{
70292889Sobrien	cset *cs = allocset(p);
70392889Sobrien	int invert = 0;
7041573Srgrimes
7051573Srgrimes	/* Dept of Truly Sickening Special-Case Kludges */
7061573Srgrimes	if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]", 6) == 0) {
7071573Srgrimes		EMIT(OBOW, 0);
7081573Srgrimes		NEXTn(6);
7091573Srgrimes		return;
7101573Srgrimes	}
7111573Srgrimes	if (p->next + 5 < p->end && strncmp(p->next, "[:>:]]", 6) == 0) {
7121573Srgrimes		EMIT(OEOW, 0);
7131573Srgrimes		NEXTn(6);
7141573Srgrimes		return;
7151573Srgrimes	}
7161573Srgrimes
7171573Srgrimes	if (EAT('^'))
7181573Srgrimes		invert++;	/* make note to invert set at end */
7191573Srgrimes	if (EAT(']'))
7201573Srgrimes		CHadd(cs, ']');
7211573Srgrimes	else if (EAT('-'))
7221573Srgrimes		CHadd(cs, '-');
7231573Srgrimes	while (MORE() && PEEK() != ']' && !SEETWO('-', ']'))
7241573Srgrimes		p_b_term(p, cs);
7251573Srgrimes	if (EAT('-'))
7261573Srgrimes		CHadd(cs, '-');
72717141Sjkh	(void)MUSTEAT(']', REG_EBRACK);
7281573Srgrimes
7291573Srgrimes	if (p->error != 0)	/* don't mess things up further */
7301573Srgrimes		return;
7311573Srgrimes
7321573Srgrimes	if (p->g->cflags&REG_ICASE) {
73392889Sobrien		int i;
73492889Sobrien		int ci;
7351573Srgrimes
7361573Srgrimes		for (i = p->g->csetsize - 1; i >= 0; i--)
7371573Srgrimes			if (CHIN(cs, i) && isalpha(i)) {
7381573Srgrimes				ci = othercase(i);
7391573Srgrimes				if (ci != i)
7401573Srgrimes					CHadd(cs, ci);
7411573Srgrimes			}
7421573Srgrimes		if (cs->multis != NULL)
7431573Srgrimes			mccase(p, cs);
7441573Srgrimes	}
7451573Srgrimes	if (invert) {
74692889Sobrien		int i;
7471573Srgrimes
7481573Srgrimes		for (i = p->g->csetsize - 1; i >= 0; i--)
7491573Srgrimes			if (CHIN(cs, i))
7501573Srgrimes				CHsub(cs, i);
7511573Srgrimes			else
7521573Srgrimes				CHadd(cs, i);
7531573Srgrimes		if (p->g->cflags&REG_NEWLINE)
7541573Srgrimes			CHsub(cs, '\n');
7551573Srgrimes		if (cs->multis != NULL)
7561573Srgrimes			mcinvert(p, cs);
7571573Srgrimes	}
7581573Srgrimes
7591573Srgrimes	assert(cs->multis == NULL);		/* xxx */
7601573Srgrimes
7611573Srgrimes	if (nch(p, cs) == 1) {		/* optimize singleton sets */
7621573Srgrimes		ordinary(p, firstch(p, cs));
7631573Srgrimes		freeset(p, cs);
7641573Srgrimes	} else
7651573Srgrimes		EMIT(OANYOF, freezeset(p, cs));
7661573Srgrimes}
7671573Srgrimes
7681573Srgrimes/*
7691573Srgrimes - p_b_term - parse one term of a bracketed character list
77092889Sobrien == static void p_b_term(struct parse *p, cset *cs);
7711573Srgrimes */
7721573Srgrimesstatic void
7731573Srgrimesp_b_term(p, cs)
77492889Sobrienstruct parse *p;
77592889Sobriencset *cs;
7761573Srgrimes{
77792889Sobrien	char c;
77892889Sobrien	char start, finish;
77992889Sobrien	int i;
7801573Srgrimes
7811573Srgrimes	/* classify what we've got */
7821573Srgrimes	switch ((MORE()) ? PEEK() : '\0') {
7831573Srgrimes	case '[':
7841573Srgrimes		c = (MORE2()) ? PEEK2() : '\0';
7851573Srgrimes		break;
7861573Srgrimes	case '-':
7871573Srgrimes		SETERROR(REG_ERANGE);
7881573Srgrimes		return;			/* NOTE RETURN */
7891573Srgrimes		break;
7901573Srgrimes	default:
7911573Srgrimes		c = '\0';
7921573Srgrimes		break;
7931573Srgrimes	}
7941573Srgrimes
7951573Srgrimes	switch (c) {
7961573Srgrimes	case ':':		/* character class */
7971573Srgrimes		NEXT2();
79817141Sjkh		(void)REQUIRE(MORE(), REG_EBRACK);
7991573Srgrimes		c = PEEK();
80017141Sjkh		(void)REQUIRE(c != '-' && c != ']', REG_ECTYPE);
8011573Srgrimes		p_b_cclass(p, cs);
80217141Sjkh		(void)REQUIRE(MORE(), REG_EBRACK);
80317141Sjkh		(void)REQUIRE(EATTWO(':', ']'), REG_ECTYPE);
8041573Srgrimes		break;
8051573Srgrimes	case '=':		/* equivalence class */
8061573Srgrimes		NEXT2();
80717141Sjkh		(void)REQUIRE(MORE(), REG_EBRACK);
8081573Srgrimes		c = PEEK();
80917141Sjkh		(void)REQUIRE(c != '-' && c != ']', REG_ECOLLATE);
8101573Srgrimes		p_b_eclass(p, cs);
81117141Sjkh		(void)REQUIRE(MORE(), REG_EBRACK);
81217141Sjkh		(void)REQUIRE(EATTWO('=', ']'), REG_ECOLLATE);
8131573Srgrimes		break;
8141573Srgrimes	default:		/* symbol, ordinary character, or range */
8151573Srgrimes/* xxx revision needed for multichar stuff */
8161573Srgrimes		start = p_b_symbol(p);
8171573Srgrimes		if (SEE('-') && MORE2() && PEEK2() != ']') {
8181573Srgrimes			/* range */
8191573Srgrimes			NEXT();
8201573Srgrimes			if (EAT('-'))
8211573Srgrimes				finish = '-';
8221573Srgrimes			else
8231573Srgrimes				finish = p_b_symbol(p);
8241573Srgrimes		} else
8251573Srgrimes			finish = start;
82617514Sache		if (start == finish)
82717514Sache			CHadd(cs, start);
82817514Sache		else {
82924637Sache			if (__collate_load_error) {
83024637Sache				(void)REQUIRE((uch)start <= (uch)finish, REG_ERANGE);
83124637Sache				for (i = (uch)start; i <= (uch)finish; i++)
83217514Sache					CHadd(cs, i);
83324637Sache			} else {
83424637Sache				(void)REQUIRE(__collate_range_cmp(start, finish) <= 0, REG_ERANGE);
83524637Sache				for (i = CHAR_MIN; i <= CHAR_MAX; i++) {
83624637Sache					if (   __collate_range_cmp(start, i) <= 0
83724637Sache					    && __collate_range_cmp(i, finish) <= 0
83824637Sache					   )
83924637Sache						CHadd(cs, i);
84024637Sache				}
84117514Sache			}
84217514Sache		}
8431573Srgrimes		break;
8441573Srgrimes	}
8451573Srgrimes}
8461573Srgrimes
8471573Srgrimes/*
8481573Srgrimes - p_b_cclass - parse a character-class name and deal with it
84992889Sobrien == static void p_b_cclass(struct parse *p, cset *cs);
8501573Srgrimes */
8511573Srgrimesstatic void
8521573Srgrimesp_b_cclass(p, cs)
85392889Sobrienstruct parse *p;
85492889Sobriencset *cs;
8551573Srgrimes{
85692889Sobrien	int c;
85792889Sobrien	char *sp = p->next;
85892889Sobrien	struct cclass *cp;
85992889Sobrien	size_t len;
8601573Srgrimes
86117508Sache	while (MORE() && isalpha((uch)PEEK()))
8621573Srgrimes		NEXT();
8631573Srgrimes	len = p->next - sp;
8641573Srgrimes	for (cp = cclasses; cp->name != NULL; cp++)
8651573Srgrimes		if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0')
8661573Srgrimes			break;
8671573Srgrimes	if (cp->name == NULL) {
8681573Srgrimes		/* oops, didn't find it */
8691573Srgrimes		SETERROR(REG_ECTYPE);
8701573Srgrimes		return;
8711573Srgrimes	}
8721573Srgrimes
87317508Sache	switch (cp->fidx) {
87417508Sache	case CALNUM:
87517508Sache		for (c = CHAR_MIN; c <= CHAR_MAX; c++)
87617508Sache			if (isalnum((uch)c))
87717508Sache				CHadd(cs, c);
87817508Sache		break;
87917508Sache	case CALPHA:
88017508Sache		for (c = CHAR_MIN; c <= CHAR_MAX; c++)
88117508Sache			if (isalpha((uch)c))
88217508Sache				CHadd(cs, c);
88317508Sache		break;
88417508Sache	case CBLANK:
88517508Sache		for (c = CHAR_MIN; c <= CHAR_MAX; c++)
88617508Sache			if (isblank((uch)c))
88717508Sache				CHadd(cs, c);
88817508Sache		break;
88917508Sache	case CCNTRL:
89017508Sache		for (c = CHAR_MIN; c <= CHAR_MAX; c++)
89117508Sache			if (iscntrl((uch)c))
89217508Sache				CHadd(cs, c);
89317508Sache		break;
89417508Sache	case CDIGIT:
89517508Sache		for (c = CHAR_MIN; c <= CHAR_MAX; c++)
89617508Sache			if (isdigit((uch)c))
89717508Sache				CHadd(cs, c);
89817508Sache		break;
89917508Sache	case CGRAPH:
90017508Sache		for (c = CHAR_MIN; c <= CHAR_MAX; c++)
90117508Sache			if (isgraph((uch)c))
90217508Sache				CHadd(cs, c);
90317508Sache		break;
90417508Sache	case CLOWER:
90517508Sache		for (c = CHAR_MIN; c <= CHAR_MAX; c++)
90617508Sache			if (islower((uch)c))
90717508Sache				CHadd(cs, c);
90817508Sache		break;
90917508Sache	case CPRINT:
91017508Sache		for (c = CHAR_MIN; c <= CHAR_MAX; c++)
91117508Sache			if (isprint((uch)c))
91217508Sache				CHadd(cs, c);
91317508Sache		break;
91417508Sache	case CPUNCT:
91517508Sache		for (c = CHAR_MIN; c <= CHAR_MAX; c++)
91617508Sache			if (ispunct((uch)c))
91717508Sache				CHadd(cs, c);
91817508Sache		break;
91917508Sache	case CSPACE:
92017508Sache		for (c = CHAR_MIN; c <= CHAR_MAX; c++)
92117508Sache			if (isspace((uch)c))
92217508Sache				CHadd(cs, c);
92317508Sache		break;
92417508Sache	case CUPPER:
92517508Sache		for (c = CHAR_MIN; c <= CHAR_MAX; c++)
92617508Sache			if (isupper((uch)c))
92717508Sache				CHadd(cs, c);
92817508Sache		break;
92917508Sache	case CXDIGIT:
93017508Sache		for (c = CHAR_MIN; c <= CHAR_MAX; c++)
93117508Sache			if (isxdigit((uch)c))
93217508Sache				CHadd(cs, c);
93317508Sache		break;
93417508Sache	}
93517508Sache#if 0
9361573Srgrimes	for (u = cp->multis; *u != '\0'; u += strlen(u) + 1)
9371573Srgrimes		MCadd(p, cs, u);
93817508Sache#endif
9391573Srgrimes}
9401573Srgrimes
9411573Srgrimes/*
9421573Srgrimes - p_b_eclass - parse an equivalence-class name and deal with it
94392889Sobrien == static void p_b_eclass(struct parse *p, cset *cs);
9441573Srgrimes *
9451573Srgrimes * This implementation is incomplete. xxx
9461573Srgrimes */
9471573Srgrimesstatic void
9481573Srgrimesp_b_eclass(p, cs)
94992889Sobrienstruct parse *p;
95092889Sobriencset *cs;
9511573Srgrimes{
95292889Sobrien	char c;
9531573Srgrimes
9541573Srgrimes	c = p_b_coll_elem(p, '=');
9551573Srgrimes	CHadd(cs, c);
9561573Srgrimes}
9571573Srgrimes
9581573Srgrimes/*
9591573Srgrimes - p_b_symbol - parse a character or [..]ed multicharacter collating symbol
96092889Sobrien == static char p_b_symbol(struct parse *p);
9611573Srgrimes */
9621573Srgrimesstatic char			/* value of symbol */
9631573Srgrimesp_b_symbol(p)
96492889Sobrienstruct parse *p;
9651573Srgrimes{
96692889Sobrien	char value;
9671573Srgrimes
96817141Sjkh	(void)REQUIRE(MORE(), REG_EBRACK);
9691573Srgrimes	if (!EATTWO('[', '.'))
9701573Srgrimes		return(GETNEXT());
9711573Srgrimes
9721573Srgrimes	/* collating symbol */
9731573Srgrimes	value = p_b_coll_elem(p, '.');
97417141Sjkh	(void)REQUIRE(EATTWO('.', ']'), REG_ECOLLATE);
9751573Srgrimes	return(value);
9761573Srgrimes}
9771573Srgrimes
9781573Srgrimes/*
9791573Srgrimes - p_b_coll_elem - parse a collating-element name and look it up
98092889Sobrien == static char p_b_coll_elem(struct parse *p, int endc);
9811573Srgrimes */
9821573Srgrimesstatic char			/* value of collating element */
9831573Srgrimesp_b_coll_elem(p, endc)
98492889Sobrienstruct parse *p;
9851573Srgrimesint endc;			/* name ended by endc,']' */
9861573Srgrimes{
98792889Sobrien	char *sp = p->next;
98892889Sobrien	struct cname *cp;
98992889Sobrien	int len;
9901573Srgrimes
9911573Srgrimes	while (MORE() && !SEETWO(endc, ']'))
9921573Srgrimes		NEXT();
9931573Srgrimes	if (!MORE()) {
9941573Srgrimes		SETERROR(REG_EBRACK);
9951573Srgrimes		return(0);
9961573Srgrimes	}
9971573Srgrimes	len = p->next - sp;
9981573Srgrimes	for (cp = cnames; cp->name != NULL; cp++)
9991573Srgrimes		if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0')
10001573Srgrimes			return(cp->code);	/* known name */
10011573Srgrimes	if (len == 1)
10021573Srgrimes		return(*sp);	/* single character */
10031573Srgrimes	SETERROR(REG_ECOLLATE);			/* neither */
10041573Srgrimes	return(0);
10051573Srgrimes}
10061573Srgrimes
10071573Srgrimes/*
10081573Srgrimes - othercase - return the case counterpart of an alphabetic
10091573Srgrimes == static char othercase(int ch);
10101573Srgrimes */
10111573Srgrimesstatic char			/* if no counterpart, return ch */
10121573Srgrimesothercase(ch)
10131573Srgrimesint ch;
10141573Srgrimes{
101549094Sache	ch = (uch)ch;
10161573Srgrimes	assert(isalpha(ch));
10171573Srgrimes	if (isupper(ch))
10181573Srgrimes		return(tolower(ch));
10191573Srgrimes	else if (islower(ch))
10201573Srgrimes		return(toupper(ch));
10211573Srgrimes	else			/* peculiar, but could happen */
10221573Srgrimes		return(ch);
10231573Srgrimes}
10241573Srgrimes
10251573Srgrimes/*
10261573Srgrimes - bothcases - emit a dualcase version of a two-case character
102792889Sobrien == static void bothcases(struct parse *p, int ch);
10281573Srgrimes *
10291573Srgrimes * Boy, is this implementation ever a kludge...
10301573Srgrimes */
10311573Srgrimesstatic void
10321573Srgrimesbothcases(p, ch)
103392889Sobrienstruct parse *p;
10341573Srgrimesint ch;
10351573Srgrimes{
103692889Sobrien	char *oldnext = p->next;
103792889Sobrien	char *oldend = p->end;
10381573Srgrimes	char bracket[3];
10391573Srgrimes
104049094Sache	ch = (uch)ch;
10411573Srgrimes	assert(othercase(ch) != ch);	/* p_bracket() would recurse */
10421573Srgrimes	p->next = bracket;
10431573Srgrimes	p->end = bracket+2;
10441573Srgrimes	bracket[0] = ch;
10451573Srgrimes	bracket[1] = ']';
10461573Srgrimes	bracket[2] = '\0';
10471573Srgrimes	p_bracket(p);
10481573Srgrimes	assert(p->next == bracket+2);
10491573Srgrimes	p->next = oldnext;
10501573Srgrimes	p->end = oldend;
10511573Srgrimes}
10521573Srgrimes
10531573Srgrimes/*
10541573Srgrimes - ordinary - emit an ordinary character
105592889Sobrien == static void ordinary(struct parse *p, int ch);
10561573Srgrimes */
10571573Srgrimesstatic void
10581573Srgrimesordinary(p, ch)
105992889Sobrienstruct parse *p;
106092889Sobrienint ch;
10611573Srgrimes{
106292889Sobrien	cat_t *cap = p->g->categories;
10631573Srgrimes
106449094Sache	if ((p->g->cflags&REG_ICASE) && isalpha((uch)ch) && othercase(ch) != ch)
10651573Srgrimes		bothcases(p, ch);
10661573Srgrimes	else {
106749094Sache		EMIT(OCHAR, (uch)ch);
10681573Srgrimes		if (cap[ch] == 0)
10691573Srgrimes			cap[ch] = p->g->ncategories++;
10701573Srgrimes	}
10711573Srgrimes}
10721573Srgrimes
10731573Srgrimes/*
10741573Srgrimes - nonnewline - emit REG_NEWLINE version of OANY
107592889Sobrien == static void nonnewline(struct parse *p);
10761573Srgrimes *
10771573Srgrimes * Boy, is this implementation ever a kludge...
10781573Srgrimes */
10791573Srgrimesstatic void
10801573Srgrimesnonnewline(p)
108192889Sobrienstruct parse *p;
10821573Srgrimes{
108392889Sobrien	char *oldnext = p->next;
108492889Sobrien	char *oldend = p->end;
10851573Srgrimes	char bracket[4];
10861573Srgrimes
10871573Srgrimes	p->next = bracket;
10881573Srgrimes	p->end = bracket+3;
10891573Srgrimes	bracket[0] = '^';
10901573Srgrimes	bracket[1] = '\n';
10911573Srgrimes	bracket[2] = ']';
10921573Srgrimes	bracket[3] = '\0';
10931573Srgrimes	p_bracket(p);
10941573Srgrimes	assert(p->next == bracket+3);
10951573Srgrimes	p->next = oldnext;
10961573Srgrimes	p->end = oldend;
10971573Srgrimes}
10981573Srgrimes
10991573Srgrimes/*
11001573Srgrimes - repeat - generate code for a bounded repetition, recursively if needed
110192889Sobrien == static void repeat(struct parse *p, sopno start, int from, int to);
11021573Srgrimes */
11031573Srgrimesstatic void
11041573Srgrimesrepeat(p, start, from, to)
110592889Sobrienstruct parse *p;
11061573Srgrimessopno start;			/* operand from here to end of strip */
11071573Srgrimesint from;			/* repeated from this number */
11081573Srgrimesint to;				/* to this number of times (maybe INFINITY) */
11091573Srgrimes{
111092889Sobrien	sopno finish = HERE();
11111573Srgrimes#	define	N	2
11121573Srgrimes#	define	INF	3
11131573Srgrimes#	define	REP(f, t)	((f)*8 + (t))
11141573Srgrimes#	define	MAP(n)	(((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N)
111592889Sobrien	sopno copy;
11161573Srgrimes
11171573Srgrimes	if (p->error != 0)	/* head off possible runaway recursion */
11181573Srgrimes		return;
11191573Srgrimes
11201573Srgrimes	assert(from <= to);
11211573Srgrimes
11221573Srgrimes	switch (REP(MAP(from), MAP(to))) {
11231573Srgrimes	case REP(0, 0):			/* must be user doing this */
11241573Srgrimes		DROP(finish-start);	/* drop the operand */
11251573Srgrimes		break;
11261573Srgrimes	case REP(0, 1):			/* as x{1,1}? */
11271573Srgrimes	case REP(0, N):			/* as x{1,n}? */
11281573Srgrimes	case REP(0, INF):		/* as x{1,}? */
11291573Srgrimes		/* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
11301573Srgrimes		INSERT(OCH_, start);		/* offset is wrong... */
11311573Srgrimes		repeat(p, start+1, 1, to);
11321573Srgrimes		ASTERN(OOR1, start);
11331573Srgrimes		AHEAD(start);			/* ... fix it */
11341573Srgrimes		EMIT(OOR2, 0);
11351573Srgrimes		AHEAD(THERE());
11361573Srgrimes		ASTERN(O_CH, THERETHERE());
11371573Srgrimes		break;
11381573Srgrimes	case REP(1, 1):			/* trivial case */
11391573Srgrimes		/* done */
11401573Srgrimes		break;
11411573Srgrimes	case REP(1, N):			/* as x?x{1,n-1} */
11421573Srgrimes		/* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
11431573Srgrimes		INSERT(OCH_, start);
11441573Srgrimes		ASTERN(OOR1, start);
11451573Srgrimes		AHEAD(start);
11461573Srgrimes		EMIT(OOR2, 0);			/* offset very wrong... */
11471573Srgrimes		AHEAD(THERE());			/* ...so fix it */
11481573Srgrimes		ASTERN(O_CH, THERETHERE());
11491573Srgrimes		copy = dupl(p, start+1, finish+1);
11501573Srgrimes		assert(copy == finish+4);
11511573Srgrimes		repeat(p, copy, 1, to-1);
11521573Srgrimes		break;
11531573Srgrimes	case REP(1, INF):		/* as x+ */
11541573Srgrimes		INSERT(OPLUS_, start);
11551573Srgrimes		ASTERN(O_PLUS, start);
11561573Srgrimes		break;
11571573Srgrimes	case REP(N, N):			/* as xx{m-1,n-1} */
11581573Srgrimes		copy = dupl(p, start, finish);
11591573Srgrimes		repeat(p, copy, from-1, to-1);
11601573Srgrimes		break;
11611573Srgrimes	case REP(N, INF):		/* as xx{n-1,INF} */
11621573Srgrimes		copy = dupl(p, start, finish);
11631573Srgrimes		repeat(p, copy, from-1, to);
11641573Srgrimes		break;
11651573Srgrimes	default:			/* "can't happen" */
11661573Srgrimes		SETERROR(REG_ASSERT);	/* just in case */
11671573Srgrimes		break;
11681573Srgrimes	}
11691573Srgrimes}
11701573Srgrimes
11711573Srgrimes/*
11721573Srgrimes - seterr - set an error condition
117392889Sobrien == static int seterr(struct parse *p, int e);
11741573Srgrimes */
11751573Srgrimesstatic int			/* useless but makes type checking happy */
11761573Srgrimesseterr(p, e)
117792889Sobrienstruct parse *p;
11781573Srgrimesint e;
11791573Srgrimes{
11801573Srgrimes	if (p->error == 0)	/* keep earliest error condition */
11811573Srgrimes		p->error = e;
11821573Srgrimes	p->next = nuls;		/* try to bring things to a halt */
11831573Srgrimes	p->end = nuls;
11841573Srgrimes	return(0);		/* make the return value well-defined */
11851573Srgrimes}
11861573Srgrimes
11871573Srgrimes/*
11881573Srgrimes - allocset - allocate a set of characters for []
118992889Sobrien == static cset *allocset(struct parse *p);
11901573Srgrimes */
11911573Srgrimesstatic cset *
11921573Srgrimesallocset(p)
119392889Sobrienstruct parse *p;
11941573Srgrimes{
119592889Sobrien	int no = p->g->ncsets++;
119692889Sobrien	size_t nc;
119792889Sobrien	size_t nbytes;
119892889Sobrien	cset *cs;
119992889Sobrien	size_t css = (size_t)p->g->csetsize;
120092889Sobrien	int i;
12011573Srgrimes
12021573Srgrimes	if (no >= p->ncsalloc) {	/* need another column of space */
12031573Srgrimes		p->ncsalloc += CHAR_BIT;
12041573Srgrimes		nc = p->ncsalloc;
12051573Srgrimes		assert(nc % CHAR_BIT == 0);
12061573Srgrimes		nbytes = nc / CHAR_BIT * css;
12071573Srgrimes		if (p->g->sets == NULL)
12081573Srgrimes			p->g->sets = (cset *)malloc(nc * sizeof(cset));
12091573Srgrimes		else
121039327Simp			p->g->sets = (cset *)reallocf((char *)p->g->sets,
12111573Srgrimes							nc * sizeof(cset));
12121573Srgrimes		if (p->g->setbits == NULL)
12131573Srgrimes			p->g->setbits = (uch *)malloc(nbytes);
12141573Srgrimes		else {
121539327Simp			p->g->setbits = (uch *)reallocf((char *)p->g->setbits,
12161573Srgrimes								nbytes);
12171573Srgrimes			/* xxx this isn't right if setbits is now NULL */
12181573Srgrimes			for (i = 0; i < no; i++)
12191573Srgrimes				p->g->sets[i].ptr = p->g->setbits + css*(i/CHAR_BIT);
12201573Srgrimes		}
12211573Srgrimes		if (p->g->sets != NULL && p->g->setbits != NULL)
12221573Srgrimes			(void) memset((char *)p->g->setbits + (nbytes - css),
12231573Srgrimes								0, css);
12241573Srgrimes		else {
12251573Srgrimes			no = 0;
12261573Srgrimes			SETERROR(REG_ESPACE);
12271573Srgrimes			/* caller's responsibility not to do set ops */
12281573Srgrimes		}
12291573Srgrimes	}
12301573Srgrimes
12311573Srgrimes	assert(p->g->sets != NULL);	/* xxx */
12321573Srgrimes	cs = &p->g->sets[no];
12331573Srgrimes	cs->ptr = p->g->setbits + css*((no)/CHAR_BIT);
12341573Srgrimes	cs->mask = 1 << ((no) % CHAR_BIT);
12351573Srgrimes	cs->hash = 0;
12361573Srgrimes	cs->smultis = 0;
12371573Srgrimes	cs->multis = NULL;
12381573Srgrimes
12391573Srgrimes	return(cs);
12401573Srgrimes}
12411573Srgrimes
12421573Srgrimes/*
12431573Srgrimes - freeset - free a now-unused set
124492889Sobrien == static void freeset(struct parse *p, cset *cs);
12451573Srgrimes */
12461573Srgrimesstatic void
12471573Srgrimesfreeset(p, cs)
124892889Sobrienstruct parse *p;
124992889Sobriencset *cs;
12501573Srgrimes{
125192889Sobrien	int i;
125292889Sobrien	cset *top = &p->g->sets[p->g->ncsets];
125392889Sobrien	size_t css = (size_t)p->g->csetsize;
12541573Srgrimes
12551573Srgrimes	for (i = 0; i < css; i++)
12561573Srgrimes		CHsub(cs, i);
12571573Srgrimes	if (cs == top-1)	/* recover only the easy case */
12581573Srgrimes		p->g->ncsets--;
12591573Srgrimes}
12601573Srgrimes
12611573Srgrimes/*
12621573Srgrimes - freezeset - final processing on a set of characters
126392889Sobrien == static int freezeset(struct parse *p, cset *cs);
12641573Srgrimes *
12651573Srgrimes * The main task here is merging identical sets.  This is usually a waste
12661573Srgrimes * of time (although the hash code minimizes the overhead), but can win
12671573Srgrimes * big if REG_ICASE is being used.  REG_ICASE, by the way, is why the hash
12681573Srgrimes * is done using addition rather than xor -- all ASCII [aA] sets xor to
12691573Srgrimes * the same value!
12701573Srgrimes */
12711573Srgrimesstatic int			/* set number */
12721573Srgrimesfreezeset(p, cs)
127392889Sobrienstruct parse *p;
127492889Sobriencset *cs;
12751573Srgrimes{
127692889Sobrien	short h = cs->hash;
127792889Sobrien	int i;
127892889Sobrien	cset *top = &p->g->sets[p->g->ncsets];
127992889Sobrien	cset *cs2;
128092889Sobrien	size_t css = (size_t)p->g->csetsize;
12811573Srgrimes
12821573Srgrimes	/* look for an earlier one which is the same */
12831573Srgrimes	for (cs2 = &p->g->sets[0]; cs2 < top; cs2++)
12841573Srgrimes		if (cs2->hash == h && cs2 != cs) {
12851573Srgrimes			/* maybe */
12861573Srgrimes			for (i = 0; i < css; i++)
12871573Srgrimes				if (!!CHIN(cs2, i) != !!CHIN(cs, i))
12881573Srgrimes					break;		/* no */
12891573Srgrimes			if (i == css)
12901573Srgrimes				break;			/* yes */
12911573Srgrimes		}
12921573Srgrimes
12931573Srgrimes	if (cs2 < top) {	/* found one */
12941573Srgrimes		freeset(p, cs);
12951573Srgrimes		cs = cs2;
12961573Srgrimes	}
12971573Srgrimes
12981573Srgrimes	return((int)(cs - p->g->sets));
12991573Srgrimes}
13001573Srgrimes
13011573Srgrimes/*
13021573Srgrimes - firstch - return first character in a set (which must have at least one)
130392889Sobrien == static int firstch(struct parse *p, cset *cs);
13041573Srgrimes */
13051573Srgrimesstatic int			/* character; there is no "none" value */
13061573Srgrimesfirstch(p, cs)
130792889Sobrienstruct parse *p;
130892889Sobriencset *cs;
13091573Srgrimes{
131092889Sobrien	int i;
131192889Sobrien	size_t css = (size_t)p->g->csetsize;
13121573Srgrimes
13131573Srgrimes	for (i = 0; i < css; i++)
13141573Srgrimes		if (CHIN(cs, i))
131549094Sache			return((char)i);
13161573Srgrimes	assert(never);
13171573Srgrimes	return(0);		/* arbitrary */
13181573Srgrimes}
13191573Srgrimes
13201573Srgrimes/*
13211573Srgrimes - nch - number of characters in a set
132292889Sobrien == static int nch(struct parse *p, cset *cs);
13231573Srgrimes */
13241573Srgrimesstatic int
13251573Srgrimesnch(p, cs)
132692889Sobrienstruct parse *p;
132792889Sobriencset *cs;
13281573Srgrimes{
132992889Sobrien	int i;
133092889Sobrien	size_t css = (size_t)p->g->csetsize;
133192889Sobrien	int n = 0;
13321573Srgrimes
13331573Srgrimes	for (i = 0; i < css; i++)
13341573Srgrimes		if (CHIN(cs, i))
13351573Srgrimes			n++;
13361573Srgrimes	return(n);
13371573Srgrimes}
13381573Srgrimes
13391573Srgrimes/*
13401573Srgrimes - mcadd - add a collating element to a cset
134192889Sobrien == static void mcadd(struct parse *p, cset *cs, \
134292889Sobrien ==	char *cp);
13431573Srgrimes */
13441573Srgrimesstatic void
13451573Srgrimesmcadd(p, cs, cp)
134692889Sobrienstruct parse *p;
134792889Sobriencset *cs;
134892889Sobrienchar *cp;
13491573Srgrimes{
135092889Sobrien	size_t oldend = cs->smultis;
13511573Srgrimes
13521573Srgrimes	cs->smultis += strlen(cp) + 1;
13531573Srgrimes	if (cs->multis == NULL)
13541573Srgrimes		cs->multis = malloc(cs->smultis);
13551573Srgrimes	else
135639327Simp		cs->multis = reallocf(cs->multis, cs->smultis);
13571573Srgrimes	if (cs->multis == NULL) {
13581573Srgrimes		SETERROR(REG_ESPACE);
13591573Srgrimes		return;
13601573Srgrimes	}
13611573Srgrimes
13621573Srgrimes	(void) strcpy(cs->multis + oldend - 1, cp);
13631573Srgrimes	cs->multis[cs->smultis - 1] = '\0';
13641573Srgrimes}
13651573Srgrimes
136617141Sjkh#if used
13671573Srgrimes/*
13681573Srgrimes - mcsub - subtract a collating element from a cset
136992889Sobrien == static void mcsub(cset *cs, char *cp);
13701573Srgrimes */
13711573Srgrimesstatic void
13721573Srgrimesmcsub(cs, cp)
137392889Sobriencset *cs;
137492889Sobrienchar *cp;
13751573Srgrimes{
137692889Sobrien	char *fp = mcfind(cs, cp);
137792889Sobrien	size_t len = strlen(fp);
13781573Srgrimes
13791573Srgrimes	assert(fp != NULL);
13801573Srgrimes	(void) memmove(fp, fp + len + 1,
13811573Srgrimes				cs->smultis - (fp + len + 1 - cs->multis));
13821573Srgrimes	cs->smultis -= len;
13831573Srgrimes
13841573Srgrimes	if (cs->smultis == 0) {
13851573Srgrimes		free(cs->multis);
13861573Srgrimes		cs->multis = NULL;
13871573Srgrimes		return;
13881573Srgrimes	}
13891573Srgrimes
139039327Simp	cs->multis = reallocf(cs->multis, cs->smultis);
13911573Srgrimes	assert(cs->multis != NULL);
13921573Srgrimes}
13931573Srgrimes
13941573Srgrimes/*
13951573Srgrimes - mcin - is a collating element in a cset?
139692889Sobrien == static int mcin(cset *cs, char *cp);
13971573Srgrimes */
13981573Srgrimesstatic int
13991573Srgrimesmcin(cs, cp)
140092889Sobriencset *cs;
140192889Sobrienchar *cp;
14021573Srgrimes{
14031573Srgrimes	return(mcfind(cs, cp) != NULL);
14041573Srgrimes}
14051573Srgrimes
14061573Srgrimes/*
14071573Srgrimes - mcfind - find a collating element in a cset
140892889Sobrien == static char *mcfind(cset *cs, char *cp);
14091573Srgrimes */
14101573Srgrimesstatic char *
14111573Srgrimesmcfind(cs, cp)
141292889Sobriencset *cs;
141392889Sobrienchar *cp;
14141573Srgrimes{
141592889Sobrien	char *p;
14161573Srgrimes
14171573Srgrimes	if (cs->multis == NULL)
14181573Srgrimes		return(NULL);
14191573Srgrimes	for (p = cs->multis; *p != '\0'; p += strlen(p) + 1)
14201573Srgrimes		if (strcmp(cp, p) == 0)
14211573Srgrimes			return(p);
14221573Srgrimes	return(NULL);
14231573Srgrimes}
142417141Sjkh#endif
14251573Srgrimes
14261573Srgrimes/*
14271573Srgrimes - mcinvert - invert the list of collating elements in a cset
142892889Sobrien == static void mcinvert(struct parse *p, cset *cs);
14291573Srgrimes *
14301573Srgrimes * This would have to know the set of possibilities.  Implementation
14311573Srgrimes * is deferred.
14321573Srgrimes */
14331573Srgrimesstatic void
14341573Srgrimesmcinvert(p, cs)
143592889Sobrienstruct parse *p;
143692889Sobriencset *cs;
14371573Srgrimes{
14381573Srgrimes	assert(cs->multis == NULL);	/* xxx */
14391573Srgrimes}
14401573Srgrimes
14411573Srgrimes/*
14421573Srgrimes - mccase - add case counterparts of the list of collating elements in a cset
144392889Sobrien == static void mccase(struct parse *p, cset *cs);
14441573Srgrimes *
14451573Srgrimes * This would have to know the set of possibilities.  Implementation
14461573Srgrimes * is deferred.
14471573Srgrimes */
14481573Srgrimesstatic void
14491573Srgrimesmccase(p, cs)
145092889Sobrienstruct parse *p;
145192889Sobriencset *cs;
14521573Srgrimes{
14531573Srgrimes	assert(cs->multis == NULL);	/* xxx */
14541573Srgrimes}
14551573Srgrimes
14561573Srgrimes/*
14571573Srgrimes - isinsets - is this character in any sets?
145892889Sobrien == static int isinsets(struct re_guts *g, int c);
14591573Srgrimes */
14601573Srgrimesstatic int			/* predicate */
14611573Srgrimesisinsets(g, c)
146292889Sobrienstruct re_guts *g;
14631573Srgrimesint c;
14641573Srgrimes{
146592889Sobrien	uch *col;
146692889Sobrien	int i;
146792889Sobrien	int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT;
146892889Sobrien	unsigned uc = (uch)c;
14691573Srgrimes
14701573Srgrimes	for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize)
14711573Srgrimes		if (col[uc] != 0)
14721573Srgrimes			return(1);
14731573Srgrimes	return(0);
14741573Srgrimes}
14751573Srgrimes
14761573Srgrimes/*
14771573Srgrimes - samesets - are these two characters in exactly the same sets?
147892889Sobrien == static int samesets(struct re_guts *g, int c1, int c2);
14791573Srgrimes */
14801573Srgrimesstatic int			/* predicate */
14811573Srgrimessamesets(g, c1, c2)
148292889Sobrienstruct re_guts *g;
14831573Srgrimesint c1;
14841573Srgrimesint c2;
14851573Srgrimes{
148692889Sobrien	uch *col;
148792889Sobrien	int i;
148892889Sobrien	int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT;
148992889Sobrien	unsigned uc1 = (uch)c1;
149092889Sobrien	unsigned uc2 = (uch)c2;
14911573Srgrimes
14921573Srgrimes	for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize)
14931573Srgrimes		if (col[uc1] != col[uc2])
14941573Srgrimes			return(0);
14951573Srgrimes	return(1);
14961573Srgrimes}
14971573Srgrimes
14981573Srgrimes/*
14991573Srgrimes - categorize - sort out character categories
150092889Sobrien == static void categorize(struct parse *p, struct re_guts *g);
15011573Srgrimes */
15021573Srgrimesstatic void
15031573Srgrimescategorize(p, g)
15041573Srgrimesstruct parse *p;
150592889Sobrienstruct re_guts *g;
15061573Srgrimes{
150792889Sobrien	cat_t *cats = g->categories;
150892889Sobrien	int c;
150992889Sobrien	int c2;
151092889Sobrien	cat_t cat;
15111573Srgrimes
15121573Srgrimes	/* avoid making error situations worse */
15131573Srgrimes	if (p->error != 0)
15141573Srgrimes		return;
15151573Srgrimes
15161573Srgrimes	for (c = CHAR_MIN; c <= CHAR_MAX; c++)
15171573Srgrimes		if (cats[c] == 0 && isinsets(g, c)) {
15181573Srgrimes			cat = g->ncategories++;
15191573Srgrimes			cats[c] = cat;
15201573Srgrimes			for (c2 = c+1; c2 <= CHAR_MAX; c2++)
15211573Srgrimes				if (cats[c2] == 0 && samesets(g, c, c2))
15221573Srgrimes					cats[c2] = cat;
15231573Srgrimes		}
15241573Srgrimes}
15251573Srgrimes
15261573Srgrimes/*
15271573Srgrimes - dupl - emit a duplicate of a bunch of sops
152892889Sobrien == static sopno dupl(struct parse *p, sopno start, sopno finish);
15291573Srgrimes */
15301573Srgrimesstatic sopno			/* start of duplicate */
15311573Srgrimesdupl(p, start, finish)
153292889Sobrienstruct parse *p;
15331573Srgrimessopno start;			/* from here */
15341573Srgrimessopno finish;			/* to this less one */
15351573Srgrimes{
153692889Sobrien	sopno ret = HERE();
153792889Sobrien	sopno len = finish - start;
15381573Srgrimes
15391573Srgrimes	assert(finish >= start);
15401573Srgrimes	if (len == 0)
15411573Srgrimes		return(ret);
15421573Srgrimes	enlarge(p, p->ssize + len);	/* this many unexpected additions */
15431573Srgrimes	assert(p->ssize >= p->slen + len);
15441573Srgrimes	(void) memcpy((char *)(p->strip + p->slen),
15451573Srgrimes		(char *)(p->strip + start), (size_t)len*sizeof(sop));
15461573Srgrimes	p->slen += len;
15471573Srgrimes	return(ret);
15481573Srgrimes}
15491573Srgrimes
15501573Srgrimes/*
15511573Srgrimes - doemit - emit a strip operator
155292889Sobrien == static void doemit(struct parse *p, sop op, size_t opnd);
15531573Srgrimes *
15541573Srgrimes * It might seem better to implement this as a macro with a function as
15551573Srgrimes * hard-case backup, but it's just too big and messy unless there are
15561573Srgrimes * some changes to the data structures.  Maybe later.
15571573Srgrimes */
15581573Srgrimesstatic void
15591573Srgrimesdoemit(p, op, opnd)
156092889Sobrienstruct parse *p;
15611573Srgrimessop op;
15621573Srgrimessize_t opnd;
15631573Srgrimes{
15641573Srgrimes	/* avoid making error situations worse */
15651573Srgrimes	if (p->error != 0)
15661573Srgrimes		return;
15671573Srgrimes
15681573Srgrimes	/* deal with oversize operands ("can't happen", more or less) */
15691573Srgrimes	assert(opnd < 1<<OPSHIFT);
15701573Srgrimes
15711573Srgrimes	/* deal with undersized strip */
15721573Srgrimes	if (p->slen >= p->ssize)
15731573Srgrimes		enlarge(p, (p->ssize+1) / 2 * 3);	/* +50% */
15741573Srgrimes	assert(p->slen < p->ssize);
15751573Srgrimes
15761573Srgrimes	/* finally, it's all reduced to the easy case */
15771573Srgrimes	p->strip[p->slen++] = SOP(op, opnd);
15781573Srgrimes}
15791573Srgrimes
15801573Srgrimes/*
15811573Srgrimes - doinsert - insert a sop into the strip
158292889Sobrien == static void doinsert(struct parse *p, sop op, size_t opnd, sopno pos);
15831573Srgrimes */
15841573Srgrimesstatic void
15851573Srgrimesdoinsert(p, op, opnd, pos)
158692889Sobrienstruct parse *p;
15871573Srgrimessop op;
15881573Srgrimessize_t opnd;
15891573Srgrimessopno pos;
15901573Srgrimes{
159192889Sobrien	sopno sn;
159292889Sobrien	sop s;
159392889Sobrien	int i;
15941573Srgrimes
15951573Srgrimes	/* avoid making error situations worse */
15961573Srgrimes	if (p->error != 0)
15971573Srgrimes		return;
15981573Srgrimes
15991573Srgrimes	sn = HERE();
16001573Srgrimes	EMIT(op, opnd);		/* do checks, ensure space */
16011573Srgrimes	assert(HERE() == sn+1);
16021573Srgrimes	s = p->strip[sn];
16031573Srgrimes
16041573Srgrimes	/* adjust paren pointers */
16051573Srgrimes	assert(pos > 0);
16061573Srgrimes	for (i = 1; i < NPAREN; i++) {
16071573Srgrimes		if (p->pbegin[i] >= pos) {
16081573Srgrimes			p->pbegin[i]++;
16091573Srgrimes		}
16101573Srgrimes		if (p->pend[i] >= pos) {
16111573Srgrimes			p->pend[i]++;
16121573Srgrimes		}
16131573Srgrimes	}
16141573Srgrimes
16151573Srgrimes	memmove((char *)&p->strip[pos+1], (char *)&p->strip[pos],
16161573Srgrimes						(HERE()-pos-1)*sizeof(sop));
16171573Srgrimes	p->strip[pos] = s;
16181573Srgrimes}
16191573Srgrimes
16201573Srgrimes/*
16211573Srgrimes - dofwd - complete a forward reference
162292889Sobrien == static void dofwd(struct parse *p, sopno pos, sop value);
16231573Srgrimes */
16241573Srgrimesstatic void
16251573Srgrimesdofwd(p, pos, value)
162692889Sobrienstruct parse *p;
162792889Sobriensopno pos;
16281573Srgrimessop value;
16291573Srgrimes{
16301573Srgrimes	/* avoid making error situations worse */
16311573Srgrimes	if (p->error != 0)
16321573Srgrimes		return;
16331573Srgrimes
16341573Srgrimes	assert(value < 1<<OPSHIFT);
16351573Srgrimes	p->strip[pos] = OP(p->strip[pos]) | value;
16361573Srgrimes}
16371573Srgrimes
16381573Srgrimes/*
16391573Srgrimes - enlarge - enlarge the strip
164092889Sobrien == static void enlarge(struct parse *p, sopno size);
16411573Srgrimes */
16421573Srgrimesstatic void
16431573Srgrimesenlarge(p, size)
164492889Sobrienstruct parse *p;
164592889Sobriensopno size;
16461573Srgrimes{
164792889Sobrien	sop *sp;
16481573Srgrimes
16491573Srgrimes	if (p->ssize >= size)
16501573Srgrimes		return;
16511573Srgrimes
16521573Srgrimes	sp = (sop *)realloc(p->strip, size*sizeof(sop));
16531573Srgrimes	if (sp == NULL) {
16541573Srgrimes		SETERROR(REG_ESPACE);
16551573Srgrimes		return;
16561573Srgrimes	}
16571573Srgrimes	p->strip = sp;
16581573Srgrimes	p->ssize = size;
16591573Srgrimes}
16601573Srgrimes
16611573Srgrimes/*
16621573Srgrimes - stripsnug - compact the strip
166392889Sobrien == static void stripsnug(struct parse *p, struct re_guts *g);
16641573Srgrimes */
16651573Srgrimesstatic void
16661573Srgrimesstripsnug(p, g)
166792889Sobrienstruct parse *p;
166892889Sobrienstruct re_guts *g;
16691573Srgrimes{
16701573Srgrimes	g->nstates = p->slen;
16711573Srgrimes	g->strip = (sop *)realloc((char *)p->strip, p->slen * sizeof(sop));
16721573Srgrimes	if (g->strip == NULL) {
16731573Srgrimes		SETERROR(REG_ESPACE);
16741573Srgrimes		g->strip = p->strip;
16751573Srgrimes	}
16761573Srgrimes}
16771573Srgrimes
16781573Srgrimes/*
16791573Srgrimes - findmust - fill in must and mlen with longest mandatory literal string
168092889Sobrien == static void findmust(struct parse *p, struct re_guts *g);
16811573Srgrimes *
16821573Srgrimes * This algorithm could do fancy things like analyzing the operands of |
16831573Srgrimes * for common subsequences.  Someday.  This code is simple and finds most
16841573Srgrimes * of the interesting cases.
16851573Srgrimes *
16861573Srgrimes * Note that must and mlen got initialized during setup.
16871573Srgrimes */
16881573Srgrimesstatic void
16891573Srgrimesfindmust(p, g)
16901573Srgrimesstruct parse *p;
169192889Sobrienstruct re_guts *g;
16921573Srgrimes{
169392889Sobrien	sop *scan;
16941573Srgrimes	sop *start;
169592889Sobrien	sop *newstart;
169692889Sobrien	sopno newlen;
169792889Sobrien	sop s;
169892889Sobrien	char *cp;
169992889Sobrien	sopno i;
170062391Sdcs	int offset;
170162391Sdcs	int cs, mccs;
17021573Srgrimes
17031573Srgrimes	/* avoid making error situations worse */
17041573Srgrimes	if (p->error != 0)
17051573Srgrimes		return;
17061573Srgrimes
170762391Sdcs	/* Find out if we can handle OANYOF or not */
170862391Sdcs	mccs = 0;
170962391Sdcs	for (cs = 0; cs < g->ncsets; cs++)
171062391Sdcs		if (g->sets[cs].multis != NULL)
171162391Sdcs			mccs = 1;
171262391Sdcs
17131573Srgrimes	/* find the longest OCHAR sequence in strip */
17141573Srgrimes	newlen = 0;
171562391Sdcs	offset = 0;
171662391Sdcs	g->moffset = 0;
17171573Srgrimes	scan = g->strip + 1;
17181573Srgrimes	do {
17191573Srgrimes		s = *scan++;
17201573Srgrimes		switch (OP(s)) {
17211573Srgrimes		case OCHAR:		/* sequence member */
17221573Srgrimes			if (newlen == 0)		/* new sequence */
17231573Srgrimes				newstart = scan - 1;
17241573Srgrimes			newlen++;
17251573Srgrimes			break;
17261573Srgrimes		case OPLUS_:		/* things that don't break one */
17271573Srgrimes		case OLPAREN:
17281573Srgrimes		case ORPAREN:
17291573Srgrimes			break;
17301573Srgrimes		case OQUEST_:		/* things that must be skipped */
17311573Srgrimes		case OCH_:
173262391Sdcs			offset = altoffset(scan, offset, mccs);
17331573Srgrimes			scan--;
17341573Srgrimes			do {
17351573Srgrimes				scan += OPND(s);
17361573Srgrimes				s = *scan;
17371573Srgrimes				/* assert() interferes w debug printouts */
17381573Srgrimes				if (OP(s) != O_QUEST && OP(s) != O_CH &&
17391573Srgrimes							OP(s) != OOR2) {
17401573Srgrimes					g->iflags |= BAD;
17411573Srgrimes					return;
17421573Srgrimes				}
17431573Srgrimes			} while (OP(s) != O_QUEST && OP(s) != O_CH);
17441573Srgrimes			/* fallthrough */
174562391Sdcs		case OBOW:		/* things that break a sequence */
174662391Sdcs		case OEOW:
174762391Sdcs		case OBOL:
174862391Sdcs		case OEOL:
174962391Sdcs		case O_QUEST:
175062391Sdcs		case O_CH:
175162391Sdcs		case OEND:
17521573Srgrimes			if (newlen > g->mlen) {		/* ends one */
17531573Srgrimes				start = newstart;
17541573Srgrimes				g->mlen = newlen;
175562391Sdcs				if (offset > -1) {
175662391Sdcs					g->moffset += offset;
175762391Sdcs					offset = newlen;
175862391Sdcs				} else
175962391Sdcs					g->moffset = offset;
176062391Sdcs			} else {
176162391Sdcs				if (offset > -1)
176262391Sdcs					offset += newlen;
17631573Srgrimes			}
17641573Srgrimes			newlen = 0;
17651573Srgrimes			break;
176662391Sdcs		case OANY:
176762391Sdcs			if (newlen > g->mlen) {		/* ends one */
176862391Sdcs				start = newstart;
176962391Sdcs				g->mlen = newlen;
177062391Sdcs				if (offset > -1) {
177162391Sdcs					g->moffset += offset;
177262391Sdcs					offset = newlen;
177362391Sdcs				} else
177462391Sdcs					g->moffset = offset;
177562391Sdcs			} else {
177662391Sdcs				if (offset > -1)
177762391Sdcs					offset += newlen;
177862391Sdcs			}
177962391Sdcs			if (offset > -1)
178062391Sdcs				offset++;
178162391Sdcs			newlen = 0;
178262391Sdcs			break;
178362391Sdcs		case OANYOF:		/* may or may not invalidate offset */
178462391Sdcs			/* First, everything as OANY */
178562391Sdcs			if (newlen > g->mlen) {		/* ends one */
178662391Sdcs				start = newstart;
178762391Sdcs				g->mlen = newlen;
178862391Sdcs				if (offset > -1) {
178962391Sdcs					g->moffset += offset;
179062391Sdcs					offset = newlen;
179162391Sdcs				} else
179262391Sdcs					g->moffset = offset;
179362391Sdcs			} else {
179462391Sdcs				if (offset > -1)
179562391Sdcs					offset += newlen;
179662391Sdcs			}
179762391Sdcs			if (offset > -1)
179862391Sdcs				offset++;
179962391Sdcs			newlen = 0;
180062391Sdcs			/* And, now, if we found out we can't deal with
180162391Sdcs			 * it, make offset = -1.
180262391Sdcs			 */
180362391Sdcs			if (mccs)
180462391Sdcs				offset = -1;
180562391Sdcs			break;
180662391Sdcs		default:
180762391Sdcs			/* Anything here makes it impossible or too hard
180862391Sdcs			 * to calculate the offset -- so we give up;
180962391Sdcs			 * save the last known good offset, in case the
181062391Sdcs			 * must sequence doesn't occur later.
181162391Sdcs			 */
181262391Sdcs			if (newlen > g->mlen) {		/* ends one */
181362391Sdcs				start = newstart;
181462391Sdcs				g->mlen = newlen;
181562391Sdcs				if (offset > -1)
181662391Sdcs					g->moffset += offset;
181762391Sdcs				else
181862391Sdcs					g->moffset = offset;
181962391Sdcs			}
182062391Sdcs			offset = -1;
182162391Sdcs			newlen = 0;
182262391Sdcs			break;
18231573Srgrimes		}
18241573Srgrimes	} while (OP(s) != OEND);
18251573Srgrimes
182662391Sdcs	if (g->mlen == 0) {		/* there isn't one */
182762391Sdcs		g->moffset = -1;
18281573Srgrimes		return;
182962391Sdcs	}
18301573Srgrimes
18311573Srgrimes	/* turn it into a character string */
18321573Srgrimes	g->must = malloc((size_t)g->mlen + 1);
18331573Srgrimes	if (g->must == NULL) {		/* argh; just forget it */
18341573Srgrimes		g->mlen = 0;
183562391Sdcs		g->moffset = -1;
18361573Srgrimes		return;
18371573Srgrimes	}
18381573Srgrimes	cp = g->must;
18391573Srgrimes	scan = start;
18401573Srgrimes	for (i = g->mlen; i > 0; i--) {
18411573Srgrimes		while (OP(s = *scan++) != OCHAR)
18421573Srgrimes			continue;
18431573Srgrimes		assert(cp < g->must + g->mlen);
18441573Srgrimes		*cp++ = (char)OPND(s);
18451573Srgrimes	}
18461573Srgrimes	assert(cp == g->must + g->mlen);
18471573Srgrimes	*cp++ = '\0';		/* just on general principles */
18481573Srgrimes}
18491573Srgrimes
18501573Srgrimes/*
185162391Sdcs - altoffset - choose biggest offset among multiple choices
185262673Sdcs == static int altoffset(sop *scan, int offset, int mccs);
185362391Sdcs *
185462391Sdcs * Compute, recursively if necessary, the largest offset among multiple
185562391Sdcs * re paths.
185662391Sdcs */
185762391Sdcsstatic int
185862391Sdcsaltoffset(scan, offset, mccs)
185962391Sdcssop *scan;
186062391Sdcsint offset;
186162391Sdcsint mccs;
186262391Sdcs{
186362391Sdcs	int largest;
186462391Sdcs	int try;
186562391Sdcs	sop s;
186662391Sdcs
186762391Sdcs	/* If we gave up already on offsets, return */
186862391Sdcs	if (offset == -1)
186962391Sdcs		return -1;
187062391Sdcs
187162391Sdcs	largest = 0;
187262391Sdcs	try = 0;
187362391Sdcs	s = *scan++;
187462391Sdcs	while (OP(s) != O_QUEST && OP(s) != O_CH) {
187562391Sdcs		switch (OP(s)) {
187662391Sdcs		case OOR1:
187762391Sdcs			if (try > largest)
187862391Sdcs				largest = try;
187962391Sdcs			try = 0;
188062391Sdcs			break;
188162391Sdcs		case OQUEST_:
188262391Sdcs		case OCH_:
188362391Sdcs			try = altoffset(scan, try, mccs);
188462391Sdcs			if (try == -1)
188562391Sdcs				return -1;
188662391Sdcs			scan--;
188762391Sdcs			do {
188862391Sdcs				scan += OPND(s);
188962391Sdcs				s = *scan;
189062391Sdcs				if (OP(s) != O_QUEST && OP(s) != O_CH &&
189162391Sdcs							OP(s) != OOR2)
189262391Sdcs					return -1;
189362391Sdcs			} while (OP(s) != O_QUEST && OP(s) != O_CH);
189462855Sdcs			/* We must skip to the next position, or we'll
189562855Sdcs			 * leave altoffset() too early.
189662855Sdcs			 */
189762855Sdcs			scan++;
189862391Sdcs			break;
189962391Sdcs		case OANYOF:
190062391Sdcs			if (mccs)
190162391Sdcs				return -1;
190262391Sdcs		case OCHAR:
190362391Sdcs		case OANY:
190462391Sdcs			try++;
190562391Sdcs		case OBOW:
190662391Sdcs		case OEOW:
190762391Sdcs		case OLPAREN:
190862391Sdcs		case ORPAREN:
190962391Sdcs		case OOR2:
191062391Sdcs			break;
191162391Sdcs		default:
191262391Sdcs			try = -1;
191362391Sdcs			break;
191462391Sdcs		}
191562391Sdcs		if (try == -1)
191662391Sdcs			return -1;
191762391Sdcs		s = *scan++;
191862391Sdcs	}
191962391Sdcs
192062391Sdcs	if (try > largest)
192162391Sdcs		largest = try;
192262391Sdcs
192362391Sdcs	return largest+offset;
192462391Sdcs}
192562391Sdcs
192662391Sdcs/*
192762232Sdcs - computejumps - compute char jumps for BM scan
192892889Sobrien == static void computejumps(struct parse *p, struct re_guts *g);
192962232Sdcs *
193062232Sdcs * This algorithm assumes g->must exists and is has size greater than
193162232Sdcs * zero. It's based on the algorithm found on Computer Algorithms by
193262232Sdcs * Sara Baase.
193362232Sdcs *
193462232Sdcs * A char jump is the number of characters one needs to jump based on
193562232Sdcs * the value of the character from the text that was mismatched.
193662232Sdcs */
193762232Sdcsstatic void
193862232Sdcscomputejumps(p, g)
193962232Sdcsstruct parse *p;
194062232Sdcsstruct re_guts *g;
194162232Sdcs{
194262232Sdcs	int ch;
194362232Sdcs	int mindex;
194462232Sdcs
194562232Sdcs	/* Avoid making errors worse */
194662232Sdcs	if (p->error != 0)
194762232Sdcs		return;
194862232Sdcs
194962848Sdcs	g->charjump = (int*) malloc((NC + 1) * sizeof(int));
195062232Sdcs	if (g->charjump == NULL)	/* Not a fatal error */
195162232Sdcs		return;
195262754Sdcs	/* Adjust for signed chars, if necessary */
195362754Sdcs	g->charjump = &g->charjump[-(CHAR_MIN)];
195462232Sdcs
195562232Sdcs	/* If the character does not exist in the pattern, the jump
195662232Sdcs	 * is equal to the number of characters in the pattern.
195762232Sdcs	 */
195862754Sdcs	for (ch = CHAR_MIN; ch < (CHAR_MAX + 1); ch++)
195962232Sdcs		g->charjump[ch] = g->mlen;
196062232Sdcs
196162232Sdcs	/* If the character does exist, compute the jump that would
196262232Sdcs	 * take us to the last character in the pattern equal to it
196362232Sdcs	 * (notice that we match right to left, so that last character
196462232Sdcs	 * is the first one that would be matched).
196562232Sdcs	 */
196662232Sdcs	for (mindex = 0; mindex < g->mlen; mindex++)
196762754Sdcs		g->charjump[g->must[mindex]] = g->mlen - mindex - 1;
196862232Sdcs}
196962232Sdcs
197062232Sdcs/*
197162232Sdcs - computematchjumps - compute match jumps for BM scan
197292889Sobrien == static void computematchjumps(struct parse *p, struct re_guts *g);
197362232Sdcs *
197462232Sdcs * This algorithm assumes g->must exists and is has size greater than
197562232Sdcs * zero. It's based on the algorithm found on Computer Algorithms by
197662232Sdcs * Sara Baase.
197762232Sdcs *
197862232Sdcs * A match jump is the number of characters one needs to advance based
197962232Sdcs * on the already-matched suffix.
198062232Sdcs * Notice that all values here are minus (g->mlen-1), because of the way
198162232Sdcs * the search algorithm works.
198262232Sdcs */
198362232Sdcsstatic void
198462232Sdcscomputematchjumps(p, g)
198562232Sdcsstruct parse *p;
198662232Sdcsstruct re_guts *g;
198762232Sdcs{
198862232Sdcs	int mindex;		/* General "must" iterator */
198962232Sdcs	int suffix;		/* Keeps track of matching suffix */
199062232Sdcs	int ssuffix;		/* Keeps track of suffixes' suffix */
199162232Sdcs	int* pmatches;		/* pmatches[k] points to the next i
199262232Sdcs				 * such that i+1...mlen is a substring
199362232Sdcs				 * of k+1...k+mlen-i-1
199462232Sdcs				 */
199562232Sdcs
199662232Sdcs	/* Avoid making errors worse */
199762232Sdcs	if (p->error != 0)
199862232Sdcs		return;
199962232Sdcs
200062848Sdcs	pmatches = (int*) malloc(g->mlen * sizeof(unsigned int));
200162232Sdcs	if (pmatches == NULL) {
200262232Sdcs		g->matchjump = NULL;
200362232Sdcs		return;
200462232Sdcs	}
200562232Sdcs
200662848Sdcs	g->matchjump = (int*) malloc(g->mlen * sizeof(unsigned int));
200762232Sdcs	if (g->matchjump == NULL)	/* Not a fatal error */
200862232Sdcs		return;
200962232Sdcs
201062232Sdcs	/* Set maximum possible jump for each character in the pattern */
201162232Sdcs	for (mindex = 0; mindex < g->mlen; mindex++)
201262232Sdcs		g->matchjump[mindex] = 2*g->mlen - mindex - 1;
201362232Sdcs
201462232Sdcs	/* Compute pmatches[] */
201562232Sdcs	for (mindex = g->mlen - 1, suffix = g->mlen; mindex >= 0;
201662232Sdcs	    mindex--, suffix--) {
201762232Sdcs		pmatches[mindex] = suffix;
201862232Sdcs
201962232Sdcs		/* If a mismatch is found, interrupting the substring,
202062232Sdcs		 * compute the matchjump for that position. If no
202162232Sdcs		 * mismatch is found, then a text substring mismatched
202262232Sdcs		 * against the suffix will also mismatch against the
202362232Sdcs		 * substring.
202462232Sdcs		 */
202562232Sdcs		while (suffix < g->mlen
202662232Sdcs		    && g->must[mindex] != g->must[suffix]) {
202762232Sdcs			g->matchjump[suffix] = MIN(g->matchjump[suffix],
202862232Sdcs			    g->mlen - mindex - 1);
202962232Sdcs			suffix = pmatches[suffix];
203062232Sdcs		}
203162232Sdcs	}
203262232Sdcs
203362232Sdcs	/* Compute the matchjump up to the last substring found to jump
203462232Sdcs	 * to the beginning of the largest must pattern prefix matching
203562232Sdcs	 * it's own suffix.
203662232Sdcs	 */
203762232Sdcs	for (mindex = 0; mindex <= suffix; mindex++)
203862232Sdcs		g->matchjump[mindex] = MIN(g->matchjump[mindex],
203962232Sdcs		    g->mlen + suffix - mindex);
204062232Sdcs
204162232Sdcs        ssuffix = pmatches[suffix];
204262391Sdcs        while (suffix < g->mlen) {
204362673Sdcs                while (suffix <= ssuffix && suffix < g->mlen) {
204462232Sdcs                        g->matchjump[suffix] = MIN(g->matchjump[suffix],
204562232Sdcs			    g->mlen + ssuffix - suffix);
204662232Sdcs                        suffix++;
204762232Sdcs                }
204886208Sdcs		if (suffix < g->mlen)
204986208Sdcs                	ssuffix = pmatches[ssuffix];
205062232Sdcs        }
205162232Sdcs
205262232Sdcs	free(pmatches);
205362232Sdcs}
205462232Sdcs
205562232Sdcs/*
20561573Srgrimes - pluscount - count + nesting
205792889Sobrien == static sopno pluscount(struct parse *p, struct re_guts *g);
20581573Srgrimes */
20591573Srgrimesstatic sopno			/* nesting depth */
20601573Srgrimespluscount(p, g)
20611573Srgrimesstruct parse *p;
206292889Sobrienstruct re_guts *g;
20631573Srgrimes{
206492889Sobrien	sop *scan;
206592889Sobrien	sop s;
206692889Sobrien	sopno plusnest = 0;
206792889Sobrien	sopno maxnest = 0;
20681573Srgrimes
20691573Srgrimes	if (p->error != 0)
20701573Srgrimes		return(0);	/* there may not be an OEND */
20711573Srgrimes
20721573Srgrimes	scan = g->strip + 1;
20731573Srgrimes	do {
20741573Srgrimes		s = *scan++;
20751573Srgrimes		switch (OP(s)) {
20761573Srgrimes		case OPLUS_:
20771573Srgrimes			plusnest++;
20781573Srgrimes			break;
20791573Srgrimes		case O_PLUS:
20801573Srgrimes			if (plusnest > maxnest)
20811573Srgrimes				maxnest = plusnest;
20821573Srgrimes			plusnest--;
20831573Srgrimes			break;
20841573Srgrimes		}
20851573Srgrimes	} while (OP(s) != OEND);
20861573Srgrimes	if (plusnest != 0)
20871573Srgrimes		g->iflags |= BAD;
20881573Srgrimes	return(maxnest);
20891573Srgrimes}
2090