regcomp.c revision 136091
11573Srgrimes/*-
21573Srgrimes * Copyright (c) 1992, 1993, 1994 Henry Spencer.
31573Srgrimes * Copyright (c) 1992, 1993, 1994
41573Srgrimes *	The Regents of the University of California.  All rights reserved.
51573Srgrimes *
61573Srgrimes * This code is derived from software contributed to Berkeley by
71573Srgrimes * Henry Spencer.
81573Srgrimes *
91573Srgrimes * Redistribution and use in source and binary forms, with or without
101573Srgrimes * modification, are permitted provided that the following conditions
111573Srgrimes * are met:
121573Srgrimes * 1. Redistributions of source code must retain the above copyright
131573Srgrimes *    notice, this list of conditions and the following disclaimer.
141573Srgrimes * 2. Redistributions in binary form must reproduce the above copyright
151573Srgrimes *    notice, this list of conditions and the following disclaimer in the
161573Srgrimes *    documentation and/or other materials provided with the distribution.
171573Srgrimes * 3. All advertising materials mentioning features or use of this software
181573Srgrimes *    must display the following acknowledgement:
191573Srgrimes *	This product includes software developed by the University of
201573Srgrimes *	California, Berkeley and its contributors.
211573Srgrimes * 4. Neither the name of the University nor the names of its contributors
221573Srgrimes *    may be used to endorse or promote products derived from this software
231573Srgrimes *    without specific prior written permission.
241573Srgrimes *
251573Srgrimes * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
261573Srgrimes * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
271573Srgrimes * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
281573Srgrimes * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
291573Srgrimes * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
301573Srgrimes * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
311573Srgrimes * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
321573Srgrimes * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
331573Srgrimes * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
341573Srgrimes * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
351573Srgrimes * SUCH DAMAGE.
361573Srgrimes *
371573Srgrimes *	@(#)regcomp.c	8.5 (Berkeley) 3/20/94
381573Srgrimes */
391573Srgrimes
401573Srgrimes#if defined(LIBC_SCCS) && !defined(lint)
411573Srgrimesstatic char sccsid[] = "@(#)regcomp.c	8.5 (Berkeley) 3/20/94";
421573Srgrimes#endif /* LIBC_SCCS and not lint */
4392986Sobrien#include <sys/cdefs.h>
4492986Sobrien__FBSDID("$FreeBSD: head/lib/libc/regex/regcomp.c 136091 2004-10-03 15:42:59Z stefanf $");
451573Srgrimes
461573Srgrimes#include <sys/types.h>
471573Srgrimes#include <stdio.h>
481573Srgrimes#include <string.h>
491573Srgrimes#include <ctype.h>
501573Srgrimes#include <limits.h>
511573Srgrimes#include <stdlib.h>
521573Srgrimes#include <regex.h>
53136091Sstefanf#include <runetype.h>
54132019Stjr#include <wchar.h>
55132019Stjr#include <wctype.h>
561573Srgrimes
5719277Sache#include "collate.h"
5819277Sache
591573Srgrimes#include "utils.h"
601573Srgrimes#include "regex2.h"
611573Srgrimes
621573Srgrimes#include "cname.h"
631573Srgrimes
641573Srgrimes/*
651573Srgrimes * parse structure, passed up and down to avoid global variables and
661573Srgrimes * other clumsinesses
671573Srgrimes */
681573Srgrimesstruct parse {
691573Srgrimes	char *next;		/* next character in RE */
701573Srgrimes	char *end;		/* end of string (-> NUL normally) */
711573Srgrimes	int error;		/* has an error been seen? */
721573Srgrimes	sop *strip;		/* malloced strip */
731573Srgrimes	sopno ssize;		/* malloced strip size (allocated) */
741573Srgrimes	sopno slen;		/* malloced strip length (used) */
751573Srgrimes	int ncsalloc;		/* number of csets allocated */
761573Srgrimes	struct re_guts *g;
771573Srgrimes#	define	NPAREN	10	/* we need to remember () 1-9 for back refs */
781573Srgrimes	sopno pbegin[NPAREN];	/* -> ( ([0] unused) */
791573Srgrimes	sopno pend[NPAREN];	/* -> ) ([0] unused) */
801573Srgrimes};
811573Srgrimes
821573Srgrimes/* ========= begin header generated by ./mkh ========= */
831573Srgrimes#ifdef __cplusplus
841573Srgrimesextern "C" {
851573Srgrimes#endif
861573Srgrimes
871573Srgrimes/* === regcomp.c === */
88132019Stjrstatic void p_ere(struct parse *p, wint_t stop);
8992905Sobrienstatic void p_ere_exp(struct parse *p);
9092905Sobrienstatic void p_str(struct parse *p);
91132019Stjrstatic void p_bre(struct parse *p, wint_t end1, wint_t end2);
9292905Sobrienstatic int p_simp_re(struct parse *p, int starordinary);
9392905Sobrienstatic int p_count(struct parse *p);
9492905Sobrienstatic void p_bracket(struct parse *p);
9592905Sobrienstatic void p_b_term(struct parse *p, cset *cs);
9692905Sobrienstatic void p_b_cclass(struct parse *p, cset *cs);
9792905Sobrienstatic void p_b_eclass(struct parse *p, cset *cs);
98132019Stjrstatic wint_t p_b_symbol(struct parse *p);
99132019Stjrstatic wint_t p_b_coll_elem(struct parse *p, wint_t endc);
100132019Stjrstatic wint_t othercase(wint_t ch);
101132019Stjrstatic void bothcases(struct parse *p, wint_t ch);
102132019Stjrstatic void ordinary(struct parse *p, wint_t ch);
10392905Sobrienstatic void nonnewline(struct parse *p);
10492905Sobrienstatic void repeat(struct parse *p, sopno start, int from, int to);
10592905Sobrienstatic int seterr(struct parse *p, int e);
10692905Sobrienstatic cset *allocset(struct parse *p);
10792905Sobrienstatic void freeset(struct parse *p, cset *cs);
108132019Stjrstatic void CHadd(struct parse *p, cset *cs, wint_t ch);
109132019Stjrstatic void CHaddrange(struct parse *p, cset *cs, wint_t min, wint_t max);
110132019Stjrstatic void CHaddtype(struct parse *p, cset *cs, wctype_t wct);
111132019Stjrstatic wint_t singleton(cset *cs);
11292905Sobrienstatic sopno dupl(struct parse *p, sopno start, sopno finish);
11392905Sobrienstatic void doemit(struct parse *p, sop op, size_t opnd);
11492905Sobrienstatic void doinsert(struct parse *p, sop op, size_t opnd, sopno pos);
11592905Sobrienstatic void dofwd(struct parse *p, sopno pos, sop value);
11692905Sobrienstatic void enlarge(struct parse *p, sopno size);
11792905Sobrienstatic void stripsnug(struct parse *p, struct re_guts *g);
11892905Sobrienstatic void findmust(struct parse *p, struct re_guts *g);
119131973Stjrstatic int altoffset(sop *scan, int offset);
12092905Sobrienstatic void computejumps(struct parse *p, struct re_guts *g);
12192905Sobrienstatic void computematchjumps(struct parse *p, struct re_guts *g);
12292905Sobrienstatic sopno pluscount(struct parse *p, struct re_guts *g);
123132019Stjrstatic wint_t wgetnext(struct parse *p);
1241573Srgrimes
1251573Srgrimes#ifdef __cplusplus
1261573Srgrimes}
1271573Srgrimes#endif
1281573Srgrimes/* ========= end header generated by ./mkh ========= */
1291573Srgrimes
1301573Srgrimesstatic char nuls[10];		/* place to point scanner in event of error */
1311573Srgrimes
1321573Srgrimes/*
1331573Srgrimes * macros for use with parse structure
1341573Srgrimes * BEWARE:  these know that the parse structure is named `p' !!!
1351573Srgrimes */
1361573Srgrimes#define	PEEK()	(*p->next)
1371573Srgrimes#define	PEEK2()	(*(p->next+1))
1381573Srgrimes#define	MORE()	(p->next < p->end)
1391573Srgrimes#define	MORE2()	(p->next+1 < p->end)
1401573Srgrimes#define	SEE(c)	(MORE() && PEEK() == (c))
1411573Srgrimes#define	SEETWO(a, b)	(MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b))
1421573Srgrimes#define	EAT(c)	((SEE(c)) ? (NEXT(), 1) : 0)
1431573Srgrimes#define	EATTWO(a, b)	((SEETWO(a, b)) ? (NEXT2(), 1) : 0)
1441573Srgrimes#define	NEXT()	(p->next++)
1451573Srgrimes#define	NEXT2()	(p->next += 2)
1461573Srgrimes#define	NEXTn(n)	(p->next += (n))
1471573Srgrimes#define	GETNEXT()	(*p->next++)
148132019Stjr#define	WGETNEXT()	wgetnext(p)
1491573Srgrimes#define	SETERROR(e)	seterr(p, (e))
1501573Srgrimes#define	REQUIRE(co, e)	((co) || SETERROR(e))
1511573Srgrimes#define	MUSTSEE(c, e)	(REQUIRE(MORE() && PEEK() == (c), e))
1521573Srgrimes#define	MUSTEAT(c, e)	(REQUIRE(MORE() && GETNEXT() == (c), e))
1531573Srgrimes#define	MUSTNOTSEE(c, e)	(REQUIRE(!MORE() || PEEK() != (c), e))
1541573Srgrimes#define	EMIT(op, sopnd)	doemit(p, (sop)(op), (size_t)(sopnd))
1551573Srgrimes#define	INSERT(op, pos)	doinsert(p, (sop)(op), HERE()-(pos)+1, pos)
1561573Srgrimes#define	AHEAD(pos)		dofwd(p, pos, HERE()-(pos))
1571573Srgrimes#define	ASTERN(sop, pos)	EMIT(sop, HERE()-pos)
1581573Srgrimes#define	HERE()		(p->slen)
1591573Srgrimes#define	THERE()		(p->slen - 1)
1601573Srgrimes#define	THERETHERE()	(p->slen - 2)
1611573Srgrimes#define	DROP(n)	(p->slen -= (n))
1621573Srgrimes
1631573Srgrimes#ifndef NDEBUG
1641573Srgrimesstatic int never = 0;		/* for use in asserts; shuts lint up */
1651573Srgrimes#else
1661573Srgrimes#define	never	0		/* some <assert.h>s have bugs too */
1671573Srgrimes#endif
1681573Srgrimes
16962232Sdcs/* Macro used by computejump()/computematchjump() */
17062232Sdcs#define MIN(a,b)	((a)<(b)?(a):(b))
17162232Sdcs
1721573Srgrimes/*
1731573Srgrimes - regcomp - interface for parser and compilation
1741573Srgrimes = extern int regcomp(regex_t *, const char *, int);
1751573Srgrimes = #define	REG_BASIC	0000
1761573Srgrimes = #define	REG_EXTENDED	0001
1771573Srgrimes = #define	REG_ICASE	0002
1781573Srgrimes = #define	REG_NOSUB	0004
1791573Srgrimes = #define	REG_NEWLINE	0010
1801573Srgrimes = #define	REG_NOSPEC	0020
1811573Srgrimes = #define	REG_PEND	0040
1821573Srgrimes = #define	REG_DUMP	0200
1831573Srgrimes */
1841573Srgrimesint				/* 0 success, otherwise REG_something */
1851573Srgrimesregcomp(preg, pattern, cflags)
186104358Smikeregex_t * __restrict preg;
187104358Smikeconst char * __restrict pattern;
1881573Srgrimesint cflags;
1891573Srgrimes{
1901573Srgrimes	struct parse pa;
19192889Sobrien	struct re_guts *g;
19292889Sobrien	struct parse *p = &pa;
19392889Sobrien	int i;
19492889Sobrien	size_t len;
1951573Srgrimes#ifdef REDEBUG
1961573Srgrimes#	define	GOODFLAGS(f)	(f)
1971573Srgrimes#else
1981573Srgrimes#	define	GOODFLAGS(f)	((f)&~REG_DUMP)
1991573Srgrimes#endif
2001573Srgrimes
2011573Srgrimes	cflags = GOODFLAGS(cflags);
2021573Srgrimes	if ((cflags&REG_EXTENDED) && (cflags&REG_NOSPEC))
2031573Srgrimes		return(REG_INVARG);
2041573Srgrimes
2051573Srgrimes	if (cflags&REG_PEND) {
2061573Srgrimes		if (preg->re_endp < pattern)
2071573Srgrimes			return(REG_INVARG);
2081573Srgrimes		len = preg->re_endp - pattern;
2091573Srgrimes	} else
2101573Srgrimes		len = strlen((char *)pattern);
2111573Srgrimes
2121573Srgrimes	/* do the mallocs early so failure handling is easy */
213131973Stjr	g = (struct re_guts *)malloc(sizeof(struct re_guts));
2141573Srgrimes	if (g == NULL)
2151573Srgrimes		return(REG_ESPACE);
2161573Srgrimes	p->ssize = len/(size_t)2*(size_t)3 + (size_t)1;	/* ugh */
2171573Srgrimes	p->strip = (sop *)malloc(p->ssize * sizeof(sop));
2181573Srgrimes	p->slen = 0;
2191573Srgrimes	if (p->strip == NULL) {
2201573Srgrimes		free((char *)g);
2211573Srgrimes		return(REG_ESPACE);
2221573Srgrimes	}
2231573Srgrimes
2241573Srgrimes	/* set things up */
2251573Srgrimes	p->g = g;
2261573Srgrimes	p->next = (char *)pattern;	/* convenience; we do not modify it */
2271573Srgrimes	p->end = p->next + len;
2281573Srgrimes	p->error = 0;
2291573Srgrimes	p->ncsalloc = 0;
2301573Srgrimes	for (i = 0; i < NPAREN; i++) {
2311573Srgrimes		p->pbegin[i] = 0;
2321573Srgrimes		p->pend[i] = 0;
2331573Srgrimes	}
2341573Srgrimes	g->sets = NULL;
2351573Srgrimes	g->ncsets = 0;
2361573Srgrimes	g->cflags = cflags;
2371573Srgrimes	g->iflags = 0;
2381573Srgrimes	g->nbol = 0;
2391573Srgrimes	g->neol = 0;
2401573Srgrimes	g->must = NULL;
24162391Sdcs	g->moffset = -1;
24262263Sdcs	g->charjump = NULL;
24362263Sdcs	g->matchjump = NULL;
2441573Srgrimes	g->mlen = 0;
2451573Srgrimes	g->nsub = 0;
2461573Srgrimes	g->backrefs = 0;
2471573Srgrimes
2481573Srgrimes	/* do it */
2491573Srgrimes	EMIT(OEND, 0);
2501573Srgrimes	g->firststate = THERE();
2511573Srgrimes	if (cflags&REG_EXTENDED)
2521573Srgrimes		p_ere(p, OUT);
2531573Srgrimes	else if (cflags&REG_NOSPEC)
2541573Srgrimes		p_str(p);
2551573Srgrimes	else
2561573Srgrimes		p_bre(p, OUT, OUT);
2571573Srgrimes	EMIT(OEND, 0);
2581573Srgrimes	g->laststate = THERE();
2591573Srgrimes
2601573Srgrimes	/* tidy up loose ends and fill things in */
2611573Srgrimes	stripsnug(p, g);
2621573Srgrimes	findmust(p, g);
26362232Sdcs	/* only use Boyer-Moore algorithm if the pattern is bigger
26462232Sdcs	 * than three characters
26562232Sdcs	 */
26662232Sdcs	if(g->mlen > 3) {
26762232Sdcs		computejumps(p, g);
26862232Sdcs		computematchjumps(p, g);
26962755Sdcs		if(g->matchjump == NULL && g->charjump != NULL) {
27062232Sdcs			free(g->charjump);
27162232Sdcs			g->charjump = NULL;
27262232Sdcs		}
27362232Sdcs	}
2741573Srgrimes	g->nplus = pluscount(p, g);
2751573Srgrimes	g->magic = MAGIC2;
2761573Srgrimes	preg->re_nsub = g->nsub;
2771573Srgrimes	preg->re_g = g;
2781573Srgrimes	preg->re_magic = MAGIC1;
2791573Srgrimes#ifndef REDEBUG
2801573Srgrimes	/* not debugging, so can't rely on the assert() in regexec() */
2811573Srgrimes	if (g->iflags&BAD)
2821573Srgrimes		SETERROR(REG_ASSERT);
2831573Srgrimes#endif
2841573Srgrimes
2851573Srgrimes	/* win or lose, we're done */
2861573Srgrimes	if (p->error != 0)	/* lose */
2871573Srgrimes		regfree(preg);
2881573Srgrimes	return(p->error);
2891573Srgrimes}
2901573Srgrimes
2911573Srgrimes/*
2921573Srgrimes - p_ere - ERE parser top level, concatenation and alternation
29392889Sobrien == static void p_ere(struct parse *p, int stop);
2941573Srgrimes */
2951573Srgrimesstatic void
2961573Srgrimesp_ere(p, stop)
29792889Sobrienstruct parse *p;
2981573Srgrimesint stop;			/* character this ERE should end at */
2991573Srgrimes{
30092889Sobrien	char c;
30192889Sobrien	sopno prevback;
30292889Sobrien	sopno prevfwd;
30392889Sobrien	sopno conc;
30492889Sobrien	int first = 1;		/* is this the first alternative? */
3051573Srgrimes
3061573Srgrimes	for (;;) {
3071573Srgrimes		/* do a bunch of concatenated expressions */
3081573Srgrimes		conc = HERE();
3091573Srgrimes		while (MORE() && (c = PEEK()) != '|' && c != stop)
3101573Srgrimes			p_ere_exp(p);
31117141Sjkh		(void)REQUIRE(HERE() != conc, REG_EMPTY);	/* require nonempty */
3121573Srgrimes
3131573Srgrimes		if (!EAT('|'))
3141573Srgrimes			break;		/* NOTE BREAK OUT */
3151573Srgrimes
3161573Srgrimes		if (first) {
3171573Srgrimes			INSERT(OCH_, conc);	/* offset is wrong */
3181573Srgrimes			prevfwd = conc;
3191573Srgrimes			prevback = conc;
3201573Srgrimes			first = 0;
3211573Srgrimes		}
3221573Srgrimes		ASTERN(OOR1, prevback);
3231573Srgrimes		prevback = THERE();
3241573Srgrimes		AHEAD(prevfwd);			/* fix previous offset */
3251573Srgrimes		prevfwd = HERE();
3261573Srgrimes		EMIT(OOR2, 0);			/* offset is very wrong */
3271573Srgrimes	}
3281573Srgrimes
3291573Srgrimes	if (!first) {		/* tail-end fixups */
3301573Srgrimes		AHEAD(prevfwd);
3311573Srgrimes		ASTERN(O_CH, prevback);
3321573Srgrimes	}
3331573Srgrimes
3341573Srgrimes	assert(!MORE() || SEE(stop));
3351573Srgrimes}
3361573Srgrimes
3371573Srgrimes/*
3381573Srgrimes - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op
33992889Sobrien == static void p_ere_exp(struct parse *p);
3401573Srgrimes */
3411573Srgrimesstatic void
3421573Srgrimesp_ere_exp(p)
34392889Sobrienstruct parse *p;
3441573Srgrimes{
34592889Sobrien	char c;
346132019Stjr	wint_t wc;
34792889Sobrien	sopno pos;
34892889Sobrien	int count;
34992889Sobrien	int count2;
35092889Sobrien	sopno subno;
3511573Srgrimes	int wascaret = 0;
3521573Srgrimes
3531573Srgrimes	assert(MORE());		/* caller should have ensured this */
3541573Srgrimes	c = GETNEXT();
3551573Srgrimes
3561573Srgrimes	pos = HERE();
3571573Srgrimes	switch (c) {
3581573Srgrimes	case '(':
35917141Sjkh		(void)REQUIRE(MORE(), REG_EPAREN);
3601573Srgrimes		p->g->nsub++;
3611573Srgrimes		subno = p->g->nsub;
3621573Srgrimes		if (subno < NPAREN)
3631573Srgrimes			p->pbegin[subno] = HERE();
3641573Srgrimes		EMIT(OLPAREN, subno);
3651573Srgrimes		if (!SEE(')'))
3661573Srgrimes			p_ere(p, ')');
3671573Srgrimes		if (subno < NPAREN) {
3681573Srgrimes			p->pend[subno] = HERE();
3691573Srgrimes			assert(p->pend[subno] != 0);
3701573Srgrimes		}
3711573Srgrimes		EMIT(ORPAREN, subno);
37217141Sjkh		(void)MUSTEAT(')', REG_EPAREN);
3731573Srgrimes		break;
3741573Srgrimes#ifndef POSIX_MISTAKE
3751573Srgrimes	case ')':		/* happens only if no current unmatched ( */
3761573Srgrimes		/*
3771573Srgrimes		 * You may ask, why the ifndef?  Because I didn't notice
3781573Srgrimes		 * this until slightly too late for 1003.2, and none of the
3791573Srgrimes		 * other 1003.2 regular-expression reviewers noticed it at
3801573Srgrimes		 * all.  So an unmatched ) is legal POSIX, at least until
3811573Srgrimes		 * we can get it fixed.
3821573Srgrimes		 */
3831573Srgrimes		SETERROR(REG_EPAREN);
3841573Srgrimes		break;
3851573Srgrimes#endif
3861573Srgrimes	case '^':
3871573Srgrimes		EMIT(OBOL, 0);
3881573Srgrimes		p->g->iflags |= USEBOL;
3891573Srgrimes		p->g->nbol++;
3901573Srgrimes		wascaret = 1;
3911573Srgrimes		break;
3921573Srgrimes	case '$':
3931573Srgrimes		EMIT(OEOL, 0);
3941573Srgrimes		p->g->iflags |= USEEOL;
3951573Srgrimes		p->g->neol++;
3961573Srgrimes		break;
3971573Srgrimes	case '|':
3981573Srgrimes		SETERROR(REG_EMPTY);
3991573Srgrimes		break;
4001573Srgrimes	case '*':
4011573Srgrimes	case '+':
4021573Srgrimes	case '?':
4031573Srgrimes		SETERROR(REG_BADRPT);
4041573Srgrimes		break;
4051573Srgrimes	case '.':
4061573Srgrimes		if (p->g->cflags&REG_NEWLINE)
4071573Srgrimes			nonnewline(p);
4081573Srgrimes		else
4091573Srgrimes			EMIT(OANY, 0);
4101573Srgrimes		break;
4111573Srgrimes	case '[':
4121573Srgrimes		p_bracket(p);
4131573Srgrimes		break;
4141573Srgrimes	case '\\':
41517141Sjkh		(void)REQUIRE(MORE(), REG_EESCAPE);
416132019Stjr		wc = WGETNEXT();
417132019Stjr		ordinary(p, wc);
4181573Srgrimes		break;
4191573Srgrimes	case '{':		/* okay as ordinary except if digit follows */
42049094Sache		(void)REQUIRE(!MORE() || !isdigit((uch)PEEK()), REG_BADRPT);
4211573Srgrimes		/* FALLTHROUGH */
4221573Srgrimes	default:
423132019Stjr		p->next--;
424132019Stjr		wc = WGETNEXT();
425132019Stjr		ordinary(p, wc);
4261573Srgrimes		break;
4271573Srgrimes	}
4281573Srgrimes
4291573Srgrimes	if (!MORE())
4301573Srgrimes		return;
4311573Srgrimes	c = PEEK();
4321573Srgrimes	/* we call { a repetition if followed by a digit */
4331573Srgrimes	if (!( c == '*' || c == '+' || c == '?' ||
43449094Sache				(c == '{' && MORE2() && isdigit((uch)PEEK2())) ))
4351573Srgrimes		return;		/* no repetition, we're done */
4361573Srgrimes	NEXT();
4371573Srgrimes
43817141Sjkh	(void)REQUIRE(!wascaret, REG_BADRPT);
4391573Srgrimes	switch (c) {
4401573Srgrimes	case '*':	/* implemented as +? */
4411573Srgrimes		/* this case does not require the (y|) trick, noKLUDGE */
4421573Srgrimes		INSERT(OPLUS_, pos);
4431573Srgrimes		ASTERN(O_PLUS, pos);
4441573Srgrimes		INSERT(OQUEST_, pos);
4451573Srgrimes		ASTERN(O_QUEST, pos);
4461573Srgrimes		break;
4471573Srgrimes	case '+':
4481573Srgrimes		INSERT(OPLUS_, pos);
4491573Srgrimes		ASTERN(O_PLUS, pos);
4501573Srgrimes		break;
4511573Srgrimes	case '?':
4521573Srgrimes		/* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
4531573Srgrimes		INSERT(OCH_, pos);		/* offset slightly wrong */
4541573Srgrimes		ASTERN(OOR1, pos);		/* this one's right */
4551573Srgrimes		AHEAD(pos);			/* fix the OCH_ */
4561573Srgrimes		EMIT(OOR2, 0);			/* offset very wrong... */
4571573Srgrimes		AHEAD(THERE());			/* ...so fix it */
4581573Srgrimes		ASTERN(O_CH, THERETHERE());
4591573Srgrimes		break;
4601573Srgrimes	case '{':
4611573Srgrimes		count = p_count(p);
4621573Srgrimes		if (EAT(',')) {
46349094Sache			if (isdigit((uch)PEEK())) {
4641573Srgrimes				count2 = p_count(p);
46517141Sjkh				(void)REQUIRE(count <= count2, REG_BADBR);
4661573Srgrimes			} else		/* single number with comma */
4671573Srgrimes				count2 = INFINITY;
4681573Srgrimes		} else		/* just a single number */
4691573Srgrimes			count2 = count;
4701573Srgrimes		repeat(p, pos, count, count2);
4711573Srgrimes		if (!EAT('}')) {	/* error heuristics */
4721573Srgrimes			while (MORE() && PEEK() != '}')
4731573Srgrimes				NEXT();
47417141Sjkh			(void)REQUIRE(MORE(), REG_EBRACE);
4751573Srgrimes			SETERROR(REG_BADBR);
4761573Srgrimes		}
4771573Srgrimes		break;
4781573Srgrimes	}
4791573Srgrimes
4801573Srgrimes	if (!MORE())
4811573Srgrimes		return;
4821573Srgrimes	c = PEEK();
4831573Srgrimes	if (!( c == '*' || c == '+' || c == '?' ||
48449094Sache				(c == '{' && MORE2() && isdigit((uch)PEEK2())) ) )
4851573Srgrimes		return;
4861573Srgrimes	SETERROR(REG_BADRPT);
4871573Srgrimes}
4881573Srgrimes
4891573Srgrimes/*
4901573Srgrimes - p_str - string (no metacharacters) "parser"
49192889Sobrien == static void p_str(struct parse *p);
4921573Srgrimes */
4931573Srgrimesstatic void
4941573Srgrimesp_str(p)
49592889Sobrienstruct parse *p;
4961573Srgrimes{
49717141Sjkh	(void)REQUIRE(MORE(), REG_EMPTY);
4981573Srgrimes	while (MORE())
499132019Stjr		ordinary(p, WGETNEXT());
5001573Srgrimes}
5011573Srgrimes
5021573Srgrimes/*
5031573Srgrimes - p_bre - BRE parser top level, anchoring and concatenation
50492889Sobrien == static void p_bre(struct parse *p, int end1, \
50592889Sobrien ==	int end2);
5061573Srgrimes * Giving end1 as OUT essentially eliminates the end1/end2 check.
5071573Srgrimes *
5081573Srgrimes * This implementation is a bit of a kludge, in that a trailing $ is first
509131973Stjr * taken as an ordinary character and then revised to be an anchor.
5101573Srgrimes * The amount of lookahead needed to avoid this kludge is excessive.
5111573Srgrimes */
5121573Srgrimesstatic void
5131573Srgrimesp_bre(p, end1, end2)
51492889Sobrienstruct parse *p;
51592889Sobrienint end1;			/* first terminating character */
51692889Sobrienint end2;			/* second terminating character */
5171573Srgrimes{
51892889Sobrien	sopno start = HERE();
51992889Sobrien	int first = 1;			/* first subexpression? */
52092889Sobrien	int wasdollar = 0;
5211573Srgrimes
5221573Srgrimes	if (EAT('^')) {
5231573Srgrimes		EMIT(OBOL, 0);
5241573Srgrimes		p->g->iflags |= USEBOL;
5251573Srgrimes		p->g->nbol++;
5261573Srgrimes	}
5271573Srgrimes	while (MORE() && !SEETWO(end1, end2)) {
5281573Srgrimes		wasdollar = p_simp_re(p, first);
5291573Srgrimes		first = 0;
5301573Srgrimes	}
5311573Srgrimes	if (wasdollar) {	/* oops, that was a trailing anchor */
5321573Srgrimes		DROP(1);
5331573Srgrimes		EMIT(OEOL, 0);
5341573Srgrimes		p->g->iflags |= USEEOL;
5351573Srgrimes		p->g->neol++;
5361573Srgrimes	}
5371573Srgrimes
53817141Sjkh	(void)REQUIRE(HERE() != start, REG_EMPTY);	/* require nonempty */
5391573Srgrimes}
5401573Srgrimes
5411573Srgrimes/*
5421573Srgrimes - p_simp_re - parse a simple RE, an atom possibly followed by a repetition
54392889Sobrien == static int p_simp_re(struct parse *p, int starordinary);
5441573Srgrimes */
5451573Srgrimesstatic int			/* was the simple RE an unbackslashed $? */
5461573Srgrimesp_simp_re(p, starordinary)
54792889Sobrienstruct parse *p;
5481573Srgrimesint starordinary;		/* is a leading * an ordinary character? */
5491573Srgrimes{
55092889Sobrien	int c;
55192889Sobrien	int count;
55292889Sobrien	int count2;
55392889Sobrien	sopno pos;
55492889Sobrien	int i;
555132019Stjr	wint_t wc;
55692889Sobrien	sopno subno;
5571573Srgrimes#	define	BACKSL	(1<<CHAR_BIT)
5581573Srgrimes
5591573Srgrimes	pos = HERE();		/* repetion op, if any, covers from here */
5601573Srgrimes
5611573Srgrimes	assert(MORE());		/* caller should have ensured this */
5621573Srgrimes	c = GETNEXT();
5631573Srgrimes	if (c == '\\') {
56417141Sjkh		(void)REQUIRE(MORE(), REG_EESCAPE);
56549094Sache		c = BACKSL | GETNEXT();
5661573Srgrimes	}
5671573Srgrimes	switch (c) {
5681573Srgrimes	case '.':
5691573Srgrimes		if (p->g->cflags&REG_NEWLINE)
5701573Srgrimes			nonnewline(p);
5711573Srgrimes		else
5721573Srgrimes			EMIT(OANY, 0);
5731573Srgrimes		break;
5741573Srgrimes	case '[':
5751573Srgrimes		p_bracket(p);
5761573Srgrimes		break;
5771573Srgrimes	case BACKSL|'{':
5781573Srgrimes		SETERROR(REG_BADRPT);
5791573Srgrimes		break;
5801573Srgrimes	case BACKSL|'(':
5811573Srgrimes		p->g->nsub++;
5821573Srgrimes		subno = p->g->nsub;
5831573Srgrimes		if (subno < NPAREN)
5841573Srgrimes			p->pbegin[subno] = HERE();
5851573Srgrimes		EMIT(OLPAREN, subno);
5861573Srgrimes		/* the MORE here is an error heuristic */
5871573Srgrimes		if (MORE() && !SEETWO('\\', ')'))
5881573Srgrimes			p_bre(p, '\\', ')');
5891573Srgrimes		if (subno < NPAREN) {
5901573Srgrimes			p->pend[subno] = HERE();
5911573Srgrimes			assert(p->pend[subno] != 0);
5921573Srgrimes		}
5931573Srgrimes		EMIT(ORPAREN, subno);
59417141Sjkh		(void)REQUIRE(EATTWO('\\', ')'), REG_EPAREN);
5951573Srgrimes		break;
5961573Srgrimes	case BACKSL|')':	/* should not get here -- must be user */
5971573Srgrimes	case BACKSL|'}':
5981573Srgrimes		SETERROR(REG_EPAREN);
5991573Srgrimes		break;
6001573Srgrimes	case BACKSL|'1':
6011573Srgrimes	case BACKSL|'2':
6021573Srgrimes	case BACKSL|'3':
6031573Srgrimes	case BACKSL|'4':
6041573Srgrimes	case BACKSL|'5':
6051573Srgrimes	case BACKSL|'6':
6061573Srgrimes	case BACKSL|'7':
6071573Srgrimes	case BACKSL|'8':
6081573Srgrimes	case BACKSL|'9':
6091573Srgrimes		i = (c&~BACKSL) - '0';
6101573Srgrimes		assert(i < NPAREN);
6111573Srgrimes		if (p->pend[i] != 0) {
6121573Srgrimes			assert(i <= p->g->nsub);
6131573Srgrimes			EMIT(OBACK_, i);
6141573Srgrimes			assert(p->pbegin[i] != 0);
6151573Srgrimes			assert(OP(p->strip[p->pbegin[i]]) == OLPAREN);
6161573Srgrimes			assert(OP(p->strip[p->pend[i]]) == ORPAREN);
6171573Srgrimes			(void) dupl(p, p->pbegin[i]+1, p->pend[i]);
6181573Srgrimes			EMIT(O_BACK, i);
6191573Srgrimes		} else
6201573Srgrimes			SETERROR(REG_ESUBREG);
6211573Srgrimes		p->g->backrefs = 1;
6221573Srgrimes		break;
6231573Srgrimes	case '*':
62417141Sjkh		(void)REQUIRE(starordinary, REG_BADRPT);
6251573Srgrimes		/* FALLTHROUGH */
6261573Srgrimes	default:
627132019Stjr		p->next--;
628132019Stjr		wc = WGETNEXT();
629132019Stjr		ordinary(p, wc);
6301573Srgrimes		break;
6311573Srgrimes	}
6321573Srgrimes
6331573Srgrimes	if (EAT('*')) {		/* implemented as +? */
6341573Srgrimes		/* this case does not require the (y|) trick, noKLUDGE */
6351573Srgrimes		INSERT(OPLUS_, pos);
6361573Srgrimes		ASTERN(O_PLUS, pos);
6371573Srgrimes		INSERT(OQUEST_, pos);
6381573Srgrimes		ASTERN(O_QUEST, pos);
6391573Srgrimes	} else if (EATTWO('\\', '{')) {
6401573Srgrimes		count = p_count(p);
6411573Srgrimes		if (EAT(',')) {
64249094Sache			if (MORE() && isdigit((uch)PEEK())) {
6431573Srgrimes				count2 = p_count(p);
64417141Sjkh				(void)REQUIRE(count <= count2, REG_BADBR);
6451573Srgrimes			} else		/* single number with comma */
6461573Srgrimes				count2 = INFINITY;
6471573Srgrimes		} else		/* just a single number */
6481573Srgrimes			count2 = count;
6491573Srgrimes		repeat(p, pos, count, count2);
6501573Srgrimes		if (!EATTWO('\\', '}')) {	/* error heuristics */
6511573Srgrimes			while (MORE() && !SEETWO('\\', '}'))
6521573Srgrimes				NEXT();
65317141Sjkh			(void)REQUIRE(MORE(), REG_EBRACE);
6541573Srgrimes			SETERROR(REG_BADBR);
6551573Srgrimes		}
65649094Sache	} else if (c == '$')     /* $ (but not \$) ends it */
6571573Srgrimes		return(1);
6581573Srgrimes
6591573Srgrimes	return(0);
6601573Srgrimes}
6611573Srgrimes
6621573Srgrimes/*
6631573Srgrimes - p_count - parse a repetition count
66492889Sobrien == static int p_count(struct parse *p);
6651573Srgrimes */
6661573Srgrimesstatic int			/* the value */
6671573Srgrimesp_count(p)
66892889Sobrienstruct parse *p;
6691573Srgrimes{
67092889Sobrien	int count = 0;
67192889Sobrien	int ndigits = 0;
6721573Srgrimes
67349094Sache	while (MORE() && isdigit((uch)PEEK()) && count <= DUPMAX) {
6741573Srgrimes		count = count*10 + (GETNEXT() - '0');
6751573Srgrimes		ndigits++;
6761573Srgrimes	}
6771573Srgrimes
67817141Sjkh	(void)REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR);
6791573Srgrimes	return(count);
6801573Srgrimes}
6811573Srgrimes
6821573Srgrimes/*
6831573Srgrimes - p_bracket - parse a bracketed character list
68492889Sobrien == static void p_bracket(struct parse *p);
6851573Srgrimes */
6861573Srgrimesstatic void
6871573Srgrimesp_bracket(p)
68892889Sobrienstruct parse *p;
6891573Srgrimes{
690132019Stjr	cset *cs;
691132019Stjr	wint_t ch;
6921573Srgrimes
6931573Srgrimes	/* Dept of Truly Sickening Special-Case Kludges */
6941573Srgrimes	if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]", 6) == 0) {
6951573Srgrimes		EMIT(OBOW, 0);
6961573Srgrimes		NEXTn(6);
6971573Srgrimes		return;
6981573Srgrimes	}
6991573Srgrimes	if (p->next + 5 < p->end && strncmp(p->next, "[:>:]]", 6) == 0) {
7001573Srgrimes		EMIT(OEOW, 0);
7011573Srgrimes		NEXTn(6);
7021573Srgrimes		return;
7031573Srgrimes	}
7041573Srgrimes
705132019Stjr	if ((cs = allocset(p)) == NULL)
706132019Stjr		return;
707132019Stjr
708132019Stjr	if (p->g->cflags&REG_ICASE)
709132019Stjr		cs->icase = 1;
7101573Srgrimes	if (EAT('^'))
711132019Stjr		cs->invert = 1;
7121573Srgrimes	if (EAT(']'))
713132019Stjr		CHadd(p, cs, ']');
7141573Srgrimes	else if (EAT('-'))
715132019Stjr		CHadd(p, cs, '-');
7161573Srgrimes	while (MORE() && PEEK() != ']' && !SEETWO('-', ']'))
7171573Srgrimes		p_b_term(p, cs);
7181573Srgrimes	if (EAT('-'))
719132019Stjr		CHadd(p, cs, '-');
72017141Sjkh	(void)MUSTEAT(']', REG_EBRACK);
7211573Srgrimes
7221573Srgrimes	if (p->error != 0)	/* don't mess things up further */
7231573Srgrimes		return;
7241573Srgrimes
725132019Stjr	if (cs->invert && p->g->cflags&REG_NEWLINE)
726132019Stjr		cs->bmp['\n' >> 3] |= 1 << ('\n' & 7);
7271573Srgrimes
728132019Stjr	if ((ch = singleton(cs)) != OUT) {	/* optimize singleton sets */
729132019Stjr		ordinary(p, ch);
7301573Srgrimes		freeset(p, cs);
7311573Srgrimes	} else
732132019Stjr		EMIT(OANYOF, (int)(cs - p->g->sets));
7331573Srgrimes}
7341573Srgrimes
7351573Srgrimes/*
7361573Srgrimes - p_b_term - parse one term of a bracketed character list
73792889Sobrien == static void p_b_term(struct parse *p, cset *cs);
7381573Srgrimes */
7391573Srgrimesstatic void
7401573Srgrimesp_b_term(p, cs)
74192889Sobrienstruct parse *p;
74292889Sobriencset *cs;
7431573Srgrimes{
74492889Sobrien	char c;
745132019Stjr	wint_t start, finish;
746132019Stjr	wint_t i;
7471573Srgrimes
7481573Srgrimes	/* classify what we've got */
7491573Srgrimes	switch ((MORE()) ? PEEK() : '\0') {
7501573Srgrimes	case '[':
7511573Srgrimes		c = (MORE2()) ? PEEK2() : '\0';
7521573Srgrimes		break;
7531573Srgrimes	case '-':
7541573Srgrimes		SETERROR(REG_ERANGE);
7551573Srgrimes		return;			/* NOTE RETURN */
7561573Srgrimes		break;
7571573Srgrimes	default:
7581573Srgrimes		c = '\0';
7591573Srgrimes		break;
7601573Srgrimes	}
7611573Srgrimes
7621573Srgrimes	switch (c) {
7631573Srgrimes	case ':':		/* character class */
7641573Srgrimes		NEXT2();
76517141Sjkh		(void)REQUIRE(MORE(), REG_EBRACK);
7661573Srgrimes		c = PEEK();
76717141Sjkh		(void)REQUIRE(c != '-' && c != ']', REG_ECTYPE);
7681573Srgrimes		p_b_cclass(p, cs);
76917141Sjkh		(void)REQUIRE(MORE(), REG_EBRACK);
77017141Sjkh		(void)REQUIRE(EATTWO(':', ']'), REG_ECTYPE);
7711573Srgrimes		break;
7721573Srgrimes	case '=':		/* equivalence class */
7731573Srgrimes		NEXT2();
77417141Sjkh		(void)REQUIRE(MORE(), REG_EBRACK);
7751573Srgrimes		c = PEEK();
77617141Sjkh		(void)REQUIRE(c != '-' && c != ']', REG_ECOLLATE);
7771573Srgrimes		p_b_eclass(p, cs);
77817141Sjkh		(void)REQUIRE(MORE(), REG_EBRACK);
77917141Sjkh		(void)REQUIRE(EATTWO('=', ']'), REG_ECOLLATE);
7801573Srgrimes		break;
7811573Srgrimes	default:		/* symbol, ordinary character, or range */
7821573Srgrimes		start = p_b_symbol(p);
7831573Srgrimes		if (SEE('-') && MORE2() && PEEK2() != ']') {
7841573Srgrimes			/* range */
7851573Srgrimes			NEXT();
7861573Srgrimes			if (EAT('-'))
7871573Srgrimes				finish = '-';
7881573Srgrimes			else
7891573Srgrimes				finish = p_b_symbol(p);
7901573Srgrimes		} else
7911573Srgrimes			finish = start;
79217514Sache		if (start == finish)
793132019Stjr			CHadd(p, cs, start);
79417514Sache		else {
79524637Sache			if (__collate_load_error) {
79624637Sache				(void)REQUIRE((uch)start <= (uch)finish, REG_ERANGE);
797132019Stjr				CHaddrange(p, cs, start, finish);
79824637Sache			} else {
79924637Sache				(void)REQUIRE(__collate_range_cmp(start, finish) <= 0, REG_ERANGE);
800132019Stjr				for (i = 0; i <= UCHAR_MAX; i++) {
80124637Sache					if (   __collate_range_cmp(start, i) <= 0
80224637Sache					    && __collate_range_cmp(i, finish) <= 0
80324637Sache					   )
804132019Stjr						CHadd(p, cs, i);
80524637Sache				}
80617514Sache			}
80717514Sache		}
8081573Srgrimes		break;
8091573Srgrimes	}
8101573Srgrimes}
8111573Srgrimes
8121573Srgrimes/*
8131573Srgrimes - p_b_cclass - parse a character-class name and deal with it
81492889Sobrien == static void p_b_cclass(struct parse *p, cset *cs);
8151573Srgrimes */
8161573Srgrimesstatic void
8171573Srgrimesp_b_cclass(p, cs)
81892889Sobrienstruct parse *p;
81992889Sobriencset *cs;
8201573Srgrimes{
82192889Sobrien	char *sp = p->next;
82292889Sobrien	size_t len;
823132019Stjr	wctype_t wct;
824132019Stjr	char clname[16];
8251573Srgrimes
82617508Sache	while (MORE() && isalpha((uch)PEEK()))
8271573Srgrimes		NEXT();
8281573Srgrimes	len = p->next - sp;
829132019Stjr	if (len >= sizeof(clname) - 1) {
8301573Srgrimes		SETERROR(REG_ECTYPE);
8311573Srgrimes		return;
8321573Srgrimes	}
833132019Stjr	memcpy(clname, sp, len);
834132019Stjr	clname[len] = '\0';
835132019Stjr	if ((wct = wctype(clname)) == 0) {
836132019Stjr		SETERROR(REG_ECTYPE);
837132019Stjr		return;
83817508Sache	}
839132019Stjr	CHaddtype(p, cs, wct);
8401573Srgrimes}
8411573Srgrimes
8421573Srgrimes/*
8431573Srgrimes - p_b_eclass - parse an equivalence-class name and deal with it
84492889Sobrien == static void p_b_eclass(struct parse *p, cset *cs);
8451573Srgrimes *
8461573Srgrimes * This implementation is incomplete. xxx
8471573Srgrimes */
8481573Srgrimesstatic void
8491573Srgrimesp_b_eclass(p, cs)
85092889Sobrienstruct parse *p;
85192889Sobriencset *cs;
8521573Srgrimes{
853132019Stjr	wint_t c;
8541573Srgrimes
8551573Srgrimes	c = p_b_coll_elem(p, '=');
856132019Stjr	CHadd(p, cs, c);
8571573Srgrimes}
8581573Srgrimes
8591573Srgrimes/*
8601573Srgrimes - p_b_symbol - parse a character or [..]ed multicharacter collating symbol
86192889Sobrien == static char p_b_symbol(struct parse *p);
8621573Srgrimes */
863132019Stjrstatic wint_t			/* value of symbol */
8641573Srgrimesp_b_symbol(p)
86592889Sobrienstruct parse *p;
8661573Srgrimes{
867132019Stjr	wint_t value;
8681573Srgrimes
86917141Sjkh	(void)REQUIRE(MORE(), REG_EBRACK);
8701573Srgrimes	if (!EATTWO('[', '.'))
871132019Stjr		return(WGETNEXT());
8721573Srgrimes
8731573Srgrimes	/* collating symbol */
8741573Srgrimes	value = p_b_coll_elem(p, '.');
87517141Sjkh	(void)REQUIRE(EATTWO('.', ']'), REG_ECOLLATE);
8761573Srgrimes	return(value);
8771573Srgrimes}
8781573Srgrimes
8791573Srgrimes/*
8801573Srgrimes - p_b_coll_elem - parse a collating-element name and look it up
88192889Sobrien == static char p_b_coll_elem(struct parse *p, int endc);
8821573Srgrimes */
883132019Stjrstatic wint_t			/* value of collating element */
8841573Srgrimesp_b_coll_elem(p, endc)
88592889Sobrienstruct parse *p;
886132019Stjrwint_t endc;			/* name ended by endc,']' */
8871573Srgrimes{
88892889Sobrien	char *sp = p->next;
88992889Sobrien	struct cname *cp;
89092889Sobrien	int len;
891132019Stjr	mbstate_t mbs;
892132019Stjr	wchar_t wc;
893132019Stjr	size_t clen;
8941573Srgrimes
8951573Srgrimes	while (MORE() && !SEETWO(endc, ']'))
8961573Srgrimes		NEXT();
8971573Srgrimes	if (!MORE()) {
8981573Srgrimes		SETERROR(REG_EBRACK);
8991573Srgrimes		return(0);
9001573Srgrimes	}
9011573Srgrimes	len = p->next - sp;
9021573Srgrimes	for (cp = cnames; cp->name != NULL; cp++)
9031573Srgrimes		if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0')
9041573Srgrimes			return(cp->code);	/* known name */
905132019Stjr	memset(&mbs, 0, sizeof(mbs));
906132019Stjr	if ((clen = mbrtowc(&wc, sp, len, &mbs)) == len)
907132019Stjr		return (wc);			/* single character */
908132019Stjr	else if (clen == (size_t)-1 || clen == (size_t)-2)
909132019Stjr		SETERROR(REG_ILLSEQ);
910132019Stjr	else
911132019Stjr		SETERROR(REG_ECOLLATE);		/* neither */
9121573Srgrimes	return(0);
9131573Srgrimes}
9141573Srgrimes
9151573Srgrimes/*
9161573Srgrimes - othercase - return the case counterpart of an alphabetic
9171573Srgrimes == static char othercase(int ch);
9181573Srgrimes */
919132019Stjrstatic wint_t			/* if no counterpart, return ch */
9201573Srgrimesothercase(ch)
921132019Stjrwint_t ch;
9221573Srgrimes{
923132019Stjr	assert(iswalpha(ch));
924132019Stjr	if (iswupper(ch))
925132019Stjr		return(towlower(ch));
926132019Stjr	else if (iswlower(ch))
927132019Stjr		return(towupper(ch));
9281573Srgrimes	else			/* peculiar, but could happen */
9291573Srgrimes		return(ch);
9301573Srgrimes}
9311573Srgrimes
9321573Srgrimes/*
9331573Srgrimes - bothcases - emit a dualcase version of a two-case character
93492889Sobrien == static void bothcases(struct parse *p, int ch);
9351573Srgrimes *
9361573Srgrimes * Boy, is this implementation ever a kludge...
9371573Srgrimes */
9381573Srgrimesstatic void
9391573Srgrimesbothcases(p, ch)
94092889Sobrienstruct parse *p;
941132019Stjrwint_t ch;
9421573Srgrimes{
94392889Sobrien	char *oldnext = p->next;
94492889Sobrien	char *oldend = p->end;
945132019Stjr	char bracket[3 + MB_LEN_MAX];
946132019Stjr	size_t n;
947132019Stjr	mbstate_t mbs;
9481573Srgrimes
9491573Srgrimes	assert(othercase(ch) != ch);	/* p_bracket() would recurse */
9501573Srgrimes	p->next = bracket;
951132019Stjr	memset(&mbs, 0, sizeof(mbs));
952132019Stjr	n = wcrtomb(bracket, ch, &mbs);
953132019Stjr	assert(n != (size_t)-1);
954132019Stjr	bracket[n] = ']';
955132019Stjr	bracket[n + 1] = '\0';
956132019Stjr	p->end = bracket+n+1;
9571573Srgrimes	p_bracket(p);
958132019Stjr	assert(p->next == p->end);
9591573Srgrimes	p->next = oldnext;
9601573Srgrimes	p->end = oldend;
9611573Srgrimes}
9621573Srgrimes
9631573Srgrimes/*
9641573Srgrimes - ordinary - emit an ordinary character
96592889Sobrien == static void ordinary(struct parse *p, int ch);
9661573Srgrimes */
9671573Srgrimesstatic void
9681573Srgrimesordinary(p, ch)
96992889Sobrienstruct parse *p;
970132019Stjrwint_t ch;
9711573Srgrimes{
972132019Stjr	cset *cs;
9731573Srgrimes
974132019Stjr	if ((p->g->cflags&REG_ICASE) && iswalpha(ch) && othercase(ch) != ch)
9751573Srgrimes		bothcases(p, ch);
976132019Stjr	else if ((ch & OPDMASK) == ch)
977132019Stjr		EMIT(OCHAR, ch);
978132019Stjr	else {
979132019Stjr		/*
980132019Stjr		 * Kludge: character is too big to fit into an OCHAR operand.
981132019Stjr		 * Emit a singleton set.
982132019Stjr		 */
983132019Stjr		if ((cs = allocset(p)) == NULL)
984132019Stjr			return;
985132019Stjr		CHadd(p, cs, ch);
986132019Stjr		EMIT(OANYOF, (int)(cs - p->g->sets));
987132019Stjr	}
9881573Srgrimes}
9891573Srgrimes
9901573Srgrimes/*
9911573Srgrimes - nonnewline - emit REG_NEWLINE version of OANY
99292889Sobrien == static void nonnewline(struct parse *p);
9931573Srgrimes *
9941573Srgrimes * Boy, is this implementation ever a kludge...
9951573Srgrimes */
9961573Srgrimesstatic void
9971573Srgrimesnonnewline(p)
99892889Sobrienstruct parse *p;
9991573Srgrimes{
100092889Sobrien	char *oldnext = p->next;
100192889Sobrien	char *oldend = p->end;
10021573Srgrimes	char bracket[4];
10031573Srgrimes
10041573Srgrimes	p->next = bracket;
10051573Srgrimes	p->end = bracket+3;
10061573Srgrimes	bracket[0] = '^';
10071573Srgrimes	bracket[1] = '\n';
10081573Srgrimes	bracket[2] = ']';
10091573Srgrimes	bracket[3] = '\0';
10101573Srgrimes	p_bracket(p);
10111573Srgrimes	assert(p->next == bracket+3);
10121573Srgrimes	p->next = oldnext;
10131573Srgrimes	p->end = oldend;
10141573Srgrimes}
10151573Srgrimes
10161573Srgrimes/*
10171573Srgrimes - repeat - generate code for a bounded repetition, recursively if needed
101892889Sobrien == static void repeat(struct parse *p, sopno start, int from, int to);
10191573Srgrimes */
10201573Srgrimesstatic void
10211573Srgrimesrepeat(p, start, from, to)
102292889Sobrienstruct parse *p;
10231573Srgrimessopno start;			/* operand from here to end of strip */
10241573Srgrimesint from;			/* repeated from this number */
10251573Srgrimesint to;				/* to this number of times (maybe INFINITY) */
10261573Srgrimes{
102792889Sobrien	sopno finish = HERE();
10281573Srgrimes#	define	N	2
10291573Srgrimes#	define	INF	3
10301573Srgrimes#	define	REP(f, t)	((f)*8 + (t))
10311573Srgrimes#	define	MAP(n)	(((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N)
103292889Sobrien	sopno copy;
10331573Srgrimes
10341573Srgrimes	if (p->error != 0)	/* head off possible runaway recursion */
10351573Srgrimes		return;
10361573Srgrimes
10371573Srgrimes	assert(from <= to);
10381573Srgrimes
10391573Srgrimes	switch (REP(MAP(from), MAP(to))) {
10401573Srgrimes	case REP(0, 0):			/* must be user doing this */
10411573Srgrimes		DROP(finish-start);	/* drop the operand */
10421573Srgrimes		break;
10431573Srgrimes	case REP(0, 1):			/* as x{1,1}? */
10441573Srgrimes	case REP(0, N):			/* as x{1,n}? */
10451573Srgrimes	case REP(0, INF):		/* as x{1,}? */
10461573Srgrimes		/* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
10471573Srgrimes		INSERT(OCH_, start);		/* offset is wrong... */
10481573Srgrimes		repeat(p, start+1, 1, to);
10491573Srgrimes		ASTERN(OOR1, start);
10501573Srgrimes		AHEAD(start);			/* ... fix it */
10511573Srgrimes		EMIT(OOR2, 0);
10521573Srgrimes		AHEAD(THERE());
10531573Srgrimes		ASTERN(O_CH, THERETHERE());
10541573Srgrimes		break;
10551573Srgrimes	case REP(1, 1):			/* trivial case */
10561573Srgrimes		/* done */
10571573Srgrimes		break;
10581573Srgrimes	case REP(1, N):			/* as x?x{1,n-1} */
10591573Srgrimes		/* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
10601573Srgrimes		INSERT(OCH_, start);
10611573Srgrimes		ASTERN(OOR1, start);
10621573Srgrimes		AHEAD(start);
10631573Srgrimes		EMIT(OOR2, 0);			/* offset very wrong... */
10641573Srgrimes		AHEAD(THERE());			/* ...so fix it */
10651573Srgrimes		ASTERN(O_CH, THERETHERE());
10661573Srgrimes		copy = dupl(p, start+1, finish+1);
10671573Srgrimes		assert(copy == finish+4);
10681573Srgrimes		repeat(p, copy, 1, to-1);
10691573Srgrimes		break;
10701573Srgrimes	case REP(1, INF):		/* as x+ */
10711573Srgrimes		INSERT(OPLUS_, start);
10721573Srgrimes		ASTERN(O_PLUS, start);
10731573Srgrimes		break;
10741573Srgrimes	case REP(N, N):			/* as xx{m-1,n-1} */
10751573Srgrimes		copy = dupl(p, start, finish);
10761573Srgrimes		repeat(p, copy, from-1, to-1);
10771573Srgrimes		break;
10781573Srgrimes	case REP(N, INF):		/* as xx{n-1,INF} */
10791573Srgrimes		copy = dupl(p, start, finish);
10801573Srgrimes		repeat(p, copy, from-1, to);
10811573Srgrimes		break;
10821573Srgrimes	default:			/* "can't happen" */
10831573Srgrimes		SETERROR(REG_ASSERT);	/* just in case */
10841573Srgrimes		break;
10851573Srgrimes	}
10861573Srgrimes}
10871573Srgrimes
10881573Srgrimes/*
1089132019Stjr - wgetnext - helper function for WGETNEXT() macro. Gets the next wide
1090132019Stjr - character from the parse struct, signals a REG_ILLSEQ error if the
1091132019Stjr - character can't be converted. Returns the number of bytes consumed.
1092132019Stjr */
1093132019Stjrstatic wint_t
1094132019Stjrwgetnext(p)
1095132019Stjrstruct parse *p;
1096132019Stjr{
1097132019Stjr	mbstate_t mbs;
1098132019Stjr	wchar_t wc;
1099132019Stjr	size_t n;
1100132019Stjr
1101132019Stjr	memset(&mbs, 0, sizeof(mbs));
1102132019Stjr	n = mbrtowc(&wc, p->next, p->end - p->next, &mbs);
1103132019Stjr	if (n == (size_t)-1 || n == (size_t)-2) {
1104132019Stjr		SETERROR(REG_ILLSEQ);
1105132019Stjr		return (0);
1106132019Stjr	}
1107132019Stjr	if (n == 0)
1108132019Stjr		n = 1;
1109132019Stjr	p->next += n;
1110132019Stjr	return (wc);
1111132019Stjr}
1112132019Stjr
1113132019Stjr/*
11141573Srgrimes - seterr - set an error condition
111592889Sobrien == static int seterr(struct parse *p, int e);
11161573Srgrimes */
11171573Srgrimesstatic int			/* useless but makes type checking happy */
11181573Srgrimesseterr(p, e)
111992889Sobrienstruct parse *p;
11201573Srgrimesint e;
11211573Srgrimes{
11221573Srgrimes	if (p->error == 0)	/* keep earliest error condition */
11231573Srgrimes		p->error = e;
11241573Srgrimes	p->next = nuls;		/* try to bring things to a halt */
11251573Srgrimes	p->end = nuls;
11261573Srgrimes	return(0);		/* make the return value well-defined */
11271573Srgrimes}
11281573Srgrimes
11291573Srgrimes/*
11301573Srgrimes - allocset - allocate a set of characters for []
113192889Sobrien == static cset *allocset(struct parse *p);
11321573Srgrimes */
11331573Srgrimesstatic cset *
11341573Srgrimesallocset(p)
113592889Sobrienstruct parse *p;
11361573Srgrimes{
1137132019Stjr	cset *cs, *ncs;
11381573Srgrimes
1139132019Stjr	ncs = realloc(p->g->sets, (p->g->ncsets + 1) * sizeof(*ncs));
1140132019Stjr	if (ncs == NULL) {
1141132019Stjr		SETERROR(REG_ESPACE);
1142132019Stjr		return (NULL);
11431573Srgrimes	}
1144132019Stjr	p->g->sets = ncs;
1145132019Stjr	cs = &p->g->sets[p->g->ncsets++];
1146132019Stjr	memset(cs, 0, sizeof(*cs));
11471573Srgrimes
11481573Srgrimes	return(cs);
11491573Srgrimes}
11501573Srgrimes
11511573Srgrimes/*
11521573Srgrimes - freeset - free a now-unused set
115392889Sobrien == static void freeset(struct parse *p, cset *cs);
11541573Srgrimes */
11551573Srgrimesstatic void
11561573Srgrimesfreeset(p, cs)
115792889Sobrienstruct parse *p;
115892889Sobriencset *cs;
11591573Srgrimes{
116092889Sobrien	cset *top = &p->g->sets[p->g->ncsets];
11611573Srgrimes
1162132019Stjr	free(cs->wides);
1163132019Stjr	free(cs->ranges);
1164132019Stjr	free(cs->types);
1165132019Stjr	memset(cs, 0, sizeof(*cs));
11661573Srgrimes	if (cs == top-1)	/* recover only the easy case */
11671573Srgrimes		p->g->ncsets--;
11681573Srgrimes}
11691573Srgrimes
11701573Srgrimes/*
1171132019Stjr - singleton - Determine whether a set contains only one character,
1172132019Stjr - returning it if so, otherwise returning OUT.
11731573Srgrimes */
1174132019Stjrstatic wint_t
1175132019Stjrsingleton(cs)
117692889Sobriencset *cs;
11771573Srgrimes{
1178132019Stjr	wint_t i, s, n;
11791573Srgrimes
1180132019Stjr	for (i = n = 0; i < NC; i++)
1181132019Stjr		if (CHIN(cs, i)) {
1182132019Stjr			n++;
1183132019Stjr			s = i;
11841573Srgrimes		}
1185132019Stjr	if (n == 1)
1186132019Stjr		return (s);
1187132019Stjr	if (cs->nwides == 1 && cs->nranges == 0 && cs->ntypes == 0 &&
1188132019Stjr	    cs->icase == 0)
1189132019Stjr		return (cs->wides[0]);
1190132019Stjr	/* Don't bother handling the other cases. */
1191132019Stjr	return (OUT);
1192132019Stjr}
11931573Srgrimes
1194132019Stjr/*
1195132019Stjr - CHadd - add character to character set.
1196132019Stjr */
1197132019Stjrstatic void
1198132019StjrCHadd(p, cs, ch)
1199132019Stjrstruct parse *p;
1200132019Stjrcset *cs;
1201132019Stjrwint_t ch;
1202132019Stjr{
1203134802Stjr	wint_t nch, *newwides;
1204132019Stjr	assert(ch >= 0);
1205134802Stjr	if (ch < NC)
1206132019Stjr		cs->bmp[ch >> 3] |= 1 << (ch & 7);
1207134802Stjr	else {
1208132019Stjr		newwides = realloc(cs->wides, (cs->nwides + 1) *
1209132019Stjr		    sizeof(*cs->wides));
1210132019Stjr		if (newwides == NULL) {
1211132019Stjr			SETERROR(REG_ESPACE);
1212132019Stjr			return;
1213132019Stjr		}
1214132019Stjr		cs->wides = newwides;
1215132019Stjr		cs->wides[cs->nwides++] = ch;
12161573Srgrimes	}
1217134802Stjr	if (cs->icase) {
1218134802Stjr		if ((nch = towlower(ch)) < NC)
1219134802Stjr			cs->bmp[nch >> 3] |= 1 << (nch & 7);
1220134802Stjr		if ((nch = towupper(ch)) < NC)
1221134802Stjr			cs->bmp[nch >> 3] |= 1 << (nch & 7);
1222134802Stjr	}
12231573Srgrimes}
12241573Srgrimes
12251573Srgrimes/*
1226132019Stjr - CHaddrange - add all characters in the range [min,max] to a character set.
12271573Srgrimes */
1228132019Stjrstatic void
1229132019StjrCHaddrange(p, cs, min, max)
123092889Sobrienstruct parse *p;
123192889Sobriencset *cs;
1232132019Stjrwint_t min, max;
12331573Srgrimes{
1234132019Stjr	crange *newranges;
12351573Srgrimes
1236132019Stjr	for (; min < NC && min <= max; min++)
1237132019Stjr		CHadd(p, cs, min);
1238132019Stjr	if (min >= max)
1239132019Stjr		return;
1240132019Stjr	newranges = realloc(cs->ranges, (cs->nranges + 1) *
1241132019Stjr	    sizeof(*cs->ranges));
1242132019Stjr	if (newranges == NULL) {
1243132019Stjr		SETERROR(REG_ESPACE);
1244132019Stjr		return;
1245132019Stjr	}
1246132019Stjr	cs->ranges = newranges;
1247132019Stjr	cs->ranges[cs->nranges].min = min;
1248132019Stjr	cs->ranges[cs->nranges].min = max;
1249132019Stjr	cs->nranges++;
12501573Srgrimes}
12511573Srgrimes
12521573Srgrimes/*
1253132019Stjr - CHaddtype - add all characters of a certain type to a character set.
12541573Srgrimes */
1255132019Stjrstatic void
1256132019StjrCHaddtype(p, cs, wct)
125792889Sobrienstruct parse *p;
125892889Sobriencset *cs;
1259132019Stjrwctype_t wct;
12601573Srgrimes{
1261132019Stjr	wint_t i;
1262132019Stjr	wctype_t *newtypes;
12631573Srgrimes
1264134802Stjr	for (i = 0; i < NC; i++)
1265132019Stjr		if (iswctype(i, wct))
1266132019Stjr			CHadd(p, cs, i);
1267132019Stjr	newtypes = realloc(cs->types, (cs->ntypes + 1) *
1268132019Stjr	    sizeof(*cs->types));
1269132019Stjr	if (newtypes == NULL) {
1270132019Stjr		SETERROR(REG_ESPACE);
1271132019Stjr		return;
1272132019Stjr	}
1273132019Stjr	cs->types = newtypes;
1274132019Stjr	cs->types[cs->ntypes++] = wct;
12751573Srgrimes}
12761573Srgrimes
12771573Srgrimes/*
12781573Srgrimes - dupl - emit a duplicate of a bunch of sops
127992889Sobrien == static sopno dupl(struct parse *p, sopno start, sopno finish);
12801573Srgrimes */
12811573Srgrimesstatic sopno			/* start of duplicate */
12821573Srgrimesdupl(p, start, finish)
128392889Sobrienstruct parse *p;
12841573Srgrimessopno start;			/* from here */
12851573Srgrimessopno finish;			/* to this less one */
12861573Srgrimes{
128792889Sobrien	sopno ret = HERE();
128892889Sobrien	sopno len = finish - start;
12891573Srgrimes
12901573Srgrimes	assert(finish >= start);
12911573Srgrimes	if (len == 0)
12921573Srgrimes		return(ret);
12931573Srgrimes	enlarge(p, p->ssize + len);	/* this many unexpected additions */
12941573Srgrimes	assert(p->ssize >= p->slen + len);
12951573Srgrimes	(void) memcpy((char *)(p->strip + p->slen),
12961573Srgrimes		(char *)(p->strip + start), (size_t)len*sizeof(sop));
12971573Srgrimes	p->slen += len;
12981573Srgrimes	return(ret);
12991573Srgrimes}
13001573Srgrimes
13011573Srgrimes/*
13021573Srgrimes - doemit - emit a strip operator
130392889Sobrien == static void doemit(struct parse *p, sop op, size_t opnd);
13041573Srgrimes *
13051573Srgrimes * It might seem better to implement this as a macro with a function as
13061573Srgrimes * hard-case backup, but it's just too big and messy unless there are
13071573Srgrimes * some changes to the data structures.  Maybe later.
13081573Srgrimes */
13091573Srgrimesstatic void
13101573Srgrimesdoemit(p, op, opnd)
131192889Sobrienstruct parse *p;
13121573Srgrimessop op;
13131573Srgrimessize_t opnd;
13141573Srgrimes{
13151573Srgrimes	/* avoid making error situations worse */
13161573Srgrimes	if (p->error != 0)
13171573Srgrimes		return;
13181573Srgrimes
13191573Srgrimes	/* deal with oversize operands ("can't happen", more or less) */
13201573Srgrimes	assert(opnd < 1<<OPSHIFT);
13211573Srgrimes
13221573Srgrimes	/* deal with undersized strip */
13231573Srgrimes	if (p->slen >= p->ssize)
13241573Srgrimes		enlarge(p, (p->ssize+1) / 2 * 3);	/* +50% */
13251573Srgrimes	assert(p->slen < p->ssize);
13261573Srgrimes
13271573Srgrimes	/* finally, it's all reduced to the easy case */
13281573Srgrimes	p->strip[p->slen++] = SOP(op, opnd);
13291573Srgrimes}
13301573Srgrimes
13311573Srgrimes/*
13321573Srgrimes - doinsert - insert a sop into the strip
133392889Sobrien == static void doinsert(struct parse *p, sop op, size_t opnd, sopno pos);
13341573Srgrimes */
13351573Srgrimesstatic void
13361573Srgrimesdoinsert(p, op, opnd, pos)
133792889Sobrienstruct parse *p;
13381573Srgrimessop op;
13391573Srgrimessize_t opnd;
13401573Srgrimessopno pos;
13411573Srgrimes{
134292889Sobrien	sopno sn;
134392889Sobrien	sop s;
134492889Sobrien	int i;
13451573Srgrimes
13461573Srgrimes	/* avoid making error situations worse */
13471573Srgrimes	if (p->error != 0)
13481573Srgrimes		return;
13491573Srgrimes
13501573Srgrimes	sn = HERE();
13511573Srgrimes	EMIT(op, opnd);		/* do checks, ensure space */
13521573Srgrimes	assert(HERE() == sn+1);
13531573Srgrimes	s = p->strip[sn];
13541573Srgrimes
13551573Srgrimes	/* adjust paren pointers */
13561573Srgrimes	assert(pos > 0);
13571573Srgrimes	for (i = 1; i < NPAREN; i++) {
13581573Srgrimes		if (p->pbegin[i] >= pos) {
13591573Srgrimes			p->pbegin[i]++;
13601573Srgrimes		}
13611573Srgrimes		if (p->pend[i] >= pos) {
13621573Srgrimes			p->pend[i]++;
13631573Srgrimes		}
13641573Srgrimes	}
13651573Srgrimes
13661573Srgrimes	memmove((char *)&p->strip[pos+1], (char *)&p->strip[pos],
13671573Srgrimes						(HERE()-pos-1)*sizeof(sop));
13681573Srgrimes	p->strip[pos] = s;
13691573Srgrimes}
13701573Srgrimes
13711573Srgrimes/*
13721573Srgrimes - dofwd - complete a forward reference
137392889Sobrien == static void dofwd(struct parse *p, sopno pos, sop value);
13741573Srgrimes */
13751573Srgrimesstatic void
13761573Srgrimesdofwd(p, pos, value)
137792889Sobrienstruct parse *p;
137892889Sobriensopno pos;
13791573Srgrimessop value;
13801573Srgrimes{
13811573Srgrimes	/* avoid making error situations worse */
13821573Srgrimes	if (p->error != 0)
13831573Srgrimes		return;
13841573Srgrimes
13851573Srgrimes	assert(value < 1<<OPSHIFT);
13861573Srgrimes	p->strip[pos] = OP(p->strip[pos]) | value;
13871573Srgrimes}
13881573Srgrimes
13891573Srgrimes/*
13901573Srgrimes - enlarge - enlarge the strip
139192889Sobrien == static void enlarge(struct parse *p, sopno size);
13921573Srgrimes */
13931573Srgrimesstatic void
13941573Srgrimesenlarge(p, size)
139592889Sobrienstruct parse *p;
139692889Sobriensopno size;
13971573Srgrimes{
139892889Sobrien	sop *sp;
13991573Srgrimes
14001573Srgrimes	if (p->ssize >= size)
14011573Srgrimes		return;
14021573Srgrimes
14031573Srgrimes	sp = (sop *)realloc(p->strip, size*sizeof(sop));
14041573Srgrimes	if (sp == NULL) {
14051573Srgrimes		SETERROR(REG_ESPACE);
14061573Srgrimes		return;
14071573Srgrimes	}
14081573Srgrimes	p->strip = sp;
14091573Srgrimes	p->ssize = size;
14101573Srgrimes}
14111573Srgrimes
14121573Srgrimes/*
14131573Srgrimes - stripsnug - compact the strip
141492889Sobrien == static void stripsnug(struct parse *p, struct re_guts *g);
14151573Srgrimes */
14161573Srgrimesstatic void
14171573Srgrimesstripsnug(p, g)
141892889Sobrienstruct parse *p;
141992889Sobrienstruct re_guts *g;
14201573Srgrimes{
14211573Srgrimes	g->nstates = p->slen;
14221573Srgrimes	g->strip = (sop *)realloc((char *)p->strip, p->slen * sizeof(sop));
14231573Srgrimes	if (g->strip == NULL) {
14241573Srgrimes		SETERROR(REG_ESPACE);
14251573Srgrimes		g->strip = p->strip;
14261573Srgrimes	}
14271573Srgrimes}
14281573Srgrimes
14291573Srgrimes/*
14301573Srgrimes - findmust - fill in must and mlen with longest mandatory literal string
143192889Sobrien == static void findmust(struct parse *p, struct re_guts *g);
14321573Srgrimes *
14331573Srgrimes * This algorithm could do fancy things like analyzing the operands of |
14341573Srgrimes * for common subsequences.  Someday.  This code is simple and finds most
14351573Srgrimes * of the interesting cases.
14361573Srgrimes *
14371573Srgrimes * Note that must and mlen got initialized during setup.
14381573Srgrimes */
14391573Srgrimesstatic void
14401573Srgrimesfindmust(p, g)
14411573Srgrimesstruct parse *p;
144292889Sobrienstruct re_guts *g;
14431573Srgrimes{
144492889Sobrien	sop *scan;
14451573Srgrimes	sop *start;
144692889Sobrien	sop *newstart;
144792889Sobrien	sopno newlen;
144892889Sobrien	sop s;
144992889Sobrien	char *cp;
145062391Sdcs	int offset;
1451132019Stjr	char buf[MB_LEN_MAX];
1452132019Stjr	size_t clen;
1453132019Stjr	mbstate_t mbs;
14541573Srgrimes
14551573Srgrimes	/* avoid making error situations worse */
14561573Srgrimes	if (p->error != 0)
14571573Srgrimes		return;
14581573Srgrimes
1459132019Stjr	/*
1460132019Stjr	 * It's not generally safe to do a ``char'' substring search on
1461132019Stjr	 * multibyte character strings, but it's safe for at least
1462132019Stjr	 * UTF-8 (see RFC 3629).
1463132019Stjr	 */
1464132019Stjr	if (MB_CUR_MAX > 1 &&
1465132019Stjr	    strcmp(_CurrentRuneLocale->__encoding, "UTF-8") != 0)
1466132019Stjr		return;
1467132019Stjr
14681573Srgrimes	/* find the longest OCHAR sequence in strip */
14691573Srgrimes	newlen = 0;
147062391Sdcs	offset = 0;
147162391Sdcs	g->moffset = 0;
14721573Srgrimes	scan = g->strip + 1;
14731573Srgrimes	do {
14741573Srgrimes		s = *scan++;
14751573Srgrimes		switch (OP(s)) {
14761573Srgrimes		case OCHAR:		/* sequence member */
1477132019Stjr			if (newlen == 0) {		/* new sequence */
1478132019Stjr				memset(&mbs, 0, sizeof(mbs));
14791573Srgrimes				newstart = scan - 1;
1480132019Stjr			}
1481132019Stjr			clen = wcrtomb(buf, OPND(s), &mbs);
1482132019Stjr			if (clen == (size_t)-1)
1483132019Stjr				goto toohard;
1484132019Stjr			newlen += clen;
14851573Srgrimes			break;
14861573Srgrimes		case OPLUS_:		/* things that don't break one */
14871573Srgrimes		case OLPAREN:
14881573Srgrimes		case ORPAREN:
14891573Srgrimes			break;
14901573Srgrimes		case OQUEST_:		/* things that must be skipped */
14911573Srgrimes		case OCH_:
1492131973Stjr			offset = altoffset(scan, offset);
14931573Srgrimes			scan--;
14941573Srgrimes			do {
14951573Srgrimes				scan += OPND(s);
14961573Srgrimes				s = *scan;
14971573Srgrimes				/* assert() interferes w debug printouts */
14981573Srgrimes				if (OP(s) != O_QUEST && OP(s) != O_CH &&
14991573Srgrimes							OP(s) != OOR2) {
15001573Srgrimes					g->iflags |= BAD;
15011573Srgrimes					return;
15021573Srgrimes				}
15031573Srgrimes			} while (OP(s) != O_QUEST && OP(s) != O_CH);
1504102411Scharnier			/* FALLTHROUGH */
150562391Sdcs		case OBOW:		/* things that break a sequence */
150662391Sdcs		case OEOW:
150762391Sdcs		case OBOL:
150862391Sdcs		case OEOL:
150962391Sdcs		case O_QUEST:
151062391Sdcs		case O_CH:
151162391Sdcs		case OEND:
15121573Srgrimes			if (newlen > g->mlen) {		/* ends one */
15131573Srgrimes				start = newstart;
15141573Srgrimes				g->mlen = newlen;
151562391Sdcs				if (offset > -1) {
151662391Sdcs					g->moffset += offset;
151762391Sdcs					offset = newlen;
151862391Sdcs				} else
151962391Sdcs					g->moffset = offset;
152062391Sdcs			} else {
152162391Sdcs				if (offset > -1)
152262391Sdcs					offset += newlen;
15231573Srgrimes			}
15241573Srgrimes			newlen = 0;
15251573Srgrimes			break;
152662391Sdcs		case OANY:
152762391Sdcs			if (newlen > g->mlen) {		/* ends one */
152862391Sdcs				start = newstart;
152962391Sdcs				g->mlen = newlen;
153062391Sdcs				if (offset > -1) {
153162391Sdcs					g->moffset += offset;
153262391Sdcs					offset = newlen;
153362391Sdcs				} else
153462391Sdcs					g->moffset = offset;
153562391Sdcs			} else {
153662391Sdcs				if (offset > -1)
153762391Sdcs					offset += newlen;
153862391Sdcs			}
153962391Sdcs			if (offset > -1)
154062391Sdcs				offset++;
154162391Sdcs			newlen = 0;
154262391Sdcs			break;
154362391Sdcs		case OANYOF:		/* may or may not invalidate offset */
154462391Sdcs			/* First, everything as OANY */
154562391Sdcs			if (newlen > g->mlen) {		/* ends one */
154662391Sdcs				start = newstart;
154762391Sdcs				g->mlen = newlen;
154862391Sdcs				if (offset > -1) {
154962391Sdcs					g->moffset += offset;
155062391Sdcs					offset = newlen;
155162391Sdcs				} else
155262391Sdcs					g->moffset = offset;
155362391Sdcs			} else {
155462391Sdcs				if (offset > -1)
155562391Sdcs					offset += newlen;
155662391Sdcs			}
155762391Sdcs			if (offset > -1)
155862391Sdcs				offset++;
155962391Sdcs			newlen = 0;
156062391Sdcs			break;
1561132019Stjr		toohard:
156262391Sdcs		default:
156362391Sdcs			/* Anything here makes it impossible or too hard
156462391Sdcs			 * to calculate the offset -- so we give up;
156562391Sdcs			 * save the last known good offset, in case the
156662391Sdcs			 * must sequence doesn't occur later.
156762391Sdcs			 */
156862391Sdcs			if (newlen > g->mlen) {		/* ends one */
156962391Sdcs				start = newstart;
157062391Sdcs				g->mlen = newlen;
157162391Sdcs				if (offset > -1)
157262391Sdcs					g->moffset += offset;
157362391Sdcs				else
157462391Sdcs					g->moffset = offset;
157562391Sdcs			}
157662391Sdcs			offset = -1;
157762391Sdcs			newlen = 0;
157862391Sdcs			break;
15791573Srgrimes		}
15801573Srgrimes	} while (OP(s) != OEND);
15811573Srgrimes
158262391Sdcs	if (g->mlen == 0) {		/* there isn't one */
158362391Sdcs		g->moffset = -1;
15841573Srgrimes		return;
158562391Sdcs	}
15861573Srgrimes
15871573Srgrimes	/* turn it into a character string */
15881573Srgrimes	g->must = malloc((size_t)g->mlen + 1);
15891573Srgrimes	if (g->must == NULL) {		/* argh; just forget it */
15901573Srgrimes		g->mlen = 0;
159162391Sdcs		g->moffset = -1;
15921573Srgrimes		return;
15931573Srgrimes	}
15941573Srgrimes	cp = g->must;
15951573Srgrimes	scan = start;
1596132019Stjr	memset(&mbs, 0, sizeof(mbs));
1597132019Stjr	while (cp < g->must + g->mlen) {
15981573Srgrimes		while (OP(s = *scan++) != OCHAR)
15991573Srgrimes			continue;
1600132019Stjr		clen = wcrtomb(cp, OPND(s), &mbs);
1601132019Stjr		assert(clen != (size_t)-1);
1602132019Stjr		cp += clen;
16031573Srgrimes	}
16041573Srgrimes	assert(cp == g->must + g->mlen);
16051573Srgrimes	*cp++ = '\0';		/* just on general principles */
16061573Srgrimes}
16071573Srgrimes
16081573Srgrimes/*
160962391Sdcs - altoffset - choose biggest offset among multiple choices
1610131973Stjr == static int altoffset(sop *scan, int offset);
161162391Sdcs *
161262391Sdcs * Compute, recursively if necessary, the largest offset among multiple
161362391Sdcs * re paths.
161462391Sdcs */
161562391Sdcsstatic int
1616131973Stjraltoffset(scan, offset)
161762391Sdcssop *scan;
161862391Sdcsint offset;
161962391Sdcs{
162062391Sdcs	int largest;
162162391Sdcs	int try;
162262391Sdcs	sop s;
162362391Sdcs
162462391Sdcs	/* If we gave up already on offsets, return */
162562391Sdcs	if (offset == -1)
162662391Sdcs		return -1;
162762391Sdcs
162862391Sdcs	largest = 0;
162962391Sdcs	try = 0;
163062391Sdcs	s = *scan++;
163162391Sdcs	while (OP(s) != O_QUEST && OP(s) != O_CH) {
163262391Sdcs		switch (OP(s)) {
163362391Sdcs		case OOR1:
163462391Sdcs			if (try > largest)
163562391Sdcs				largest = try;
163662391Sdcs			try = 0;
163762391Sdcs			break;
163862391Sdcs		case OQUEST_:
163962391Sdcs		case OCH_:
1640131973Stjr			try = altoffset(scan, try);
164162391Sdcs			if (try == -1)
164262391Sdcs				return -1;
164362391Sdcs			scan--;
164462391Sdcs			do {
164562391Sdcs				scan += OPND(s);
164662391Sdcs				s = *scan;
164762391Sdcs				if (OP(s) != O_QUEST && OP(s) != O_CH &&
164862391Sdcs							OP(s) != OOR2)
164962391Sdcs					return -1;
165062391Sdcs			} while (OP(s) != O_QUEST && OP(s) != O_CH);
165162855Sdcs			/* We must skip to the next position, or we'll
165262855Sdcs			 * leave altoffset() too early.
165362855Sdcs			 */
165462855Sdcs			scan++;
165562391Sdcs			break;
165662391Sdcs		case OANYOF:
165762391Sdcs		case OCHAR:
165862391Sdcs		case OANY:
165962391Sdcs			try++;
166062391Sdcs		case OBOW:
166162391Sdcs		case OEOW:
166262391Sdcs		case OLPAREN:
166362391Sdcs		case ORPAREN:
166462391Sdcs		case OOR2:
166562391Sdcs			break;
166662391Sdcs		default:
166762391Sdcs			try = -1;
166862391Sdcs			break;
166962391Sdcs		}
167062391Sdcs		if (try == -1)
167162391Sdcs			return -1;
167262391Sdcs		s = *scan++;
167362391Sdcs	}
167462391Sdcs
167562391Sdcs	if (try > largest)
167662391Sdcs		largest = try;
167762391Sdcs
167862391Sdcs	return largest+offset;
167962391Sdcs}
168062391Sdcs
168162391Sdcs/*
168262232Sdcs - computejumps - compute char jumps for BM scan
168392889Sobrien == static void computejumps(struct parse *p, struct re_guts *g);
168462232Sdcs *
168562232Sdcs * This algorithm assumes g->must exists and is has size greater than
168662232Sdcs * zero. It's based on the algorithm found on Computer Algorithms by
168762232Sdcs * Sara Baase.
168862232Sdcs *
168962232Sdcs * A char jump is the number of characters one needs to jump based on
169062232Sdcs * the value of the character from the text that was mismatched.
169162232Sdcs */
169262232Sdcsstatic void
169362232Sdcscomputejumps(p, g)
169462232Sdcsstruct parse *p;
169562232Sdcsstruct re_guts *g;
169662232Sdcs{
169762232Sdcs	int ch;
169862232Sdcs	int mindex;
169962232Sdcs
170062232Sdcs	/* Avoid making errors worse */
170162232Sdcs	if (p->error != 0)
170262232Sdcs		return;
170362232Sdcs
170462848Sdcs	g->charjump = (int*) malloc((NC + 1) * sizeof(int));
170562232Sdcs	if (g->charjump == NULL)	/* Not a fatal error */
170662232Sdcs		return;
170762754Sdcs	/* Adjust for signed chars, if necessary */
170862754Sdcs	g->charjump = &g->charjump[-(CHAR_MIN)];
170962232Sdcs
171062232Sdcs	/* If the character does not exist in the pattern, the jump
171162232Sdcs	 * is equal to the number of characters in the pattern.
171262232Sdcs	 */
171362754Sdcs	for (ch = CHAR_MIN; ch < (CHAR_MAX + 1); ch++)
171462232Sdcs		g->charjump[ch] = g->mlen;
171562232Sdcs
171662232Sdcs	/* If the character does exist, compute the jump that would
171762232Sdcs	 * take us to the last character in the pattern equal to it
171862232Sdcs	 * (notice that we match right to left, so that last character
171962232Sdcs	 * is the first one that would be matched).
172062232Sdcs	 */
172162232Sdcs	for (mindex = 0; mindex < g->mlen; mindex++)
1722111010Snectar		g->charjump[(int)g->must[mindex]] = g->mlen - mindex - 1;
172362232Sdcs}
172462232Sdcs
172562232Sdcs/*
172662232Sdcs - computematchjumps - compute match jumps for BM scan
172792889Sobrien == static void computematchjumps(struct parse *p, struct re_guts *g);
172862232Sdcs *
172962232Sdcs * This algorithm assumes g->must exists and is has size greater than
173062232Sdcs * zero. It's based on the algorithm found on Computer Algorithms by
173162232Sdcs * Sara Baase.
173262232Sdcs *
173362232Sdcs * A match jump is the number of characters one needs to advance based
173462232Sdcs * on the already-matched suffix.
173562232Sdcs * Notice that all values here are minus (g->mlen-1), because of the way
173662232Sdcs * the search algorithm works.
173762232Sdcs */
173862232Sdcsstatic void
173962232Sdcscomputematchjumps(p, g)
174062232Sdcsstruct parse *p;
174162232Sdcsstruct re_guts *g;
174262232Sdcs{
174362232Sdcs	int mindex;		/* General "must" iterator */
174462232Sdcs	int suffix;		/* Keeps track of matching suffix */
174562232Sdcs	int ssuffix;		/* Keeps track of suffixes' suffix */
174662232Sdcs	int* pmatches;		/* pmatches[k] points to the next i
174762232Sdcs				 * such that i+1...mlen is a substring
174862232Sdcs				 * of k+1...k+mlen-i-1
174962232Sdcs				 */
175062232Sdcs
175162232Sdcs	/* Avoid making errors worse */
175262232Sdcs	if (p->error != 0)
175362232Sdcs		return;
175462232Sdcs
175562848Sdcs	pmatches = (int*) malloc(g->mlen * sizeof(unsigned int));
175662232Sdcs	if (pmatches == NULL) {
175762232Sdcs		g->matchjump = NULL;
175862232Sdcs		return;
175962232Sdcs	}
176062232Sdcs
176162848Sdcs	g->matchjump = (int*) malloc(g->mlen * sizeof(unsigned int));
176262232Sdcs	if (g->matchjump == NULL)	/* Not a fatal error */
176362232Sdcs		return;
176462232Sdcs
176562232Sdcs	/* Set maximum possible jump for each character in the pattern */
176662232Sdcs	for (mindex = 0; mindex < g->mlen; mindex++)
176762232Sdcs		g->matchjump[mindex] = 2*g->mlen - mindex - 1;
176862232Sdcs
176962232Sdcs	/* Compute pmatches[] */
177062232Sdcs	for (mindex = g->mlen - 1, suffix = g->mlen; mindex >= 0;
177162232Sdcs	    mindex--, suffix--) {
177262232Sdcs		pmatches[mindex] = suffix;
177362232Sdcs
177462232Sdcs		/* If a mismatch is found, interrupting the substring,
177562232Sdcs		 * compute the matchjump for that position. If no
177662232Sdcs		 * mismatch is found, then a text substring mismatched
177762232Sdcs		 * against the suffix will also mismatch against the
177862232Sdcs		 * substring.
177962232Sdcs		 */
178062232Sdcs		while (suffix < g->mlen
178162232Sdcs		    && g->must[mindex] != g->must[suffix]) {
178262232Sdcs			g->matchjump[suffix] = MIN(g->matchjump[suffix],
178362232Sdcs			    g->mlen - mindex - 1);
178462232Sdcs			suffix = pmatches[suffix];
178562232Sdcs		}
178662232Sdcs	}
178762232Sdcs
178862232Sdcs	/* Compute the matchjump up to the last substring found to jump
178962232Sdcs	 * to the beginning of the largest must pattern prefix matching
179062232Sdcs	 * it's own suffix.
179162232Sdcs	 */
179262232Sdcs	for (mindex = 0; mindex <= suffix; mindex++)
179362232Sdcs		g->matchjump[mindex] = MIN(g->matchjump[mindex],
179462232Sdcs		    g->mlen + suffix - mindex);
179562232Sdcs
179662232Sdcs        ssuffix = pmatches[suffix];
179762391Sdcs        while (suffix < g->mlen) {
179862673Sdcs                while (suffix <= ssuffix && suffix < g->mlen) {
179962232Sdcs                        g->matchjump[suffix] = MIN(g->matchjump[suffix],
180062232Sdcs			    g->mlen + ssuffix - suffix);
180162232Sdcs                        suffix++;
180262232Sdcs                }
180386208Sdcs		if (suffix < g->mlen)
180486208Sdcs                	ssuffix = pmatches[ssuffix];
180562232Sdcs        }
180662232Sdcs
180762232Sdcs	free(pmatches);
180862232Sdcs}
180962232Sdcs
181062232Sdcs/*
18111573Srgrimes - pluscount - count + nesting
181292889Sobrien == static sopno pluscount(struct parse *p, struct re_guts *g);
18131573Srgrimes */
18141573Srgrimesstatic sopno			/* nesting depth */
18151573Srgrimespluscount(p, g)
18161573Srgrimesstruct parse *p;
181792889Sobrienstruct re_guts *g;
18181573Srgrimes{
181992889Sobrien	sop *scan;
182092889Sobrien	sop s;
182192889Sobrien	sopno plusnest = 0;
182292889Sobrien	sopno maxnest = 0;
18231573Srgrimes
18241573Srgrimes	if (p->error != 0)
18251573Srgrimes		return(0);	/* there may not be an OEND */
18261573Srgrimes
18271573Srgrimes	scan = g->strip + 1;
18281573Srgrimes	do {
18291573Srgrimes		s = *scan++;
18301573Srgrimes		switch (OP(s)) {
18311573Srgrimes		case OPLUS_:
18321573Srgrimes			plusnest++;
18331573Srgrimes			break;
18341573Srgrimes		case O_PLUS:
18351573Srgrimes			if (plusnest > maxnest)
18361573Srgrimes				maxnest = plusnest;
18371573Srgrimes			plusnest--;
18381573Srgrimes			break;
18391573Srgrimes		}
18401573Srgrimes	} while (OP(s) != OEND);
18411573Srgrimes	if (plusnest != 0)
18421573Srgrimes		g->iflags |= BAD;
18431573Srgrimes	return(maxnest);
18441573Srgrimes}
1845