regcomp.c revision 227435
11573Srgrimes/*-
21573Srgrimes * Copyright (c) 1992, 1993, 1994 Henry Spencer.
31573Srgrimes * Copyright (c) 1992, 1993, 1994
41573Srgrimes *	The Regents of the University of California.  All rights reserved.
51573Srgrimes *
61573Srgrimes * This code is derived from software contributed to Berkeley by
71573Srgrimes * Henry Spencer.
81573Srgrimes *
91573Srgrimes * Redistribution and use in source and binary forms, with or without
101573Srgrimes * modification, are permitted provided that the following conditions
111573Srgrimes * are met:
121573Srgrimes * 1. Redistributions of source code must retain the above copyright
131573Srgrimes *    notice, this list of conditions and the following disclaimer.
141573Srgrimes * 2. Redistributions in binary form must reproduce the above copyright
151573Srgrimes *    notice, this list of conditions and the following disclaimer in the
161573Srgrimes *    documentation and/or other materials provided with the distribution.
171573Srgrimes * 4. Neither the name of the University nor the names of its contributors
181573Srgrimes *    may be used to endorse or promote products derived from this software
191573Srgrimes *    without specific prior written permission.
201573Srgrimes *
211573Srgrimes * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
221573Srgrimes * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
231573Srgrimes * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
241573Srgrimes * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
251573Srgrimes * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
261573Srgrimes * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
271573Srgrimes * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
281573Srgrimes * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
291573Srgrimes * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
301573Srgrimes * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
311573Srgrimes * SUCH DAMAGE.
321573Srgrimes *
331573Srgrimes *	@(#)regcomp.c	8.5 (Berkeley) 3/20/94
341573Srgrimes */
351573Srgrimes
361573Srgrimes#if defined(LIBC_SCCS) && !defined(lint)
371573Srgrimesstatic char sccsid[] = "@(#)regcomp.c	8.5 (Berkeley) 3/20/94";
381573Srgrimes#endif /* LIBC_SCCS and not lint */
3992986Sobrien#include <sys/cdefs.h>
4092986Sobrien__FBSDID("$FreeBSD: head/lib/libc/regex/regcomp.c 227435 2011-11-11 01:35:07Z kevlo $");
411573Srgrimes
421573Srgrimes#include <sys/types.h>
431573Srgrimes#include <stdio.h>
441573Srgrimes#include <string.h>
451573Srgrimes#include <ctype.h>
461573Srgrimes#include <limits.h>
471573Srgrimes#include <stdlib.h>
481573Srgrimes#include <regex.h>
49136091Sstefanf#include <runetype.h>
50132019Stjr#include <wchar.h>
51132019Stjr#include <wctype.h>
521573Srgrimes
5319277Sache#include "collate.h"
5419277Sache
551573Srgrimes#include "utils.h"
561573Srgrimes#include "regex2.h"
571573Srgrimes
581573Srgrimes#include "cname.h"
591573Srgrimes
601573Srgrimes/*
611573Srgrimes * parse structure, passed up and down to avoid global variables and
621573Srgrimes * other clumsinesses
631573Srgrimes */
641573Srgrimesstruct parse {
651573Srgrimes	char *next;		/* next character in RE */
661573Srgrimes	char *end;		/* end of string (-> NUL normally) */
671573Srgrimes	int error;		/* has an error been seen? */
681573Srgrimes	sop *strip;		/* malloced strip */
691573Srgrimes	sopno ssize;		/* malloced strip size (allocated) */
701573Srgrimes	sopno slen;		/* malloced strip length (used) */
711573Srgrimes	int ncsalloc;		/* number of csets allocated */
721573Srgrimes	struct re_guts *g;
731573Srgrimes#	define	NPAREN	10	/* we need to remember () 1-9 for back refs */
741573Srgrimes	sopno pbegin[NPAREN];	/* -> ( ([0] unused) */
751573Srgrimes	sopno pend[NPAREN];	/* -> ) ([0] unused) */
761573Srgrimes};
771573Srgrimes
781573Srgrimes/* ========= begin header generated by ./mkh ========= */
791573Srgrimes#ifdef __cplusplus
801573Srgrimesextern "C" {
811573Srgrimes#endif
821573Srgrimes
831573Srgrimes/* === regcomp.c === */
84227435Skevlostatic void p_ere(struct parse *p, int stop);
8592905Sobrienstatic void p_ere_exp(struct parse *p);
8692905Sobrienstatic void p_str(struct parse *p);
87227435Skevlostatic void p_bre(struct parse *p, int end1, int end2);
8892905Sobrienstatic int p_simp_re(struct parse *p, int starordinary);
8992905Sobrienstatic int p_count(struct parse *p);
9092905Sobrienstatic void p_bracket(struct parse *p);
9192905Sobrienstatic void p_b_term(struct parse *p, cset *cs);
9292905Sobrienstatic void p_b_cclass(struct parse *p, cset *cs);
9392905Sobrienstatic void p_b_eclass(struct parse *p, cset *cs);
94132019Stjrstatic wint_t p_b_symbol(struct parse *p);
95132019Stjrstatic wint_t p_b_coll_elem(struct parse *p, wint_t endc);
96132019Stjrstatic wint_t othercase(wint_t ch);
97132019Stjrstatic void bothcases(struct parse *p, wint_t ch);
98132019Stjrstatic void ordinary(struct parse *p, wint_t ch);
9992905Sobrienstatic void nonnewline(struct parse *p);
10092905Sobrienstatic void repeat(struct parse *p, sopno start, int from, int to);
10192905Sobrienstatic int seterr(struct parse *p, int e);
10292905Sobrienstatic cset *allocset(struct parse *p);
10392905Sobrienstatic void freeset(struct parse *p, cset *cs);
104132019Stjrstatic void CHadd(struct parse *p, cset *cs, wint_t ch);
105132019Stjrstatic void CHaddrange(struct parse *p, cset *cs, wint_t min, wint_t max);
106132019Stjrstatic void CHaddtype(struct parse *p, cset *cs, wctype_t wct);
107132019Stjrstatic wint_t singleton(cset *cs);
10892905Sobrienstatic sopno dupl(struct parse *p, sopno start, sopno finish);
10992905Sobrienstatic void doemit(struct parse *p, sop op, size_t opnd);
11092905Sobrienstatic void doinsert(struct parse *p, sop op, size_t opnd, sopno pos);
11192905Sobrienstatic void dofwd(struct parse *p, sopno pos, sop value);
112227414Skevlostatic int enlarge(struct parse *p, sopno size);
11392905Sobrienstatic void stripsnug(struct parse *p, struct re_guts *g);
11492905Sobrienstatic void findmust(struct parse *p, struct re_guts *g);
115131973Stjrstatic int altoffset(sop *scan, int offset);
11692905Sobrienstatic void computejumps(struct parse *p, struct re_guts *g);
11792905Sobrienstatic void computematchjumps(struct parse *p, struct re_guts *g);
11892905Sobrienstatic sopno pluscount(struct parse *p, struct re_guts *g);
119132019Stjrstatic wint_t wgetnext(struct parse *p);
1201573Srgrimes
1211573Srgrimes#ifdef __cplusplus
1221573Srgrimes}
1231573Srgrimes#endif
1241573Srgrimes/* ========= end header generated by ./mkh ========= */
1251573Srgrimes
1261573Srgrimesstatic char nuls[10];		/* place to point scanner in event of error */
1271573Srgrimes
1281573Srgrimes/*
1291573Srgrimes * macros for use with parse structure
1301573Srgrimes * BEWARE:  these know that the parse structure is named `p' !!!
1311573Srgrimes */
1321573Srgrimes#define	PEEK()	(*p->next)
1331573Srgrimes#define	PEEK2()	(*(p->next+1))
1341573Srgrimes#define	MORE()	(p->next < p->end)
1351573Srgrimes#define	MORE2()	(p->next+1 < p->end)
1361573Srgrimes#define	SEE(c)	(MORE() && PEEK() == (c))
1371573Srgrimes#define	SEETWO(a, b)	(MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b))
1381573Srgrimes#define	EAT(c)	((SEE(c)) ? (NEXT(), 1) : 0)
1391573Srgrimes#define	EATTWO(a, b)	((SEETWO(a, b)) ? (NEXT2(), 1) : 0)
1401573Srgrimes#define	NEXT()	(p->next++)
1411573Srgrimes#define	NEXT2()	(p->next += 2)
1421573Srgrimes#define	NEXTn(n)	(p->next += (n))
1431573Srgrimes#define	GETNEXT()	(*p->next++)
144132019Stjr#define	WGETNEXT()	wgetnext(p)
1451573Srgrimes#define	SETERROR(e)	seterr(p, (e))
1461573Srgrimes#define	REQUIRE(co, e)	((co) || SETERROR(e))
1471573Srgrimes#define	MUSTSEE(c, e)	(REQUIRE(MORE() && PEEK() == (c), e))
1481573Srgrimes#define	MUSTEAT(c, e)	(REQUIRE(MORE() && GETNEXT() == (c), e))
1491573Srgrimes#define	MUSTNOTSEE(c, e)	(REQUIRE(!MORE() || PEEK() != (c), e))
1501573Srgrimes#define	EMIT(op, sopnd)	doemit(p, (sop)(op), (size_t)(sopnd))
1511573Srgrimes#define	INSERT(op, pos)	doinsert(p, (sop)(op), HERE()-(pos)+1, pos)
1521573Srgrimes#define	AHEAD(pos)		dofwd(p, pos, HERE()-(pos))
1531573Srgrimes#define	ASTERN(sop, pos)	EMIT(sop, HERE()-pos)
1541573Srgrimes#define	HERE()		(p->slen)
1551573Srgrimes#define	THERE()		(p->slen - 1)
1561573Srgrimes#define	THERETHERE()	(p->slen - 2)
1571573Srgrimes#define	DROP(n)	(p->slen -= (n))
1581573Srgrimes
1591573Srgrimes#ifndef NDEBUG
1601573Srgrimesstatic int never = 0;		/* for use in asserts; shuts lint up */
1611573Srgrimes#else
1621573Srgrimes#define	never	0		/* some <assert.h>s have bugs too */
1631573Srgrimes#endif
1641573Srgrimes
16562232Sdcs/* Macro used by computejump()/computematchjump() */
16662232Sdcs#define MIN(a,b)	((a)<(b)?(a):(b))
16762232Sdcs
1681573Srgrimes/*
1691573Srgrimes - regcomp - interface for parser and compilation
1701573Srgrimes = extern int regcomp(regex_t *, const char *, int);
1711573Srgrimes = #define	REG_BASIC	0000
1721573Srgrimes = #define	REG_EXTENDED	0001
1731573Srgrimes = #define	REG_ICASE	0002
1741573Srgrimes = #define	REG_NOSUB	0004
1751573Srgrimes = #define	REG_NEWLINE	0010
1761573Srgrimes = #define	REG_NOSPEC	0020
1771573Srgrimes = #define	REG_PEND	0040
1781573Srgrimes = #define	REG_DUMP	0200
1791573Srgrimes */
1801573Srgrimesint				/* 0 success, otherwise REG_something */
181170528Sdelphijregcomp(regex_t * __restrict preg,
182170528Sdelphij	const char * __restrict pattern,
183170528Sdelphij	int cflags)
1841573Srgrimes{
1851573Srgrimes	struct parse pa;
18692889Sobrien	struct re_guts *g;
18792889Sobrien	struct parse *p = &pa;
18892889Sobrien	int i;
18992889Sobrien	size_t len;
1901573Srgrimes#ifdef REDEBUG
1911573Srgrimes#	define	GOODFLAGS(f)	(f)
1921573Srgrimes#else
1931573Srgrimes#	define	GOODFLAGS(f)	((f)&~REG_DUMP)
1941573Srgrimes#endif
1951573Srgrimes
1961573Srgrimes	cflags = GOODFLAGS(cflags);
1971573Srgrimes	if ((cflags&REG_EXTENDED) && (cflags&REG_NOSPEC))
1981573Srgrimes		return(REG_INVARG);
1991573Srgrimes
2001573Srgrimes	if (cflags&REG_PEND) {
2011573Srgrimes		if (preg->re_endp < pattern)
2021573Srgrimes			return(REG_INVARG);
2031573Srgrimes		len = preg->re_endp - pattern;
2041573Srgrimes	} else
2051573Srgrimes		len = strlen((char *)pattern);
2061573Srgrimes
2071573Srgrimes	/* do the mallocs early so failure handling is easy */
208131973Stjr	g = (struct re_guts *)malloc(sizeof(struct re_guts));
2091573Srgrimes	if (g == NULL)
2101573Srgrimes		return(REG_ESPACE);
2111573Srgrimes	p->ssize = len/(size_t)2*(size_t)3 + (size_t)1;	/* ugh */
2121573Srgrimes	p->strip = (sop *)malloc(p->ssize * sizeof(sop));
2131573Srgrimes	p->slen = 0;
2141573Srgrimes	if (p->strip == NULL) {
2151573Srgrimes		free((char *)g);
2161573Srgrimes		return(REG_ESPACE);
2171573Srgrimes	}
2181573Srgrimes
2191573Srgrimes	/* set things up */
2201573Srgrimes	p->g = g;
2211573Srgrimes	p->next = (char *)pattern;	/* convenience; we do not modify it */
2221573Srgrimes	p->end = p->next + len;
2231573Srgrimes	p->error = 0;
2241573Srgrimes	p->ncsalloc = 0;
2251573Srgrimes	for (i = 0; i < NPAREN; i++) {
2261573Srgrimes		p->pbegin[i] = 0;
2271573Srgrimes		p->pend[i] = 0;
2281573Srgrimes	}
2291573Srgrimes	g->sets = NULL;
2301573Srgrimes	g->ncsets = 0;
2311573Srgrimes	g->cflags = cflags;
2321573Srgrimes	g->iflags = 0;
2331573Srgrimes	g->nbol = 0;
2341573Srgrimes	g->neol = 0;
2351573Srgrimes	g->must = NULL;
23662391Sdcs	g->moffset = -1;
23762263Sdcs	g->charjump = NULL;
23862263Sdcs	g->matchjump = NULL;
2391573Srgrimes	g->mlen = 0;
2401573Srgrimes	g->nsub = 0;
2411573Srgrimes	g->backrefs = 0;
2421573Srgrimes
2431573Srgrimes	/* do it */
2441573Srgrimes	EMIT(OEND, 0);
2451573Srgrimes	g->firststate = THERE();
2461573Srgrimes	if (cflags&REG_EXTENDED)
2471573Srgrimes		p_ere(p, OUT);
2481573Srgrimes	else if (cflags&REG_NOSPEC)
2491573Srgrimes		p_str(p);
2501573Srgrimes	else
2511573Srgrimes		p_bre(p, OUT, OUT);
2521573Srgrimes	EMIT(OEND, 0);
2531573Srgrimes	g->laststate = THERE();
2541573Srgrimes
2551573Srgrimes	/* tidy up loose ends and fill things in */
2561573Srgrimes	stripsnug(p, g);
2571573Srgrimes	findmust(p, g);
25862232Sdcs	/* only use Boyer-Moore algorithm if the pattern is bigger
25962232Sdcs	 * than three characters
26062232Sdcs	 */
26162232Sdcs	if(g->mlen > 3) {
26262232Sdcs		computejumps(p, g);
26362232Sdcs		computematchjumps(p, g);
26462755Sdcs		if(g->matchjump == NULL && g->charjump != NULL) {
26562232Sdcs			free(g->charjump);
26662232Sdcs			g->charjump = NULL;
26762232Sdcs		}
26862232Sdcs	}
2691573Srgrimes	g->nplus = pluscount(p, g);
2701573Srgrimes	g->magic = MAGIC2;
2711573Srgrimes	preg->re_nsub = g->nsub;
2721573Srgrimes	preg->re_g = g;
2731573Srgrimes	preg->re_magic = MAGIC1;
2741573Srgrimes#ifndef REDEBUG
2751573Srgrimes	/* not debugging, so can't rely on the assert() in regexec() */
2761573Srgrimes	if (g->iflags&BAD)
2771573Srgrimes		SETERROR(REG_ASSERT);
2781573Srgrimes#endif
2791573Srgrimes
2801573Srgrimes	/* win or lose, we're done */
2811573Srgrimes	if (p->error != 0)	/* lose */
2821573Srgrimes		regfree(preg);
2831573Srgrimes	return(p->error);
2841573Srgrimes}
2851573Srgrimes
2861573Srgrimes/*
2871573Srgrimes - p_ere - ERE parser top level, concatenation and alternation
288227435Skevlo == static void p_ere(struct parse *p, int_t stop);
2891573Srgrimes */
2901573Srgrimesstatic void
291170528Sdelphijp_ere(struct parse *p,
292227435Skevlo	int stop)		/* character this ERE should end at */
2931573Srgrimes{
29492889Sobrien	char c;
29592889Sobrien	sopno prevback;
29692889Sobrien	sopno prevfwd;
29792889Sobrien	sopno conc;
29892889Sobrien	int first = 1;		/* is this the first alternative? */
2991573Srgrimes
3001573Srgrimes	for (;;) {
3011573Srgrimes		/* do a bunch of concatenated expressions */
3021573Srgrimes		conc = HERE();
3031573Srgrimes		while (MORE() && (c = PEEK()) != '|' && c != stop)
3041573Srgrimes			p_ere_exp(p);
30517141Sjkh		(void)REQUIRE(HERE() != conc, REG_EMPTY);	/* require nonempty */
3061573Srgrimes
3071573Srgrimes		if (!EAT('|'))
3081573Srgrimes			break;		/* NOTE BREAK OUT */
3091573Srgrimes
3101573Srgrimes		if (first) {
3111573Srgrimes			INSERT(OCH_, conc);	/* offset is wrong */
3121573Srgrimes			prevfwd = conc;
3131573Srgrimes			prevback = conc;
3141573Srgrimes			first = 0;
3151573Srgrimes		}
3161573Srgrimes		ASTERN(OOR1, prevback);
3171573Srgrimes		prevback = THERE();
3181573Srgrimes		AHEAD(prevfwd);			/* fix previous offset */
3191573Srgrimes		prevfwd = HERE();
3201573Srgrimes		EMIT(OOR2, 0);			/* offset is very wrong */
3211573Srgrimes	}
3221573Srgrimes
3231573Srgrimes	if (!first) {		/* tail-end fixups */
3241573Srgrimes		AHEAD(prevfwd);
3251573Srgrimes		ASTERN(O_CH, prevback);
3261573Srgrimes	}
3271573Srgrimes
3281573Srgrimes	assert(!MORE() || SEE(stop));
3291573Srgrimes}
3301573Srgrimes
3311573Srgrimes/*
3321573Srgrimes - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op
33392889Sobrien == static void p_ere_exp(struct parse *p);
3341573Srgrimes */
3351573Srgrimesstatic void
336170528Sdelphijp_ere_exp(struct parse *p)
3371573Srgrimes{
33892889Sobrien	char c;
339132019Stjr	wint_t wc;
34092889Sobrien	sopno pos;
34192889Sobrien	int count;
34292889Sobrien	int count2;
34392889Sobrien	sopno subno;
3441573Srgrimes	int wascaret = 0;
3451573Srgrimes
3461573Srgrimes	assert(MORE());		/* caller should have ensured this */
3471573Srgrimes	c = GETNEXT();
3481573Srgrimes
3491573Srgrimes	pos = HERE();
3501573Srgrimes	switch (c) {
3511573Srgrimes	case '(':
35217141Sjkh		(void)REQUIRE(MORE(), REG_EPAREN);
3531573Srgrimes		p->g->nsub++;
3541573Srgrimes		subno = p->g->nsub;
3551573Srgrimes		if (subno < NPAREN)
3561573Srgrimes			p->pbegin[subno] = HERE();
3571573Srgrimes		EMIT(OLPAREN, subno);
3581573Srgrimes		if (!SEE(')'))
3591573Srgrimes			p_ere(p, ')');
3601573Srgrimes		if (subno < NPAREN) {
3611573Srgrimes			p->pend[subno] = HERE();
3621573Srgrimes			assert(p->pend[subno] != 0);
3631573Srgrimes		}
3641573Srgrimes		EMIT(ORPAREN, subno);
36517141Sjkh		(void)MUSTEAT(')', REG_EPAREN);
3661573Srgrimes		break;
3671573Srgrimes#ifndef POSIX_MISTAKE
3681573Srgrimes	case ')':		/* happens only if no current unmatched ( */
3691573Srgrimes		/*
3701573Srgrimes		 * You may ask, why the ifndef?  Because I didn't notice
3711573Srgrimes		 * this until slightly too late for 1003.2, and none of the
3721573Srgrimes		 * other 1003.2 regular-expression reviewers noticed it at
3731573Srgrimes		 * all.  So an unmatched ) is legal POSIX, at least until
3741573Srgrimes		 * we can get it fixed.
3751573Srgrimes		 */
3761573Srgrimes		SETERROR(REG_EPAREN);
3771573Srgrimes		break;
3781573Srgrimes#endif
3791573Srgrimes	case '^':
3801573Srgrimes		EMIT(OBOL, 0);
3811573Srgrimes		p->g->iflags |= USEBOL;
3821573Srgrimes		p->g->nbol++;
3831573Srgrimes		wascaret = 1;
3841573Srgrimes		break;
3851573Srgrimes	case '$':
3861573Srgrimes		EMIT(OEOL, 0);
3871573Srgrimes		p->g->iflags |= USEEOL;
3881573Srgrimes		p->g->neol++;
3891573Srgrimes		break;
3901573Srgrimes	case '|':
3911573Srgrimes		SETERROR(REG_EMPTY);
3921573Srgrimes		break;
3931573Srgrimes	case '*':
3941573Srgrimes	case '+':
3951573Srgrimes	case '?':
3961573Srgrimes		SETERROR(REG_BADRPT);
3971573Srgrimes		break;
3981573Srgrimes	case '.':
3991573Srgrimes		if (p->g->cflags&REG_NEWLINE)
4001573Srgrimes			nonnewline(p);
4011573Srgrimes		else
4021573Srgrimes			EMIT(OANY, 0);
4031573Srgrimes		break;
4041573Srgrimes	case '[':
4051573Srgrimes		p_bracket(p);
4061573Srgrimes		break;
4071573Srgrimes	case '\\':
40817141Sjkh		(void)REQUIRE(MORE(), REG_EESCAPE);
409132019Stjr		wc = WGETNEXT();
410132019Stjr		ordinary(p, wc);
4111573Srgrimes		break;
4121573Srgrimes	case '{':		/* okay as ordinary except if digit follows */
41349094Sache		(void)REQUIRE(!MORE() || !isdigit((uch)PEEK()), REG_BADRPT);
4141573Srgrimes		/* FALLTHROUGH */
4151573Srgrimes	default:
416132019Stjr		p->next--;
417132019Stjr		wc = WGETNEXT();
418132019Stjr		ordinary(p, wc);
4191573Srgrimes		break;
4201573Srgrimes	}
4211573Srgrimes
4221573Srgrimes	if (!MORE())
4231573Srgrimes		return;
4241573Srgrimes	c = PEEK();
4251573Srgrimes	/* we call { a repetition if followed by a digit */
4261573Srgrimes	if (!( c == '*' || c == '+' || c == '?' ||
42749094Sache				(c == '{' && MORE2() && isdigit((uch)PEEK2())) ))
4281573Srgrimes		return;		/* no repetition, we're done */
4291573Srgrimes	NEXT();
4301573Srgrimes
43117141Sjkh	(void)REQUIRE(!wascaret, REG_BADRPT);
4321573Srgrimes	switch (c) {
4331573Srgrimes	case '*':	/* implemented as +? */
4341573Srgrimes		/* this case does not require the (y|) trick, noKLUDGE */
4351573Srgrimes		INSERT(OPLUS_, pos);
4361573Srgrimes		ASTERN(O_PLUS, pos);
4371573Srgrimes		INSERT(OQUEST_, pos);
4381573Srgrimes		ASTERN(O_QUEST, pos);
4391573Srgrimes		break;
4401573Srgrimes	case '+':
4411573Srgrimes		INSERT(OPLUS_, pos);
4421573Srgrimes		ASTERN(O_PLUS, pos);
4431573Srgrimes		break;
4441573Srgrimes	case '?':
4451573Srgrimes		/* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
4461573Srgrimes		INSERT(OCH_, pos);		/* offset slightly wrong */
4471573Srgrimes		ASTERN(OOR1, pos);		/* this one's right */
4481573Srgrimes		AHEAD(pos);			/* fix the OCH_ */
4491573Srgrimes		EMIT(OOR2, 0);			/* offset very wrong... */
4501573Srgrimes		AHEAD(THERE());			/* ...so fix it */
4511573Srgrimes		ASTERN(O_CH, THERETHERE());
4521573Srgrimes		break;
4531573Srgrimes	case '{':
4541573Srgrimes		count = p_count(p);
4551573Srgrimes		if (EAT(',')) {
45649094Sache			if (isdigit((uch)PEEK())) {
4571573Srgrimes				count2 = p_count(p);
45817141Sjkh				(void)REQUIRE(count <= count2, REG_BADBR);
4591573Srgrimes			} else		/* single number with comma */
4601573Srgrimes				count2 = INFINITY;
4611573Srgrimes		} else		/* just a single number */
4621573Srgrimes			count2 = count;
4631573Srgrimes		repeat(p, pos, count, count2);
4641573Srgrimes		if (!EAT('}')) {	/* error heuristics */
4651573Srgrimes			while (MORE() && PEEK() != '}')
4661573Srgrimes				NEXT();
46717141Sjkh			(void)REQUIRE(MORE(), REG_EBRACE);
4681573Srgrimes			SETERROR(REG_BADBR);
4691573Srgrimes		}
4701573Srgrimes		break;
4711573Srgrimes	}
4721573Srgrimes
4731573Srgrimes	if (!MORE())
4741573Srgrimes		return;
4751573Srgrimes	c = PEEK();
4761573Srgrimes	if (!( c == '*' || c == '+' || c == '?' ||
47749094Sache				(c == '{' && MORE2() && isdigit((uch)PEEK2())) ) )
4781573Srgrimes		return;
4791573Srgrimes	SETERROR(REG_BADRPT);
4801573Srgrimes}
4811573Srgrimes
4821573Srgrimes/*
4831573Srgrimes - p_str - string (no metacharacters) "parser"
48492889Sobrien == static void p_str(struct parse *p);
4851573Srgrimes */
4861573Srgrimesstatic void
487170528Sdelphijp_str(struct parse *p)
4881573Srgrimes{
48917141Sjkh	(void)REQUIRE(MORE(), REG_EMPTY);
4901573Srgrimes	while (MORE())
491132019Stjr		ordinary(p, WGETNEXT());
4921573Srgrimes}
4931573Srgrimes
4941573Srgrimes/*
4951573Srgrimes - p_bre - BRE parser top level, anchoring and concatenation
496227435Skevlo == static void p_bre(struct parse *p,  int end1, \
497227435Skevlo ==	int end2);
4981573Srgrimes * Giving end1 as OUT essentially eliminates the end1/end2 check.
4991573Srgrimes *
5001573Srgrimes * This implementation is a bit of a kludge, in that a trailing $ is first
501131973Stjr * taken as an ordinary character and then revised to be an anchor.
5021573Srgrimes * The amount of lookahead needed to avoid this kludge is excessive.
5031573Srgrimes */
5041573Srgrimesstatic void
505170528Sdelphijp_bre(struct parse *p,
506227435Skevlo	int end1,		/* first terminating character */
507227435Skevlo	int end2)		/* second terminating character */
5081573Srgrimes{
50992889Sobrien	sopno start = HERE();
51092889Sobrien	int first = 1;			/* first subexpression? */
51192889Sobrien	int wasdollar = 0;
5121573Srgrimes
5131573Srgrimes	if (EAT('^')) {
5141573Srgrimes		EMIT(OBOL, 0);
5151573Srgrimes		p->g->iflags |= USEBOL;
5161573Srgrimes		p->g->nbol++;
5171573Srgrimes	}
5181573Srgrimes	while (MORE() && !SEETWO(end1, end2)) {
5191573Srgrimes		wasdollar = p_simp_re(p, first);
5201573Srgrimes		first = 0;
5211573Srgrimes	}
5221573Srgrimes	if (wasdollar) {	/* oops, that was a trailing anchor */
5231573Srgrimes		DROP(1);
5241573Srgrimes		EMIT(OEOL, 0);
5251573Srgrimes		p->g->iflags |= USEEOL;
5261573Srgrimes		p->g->neol++;
5271573Srgrimes	}
5281573Srgrimes
52917141Sjkh	(void)REQUIRE(HERE() != start, REG_EMPTY);	/* require nonempty */
5301573Srgrimes}
5311573Srgrimes
5321573Srgrimes/*
5331573Srgrimes - p_simp_re - parse a simple RE, an atom possibly followed by a repetition
53492889Sobrien == static int p_simp_re(struct parse *p, int starordinary);
5351573Srgrimes */
5361573Srgrimesstatic int			/* was the simple RE an unbackslashed $? */
537170528Sdelphijp_simp_re(struct parse *p,
538170528Sdelphij	int starordinary)	/* is a leading * an ordinary character? */
5391573Srgrimes{
54092889Sobrien	int c;
54192889Sobrien	int count;
54292889Sobrien	int count2;
54392889Sobrien	sopno pos;
54492889Sobrien	int i;
545132019Stjr	wint_t wc;
54692889Sobrien	sopno subno;
5471573Srgrimes#	define	BACKSL	(1<<CHAR_BIT)
5481573Srgrimes
5491573Srgrimes	pos = HERE();		/* repetion op, if any, covers from here */
5501573Srgrimes
5511573Srgrimes	assert(MORE());		/* caller should have ensured this */
5521573Srgrimes	c = GETNEXT();
5531573Srgrimes	if (c == '\\') {
55417141Sjkh		(void)REQUIRE(MORE(), REG_EESCAPE);
55549094Sache		c = BACKSL | GETNEXT();
5561573Srgrimes	}
5571573Srgrimes	switch (c) {
5581573Srgrimes	case '.':
5591573Srgrimes		if (p->g->cflags&REG_NEWLINE)
5601573Srgrimes			nonnewline(p);
5611573Srgrimes		else
5621573Srgrimes			EMIT(OANY, 0);
5631573Srgrimes		break;
5641573Srgrimes	case '[':
5651573Srgrimes		p_bracket(p);
5661573Srgrimes		break;
5671573Srgrimes	case BACKSL|'{':
5681573Srgrimes		SETERROR(REG_BADRPT);
5691573Srgrimes		break;
5701573Srgrimes	case BACKSL|'(':
5711573Srgrimes		p->g->nsub++;
5721573Srgrimes		subno = p->g->nsub;
5731573Srgrimes		if (subno < NPAREN)
5741573Srgrimes			p->pbegin[subno] = HERE();
5751573Srgrimes		EMIT(OLPAREN, subno);
5761573Srgrimes		/* the MORE here is an error heuristic */
5771573Srgrimes		if (MORE() && !SEETWO('\\', ')'))
5781573Srgrimes			p_bre(p, '\\', ')');
5791573Srgrimes		if (subno < NPAREN) {
5801573Srgrimes			p->pend[subno] = HERE();
5811573Srgrimes			assert(p->pend[subno] != 0);
5821573Srgrimes		}
5831573Srgrimes		EMIT(ORPAREN, subno);
58417141Sjkh		(void)REQUIRE(EATTWO('\\', ')'), REG_EPAREN);
5851573Srgrimes		break;
5861573Srgrimes	case BACKSL|')':	/* should not get here -- must be user */
5871573Srgrimes	case BACKSL|'}':
5881573Srgrimes		SETERROR(REG_EPAREN);
5891573Srgrimes		break;
5901573Srgrimes	case BACKSL|'1':
5911573Srgrimes	case BACKSL|'2':
5921573Srgrimes	case BACKSL|'3':
5931573Srgrimes	case BACKSL|'4':
5941573Srgrimes	case BACKSL|'5':
5951573Srgrimes	case BACKSL|'6':
5961573Srgrimes	case BACKSL|'7':
5971573Srgrimes	case BACKSL|'8':
5981573Srgrimes	case BACKSL|'9':
5991573Srgrimes		i = (c&~BACKSL) - '0';
6001573Srgrimes		assert(i < NPAREN);
6011573Srgrimes		if (p->pend[i] != 0) {
6021573Srgrimes			assert(i <= p->g->nsub);
6031573Srgrimes			EMIT(OBACK_, i);
6041573Srgrimes			assert(p->pbegin[i] != 0);
6051573Srgrimes			assert(OP(p->strip[p->pbegin[i]]) == OLPAREN);
6061573Srgrimes			assert(OP(p->strip[p->pend[i]]) == ORPAREN);
6071573Srgrimes			(void) dupl(p, p->pbegin[i]+1, p->pend[i]);
6081573Srgrimes			EMIT(O_BACK, i);
6091573Srgrimes		} else
6101573Srgrimes			SETERROR(REG_ESUBREG);
6111573Srgrimes		p->g->backrefs = 1;
6121573Srgrimes		break;
6131573Srgrimes	case '*':
61417141Sjkh		(void)REQUIRE(starordinary, REG_BADRPT);
6151573Srgrimes		/* FALLTHROUGH */
6161573Srgrimes	default:
617132019Stjr		p->next--;
618132019Stjr		wc = WGETNEXT();
619132019Stjr		ordinary(p, wc);
6201573Srgrimes		break;
6211573Srgrimes	}
6221573Srgrimes
6231573Srgrimes	if (EAT('*')) {		/* implemented as +? */
6241573Srgrimes		/* this case does not require the (y|) trick, noKLUDGE */
6251573Srgrimes		INSERT(OPLUS_, pos);
6261573Srgrimes		ASTERN(O_PLUS, pos);
6271573Srgrimes		INSERT(OQUEST_, pos);
6281573Srgrimes		ASTERN(O_QUEST, pos);
6291573Srgrimes	} else if (EATTWO('\\', '{')) {
6301573Srgrimes		count = p_count(p);
6311573Srgrimes		if (EAT(',')) {
63249094Sache			if (MORE() && isdigit((uch)PEEK())) {
6331573Srgrimes				count2 = p_count(p);
63417141Sjkh				(void)REQUIRE(count <= count2, REG_BADBR);
6351573Srgrimes			} else		/* single number with comma */
6361573Srgrimes				count2 = INFINITY;
6371573Srgrimes		} else		/* just a single number */
6381573Srgrimes			count2 = count;
6391573Srgrimes		repeat(p, pos, count, count2);
6401573Srgrimes		if (!EATTWO('\\', '}')) {	/* error heuristics */
6411573Srgrimes			while (MORE() && !SEETWO('\\', '}'))
6421573Srgrimes				NEXT();
64317141Sjkh			(void)REQUIRE(MORE(), REG_EBRACE);
6441573Srgrimes			SETERROR(REG_BADBR);
6451573Srgrimes		}
64649094Sache	} else if (c == '$')     /* $ (but not \$) ends it */
6471573Srgrimes		return(1);
6481573Srgrimes
6491573Srgrimes	return(0);
6501573Srgrimes}
6511573Srgrimes
6521573Srgrimes/*
6531573Srgrimes - p_count - parse a repetition count
65492889Sobrien == static int p_count(struct parse *p);
6551573Srgrimes */
6561573Srgrimesstatic int			/* the value */
657170528Sdelphijp_count(struct parse *p)
6581573Srgrimes{
65992889Sobrien	int count = 0;
66092889Sobrien	int ndigits = 0;
6611573Srgrimes
66249094Sache	while (MORE() && isdigit((uch)PEEK()) && count <= DUPMAX) {
6631573Srgrimes		count = count*10 + (GETNEXT() - '0');
6641573Srgrimes		ndigits++;
6651573Srgrimes	}
6661573Srgrimes
66717141Sjkh	(void)REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR);
6681573Srgrimes	return(count);
6691573Srgrimes}
6701573Srgrimes
6711573Srgrimes/*
6721573Srgrimes - p_bracket - parse a bracketed character list
67392889Sobrien == static void p_bracket(struct parse *p);
6741573Srgrimes */
6751573Srgrimesstatic void
676170528Sdelphijp_bracket(struct parse *p)
6771573Srgrimes{
678132019Stjr	cset *cs;
679132019Stjr	wint_t ch;
6801573Srgrimes
6811573Srgrimes	/* Dept of Truly Sickening Special-Case Kludges */
6821573Srgrimes	if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]", 6) == 0) {
6831573Srgrimes		EMIT(OBOW, 0);
6841573Srgrimes		NEXTn(6);
6851573Srgrimes		return;
6861573Srgrimes	}
6871573Srgrimes	if (p->next + 5 < p->end && strncmp(p->next, "[:>:]]", 6) == 0) {
6881573Srgrimes		EMIT(OEOW, 0);
6891573Srgrimes		NEXTn(6);
6901573Srgrimes		return;
6911573Srgrimes	}
6921573Srgrimes
693132019Stjr	if ((cs = allocset(p)) == NULL)
694132019Stjr		return;
695132019Stjr
696132019Stjr	if (p->g->cflags&REG_ICASE)
697132019Stjr		cs->icase = 1;
6981573Srgrimes	if (EAT('^'))
699132019Stjr		cs->invert = 1;
7001573Srgrimes	if (EAT(']'))
701132019Stjr		CHadd(p, cs, ']');
7021573Srgrimes	else if (EAT('-'))
703132019Stjr		CHadd(p, cs, '-');
7041573Srgrimes	while (MORE() && PEEK() != ']' && !SEETWO('-', ']'))
7051573Srgrimes		p_b_term(p, cs);
7061573Srgrimes	if (EAT('-'))
707132019Stjr		CHadd(p, cs, '-');
70817141Sjkh	(void)MUSTEAT(']', REG_EBRACK);
7091573Srgrimes
7101573Srgrimes	if (p->error != 0)	/* don't mess things up further */
7111573Srgrimes		return;
7121573Srgrimes
713132019Stjr	if (cs->invert && p->g->cflags&REG_NEWLINE)
714132019Stjr		cs->bmp['\n' >> 3] |= 1 << ('\n' & 7);
7151573Srgrimes
716132019Stjr	if ((ch = singleton(cs)) != OUT) {	/* optimize singleton sets */
717132019Stjr		ordinary(p, ch);
7181573Srgrimes		freeset(p, cs);
7191573Srgrimes	} else
720132019Stjr		EMIT(OANYOF, (int)(cs - p->g->sets));
7211573Srgrimes}
7221573Srgrimes
7231573Srgrimes/*
7241573Srgrimes - p_b_term - parse one term of a bracketed character list
72592889Sobrien == static void p_b_term(struct parse *p, cset *cs);
7261573Srgrimes */
7271573Srgrimesstatic void
728170528Sdelphijp_b_term(struct parse *p, cset *cs)
7291573Srgrimes{
73092889Sobrien	char c;
731132019Stjr	wint_t start, finish;
732132019Stjr	wint_t i;
7331573Srgrimes
7341573Srgrimes	/* classify what we've got */
7351573Srgrimes	switch ((MORE()) ? PEEK() : '\0') {
7361573Srgrimes	case '[':
7371573Srgrimes		c = (MORE2()) ? PEEK2() : '\0';
7381573Srgrimes		break;
7391573Srgrimes	case '-':
7401573Srgrimes		SETERROR(REG_ERANGE);
7411573Srgrimes		return;			/* NOTE RETURN */
7421573Srgrimes		break;
7431573Srgrimes	default:
7441573Srgrimes		c = '\0';
7451573Srgrimes		break;
7461573Srgrimes	}
7471573Srgrimes
7481573Srgrimes	switch (c) {
7491573Srgrimes	case ':':		/* character class */
7501573Srgrimes		NEXT2();
75117141Sjkh		(void)REQUIRE(MORE(), REG_EBRACK);
7521573Srgrimes		c = PEEK();
75317141Sjkh		(void)REQUIRE(c != '-' && c != ']', REG_ECTYPE);
7541573Srgrimes		p_b_cclass(p, cs);
75517141Sjkh		(void)REQUIRE(MORE(), REG_EBRACK);
75617141Sjkh		(void)REQUIRE(EATTWO(':', ']'), REG_ECTYPE);
7571573Srgrimes		break;
7581573Srgrimes	case '=':		/* equivalence class */
7591573Srgrimes		NEXT2();
76017141Sjkh		(void)REQUIRE(MORE(), REG_EBRACK);
7611573Srgrimes		c = PEEK();
76217141Sjkh		(void)REQUIRE(c != '-' && c != ']', REG_ECOLLATE);
7631573Srgrimes		p_b_eclass(p, cs);
76417141Sjkh		(void)REQUIRE(MORE(), REG_EBRACK);
76517141Sjkh		(void)REQUIRE(EATTWO('=', ']'), REG_ECOLLATE);
7661573Srgrimes		break;
7671573Srgrimes	default:		/* symbol, ordinary character, or range */
7681573Srgrimes		start = p_b_symbol(p);
7691573Srgrimes		if (SEE('-') && MORE2() && PEEK2() != ']') {
7701573Srgrimes			/* range */
7711573Srgrimes			NEXT();
7721573Srgrimes			if (EAT('-'))
7731573Srgrimes				finish = '-';
7741573Srgrimes			else
7751573Srgrimes				finish = p_b_symbol(p);
7761573Srgrimes		} else
7771573Srgrimes			finish = start;
77817514Sache		if (start == finish)
779132019Stjr			CHadd(p, cs, start);
78017514Sache		else {
78124637Sache			if (__collate_load_error) {
78224637Sache				(void)REQUIRE((uch)start <= (uch)finish, REG_ERANGE);
783132019Stjr				CHaddrange(p, cs, start, finish);
78424637Sache			} else {
78524637Sache				(void)REQUIRE(__collate_range_cmp(start, finish) <= 0, REG_ERANGE);
786132019Stjr				for (i = 0; i <= UCHAR_MAX; i++) {
78724637Sache					if (   __collate_range_cmp(start, i) <= 0
78824637Sache					    && __collate_range_cmp(i, finish) <= 0
78924637Sache					   )
790132019Stjr						CHadd(p, cs, i);
79124637Sache				}
79217514Sache			}
79317514Sache		}
7941573Srgrimes		break;
7951573Srgrimes	}
7961573Srgrimes}
7971573Srgrimes
7981573Srgrimes/*
7991573Srgrimes - p_b_cclass - parse a character-class name and deal with it
80092889Sobrien == static void p_b_cclass(struct parse *p, cset *cs);
8011573Srgrimes */
8021573Srgrimesstatic void
803170528Sdelphijp_b_cclass(struct parse *p, cset *cs)
8041573Srgrimes{
80592889Sobrien	char *sp = p->next;
80692889Sobrien	size_t len;
807132019Stjr	wctype_t wct;
808132019Stjr	char clname[16];
8091573Srgrimes
81017508Sache	while (MORE() && isalpha((uch)PEEK()))
8111573Srgrimes		NEXT();
8121573Srgrimes	len = p->next - sp;
813132019Stjr	if (len >= sizeof(clname) - 1) {
8141573Srgrimes		SETERROR(REG_ECTYPE);
8151573Srgrimes		return;
8161573Srgrimes	}
817132019Stjr	memcpy(clname, sp, len);
818132019Stjr	clname[len] = '\0';
819132019Stjr	if ((wct = wctype(clname)) == 0) {
820132019Stjr		SETERROR(REG_ECTYPE);
821132019Stjr		return;
82217508Sache	}
823132019Stjr	CHaddtype(p, cs, wct);
8241573Srgrimes}
8251573Srgrimes
8261573Srgrimes/*
8271573Srgrimes - p_b_eclass - parse an equivalence-class name and deal with it
82892889Sobrien == static void p_b_eclass(struct parse *p, cset *cs);
8291573Srgrimes *
8301573Srgrimes * This implementation is incomplete. xxx
8311573Srgrimes */
8321573Srgrimesstatic void
833170528Sdelphijp_b_eclass(struct parse *p, cset *cs)
8341573Srgrimes{
835132019Stjr	wint_t c;
8361573Srgrimes
8371573Srgrimes	c = p_b_coll_elem(p, '=');
838132019Stjr	CHadd(p, cs, c);
8391573Srgrimes}
8401573Srgrimes
8411573Srgrimes/*
8421573Srgrimes - p_b_symbol - parse a character or [..]ed multicharacter collating symbol
843227414Skevlo == static wint_t p_b_symbol(struct parse *p);
8441573Srgrimes */
845132019Stjrstatic wint_t			/* value of symbol */
846170528Sdelphijp_b_symbol(struct parse *p)
8471573Srgrimes{
848132019Stjr	wint_t value;
8491573Srgrimes
85017141Sjkh	(void)REQUIRE(MORE(), REG_EBRACK);
8511573Srgrimes	if (!EATTWO('[', '.'))
852132019Stjr		return(WGETNEXT());
8531573Srgrimes
8541573Srgrimes	/* collating symbol */
8551573Srgrimes	value = p_b_coll_elem(p, '.');
85617141Sjkh	(void)REQUIRE(EATTWO('.', ']'), REG_ECOLLATE);
8571573Srgrimes	return(value);
8581573Srgrimes}
8591573Srgrimes
8601573Srgrimes/*
8611573Srgrimes - p_b_coll_elem - parse a collating-element name and look it up
862227414Skevlo == static wint_t p_b_coll_elem(struct parse *p, wint_t endc);
8631573Srgrimes */
864132019Stjrstatic wint_t			/* value of collating element */
865170528Sdelphijp_b_coll_elem(struct parse *p,
866170528Sdelphij	wint_t endc)		/* name ended by endc,']' */
8671573Srgrimes{
86892889Sobrien	char *sp = p->next;
86992889Sobrien	struct cname *cp;
87092889Sobrien	int len;
871132019Stjr	mbstate_t mbs;
872132019Stjr	wchar_t wc;
873132019Stjr	size_t clen;
8741573Srgrimes
8751573Srgrimes	while (MORE() && !SEETWO(endc, ']'))
8761573Srgrimes		NEXT();
8771573Srgrimes	if (!MORE()) {
8781573Srgrimes		SETERROR(REG_EBRACK);
8791573Srgrimes		return(0);
8801573Srgrimes	}
8811573Srgrimes	len = p->next - sp;
8821573Srgrimes	for (cp = cnames; cp->name != NULL; cp++)
8831573Srgrimes		if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0')
8841573Srgrimes			return(cp->code);	/* known name */
885132019Stjr	memset(&mbs, 0, sizeof(mbs));
886132019Stjr	if ((clen = mbrtowc(&wc, sp, len, &mbs)) == len)
887132019Stjr		return (wc);			/* single character */
888132019Stjr	else if (clen == (size_t)-1 || clen == (size_t)-2)
889132019Stjr		SETERROR(REG_ILLSEQ);
890132019Stjr	else
891132019Stjr		SETERROR(REG_ECOLLATE);		/* neither */
8921573Srgrimes	return(0);
8931573Srgrimes}
8941573Srgrimes
8951573Srgrimes/*
8961573Srgrimes - othercase - return the case counterpart of an alphabetic
897227414Skevlo == static wint_t othercase(wint_t ch);
8981573Srgrimes */
899132019Stjrstatic wint_t			/* if no counterpart, return ch */
900170528Sdelphijothercase(wint_t ch)
9011573Srgrimes{
902132019Stjr	assert(iswalpha(ch));
903132019Stjr	if (iswupper(ch))
904132019Stjr		return(towlower(ch));
905132019Stjr	else if (iswlower(ch))
906132019Stjr		return(towupper(ch));
9071573Srgrimes	else			/* peculiar, but could happen */
9081573Srgrimes		return(ch);
9091573Srgrimes}
9101573Srgrimes
9111573Srgrimes/*
9121573Srgrimes - bothcases - emit a dualcase version of a two-case character
913227414Skevlo == static void bothcases(struct parse *p, wint_t ch);
9141573Srgrimes *
9151573Srgrimes * Boy, is this implementation ever a kludge...
9161573Srgrimes */
9171573Srgrimesstatic void
918170528Sdelphijbothcases(struct parse *p, wint_t ch)
9191573Srgrimes{
92092889Sobrien	char *oldnext = p->next;
92192889Sobrien	char *oldend = p->end;
922132019Stjr	char bracket[3 + MB_LEN_MAX];
923132019Stjr	size_t n;
924132019Stjr	mbstate_t mbs;
9251573Srgrimes
9261573Srgrimes	assert(othercase(ch) != ch);	/* p_bracket() would recurse */
9271573Srgrimes	p->next = bracket;
928132019Stjr	memset(&mbs, 0, sizeof(mbs));
929132019Stjr	n = wcrtomb(bracket, ch, &mbs);
930132019Stjr	assert(n != (size_t)-1);
931132019Stjr	bracket[n] = ']';
932132019Stjr	bracket[n + 1] = '\0';
933132019Stjr	p->end = bracket+n+1;
9341573Srgrimes	p_bracket(p);
935132019Stjr	assert(p->next == p->end);
9361573Srgrimes	p->next = oldnext;
9371573Srgrimes	p->end = oldend;
9381573Srgrimes}
9391573Srgrimes
9401573Srgrimes/*
9411573Srgrimes - ordinary - emit an ordinary character
942227414Skevlo == static void ordinary(struct parse *p, wint_t ch);
9431573Srgrimes */
9441573Srgrimesstatic void
945170528Sdelphijordinary(struct parse *p, wint_t ch)
9461573Srgrimes{
947132019Stjr	cset *cs;
9481573Srgrimes
949132019Stjr	if ((p->g->cflags&REG_ICASE) && iswalpha(ch) && othercase(ch) != ch)
9501573Srgrimes		bothcases(p, ch);
951132019Stjr	else if ((ch & OPDMASK) == ch)
952132019Stjr		EMIT(OCHAR, ch);
953132019Stjr	else {
954132019Stjr		/*
955132019Stjr		 * Kludge: character is too big to fit into an OCHAR operand.
956132019Stjr		 * Emit a singleton set.
957132019Stjr		 */
958132019Stjr		if ((cs = allocset(p)) == NULL)
959132019Stjr			return;
960132019Stjr		CHadd(p, cs, ch);
961132019Stjr		EMIT(OANYOF, (int)(cs - p->g->sets));
962132019Stjr	}
9631573Srgrimes}
9641573Srgrimes
9651573Srgrimes/*
9661573Srgrimes - nonnewline - emit REG_NEWLINE version of OANY
96792889Sobrien == static void nonnewline(struct parse *p);
9681573Srgrimes *
9691573Srgrimes * Boy, is this implementation ever a kludge...
9701573Srgrimes */
9711573Srgrimesstatic void
972170528Sdelphijnonnewline(struct parse *p)
9731573Srgrimes{
97492889Sobrien	char *oldnext = p->next;
97592889Sobrien	char *oldend = p->end;
9761573Srgrimes	char bracket[4];
9771573Srgrimes
9781573Srgrimes	p->next = bracket;
9791573Srgrimes	p->end = bracket+3;
9801573Srgrimes	bracket[0] = '^';
9811573Srgrimes	bracket[1] = '\n';
9821573Srgrimes	bracket[2] = ']';
9831573Srgrimes	bracket[3] = '\0';
9841573Srgrimes	p_bracket(p);
9851573Srgrimes	assert(p->next == bracket+3);
9861573Srgrimes	p->next = oldnext;
9871573Srgrimes	p->end = oldend;
9881573Srgrimes}
9891573Srgrimes
9901573Srgrimes/*
9911573Srgrimes - repeat - generate code for a bounded repetition, recursively if needed
99292889Sobrien == static void repeat(struct parse *p, sopno start, int from, int to);
9931573Srgrimes */
9941573Srgrimesstatic void
995170528Sdelphijrepeat(struct parse *p,
996170528Sdelphij	sopno start,		/* operand from here to end of strip */
997170528Sdelphij	int from,		/* repeated from this number */
998170528Sdelphij	int to)			/* to this number of times (maybe INFINITY) */
9991573Srgrimes{
100092889Sobrien	sopno finish = HERE();
10011573Srgrimes#	define	N	2
10021573Srgrimes#	define	INF	3
10031573Srgrimes#	define	REP(f, t)	((f)*8 + (t))
10041573Srgrimes#	define	MAP(n)	(((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N)
100592889Sobrien	sopno copy;
10061573Srgrimes
10071573Srgrimes	if (p->error != 0)	/* head off possible runaway recursion */
10081573Srgrimes		return;
10091573Srgrimes
10101573Srgrimes	assert(from <= to);
10111573Srgrimes
10121573Srgrimes	switch (REP(MAP(from), MAP(to))) {
10131573Srgrimes	case REP(0, 0):			/* must be user doing this */
10141573Srgrimes		DROP(finish-start);	/* drop the operand */
10151573Srgrimes		break;
10161573Srgrimes	case REP(0, 1):			/* as x{1,1}? */
10171573Srgrimes	case REP(0, N):			/* as x{1,n}? */
10181573Srgrimes	case REP(0, INF):		/* as x{1,}? */
10191573Srgrimes		/* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
10201573Srgrimes		INSERT(OCH_, start);		/* offset is wrong... */
10211573Srgrimes		repeat(p, start+1, 1, to);
10221573Srgrimes		ASTERN(OOR1, start);
10231573Srgrimes		AHEAD(start);			/* ... fix it */
10241573Srgrimes		EMIT(OOR2, 0);
10251573Srgrimes		AHEAD(THERE());
10261573Srgrimes		ASTERN(O_CH, THERETHERE());
10271573Srgrimes		break;
10281573Srgrimes	case REP(1, 1):			/* trivial case */
10291573Srgrimes		/* done */
10301573Srgrimes		break;
10311573Srgrimes	case REP(1, N):			/* as x?x{1,n-1} */
10321573Srgrimes		/* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
10331573Srgrimes		INSERT(OCH_, start);
10341573Srgrimes		ASTERN(OOR1, start);
10351573Srgrimes		AHEAD(start);
10361573Srgrimes		EMIT(OOR2, 0);			/* offset very wrong... */
10371573Srgrimes		AHEAD(THERE());			/* ...so fix it */
10381573Srgrimes		ASTERN(O_CH, THERETHERE());
10391573Srgrimes		copy = dupl(p, start+1, finish+1);
10401573Srgrimes		assert(copy == finish+4);
10411573Srgrimes		repeat(p, copy, 1, to-1);
10421573Srgrimes		break;
10431573Srgrimes	case REP(1, INF):		/* as x+ */
10441573Srgrimes		INSERT(OPLUS_, start);
10451573Srgrimes		ASTERN(O_PLUS, start);
10461573Srgrimes		break;
10471573Srgrimes	case REP(N, N):			/* as xx{m-1,n-1} */
10481573Srgrimes		copy = dupl(p, start, finish);
10491573Srgrimes		repeat(p, copy, from-1, to-1);
10501573Srgrimes		break;
10511573Srgrimes	case REP(N, INF):		/* as xx{n-1,INF} */
10521573Srgrimes		copy = dupl(p, start, finish);
10531573Srgrimes		repeat(p, copy, from-1, to);
10541573Srgrimes		break;
10551573Srgrimes	default:			/* "can't happen" */
10561573Srgrimes		SETERROR(REG_ASSERT);	/* just in case */
10571573Srgrimes		break;
10581573Srgrimes	}
10591573Srgrimes}
10601573Srgrimes
10611573Srgrimes/*
1062132019Stjr - wgetnext - helper function for WGETNEXT() macro. Gets the next wide
1063132019Stjr - character from the parse struct, signals a REG_ILLSEQ error if the
1064132019Stjr - character can't be converted. Returns the number of bytes consumed.
1065132019Stjr */
1066132019Stjrstatic wint_t
1067170528Sdelphijwgetnext(struct parse *p)
1068132019Stjr{
1069132019Stjr	mbstate_t mbs;
1070132019Stjr	wchar_t wc;
1071132019Stjr	size_t n;
1072132019Stjr
1073132019Stjr	memset(&mbs, 0, sizeof(mbs));
1074132019Stjr	n = mbrtowc(&wc, p->next, p->end - p->next, &mbs);
1075132019Stjr	if (n == (size_t)-1 || n == (size_t)-2) {
1076132019Stjr		SETERROR(REG_ILLSEQ);
1077132019Stjr		return (0);
1078132019Stjr	}
1079132019Stjr	if (n == 0)
1080132019Stjr		n = 1;
1081132019Stjr	p->next += n;
1082132019Stjr	return (wc);
1083132019Stjr}
1084132019Stjr
1085132019Stjr/*
10861573Srgrimes - seterr - set an error condition
108792889Sobrien == static int seterr(struct parse *p, int e);
10881573Srgrimes */
10891573Srgrimesstatic int			/* useless but makes type checking happy */
1090170528Sdelphijseterr(struct parse *p, int e)
10911573Srgrimes{
10921573Srgrimes	if (p->error == 0)	/* keep earliest error condition */
10931573Srgrimes		p->error = e;
10941573Srgrimes	p->next = nuls;		/* try to bring things to a halt */
10951573Srgrimes	p->end = nuls;
10961573Srgrimes	return(0);		/* make the return value well-defined */
10971573Srgrimes}
10981573Srgrimes
10991573Srgrimes/*
11001573Srgrimes - allocset - allocate a set of characters for []
110192889Sobrien == static cset *allocset(struct parse *p);
11021573Srgrimes */
11031573Srgrimesstatic cset *
1104170528Sdelphijallocset(struct parse *p)
11051573Srgrimes{
1106132019Stjr	cset *cs, *ncs;
11071573Srgrimes
1108132019Stjr	ncs = realloc(p->g->sets, (p->g->ncsets + 1) * sizeof(*ncs));
1109132019Stjr	if (ncs == NULL) {
1110132019Stjr		SETERROR(REG_ESPACE);
1111132019Stjr		return (NULL);
11121573Srgrimes	}
1113132019Stjr	p->g->sets = ncs;
1114132019Stjr	cs = &p->g->sets[p->g->ncsets++];
1115132019Stjr	memset(cs, 0, sizeof(*cs));
11161573Srgrimes
11171573Srgrimes	return(cs);
11181573Srgrimes}
11191573Srgrimes
11201573Srgrimes/*
11211573Srgrimes - freeset - free a now-unused set
112292889Sobrien == static void freeset(struct parse *p, cset *cs);
11231573Srgrimes */
11241573Srgrimesstatic void
1125170528Sdelphijfreeset(struct parse *p, cset *cs)
11261573Srgrimes{
112792889Sobrien	cset *top = &p->g->sets[p->g->ncsets];
11281573Srgrimes
1129132019Stjr	free(cs->wides);
1130132019Stjr	free(cs->ranges);
1131132019Stjr	free(cs->types);
1132132019Stjr	memset(cs, 0, sizeof(*cs));
11331573Srgrimes	if (cs == top-1)	/* recover only the easy case */
11341573Srgrimes		p->g->ncsets--;
11351573Srgrimes}
11361573Srgrimes
11371573Srgrimes/*
1138132019Stjr - singleton - Determine whether a set contains only one character,
1139132019Stjr - returning it if so, otherwise returning OUT.
11401573Srgrimes */
1141132019Stjrstatic wint_t
1142170528Sdelphijsingleton(cset *cs)
11431573Srgrimes{
1144132019Stjr	wint_t i, s, n;
11451573Srgrimes
1146132019Stjr	for (i = n = 0; i < NC; i++)
1147132019Stjr		if (CHIN(cs, i)) {
1148132019Stjr			n++;
1149132019Stjr			s = i;
11501573Srgrimes		}
1151132019Stjr	if (n == 1)
1152132019Stjr		return (s);
1153132019Stjr	if (cs->nwides == 1 && cs->nranges == 0 && cs->ntypes == 0 &&
1154132019Stjr	    cs->icase == 0)
1155132019Stjr		return (cs->wides[0]);
1156132019Stjr	/* Don't bother handling the other cases. */
1157132019Stjr	return (OUT);
1158132019Stjr}
11591573Srgrimes
1160132019Stjr/*
1161132019Stjr - CHadd - add character to character set.
1162132019Stjr */
1163132019Stjrstatic void
1164170528SdelphijCHadd(struct parse *p, cset *cs, wint_t ch)
1165132019Stjr{
1166134802Stjr	wint_t nch, *newwides;
1167132019Stjr	assert(ch >= 0);
1168134802Stjr	if (ch < NC)
1169132019Stjr		cs->bmp[ch >> 3] |= 1 << (ch & 7);
1170134802Stjr	else {
1171132019Stjr		newwides = realloc(cs->wides, (cs->nwides + 1) *
1172132019Stjr		    sizeof(*cs->wides));
1173132019Stjr		if (newwides == NULL) {
1174132019Stjr			SETERROR(REG_ESPACE);
1175132019Stjr			return;
1176132019Stjr		}
1177132019Stjr		cs->wides = newwides;
1178132019Stjr		cs->wides[cs->nwides++] = ch;
11791573Srgrimes	}
1180134802Stjr	if (cs->icase) {
1181134802Stjr		if ((nch = towlower(ch)) < NC)
1182134802Stjr			cs->bmp[nch >> 3] |= 1 << (nch & 7);
1183134802Stjr		if ((nch = towupper(ch)) < NC)
1184134802Stjr			cs->bmp[nch >> 3] |= 1 << (nch & 7);
1185134802Stjr	}
11861573Srgrimes}
11871573Srgrimes
11881573Srgrimes/*
1189132019Stjr - CHaddrange - add all characters in the range [min,max] to a character set.
11901573Srgrimes */
1191132019Stjrstatic void
1192170528SdelphijCHaddrange(struct parse *p, cset *cs, wint_t min, wint_t max)
11931573Srgrimes{
1194132019Stjr	crange *newranges;
11951573Srgrimes
1196132019Stjr	for (; min < NC && min <= max; min++)
1197132019Stjr		CHadd(p, cs, min);
1198132019Stjr	if (min >= max)
1199132019Stjr		return;
1200132019Stjr	newranges = realloc(cs->ranges, (cs->nranges + 1) *
1201132019Stjr	    sizeof(*cs->ranges));
1202132019Stjr	if (newranges == NULL) {
1203132019Stjr		SETERROR(REG_ESPACE);
1204132019Stjr		return;
1205132019Stjr	}
1206132019Stjr	cs->ranges = newranges;
1207132019Stjr	cs->ranges[cs->nranges].min = min;
1208132019Stjr	cs->ranges[cs->nranges].min = max;
1209132019Stjr	cs->nranges++;
12101573Srgrimes}
12111573Srgrimes
12121573Srgrimes/*
1213132019Stjr - CHaddtype - add all characters of a certain type to a character set.
12141573Srgrimes */
1215132019Stjrstatic void
1216170528SdelphijCHaddtype(struct parse *p, cset *cs, wctype_t wct)
12171573Srgrimes{
1218132019Stjr	wint_t i;
1219132019Stjr	wctype_t *newtypes;
12201573Srgrimes
1221134802Stjr	for (i = 0; i < NC; i++)
1222132019Stjr		if (iswctype(i, wct))
1223132019Stjr			CHadd(p, cs, i);
1224132019Stjr	newtypes = realloc(cs->types, (cs->ntypes + 1) *
1225132019Stjr	    sizeof(*cs->types));
1226132019Stjr	if (newtypes == NULL) {
1227132019Stjr		SETERROR(REG_ESPACE);
1228132019Stjr		return;
1229132019Stjr	}
1230132019Stjr	cs->types = newtypes;
1231132019Stjr	cs->types[cs->ntypes++] = wct;
12321573Srgrimes}
12331573Srgrimes
12341573Srgrimes/*
12351573Srgrimes - dupl - emit a duplicate of a bunch of sops
123692889Sobrien == static sopno dupl(struct parse *p, sopno start, sopno finish);
12371573Srgrimes */
12381573Srgrimesstatic sopno			/* start of duplicate */
1239170528Sdelphijdupl(struct parse *p,
1240170528Sdelphij	sopno start,		/* from here */
1241170528Sdelphij	sopno finish)		/* to this less one */
12421573Srgrimes{
124392889Sobrien	sopno ret = HERE();
124492889Sobrien	sopno len = finish - start;
12451573Srgrimes
12461573Srgrimes	assert(finish >= start);
12471573Srgrimes	if (len == 0)
12481573Srgrimes		return(ret);
1249227414Skevlo	if (!enlarge(p, p->ssize + len)) /* this many unexpected additions */
1250227414Skevlo		return(ret);
12511573Srgrimes	(void) memcpy((char *)(p->strip + p->slen),
12521573Srgrimes		(char *)(p->strip + start), (size_t)len*sizeof(sop));
12531573Srgrimes	p->slen += len;
12541573Srgrimes	return(ret);
12551573Srgrimes}
12561573Srgrimes
12571573Srgrimes/*
12581573Srgrimes - doemit - emit a strip operator
125992889Sobrien == static void doemit(struct parse *p, sop op, size_t opnd);
12601573Srgrimes *
12611573Srgrimes * It might seem better to implement this as a macro with a function as
12621573Srgrimes * hard-case backup, but it's just too big and messy unless there are
12631573Srgrimes * some changes to the data structures.  Maybe later.
12641573Srgrimes */
12651573Srgrimesstatic void
1266170528Sdelphijdoemit(struct parse *p, sop op, size_t opnd)
12671573Srgrimes{
12681573Srgrimes	/* avoid making error situations worse */
12691573Srgrimes	if (p->error != 0)
12701573Srgrimes		return;
12711573Srgrimes
12721573Srgrimes	/* deal with oversize operands ("can't happen", more or less) */
12731573Srgrimes	assert(opnd < 1<<OPSHIFT);
12741573Srgrimes
12751573Srgrimes	/* deal with undersized strip */
12761573Srgrimes	if (p->slen >= p->ssize)
1277227414Skevlo		if (!enlarge(p, (p->ssize+1) / 2 * 3))	/* +50% */
1278227414Skevlo			return;
12791573Srgrimes
12801573Srgrimes	/* finally, it's all reduced to the easy case */
12811573Srgrimes	p->strip[p->slen++] = SOP(op, opnd);
12821573Srgrimes}
12831573Srgrimes
12841573Srgrimes/*
12851573Srgrimes - doinsert - insert a sop into the strip
128692889Sobrien == static void doinsert(struct parse *p, sop op, size_t opnd, sopno pos);
12871573Srgrimes */
12881573Srgrimesstatic void
1289170528Sdelphijdoinsert(struct parse *p, sop op, size_t opnd, sopno pos)
12901573Srgrimes{
129192889Sobrien	sopno sn;
129292889Sobrien	sop s;
129392889Sobrien	int i;
12941573Srgrimes
12951573Srgrimes	/* avoid making error situations worse */
12961573Srgrimes	if (p->error != 0)
12971573Srgrimes		return;
12981573Srgrimes
12991573Srgrimes	sn = HERE();
13001573Srgrimes	EMIT(op, opnd);		/* do checks, ensure space */
13011573Srgrimes	assert(HERE() == sn+1);
13021573Srgrimes	s = p->strip[sn];
13031573Srgrimes
13041573Srgrimes	/* adjust paren pointers */
13051573Srgrimes	assert(pos > 0);
13061573Srgrimes	for (i = 1; i < NPAREN; i++) {
13071573Srgrimes		if (p->pbegin[i] >= pos) {
13081573Srgrimes			p->pbegin[i]++;
13091573Srgrimes		}
13101573Srgrimes		if (p->pend[i] >= pos) {
13111573Srgrimes			p->pend[i]++;
13121573Srgrimes		}
13131573Srgrimes	}
13141573Srgrimes
13151573Srgrimes	memmove((char *)&p->strip[pos+1], (char *)&p->strip[pos],
13161573Srgrimes						(HERE()-pos-1)*sizeof(sop));
13171573Srgrimes	p->strip[pos] = s;
13181573Srgrimes}
13191573Srgrimes
13201573Srgrimes/*
13211573Srgrimes - dofwd - complete a forward reference
132292889Sobrien == static void dofwd(struct parse *p, sopno pos, sop value);
13231573Srgrimes */
13241573Srgrimesstatic void
1325170528Sdelphijdofwd(struct parse *p, sopno pos, sop value)
13261573Srgrimes{
13271573Srgrimes	/* avoid making error situations worse */
13281573Srgrimes	if (p->error != 0)
13291573Srgrimes		return;
13301573Srgrimes
13311573Srgrimes	assert(value < 1<<OPSHIFT);
13321573Srgrimes	p->strip[pos] = OP(p->strip[pos]) | value;
13331573Srgrimes}
13341573Srgrimes
13351573Srgrimes/*
13361573Srgrimes - enlarge - enlarge the strip
1337227414Skevlo == static int enlarge(struct parse *p, sopno size);
13381573Srgrimes */
1339227414Skevlostatic int
1340170528Sdelphijenlarge(struct parse *p, sopno size)
13411573Srgrimes{
134292889Sobrien	sop *sp;
13431573Srgrimes
13441573Srgrimes	if (p->ssize >= size)
1345227414Skevlo		return 1;
13461573Srgrimes
13471573Srgrimes	sp = (sop *)realloc(p->strip, size*sizeof(sop));
13481573Srgrimes	if (sp == NULL) {
13491573Srgrimes		SETERROR(REG_ESPACE);
1350227414Skevlo		return 0;
13511573Srgrimes	}
13521573Srgrimes	p->strip = sp;
13531573Srgrimes	p->ssize = size;
1354227414Skevlo	return 1;
13551573Srgrimes}
13561573Srgrimes
13571573Srgrimes/*
13581573Srgrimes - stripsnug - compact the strip
135992889Sobrien == static void stripsnug(struct parse *p, struct re_guts *g);
13601573Srgrimes */
13611573Srgrimesstatic void
1362170528Sdelphijstripsnug(struct parse *p, struct re_guts *g)
13631573Srgrimes{
13641573Srgrimes	g->nstates = p->slen;
13651573Srgrimes	g->strip = (sop *)realloc((char *)p->strip, p->slen * sizeof(sop));
13661573Srgrimes	if (g->strip == NULL) {
13671573Srgrimes		SETERROR(REG_ESPACE);
13681573Srgrimes		g->strip = p->strip;
13691573Srgrimes	}
13701573Srgrimes}
13711573Srgrimes
13721573Srgrimes/*
13731573Srgrimes - findmust - fill in must and mlen with longest mandatory literal string
137492889Sobrien == static void findmust(struct parse *p, struct re_guts *g);
13751573Srgrimes *
13761573Srgrimes * This algorithm could do fancy things like analyzing the operands of |
13771573Srgrimes * for common subsequences.  Someday.  This code is simple and finds most
13781573Srgrimes * of the interesting cases.
13791573Srgrimes *
13801573Srgrimes * Note that must and mlen got initialized during setup.
13811573Srgrimes */
13821573Srgrimesstatic void
1383170528Sdelphijfindmust(struct parse *p, struct re_guts *g)
13841573Srgrimes{
138592889Sobrien	sop *scan;
13861573Srgrimes	sop *start;
138792889Sobrien	sop *newstart;
138892889Sobrien	sopno newlen;
138992889Sobrien	sop s;
139092889Sobrien	char *cp;
139162391Sdcs	int offset;
1392132019Stjr	char buf[MB_LEN_MAX];
1393132019Stjr	size_t clen;
1394132019Stjr	mbstate_t mbs;
13951573Srgrimes
13961573Srgrimes	/* avoid making error situations worse */
13971573Srgrimes	if (p->error != 0)
13981573Srgrimes		return;
13991573Srgrimes
1400132019Stjr	/*
1401132019Stjr	 * It's not generally safe to do a ``char'' substring search on
1402132019Stjr	 * multibyte character strings, but it's safe for at least
1403132019Stjr	 * UTF-8 (see RFC 3629).
1404132019Stjr	 */
1405132019Stjr	if (MB_CUR_MAX > 1 &&
1406132019Stjr	    strcmp(_CurrentRuneLocale->__encoding, "UTF-8") != 0)
1407132019Stjr		return;
1408132019Stjr
14091573Srgrimes	/* find the longest OCHAR sequence in strip */
14101573Srgrimes	newlen = 0;
141162391Sdcs	offset = 0;
141262391Sdcs	g->moffset = 0;
14131573Srgrimes	scan = g->strip + 1;
14141573Srgrimes	do {
14151573Srgrimes		s = *scan++;
14161573Srgrimes		switch (OP(s)) {
14171573Srgrimes		case OCHAR:		/* sequence member */
1418132019Stjr			if (newlen == 0) {		/* new sequence */
1419132019Stjr				memset(&mbs, 0, sizeof(mbs));
14201573Srgrimes				newstart = scan - 1;
1421132019Stjr			}
1422132019Stjr			clen = wcrtomb(buf, OPND(s), &mbs);
1423132019Stjr			if (clen == (size_t)-1)
1424132019Stjr				goto toohard;
1425132019Stjr			newlen += clen;
14261573Srgrimes			break;
14271573Srgrimes		case OPLUS_:		/* things that don't break one */
14281573Srgrimes		case OLPAREN:
14291573Srgrimes		case ORPAREN:
14301573Srgrimes			break;
14311573Srgrimes		case OQUEST_:		/* things that must be skipped */
14321573Srgrimes		case OCH_:
1433131973Stjr			offset = altoffset(scan, offset);
14341573Srgrimes			scan--;
14351573Srgrimes			do {
14361573Srgrimes				scan += OPND(s);
14371573Srgrimes				s = *scan;
14381573Srgrimes				/* assert() interferes w debug printouts */
14391573Srgrimes				if (OP(s) != O_QUEST && OP(s) != O_CH &&
14401573Srgrimes							OP(s) != OOR2) {
14411573Srgrimes					g->iflags |= BAD;
14421573Srgrimes					return;
14431573Srgrimes				}
14441573Srgrimes			} while (OP(s) != O_QUEST && OP(s) != O_CH);
1445102411Scharnier			/* FALLTHROUGH */
144662391Sdcs		case OBOW:		/* things that break a sequence */
144762391Sdcs		case OEOW:
144862391Sdcs		case OBOL:
144962391Sdcs		case OEOL:
145062391Sdcs		case O_QUEST:
145162391Sdcs		case O_CH:
145262391Sdcs		case OEND:
14531573Srgrimes			if (newlen > g->mlen) {		/* ends one */
14541573Srgrimes				start = newstart;
14551573Srgrimes				g->mlen = newlen;
145662391Sdcs				if (offset > -1) {
145762391Sdcs					g->moffset += offset;
145862391Sdcs					offset = newlen;
145962391Sdcs				} else
146062391Sdcs					g->moffset = offset;
146162391Sdcs			} else {
146262391Sdcs				if (offset > -1)
146362391Sdcs					offset += newlen;
14641573Srgrimes			}
14651573Srgrimes			newlen = 0;
14661573Srgrimes			break;
146762391Sdcs		case OANY:
146862391Sdcs			if (newlen > g->mlen) {		/* ends one */
146962391Sdcs				start = newstart;
147062391Sdcs				g->mlen = newlen;
147162391Sdcs				if (offset > -1) {
147262391Sdcs					g->moffset += offset;
147362391Sdcs					offset = newlen;
147462391Sdcs				} else
147562391Sdcs					g->moffset = offset;
147662391Sdcs			} else {
147762391Sdcs				if (offset > -1)
147862391Sdcs					offset += newlen;
147962391Sdcs			}
148062391Sdcs			if (offset > -1)
148162391Sdcs				offset++;
148262391Sdcs			newlen = 0;
148362391Sdcs			break;
148462391Sdcs		case OANYOF:		/* may or may not invalidate offset */
148562391Sdcs			/* First, everything as OANY */
148662391Sdcs			if (newlen > g->mlen) {		/* ends one */
148762391Sdcs				start = newstart;
148862391Sdcs				g->mlen = newlen;
148962391Sdcs				if (offset > -1) {
149062391Sdcs					g->moffset += offset;
149162391Sdcs					offset = newlen;
149262391Sdcs				} else
149362391Sdcs					g->moffset = offset;
149462391Sdcs			} else {
149562391Sdcs				if (offset > -1)
149662391Sdcs					offset += newlen;
149762391Sdcs			}
149862391Sdcs			if (offset > -1)
149962391Sdcs				offset++;
150062391Sdcs			newlen = 0;
150162391Sdcs			break;
1502132019Stjr		toohard:
150362391Sdcs		default:
150462391Sdcs			/* Anything here makes it impossible or too hard
150562391Sdcs			 * to calculate the offset -- so we give up;
150662391Sdcs			 * save the last known good offset, in case the
150762391Sdcs			 * must sequence doesn't occur later.
150862391Sdcs			 */
150962391Sdcs			if (newlen > g->mlen) {		/* ends one */
151062391Sdcs				start = newstart;
151162391Sdcs				g->mlen = newlen;
151262391Sdcs				if (offset > -1)
151362391Sdcs					g->moffset += offset;
151462391Sdcs				else
151562391Sdcs					g->moffset = offset;
151662391Sdcs			}
151762391Sdcs			offset = -1;
151862391Sdcs			newlen = 0;
151962391Sdcs			break;
15201573Srgrimes		}
15211573Srgrimes	} while (OP(s) != OEND);
15221573Srgrimes
152362391Sdcs	if (g->mlen == 0) {		/* there isn't one */
152462391Sdcs		g->moffset = -1;
15251573Srgrimes		return;
152662391Sdcs	}
15271573Srgrimes
15281573Srgrimes	/* turn it into a character string */
15291573Srgrimes	g->must = malloc((size_t)g->mlen + 1);
15301573Srgrimes	if (g->must == NULL) {		/* argh; just forget it */
15311573Srgrimes		g->mlen = 0;
153262391Sdcs		g->moffset = -1;
15331573Srgrimes		return;
15341573Srgrimes	}
15351573Srgrimes	cp = g->must;
15361573Srgrimes	scan = start;
1537132019Stjr	memset(&mbs, 0, sizeof(mbs));
1538132019Stjr	while (cp < g->must + g->mlen) {
15391573Srgrimes		while (OP(s = *scan++) != OCHAR)
15401573Srgrimes			continue;
1541132019Stjr		clen = wcrtomb(cp, OPND(s), &mbs);
1542132019Stjr		assert(clen != (size_t)-1);
1543132019Stjr		cp += clen;
15441573Srgrimes	}
15451573Srgrimes	assert(cp == g->must + g->mlen);
15461573Srgrimes	*cp++ = '\0';		/* just on general principles */
15471573Srgrimes}
15481573Srgrimes
15491573Srgrimes/*
155062391Sdcs - altoffset - choose biggest offset among multiple choices
1551131973Stjr == static int altoffset(sop *scan, int offset);
155262391Sdcs *
155362391Sdcs * Compute, recursively if necessary, the largest offset among multiple
155462391Sdcs * re paths.
155562391Sdcs */
155662391Sdcsstatic int
1557170528Sdelphijaltoffset(sop *scan, int offset)
155862391Sdcs{
155962391Sdcs	int largest;
156062391Sdcs	int try;
156162391Sdcs	sop s;
156262391Sdcs
156362391Sdcs	/* If we gave up already on offsets, return */
156462391Sdcs	if (offset == -1)
156562391Sdcs		return -1;
156662391Sdcs
156762391Sdcs	largest = 0;
156862391Sdcs	try = 0;
156962391Sdcs	s = *scan++;
157062391Sdcs	while (OP(s) != O_QUEST && OP(s) != O_CH) {
157162391Sdcs		switch (OP(s)) {
157262391Sdcs		case OOR1:
157362391Sdcs			if (try > largest)
157462391Sdcs				largest = try;
157562391Sdcs			try = 0;
157662391Sdcs			break;
157762391Sdcs		case OQUEST_:
157862391Sdcs		case OCH_:
1579131973Stjr			try = altoffset(scan, try);
158062391Sdcs			if (try == -1)
158162391Sdcs				return -1;
158262391Sdcs			scan--;
158362391Sdcs			do {
158462391Sdcs				scan += OPND(s);
158562391Sdcs				s = *scan;
158662391Sdcs				if (OP(s) != O_QUEST && OP(s) != O_CH &&
158762391Sdcs							OP(s) != OOR2)
158862391Sdcs					return -1;
158962391Sdcs			} while (OP(s) != O_QUEST && OP(s) != O_CH);
159062855Sdcs			/* We must skip to the next position, or we'll
159162855Sdcs			 * leave altoffset() too early.
159262855Sdcs			 */
159362855Sdcs			scan++;
159462391Sdcs			break;
159562391Sdcs		case OANYOF:
159662391Sdcs		case OCHAR:
159762391Sdcs		case OANY:
159862391Sdcs			try++;
159962391Sdcs		case OBOW:
160062391Sdcs		case OEOW:
160162391Sdcs		case OLPAREN:
160262391Sdcs		case ORPAREN:
160362391Sdcs		case OOR2:
160462391Sdcs			break;
160562391Sdcs		default:
160662391Sdcs			try = -1;
160762391Sdcs			break;
160862391Sdcs		}
160962391Sdcs		if (try == -1)
161062391Sdcs			return -1;
161162391Sdcs		s = *scan++;
161262391Sdcs	}
161362391Sdcs
161462391Sdcs	if (try > largest)
161562391Sdcs		largest = try;
161662391Sdcs
161762391Sdcs	return largest+offset;
161862391Sdcs}
161962391Sdcs
162062391Sdcs/*
162162232Sdcs - computejumps - compute char jumps for BM scan
162292889Sobrien == static void computejumps(struct parse *p, struct re_guts *g);
162362232Sdcs *
162462232Sdcs * This algorithm assumes g->must exists and is has size greater than
162562232Sdcs * zero. It's based on the algorithm found on Computer Algorithms by
162662232Sdcs * Sara Baase.
162762232Sdcs *
162862232Sdcs * A char jump is the number of characters one needs to jump based on
162962232Sdcs * the value of the character from the text that was mismatched.
163062232Sdcs */
163162232Sdcsstatic void
1632170528Sdelphijcomputejumps(struct parse *p, struct re_guts *g)
163362232Sdcs{
163462232Sdcs	int ch;
163562232Sdcs	int mindex;
163662232Sdcs
163762232Sdcs	/* Avoid making errors worse */
163862232Sdcs	if (p->error != 0)
163962232Sdcs		return;
164062232Sdcs
164162848Sdcs	g->charjump = (int*) malloc((NC + 1) * sizeof(int));
164262232Sdcs	if (g->charjump == NULL)	/* Not a fatal error */
164362232Sdcs		return;
164462754Sdcs	/* Adjust for signed chars, if necessary */
164562754Sdcs	g->charjump = &g->charjump[-(CHAR_MIN)];
164662232Sdcs
164762232Sdcs	/* If the character does not exist in the pattern, the jump
164862232Sdcs	 * is equal to the number of characters in the pattern.
164962232Sdcs	 */
165062754Sdcs	for (ch = CHAR_MIN; ch < (CHAR_MAX + 1); ch++)
165162232Sdcs		g->charjump[ch] = g->mlen;
165262232Sdcs
165362232Sdcs	/* If the character does exist, compute the jump that would
165462232Sdcs	 * take us to the last character in the pattern equal to it
165562232Sdcs	 * (notice that we match right to left, so that last character
165662232Sdcs	 * is the first one that would be matched).
165762232Sdcs	 */
165862232Sdcs	for (mindex = 0; mindex < g->mlen; mindex++)
1659111010Snectar		g->charjump[(int)g->must[mindex]] = g->mlen - mindex - 1;
166062232Sdcs}
166162232Sdcs
166262232Sdcs/*
166362232Sdcs - computematchjumps - compute match jumps for BM scan
166492889Sobrien == static void computematchjumps(struct parse *p, struct re_guts *g);
166562232Sdcs *
166662232Sdcs * This algorithm assumes g->must exists and is has size greater than
166762232Sdcs * zero. It's based on the algorithm found on Computer Algorithms by
166862232Sdcs * Sara Baase.
166962232Sdcs *
167062232Sdcs * A match jump is the number of characters one needs to advance based
167162232Sdcs * on the already-matched suffix.
167262232Sdcs * Notice that all values here are minus (g->mlen-1), because of the way
167362232Sdcs * the search algorithm works.
167462232Sdcs */
167562232Sdcsstatic void
1676170528Sdelphijcomputematchjumps(struct parse *p, struct re_guts *g)
167762232Sdcs{
167862232Sdcs	int mindex;		/* General "must" iterator */
167962232Sdcs	int suffix;		/* Keeps track of matching suffix */
168062232Sdcs	int ssuffix;		/* Keeps track of suffixes' suffix */
168162232Sdcs	int* pmatches;		/* pmatches[k] points to the next i
168262232Sdcs				 * such that i+1...mlen is a substring
168362232Sdcs				 * of k+1...k+mlen-i-1
168462232Sdcs				 */
168562232Sdcs
168662232Sdcs	/* Avoid making errors worse */
168762232Sdcs	if (p->error != 0)
168862232Sdcs		return;
168962232Sdcs
169062848Sdcs	pmatches = (int*) malloc(g->mlen * sizeof(unsigned int));
169162232Sdcs	if (pmatches == NULL) {
169262232Sdcs		g->matchjump = NULL;
169362232Sdcs		return;
169462232Sdcs	}
169562232Sdcs
169662848Sdcs	g->matchjump = (int*) malloc(g->mlen * sizeof(unsigned int));
169762232Sdcs	if (g->matchjump == NULL)	/* Not a fatal error */
169862232Sdcs		return;
169962232Sdcs
170062232Sdcs	/* Set maximum possible jump for each character in the pattern */
170162232Sdcs	for (mindex = 0; mindex < g->mlen; mindex++)
170262232Sdcs		g->matchjump[mindex] = 2*g->mlen - mindex - 1;
170362232Sdcs
170462232Sdcs	/* Compute pmatches[] */
170562232Sdcs	for (mindex = g->mlen - 1, suffix = g->mlen; mindex >= 0;
170662232Sdcs	    mindex--, suffix--) {
170762232Sdcs		pmatches[mindex] = suffix;
170862232Sdcs
170962232Sdcs		/* If a mismatch is found, interrupting the substring,
171062232Sdcs		 * compute the matchjump for that position. If no
171162232Sdcs		 * mismatch is found, then a text substring mismatched
171262232Sdcs		 * against the suffix will also mismatch against the
171362232Sdcs		 * substring.
171462232Sdcs		 */
171562232Sdcs		while (suffix < g->mlen
171662232Sdcs		    && g->must[mindex] != g->must[suffix]) {
171762232Sdcs			g->matchjump[suffix] = MIN(g->matchjump[suffix],
171862232Sdcs			    g->mlen - mindex - 1);
171962232Sdcs			suffix = pmatches[suffix];
172062232Sdcs		}
172162232Sdcs	}
172262232Sdcs
172362232Sdcs	/* Compute the matchjump up to the last substring found to jump
172462232Sdcs	 * to the beginning of the largest must pattern prefix matching
172562232Sdcs	 * it's own suffix.
172662232Sdcs	 */
172762232Sdcs	for (mindex = 0; mindex <= suffix; mindex++)
172862232Sdcs		g->matchjump[mindex] = MIN(g->matchjump[mindex],
172962232Sdcs		    g->mlen + suffix - mindex);
173062232Sdcs
173162232Sdcs        ssuffix = pmatches[suffix];
173262391Sdcs        while (suffix < g->mlen) {
173362673Sdcs                while (suffix <= ssuffix && suffix < g->mlen) {
173462232Sdcs                        g->matchjump[suffix] = MIN(g->matchjump[suffix],
173562232Sdcs			    g->mlen + ssuffix - suffix);
173662232Sdcs                        suffix++;
173762232Sdcs                }
173886208Sdcs		if (suffix < g->mlen)
173986208Sdcs                	ssuffix = pmatches[ssuffix];
174062232Sdcs        }
174162232Sdcs
174262232Sdcs	free(pmatches);
174362232Sdcs}
174462232Sdcs
174562232Sdcs/*
17461573Srgrimes - pluscount - count + nesting
174792889Sobrien == static sopno pluscount(struct parse *p, struct re_guts *g);
17481573Srgrimes */
17491573Srgrimesstatic sopno			/* nesting depth */
1750170528Sdelphijpluscount(struct parse *p, struct re_guts *g)
17511573Srgrimes{
175292889Sobrien	sop *scan;
175392889Sobrien	sop s;
175492889Sobrien	sopno plusnest = 0;
175592889Sobrien	sopno maxnest = 0;
17561573Srgrimes
17571573Srgrimes	if (p->error != 0)
17581573Srgrimes		return(0);	/* there may not be an OEND */
17591573Srgrimes
17601573Srgrimes	scan = g->strip + 1;
17611573Srgrimes	do {
17621573Srgrimes		s = *scan++;
17631573Srgrimes		switch (OP(s)) {
17641573Srgrimes		case OPLUS_:
17651573Srgrimes			plusnest++;
17661573Srgrimes			break;
17671573Srgrimes		case O_PLUS:
17681573Srgrimes			if (plusnest > maxnest)
17691573Srgrimes				maxnest = plusnest;
17701573Srgrimes			plusnest--;
17711573Srgrimes			break;
17721573Srgrimes		}
17731573Srgrimes	} while (OP(s) != OEND);
17741573Srgrimes	if (plusnest != 0)
17751573Srgrimes		g->iflags |= BAD;
17761573Srgrimes	return(maxnest);
17771573Srgrimes}
1778