11573Srgrimes/*-
21573Srgrimes * Copyright (c) 1992, 1993, 1994 Henry Spencer.
31573Srgrimes * Copyright (c) 1992, 1993, 1994
41573Srgrimes *	The Regents of the University of California.  All rights reserved.
51573Srgrimes *
6227753Stheraven * Copyright (c) 2011 The FreeBSD Foundation
7227753Stheraven * All rights reserved.
8227753Stheraven * Portions of this software were developed by David Chisnall
9227753Stheraven * under sponsorship from the FreeBSD Foundation.
10227753Stheraven *
111573Srgrimes * This code is derived from software contributed to Berkeley by
121573Srgrimes * Henry Spencer.
131573Srgrimes *
141573Srgrimes * Redistribution and use in source and binary forms, with or without
151573Srgrimes * modification, are permitted provided that the following conditions
161573Srgrimes * are met:
171573Srgrimes * 1. Redistributions of source code must retain the above copyright
181573Srgrimes *    notice, this list of conditions and the following disclaimer.
191573Srgrimes * 2. Redistributions in binary form must reproduce the above copyright
201573Srgrimes *    notice, this list of conditions and the following disclaimer in the
211573Srgrimes *    documentation and/or other materials provided with the distribution.
221573Srgrimes * 4. Neither the name of the University nor the names of its contributors
231573Srgrimes *    may be used to endorse or promote products derived from this software
241573Srgrimes *    without specific prior written permission.
251573Srgrimes *
261573Srgrimes * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
271573Srgrimes * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
281573Srgrimes * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
291573Srgrimes * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
301573Srgrimes * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
311573Srgrimes * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
321573Srgrimes * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
331573Srgrimes * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
341573Srgrimes * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
351573Srgrimes * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
361573Srgrimes * SUCH DAMAGE.
371573Srgrimes *
381573Srgrimes *	@(#)regcomp.c	8.5 (Berkeley) 3/20/94
391573Srgrimes */
401573Srgrimes
411573Srgrimes#if defined(LIBC_SCCS) && !defined(lint)
421573Srgrimesstatic char sccsid[] = "@(#)regcomp.c	8.5 (Berkeley) 3/20/94";
431573Srgrimes#endif /* LIBC_SCCS and not lint */
4492986Sobrien#include <sys/cdefs.h>
4592986Sobrien__FBSDID("$FreeBSD$");
461573Srgrimes
471573Srgrimes#include <sys/types.h>
481573Srgrimes#include <stdio.h>
491573Srgrimes#include <string.h>
501573Srgrimes#include <ctype.h>
511573Srgrimes#include <limits.h>
521573Srgrimes#include <stdlib.h>
531573Srgrimes#include <regex.h>
54136091Sstefanf#include <runetype.h>
55132019Stjr#include <wchar.h>
56132019Stjr#include <wctype.h>
571573Srgrimes
5819277Sache#include "collate.h"
5919277Sache
601573Srgrimes#include "utils.h"
611573Srgrimes#include "regex2.h"
621573Srgrimes
631573Srgrimes#include "cname.h"
641573Srgrimes
651573Srgrimes/*
661573Srgrimes * parse structure, passed up and down to avoid global variables and
671573Srgrimes * other clumsinesses
681573Srgrimes */
691573Srgrimesstruct parse {
701573Srgrimes	char *next;		/* next character in RE */
711573Srgrimes	char *end;		/* end of string (-> NUL normally) */
721573Srgrimes	int error;		/* has an error been seen? */
731573Srgrimes	sop *strip;		/* malloced strip */
741573Srgrimes	sopno ssize;		/* malloced strip size (allocated) */
751573Srgrimes	sopno slen;		/* malloced strip length (used) */
761573Srgrimes	int ncsalloc;		/* number of csets allocated */
771573Srgrimes	struct re_guts *g;
781573Srgrimes#	define	NPAREN	10	/* we need to remember () 1-9 for back refs */
791573Srgrimes	sopno pbegin[NPAREN];	/* -> ( ([0] unused) */
801573Srgrimes	sopno pend[NPAREN];	/* -> ) ([0] unused) */
811573Srgrimes};
821573Srgrimes
831573Srgrimes/* ========= begin header generated by ./mkh ========= */
841573Srgrimes#ifdef __cplusplus
851573Srgrimesextern "C" {
861573Srgrimes#endif
871573Srgrimes
881573Srgrimes/* === regcomp.c === */
89227435Skevlostatic void p_ere(struct parse *p, int stop);
9092905Sobrienstatic void p_ere_exp(struct parse *p);
9192905Sobrienstatic void p_str(struct parse *p);
92227435Skevlostatic void p_bre(struct parse *p, int end1, int end2);
9392905Sobrienstatic int p_simp_re(struct parse *p, int starordinary);
9492905Sobrienstatic int p_count(struct parse *p);
9592905Sobrienstatic void p_bracket(struct parse *p);
9692905Sobrienstatic void p_b_term(struct parse *p, cset *cs);
9792905Sobrienstatic void p_b_cclass(struct parse *p, cset *cs);
9892905Sobrienstatic void p_b_eclass(struct parse *p, cset *cs);
99132019Stjrstatic wint_t p_b_symbol(struct parse *p);
100132019Stjrstatic wint_t p_b_coll_elem(struct parse *p, wint_t endc);
101132019Stjrstatic wint_t othercase(wint_t ch);
102132019Stjrstatic void bothcases(struct parse *p, wint_t ch);
103132019Stjrstatic void ordinary(struct parse *p, wint_t ch);
10492905Sobrienstatic void nonnewline(struct parse *p);
10592905Sobrienstatic void repeat(struct parse *p, sopno start, int from, int to);
10692905Sobrienstatic int seterr(struct parse *p, int e);
10792905Sobrienstatic cset *allocset(struct parse *p);
10892905Sobrienstatic void freeset(struct parse *p, cset *cs);
109132019Stjrstatic void CHadd(struct parse *p, cset *cs, wint_t ch);
110132019Stjrstatic void CHaddrange(struct parse *p, cset *cs, wint_t min, wint_t max);
111132019Stjrstatic void CHaddtype(struct parse *p, cset *cs, wctype_t wct);
112132019Stjrstatic wint_t singleton(cset *cs);
11392905Sobrienstatic sopno dupl(struct parse *p, sopno start, sopno finish);
11492905Sobrienstatic void doemit(struct parse *p, sop op, size_t opnd);
11592905Sobrienstatic void doinsert(struct parse *p, sop op, size_t opnd, sopno pos);
11692905Sobrienstatic void dofwd(struct parse *p, sopno pos, sop value);
117227414Skevlostatic int enlarge(struct parse *p, sopno size);
11892905Sobrienstatic void stripsnug(struct parse *p, struct re_guts *g);
11992905Sobrienstatic void findmust(struct parse *p, struct re_guts *g);
120131973Stjrstatic int altoffset(sop *scan, int offset);
12192905Sobrienstatic void computejumps(struct parse *p, struct re_guts *g);
12292905Sobrienstatic void computematchjumps(struct parse *p, struct re_guts *g);
12392905Sobrienstatic sopno pluscount(struct parse *p, struct re_guts *g);
124132019Stjrstatic wint_t wgetnext(struct parse *p);
1251573Srgrimes
1261573Srgrimes#ifdef __cplusplus
1271573Srgrimes}
1281573Srgrimes#endif
1291573Srgrimes/* ========= end header generated by ./mkh ========= */
1301573Srgrimes
1311573Srgrimesstatic char nuls[10];		/* place to point scanner in event of error */
1321573Srgrimes
1331573Srgrimes/*
1341573Srgrimes * macros for use with parse structure
1351573Srgrimes * BEWARE:  these know that the parse structure is named `p' !!!
1361573Srgrimes */
1371573Srgrimes#define	PEEK()	(*p->next)
1381573Srgrimes#define	PEEK2()	(*(p->next+1))
1391573Srgrimes#define	MORE()	(p->next < p->end)
1401573Srgrimes#define	MORE2()	(p->next+1 < p->end)
1411573Srgrimes#define	SEE(c)	(MORE() && PEEK() == (c))
1421573Srgrimes#define	SEETWO(a, b)	(MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b))
1431573Srgrimes#define	EAT(c)	((SEE(c)) ? (NEXT(), 1) : 0)
1441573Srgrimes#define	EATTWO(a, b)	((SEETWO(a, b)) ? (NEXT2(), 1) : 0)
1451573Srgrimes#define	NEXT()	(p->next++)
1461573Srgrimes#define	NEXT2()	(p->next += 2)
1471573Srgrimes#define	NEXTn(n)	(p->next += (n))
1481573Srgrimes#define	GETNEXT()	(*p->next++)
149132019Stjr#define	WGETNEXT()	wgetnext(p)
1501573Srgrimes#define	SETERROR(e)	seterr(p, (e))
1511573Srgrimes#define	REQUIRE(co, e)	((co) || SETERROR(e))
1521573Srgrimes#define	MUSTSEE(c, e)	(REQUIRE(MORE() && PEEK() == (c), e))
1531573Srgrimes#define	MUSTEAT(c, e)	(REQUIRE(MORE() && GETNEXT() == (c), e))
1541573Srgrimes#define	MUSTNOTSEE(c, e)	(REQUIRE(!MORE() || PEEK() != (c), e))
1551573Srgrimes#define	EMIT(op, sopnd)	doemit(p, (sop)(op), (size_t)(sopnd))
1561573Srgrimes#define	INSERT(op, pos)	doinsert(p, (sop)(op), HERE()-(pos)+1, pos)
1571573Srgrimes#define	AHEAD(pos)		dofwd(p, pos, HERE()-(pos))
1581573Srgrimes#define	ASTERN(sop, pos)	EMIT(sop, HERE()-pos)
1591573Srgrimes#define	HERE()		(p->slen)
1601573Srgrimes#define	THERE()		(p->slen - 1)
1611573Srgrimes#define	THERETHERE()	(p->slen - 2)
1621573Srgrimes#define	DROP(n)	(p->slen -= (n))
1631573Srgrimes
1641573Srgrimes#ifndef NDEBUG
1651573Srgrimesstatic int never = 0;		/* for use in asserts; shuts lint up */
1661573Srgrimes#else
1671573Srgrimes#define	never	0		/* some <assert.h>s have bugs too */
1681573Srgrimes#endif
1691573Srgrimes
17062232Sdcs/* Macro used by computejump()/computematchjump() */
17162232Sdcs#define MIN(a,b)	((a)<(b)?(a):(b))
17262232Sdcs
1731573Srgrimes/*
1741573Srgrimes - regcomp - interface for parser and compilation
1751573Srgrimes = extern int regcomp(regex_t *, const char *, int);
1761573Srgrimes = #define	REG_BASIC	0000
1771573Srgrimes = #define	REG_EXTENDED	0001
1781573Srgrimes = #define	REG_ICASE	0002
1791573Srgrimes = #define	REG_NOSUB	0004
1801573Srgrimes = #define	REG_NEWLINE	0010
1811573Srgrimes = #define	REG_NOSPEC	0020
1821573Srgrimes = #define	REG_PEND	0040
1831573Srgrimes = #define	REG_DUMP	0200
1841573Srgrimes */
1851573Srgrimesint				/* 0 success, otherwise REG_something */
186170528Sdelphijregcomp(regex_t * __restrict preg,
187170528Sdelphij	const char * __restrict pattern,
188170528Sdelphij	int cflags)
1891573Srgrimes{
1901573Srgrimes	struct parse pa;
19192889Sobrien	struct re_guts *g;
19292889Sobrien	struct parse *p = &pa;
19392889Sobrien	int i;
19492889Sobrien	size_t len;
1951573Srgrimes#ifdef REDEBUG
1961573Srgrimes#	define	GOODFLAGS(f)	(f)
1971573Srgrimes#else
1981573Srgrimes#	define	GOODFLAGS(f)	((f)&~REG_DUMP)
1991573Srgrimes#endif
2001573Srgrimes
2011573Srgrimes	cflags = GOODFLAGS(cflags);
2021573Srgrimes	if ((cflags&REG_EXTENDED) && (cflags&REG_NOSPEC))
2031573Srgrimes		return(REG_INVARG);
2041573Srgrimes
2051573Srgrimes	if (cflags&REG_PEND) {
2061573Srgrimes		if (preg->re_endp < pattern)
2071573Srgrimes			return(REG_INVARG);
2081573Srgrimes		len = preg->re_endp - pattern;
2091573Srgrimes	} else
2101573Srgrimes		len = strlen((char *)pattern);
2111573Srgrimes
2121573Srgrimes	/* do the mallocs early so failure handling is easy */
213131973Stjr	g = (struct re_guts *)malloc(sizeof(struct re_guts));
2141573Srgrimes	if (g == NULL)
2151573Srgrimes		return(REG_ESPACE);
2161573Srgrimes	p->ssize = len/(size_t)2*(size_t)3 + (size_t)1;	/* ugh */
2171573Srgrimes	p->strip = (sop *)malloc(p->ssize * sizeof(sop));
2181573Srgrimes	p->slen = 0;
2191573Srgrimes	if (p->strip == NULL) {
2201573Srgrimes		free((char *)g);
2211573Srgrimes		return(REG_ESPACE);
2221573Srgrimes	}
2231573Srgrimes
2241573Srgrimes	/* set things up */
2251573Srgrimes	p->g = g;
2261573Srgrimes	p->next = (char *)pattern;	/* convenience; we do not modify it */
2271573Srgrimes	p->end = p->next + len;
2281573Srgrimes	p->error = 0;
2291573Srgrimes	p->ncsalloc = 0;
2301573Srgrimes	for (i = 0; i < NPAREN; i++) {
2311573Srgrimes		p->pbegin[i] = 0;
2321573Srgrimes		p->pend[i] = 0;
2331573Srgrimes	}
2341573Srgrimes	g->sets = NULL;
2351573Srgrimes	g->ncsets = 0;
2361573Srgrimes	g->cflags = cflags;
2371573Srgrimes	g->iflags = 0;
2381573Srgrimes	g->nbol = 0;
2391573Srgrimes	g->neol = 0;
2401573Srgrimes	g->must = NULL;
24162391Sdcs	g->moffset = -1;
24262263Sdcs	g->charjump = NULL;
24362263Sdcs	g->matchjump = NULL;
2441573Srgrimes	g->mlen = 0;
2451573Srgrimes	g->nsub = 0;
2461573Srgrimes	g->backrefs = 0;
2471573Srgrimes
2481573Srgrimes	/* do it */
2491573Srgrimes	EMIT(OEND, 0);
2501573Srgrimes	g->firststate = THERE();
2511573Srgrimes	if (cflags&REG_EXTENDED)
2521573Srgrimes		p_ere(p, OUT);
2531573Srgrimes	else if (cflags&REG_NOSPEC)
2541573Srgrimes		p_str(p);
2551573Srgrimes	else
2561573Srgrimes		p_bre(p, OUT, OUT);
2571573Srgrimes	EMIT(OEND, 0);
2581573Srgrimes	g->laststate = THERE();
2591573Srgrimes
2601573Srgrimes	/* tidy up loose ends and fill things in */
2611573Srgrimes	stripsnug(p, g);
2621573Srgrimes	findmust(p, g);
26362232Sdcs	/* only use Boyer-Moore algorithm if the pattern is bigger
26462232Sdcs	 * than three characters
26562232Sdcs	 */
26662232Sdcs	if(g->mlen > 3) {
26762232Sdcs		computejumps(p, g);
26862232Sdcs		computematchjumps(p, g);
26962755Sdcs		if(g->matchjump == NULL && g->charjump != NULL) {
27062232Sdcs			free(g->charjump);
27162232Sdcs			g->charjump = NULL;
27262232Sdcs		}
27362232Sdcs	}
2741573Srgrimes	g->nplus = pluscount(p, g);
2751573Srgrimes	g->magic = MAGIC2;
2761573Srgrimes	preg->re_nsub = g->nsub;
2771573Srgrimes	preg->re_g = g;
2781573Srgrimes	preg->re_magic = MAGIC1;
2791573Srgrimes#ifndef REDEBUG
2801573Srgrimes	/* not debugging, so can't rely on the assert() in regexec() */
2811573Srgrimes	if (g->iflags&BAD)
2821573Srgrimes		SETERROR(REG_ASSERT);
2831573Srgrimes#endif
2841573Srgrimes
2851573Srgrimes	/* win or lose, we're done */
2861573Srgrimes	if (p->error != 0)	/* lose */
2871573Srgrimes		regfree(preg);
2881573Srgrimes	return(p->error);
2891573Srgrimes}
2901573Srgrimes
2911573Srgrimes/*
2921573Srgrimes - p_ere - ERE parser top level, concatenation and alternation
293227435Skevlo == static void p_ere(struct parse *p, int_t stop);
2941573Srgrimes */
2951573Srgrimesstatic void
296170528Sdelphijp_ere(struct parse *p,
297227435Skevlo	int stop)		/* character this ERE should end at */
2981573Srgrimes{
29992889Sobrien	char c;
30092889Sobrien	sopno prevback;
30192889Sobrien	sopno prevfwd;
30292889Sobrien	sopno conc;
30392889Sobrien	int first = 1;		/* is this the first alternative? */
3041573Srgrimes
3051573Srgrimes	for (;;) {
3061573Srgrimes		/* do a bunch of concatenated expressions */
3071573Srgrimes		conc = HERE();
3081573Srgrimes		while (MORE() && (c = PEEK()) != '|' && c != stop)
3091573Srgrimes			p_ere_exp(p);
31017141Sjkh		(void)REQUIRE(HERE() != conc, REG_EMPTY);	/* require nonempty */
3111573Srgrimes
3121573Srgrimes		if (!EAT('|'))
3131573Srgrimes			break;		/* NOTE BREAK OUT */
3141573Srgrimes
3151573Srgrimes		if (first) {
3161573Srgrimes			INSERT(OCH_, conc);	/* offset is wrong */
3171573Srgrimes			prevfwd = conc;
3181573Srgrimes			prevback = conc;
3191573Srgrimes			first = 0;
3201573Srgrimes		}
3211573Srgrimes		ASTERN(OOR1, prevback);
3221573Srgrimes		prevback = THERE();
3231573Srgrimes		AHEAD(prevfwd);			/* fix previous offset */
3241573Srgrimes		prevfwd = HERE();
3251573Srgrimes		EMIT(OOR2, 0);			/* offset is very wrong */
3261573Srgrimes	}
3271573Srgrimes
3281573Srgrimes	if (!first) {		/* tail-end fixups */
3291573Srgrimes		AHEAD(prevfwd);
3301573Srgrimes		ASTERN(O_CH, prevback);
3311573Srgrimes	}
3321573Srgrimes
3331573Srgrimes	assert(!MORE() || SEE(stop));
3341573Srgrimes}
3351573Srgrimes
3361573Srgrimes/*
3371573Srgrimes - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op
33892889Sobrien == static void p_ere_exp(struct parse *p);
3391573Srgrimes */
3401573Srgrimesstatic void
341170528Sdelphijp_ere_exp(struct parse *p)
3421573Srgrimes{
34392889Sobrien	char c;
344132019Stjr	wint_t wc;
34592889Sobrien	sopno pos;
34692889Sobrien	int count;
34792889Sobrien	int count2;
34892889Sobrien	sopno subno;
3491573Srgrimes	int wascaret = 0;
3501573Srgrimes
3511573Srgrimes	assert(MORE());		/* caller should have ensured this */
3521573Srgrimes	c = GETNEXT();
3531573Srgrimes
3541573Srgrimes	pos = HERE();
3551573Srgrimes	switch (c) {
3561573Srgrimes	case '(':
35717141Sjkh		(void)REQUIRE(MORE(), REG_EPAREN);
3581573Srgrimes		p->g->nsub++;
3591573Srgrimes		subno = p->g->nsub;
3601573Srgrimes		if (subno < NPAREN)
3611573Srgrimes			p->pbegin[subno] = HERE();
3621573Srgrimes		EMIT(OLPAREN, subno);
3631573Srgrimes		if (!SEE(')'))
3641573Srgrimes			p_ere(p, ')');
3651573Srgrimes		if (subno < NPAREN) {
3661573Srgrimes			p->pend[subno] = HERE();
3671573Srgrimes			assert(p->pend[subno] != 0);
3681573Srgrimes		}
3691573Srgrimes		EMIT(ORPAREN, subno);
37017141Sjkh		(void)MUSTEAT(')', REG_EPAREN);
3711573Srgrimes		break;
3721573Srgrimes#ifndef POSIX_MISTAKE
3731573Srgrimes	case ')':		/* happens only if no current unmatched ( */
3741573Srgrimes		/*
3751573Srgrimes		 * You may ask, why the ifndef?  Because I didn't notice
3761573Srgrimes		 * this until slightly too late for 1003.2, and none of the
3771573Srgrimes		 * other 1003.2 regular-expression reviewers noticed it at
3781573Srgrimes		 * all.  So an unmatched ) is legal POSIX, at least until
3791573Srgrimes		 * we can get it fixed.
3801573Srgrimes		 */
3811573Srgrimes		SETERROR(REG_EPAREN);
3821573Srgrimes		break;
3831573Srgrimes#endif
3841573Srgrimes	case '^':
3851573Srgrimes		EMIT(OBOL, 0);
3861573Srgrimes		p->g->iflags |= USEBOL;
3871573Srgrimes		p->g->nbol++;
3881573Srgrimes		wascaret = 1;
3891573Srgrimes		break;
3901573Srgrimes	case '$':
3911573Srgrimes		EMIT(OEOL, 0);
3921573Srgrimes		p->g->iflags |= USEEOL;
3931573Srgrimes		p->g->neol++;
3941573Srgrimes		break;
3951573Srgrimes	case '|':
3961573Srgrimes		SETERROR(REG_EMPTY);
3971573Srgrimes		break;
3981573Srgrimes	case '*':
3991573Srgrimes	case '+':
4001573Srgrimes	case '?':
4011573Srgrimes		SETERROR(REG_BADRPT);
4021573Srgrimes		break;
4031573Srgrimes	case '.':
4041573Srgrimes		if (p->g->cflags&REG_NEWLINE)
4051573Srgrimes			nonnewline(p);
4061573Srgrimes		else
4071573Srgrimes			EMIT(OANY, 0);
4081573Srgrimes		break;
4091573Srgrimes	case '[':
4101573Srgrimes		p_bracket(p);
4111573Srgrimes		break;
4121573Srgrimes	case '\\':
41317141Sjkh		(void)REQUIRE(MORE(), REG_EESCAPE);
414132019Stjr		wc = WGETNEXT();
415132019Stjr		ordinary(p, wc);
4161573Srgrimes		break;
4171573Srgrimes	case '{':		/* okay as ordinary except if digit follows */
41849094Sache		(void)REQUIRE(!MORE() || !isdigit((uch)PEEK()), REG_BADRPT);
4191573Srgrimes		/* FALLTHROUGH */
4201573Srgrimes	default:
421132019Stjr		p->next--;
422132019Stjr		wc = WGETNEXT();
423132019Stjr		ordinary(p, wc);
4241573Srgrimes		break;
4251573Srgrimes	}
4261573Srgrimes
4271573Srgrimes	if (!MORE())
4281573Srgrimes		return;
4291573Srgrimes	c = PEEK();
4301573Srgrimes	/* we call { a repetition if followed by a digit */
4311573Srgrimes	if (!( c == '*' || c == '+' || c == '?' ||
43249094Sache				(c == '{' && MORE2() && isdigit((uch)PEEK2())) ))
4331573Srgrimes		return;		/* no repetition, we're done */
4341573Srgrimes	NEXT();
4351573Srgrimes
43617141Sjkh	(void)REQUIRE(!wascaret, REG_BADRPT);
4371573Srgrimes	switch (c) {
4381573Srgrimes	case '*':	/* implemented as +? */
4391573Srgrimes		/* this case does not require the (y|) trick, noKLUDGE */
4401573Srgrimes		INSERT(OPLUS_, pos);
4411573Srgrimes		ASTERN(O_PLUS, pos);
4421573Srgrimes		INSERT(OQUEST_, pos);
4431573Srgrimes		ASTERN(O_QUEST, pos);
4441573Srgrimes		break;
4451573Srgrimes	case '+':
4461573Srgrimes		INSERT(OPLUS_, pos);
4471573Srgrimes		ASTERN(O_PLUS, pos);
4481573Srgrimes		break;
4491573Srgrimes	case '?':
4501573Srgrimes		/* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
4511573Srgrimes		INSERT(OCH_, pos);		/* offset slightly wrong */
4521573Srgrimes		ASTERN(OOR1, pos);		/* this one's right */
4531573Srgrimes		AHEAD(pos);			/* fix the OCH_ */
4541573Srgrimes		EMIT(OOR2, 0);			/* offset very wrong... */
4551573Srgrimes		AHEAD(THERE());			/* ...so fix it */
4561573Srgrimes		ASTERN(O_CH, THERETHERE());
4571573Srgrimes		break;
4581573Srgrimes	case '{':
4591573Srgrimes		count = p_count(p);
4601573Srgrimes		if (EAT(',')) {
46149094Sache			if (isdigit((uch)PEEK())) {
4621573Srgrimes				count2 = p_count(p);
46317141Sjkh				(void)REQUIRE(count <= count2, REG_BADBR);
4641573Srgrimes			} else		/* single number with comma */
4651573Srgrimes				count2 = INFINITY;
4661573Srgrimes		} else		/* just a single number */
4671573Srgrimes			count2 = count;
4681573Srgrimes		repeat(p, pos, count, count2);
4691573Srgrimes		if (!EAT('}')) {	/* error heuristics */
4701573Srgrimes			while (MORE() && PEEK() != '}')
4711573Srgrimes				NEXT();
47217141Sjkh			(void)REQUIRE(MORE(), REG_EBRACE);
4731573Srgrimes			SETERROR(REG_BADBR);
4741573Srgrimes		}
4751573Srgrimes		break;
4761573Srgrimes	}
4771573Srgrimes
4781573Srgrimes	if (!MORE())
4791573Srgrimes		return;
4801573Srgrimes	c = PEEK();
4811573Srgrimes	if (!( c == '*' || c == '+' || c == '?' ||
48249094Sache				(c == '{' && MORE2() && isdigit((uch)PEEK2())) ) )
4831573Srgrimes		return;
4841573Srgrimes	SETERROR(REG_BADRPT);
4851573Srgrimes}
4861573Srgrimes
4871573Srgrimes/*
4881573Srgrimes - p_str - string (no metacharacters) "parser"
48992889Sobrien == static void p_str(struct parse *p);
4901573Srgrimes */
4911573Srgrimesstatic void
492170528Sdelphijp_str(struct parse *p)
4931573Srgrimes{
49417141Sjkh	(void)REQUIRE(MORE(), REG_EMPTY);
4951573Srgrimes	while (MORE())
496132019Stjr		ordinary(p, WGETNEXT());
4971573Srgrimes}
4981573Srgrimes
4991573Srgrimes/*
5001573Srgrimes - p_bre - BRE parser top level, anchoring and concatenation
501227435Skevlo == static void p_bre(struct parse *p,  int end1, \
502227435Skevlo ==	int end2);
5031573Srgrimes * Giving end1 as OUT essentially eliminates the end1/end2 check.
5041573Srgrimes *
5051573Srgrimes * This implementation is a bit of a kludge, in that a trailing $ is first
506131973Stjr * taken as an ordinary character and then revised to be an anchor.
5071573Srgrimes * The amount of lookahead needed to avoid this kludge is excessive.
5081573Srgrimes */
5091573Srgrimesstatic void
510170528Sdelphijp_bre(struct parse *p,
511227435Skevlo	int end1,		/* first terminating character */
512227435Skevlo	int end2)		/* second terminating character */
5131573Srgrimes{
51492889Sobrien	sopno start = HERE();
51592889Sobrien	int first = 1;			/* first subexpression? */
51692889Sobrien	int wasdollar = 0;
5171573Srgrimes
5181573Srgrimes	if (EAT('^')) {
5191573Srgrimes		EMIT(OBOL, 0);
5201573Srgrimes		p->g->iflags |= USEBOL;
5211573Srgrimes		p->g->nbol++;
5221573Srgrimes	}
5231573Srgrimes	while (MORE() && !SEETWO(end1, end2)) {
5241573Srgrimes		wasdollar = p_simp_re(p, first);
5251573Srgrimes		first = 0;
5261573Srgrimes	}
5271573Srgrimes	if (wasdollar) {	/* oops, that was a trailing anchor */
5281573Srgrimes		DROP(1);
5291573Srgrimes		EMIT(OEOL, 0);
5301573Srgrimes		p->g->iflags |= USEEOL;
5311573Srgrimes		p->g->neol++;
5321573Srgrimes	}
5331573Srgrimes
53417141Sjkh	(void)REQUIRE(HERE() != start, REG_EMPTY);	/* require nonempty */
5351573Srgrimes}
5361573Srgrimes
5371573Srgrimes/*
5381573Srgrimes - p_simp_re - parse a simple RE, an atom possibly followed by a repetition
53992889Sobrien == static int p_simp_re(struct parse *p, int starordinary);
5401573Srgrimes */
5411573Srgrimesstatic int			/* was the simple RE an unbackslashed $? */
542170528Sdelphijp_simp_re(struct parse *p,
543170528Sdelphij	int starordinary)	/* is a leading * an ordinary character? */
5441573Srgrimes{
54592889Sobrien	int c;
54692889Sobrien	int count;
54792889Sobrien	int count2;
54892889Sobrien	sopno pos;
54992889Sobrien	int i;
550132019Stjr	wint_t wc;
55192889Sobrien	sopno subno;
5521573Srgrimes#	define	BACKSL	(1<<CHAR_BIT)
5531573Srgrimes
5541573Srgrimes	pos = HERE();		/* repetion op, if any, covers from here */
5551573Srgrimes
5561573Srgrimes	assert(MORE());		/* caller should have ensured this */
5571573Srgrimes	c = GETNEXT();
5581573Srgrimes	if (c == '\\') {
55917141Sjkh		(void)REQUIRE(MORE(), REG_EESCAPE);
56049094Sache		c = BACKSL | GETNEXT();
5611573Srgrimes	}
5621573Srgrimes	switch (c) {
5631573Srgrimes	case '.':
5641573Srgrimes		if (p->g->cflags&REG_NEWLINE)
5651573Srgrimes			nonnewline(p);
5661573Srgrimes		else
5671573Srgrimes			EMIT(OANY, 0);
5681573Srgrimes		break;
5691573Srgrimes	case '[':
5701573Srgrimes		p_bracket(p);
5711573Srgrimes		break;
5721573Srgrimes	case BACKSL|'{':
5731573Srgrimes		SETERROR(REG_BADRPT);
5741573Srgrimes		break;
5751573Srgrimes	case BACKSL|'(':
5761573Srgrimes		p->g->nsub++;
5771573Srgrimes		subno = p->g->nsub;
5781573Srgrimes		if (subno < NPAREN)
5791573Srgrimes			p->pbegin[subno] = HERE();
5801573Srgrimes		EMIT(OLPAREN, subno);
5811573Srgrimes		/* the MORE here is an error heuristic */
5821573Srgrimes		if (MORE() && !SEETWO('\\', ')'))
5831573Srgrimes			p_bre(p, '\\', ')');
5841573Srgrimes		if (subno < NPAREN) {
5851573Srgrimes			p->pend[subno] = HERE();
5861573Srgrimes			assert(p->pend[subno] != 0);
5871573Srgrimes		}
5881573Srgrimes		EMIT(ORPAREN, subno);
58917141Sjkh		(void)REQUIRE(EATTWO('\\', ')'), REG_EPAREN);
5901573Srgrimes		break;
5911573Srgrimes	case BACKSL|')':	/* should not get here -- must be user */
5921573Srgrimes	case BACKSL|'}':
5931573Srgrimes		SETERROR(REG_EPAREN);
5941573Srgrimes		break;
5951573Srgrimes	case BACKSL|'1':
5961573Srgrimes	case BACKSL|'2':
5971573Srgrimes	case BACKSL|'3':
5981573Srgrimes	case BACKSL|'4':
5991573Srgrimes	case BACKSL|'5':
6001573Srgrimes	case BACKSL|'6':
6011573Srgrimes	case BACKSL|'7':
6021573Srgrimes	case BACKSL|'8':
6031573Srgrimes	case BACKSL|'9':
6041573Srgrimes		i = (c&~BACKSL) - '0';
6051573Srgrimes		assert(i < NPAREN);
6061573Srgrimes		if (p->pend[i] != 0) {
6071573Srgrimes			assert(i <= p->g->nsub);
6081573Srgrimes			EMIT(OBACK_, i);
6091573Srgrimes			assert(p->pbegin[i] != 0);
6101573Srgrimes			assert(OP(p->strip[p->pbegin[i]]) == OLPAREN);
6111573Srgrimes			assert(OP(p->strip[p->pend[i]]) == ORPAREN);
6121573Srgrimes			(void) dupl(p, p->pbegin[i]+1, p->pend[i]);
6131573Srgrimes			EMIT(O_BACK, i);
6141573Srgrimes		} else
6151573Srgrimes			SETERROR(REG_ESUBREG);
6161573Srgrimes		p->g->backrefs = 1;
6171573Srgrimes		break;
6181573Srgrimes	case '*':
61917141Sjkh		(void)REQUIRE(starordinary, REG_BADRPT);
6201573Srgrimes		/* FALLTHROUGH */
6211573Srgrimes	default:
622132019Stjr		p->next--;
623132019Stjr		wc = WGETNEXT();
624132019Stjr		ordinary(p, wc);
6251573Srgrimes		break;
6261573Srgrimes	}
6271573Srgrimes
6281573Srgrimes	if (EAT('*')) {		/* implemented as +? */
6291573Srgrimes		/* this case does not require the (y|) trick, noKLUDGE */
6301573Srgrimes		INSERT(OPLUS_, pos);
6311573Srgrimes		ASTERN(O_PLUS, pos);
6321573Srgrimes		INSERT(OQUEST_, pos);
6331573Srgrimes		ASTERN(O_QUEST, pos);
6341573Srgrimes	} else if (EATTWO('\\', '{')) {
6351573Srgrimes		count = p_count(p);
6361573Srgrimes		if (EAT(',')) {
63749094Sache			if (MORE() && isdigit((uch)PEEK())) {
6381573Srgrimes				count2 = p_count(p);
63917141Sjkh				(void)REQUIRE(count <= count2, REG_BADBR);
6401573Srgrimes			} else		/* single number with comma */
6411573Srgrimes				count2 = INFINITY;
6421573Srgrimes		} else		/* just a single number */
6431573Srgrimes			count2 = count;
6441573Srgrimes		repeat(p, pos, count, count2);
6451573Srgrimes		if (!EATTWO('\\', '}')) {	/* error heuristics */
6461573Srgrimes			while (MORE() && !SEETWO('\\', '}'))
6471573Srgrimes				NEXT();
64817141Sjkh			(void)REQUIRE(MORE(), REG_EBRACE);
6491573Srgrimes			SETERROR(REG_BADBR);
6501573Srgrimes		}
65149094Sache	} else if (c == '$')     /* $ (but not \$) ends it */
6521573Srgrimes		return(1);
6531573Srgrimes
6541573Srgrimes	return(0);
6551573Srgrimes}
6561573Srgrimes
6571573Srgrimes/*
6581573Srgrimes - p_count - parse a repetition count
65992889Sobrien == static int p_count(struct parse *p);
6601573Srgrimes */
6611573Srgrimesstatic int			/* the value */
662170528Sdelphijp_count(struct parse *p)
6631573Srgrimes{
66492889Sobrien	int count = 0;
66592889Sobrien	int ndigits = 0;
6661573Srgrimes
66749094Sache	while (MORE() && isdigit((uch)PEEK()) && count <= DUPMAX) {
6681573Srgrimes		count = count*10 + (GETNEXT() - '0');
6691573Srgrimes		ndigits++;
6701573Srgrimes	}
6711573Srgrimes
67217141Sjkh	(void)REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR);
6731573Srgrimes	return(count);
6741573Srgrimes}
6751573Srgrimes
6761573Srgrimes/*
6771573Srgrimes - p_bracket - parse a bracketed character list
67892889Sobrien == static void p_bracket(struct parse *p);
6791573Srgrimes */
6801573Srgrimesstatic void
681170528Sdelphijp_bracket(struct parse *p)
6821573Srgrimes{
683132019Stjr	cset *cs;
684132019Stjr	wint_t ch;
6851573Srgrimes
6861573Srgrimes	/* Dept of Truly Sickening Special-Case Kludges */
6871573Srgrimes	if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]", 6) == 0) {
6881573Srgrimes		EMIT(OBOW, 0);
6891573Srgrimes		NEXTn(6);
6901573Srgrimes		return;
6911573Srgrimes	}
6921573Srgrimes	if (p->next + 5 < p->end && strncmp(p->next, "[:>:]]", 6) == 0) {
6931573Srgrimes		EMIT(OEOW, 0);
6941573Srgrimes		NEXTn(6);
6951573Srgrimes		return;
6961573Srgrimes	}
6971573Srgrimes
698132019Stjr	if ((cs = allocset(p)) == NULL)
699132019Stjr		return;
700132019Stjr
701132019Stjr	if (p->g->cflags&REG_ICASE)
702132019Stjr		cs->icase = 1;
7031573Srgrimes	if (EAT('^'))
704132019Stjr		cs->invert = 1;
7051573Srgrimes	if (EAT(']'))
706132019Stjr		CHadd(p, cs, ']');
7071573Srgrimes	else if (EAT('-'))
708132019Stjr		CHadd(p, cs, '-');
7091573Srgrimes	while (MORE() && PEEK() != ']' && !SEETWO('-', ']'))
7101573Srgrimes		p_b_term(p, cs);
7111573Srgrimes	if (EAT('-'))
712132019Stjr		CHadd(p, cs, '-');
71317141Sjkh	(void)MUSTEAT(']', REG_EBRACK);
7141573Srgrimes
7151573Srgrimes	if (p->error != 0)	/* don't mess things up further */
7161573Srgrimes		return;
7171573Srgrimes
718132019Stjr	if (cs->invert && p->g->cflags&REG_NEWLINE)
719132019Stjr		cs->bmp['\n' >> 3] |= 1 << ('\n' & 7);
7201573Srgrimes
721132019Stjr	if ((ch = singleton(cs)) != OUT) {	/* optimize singleton sets */
722132019Stjr		ordinary(p, ch);
7231573Srgrimes		freeset(p, cs);
7241573Srgrimes	} else
725132019Stjr		EMIT(OANYOF, (int)(cs - p->g->sets));
7261573Srgrimes}
7271573Srgrimes
7281573Srgrimes/*
7291573Srgrimes - p_b_term - parse one term of a bracketed character list
73092889Sobrien == static void p_b_term(struct parse *p, cset *cs);
7311573Srgrimes */
7321573Srgrimesstatic void
733170528Sdelphijp_b_term(struct parse *p, cset *cs)
7341573Srgrimes{
73592889Sobrien	char c;
736132019Stjr	wint_t start, finish;
737132019Stjr	wint_t i;
738227753Stheraven	struct xlocale_collate *table =
739227753Stheraven		(struct xlocale_collate*)__get_locale()->components[XLC_COLLATE];
7401573Srgrimes
7411573Srgrimes	/* classify what we've got */
7421573Srgrimes	switch ((MORE()) ? PEEK() : '\0') {
7431573Srgrimes	case '[':
7441573Srgrimes		c = (MORE2()) ? PEEK2() : '\0';
7451573Srgrimes		break;
7461573Srgrimes	case '-':
7471573Srgrimes		SETERROR(REG_ERANGE);
7481573Srgrimes		return;			/* NOTE RETURN */
7491573Srgrimes		break;
7501573Srgrimes	default:
7511573Srgrimes		c = '\0';
7521573Srgrimes		break;
7531573Srgrimes	}
7541573Srgrimes
7551573Srgrimes	switch (c) {
7561573Srgrimes	case ':':		/* character class */
7571573Srgrimes		NEXT2();
75817141Sjkh		(void)REQUIRE(MORE(), REG_EBRACK);
7591573Srgrimes		c = PEEK();
76017141Sjkh		(void)REQUIRE(c != '-' && c != ']', REG_ECTYPE);
7611573Srgrimes		p_b_cclass(p, cs);
76217141Sjkh		(void)REQUIRE(MORE(), REG_EBRACK);
76317141Sjkh		(void)REQUIRE(EATTWO(':', ']'), REG_ECTYPE);
7641573Srgrimes		break;
7651573Srgrimes	case '=':		/* equivalence class */
7661573Srgrimes		NEXT2();
76717141Sjkh		(void)REQUIRE(MORE(), REG_EBRACK);
7681573Srgrimes		c = PEEK();
76917141Sjkh		(void)REQUIRE(c != '-' && c != ']', REG_ECOLLATE);
7701573Srgrimes		p_b_eclass(p, cs);
77117141Sjkh		(void)REQUIRE(MORE(), REG_EBRACK);
77217141Sjkh		(void)REQUIRE(EATTWO('=', ']'), REG_ECOLLATE);
7731573Srgrimes		break;
7741573Srgrimes	default:		/* symbol, ordinary character, or range */
7751573Srgrimes		start = p_b_symbol(p);
7761573Srgrimes		if (SEE('-') && MORE2() && PEEK2() != ']') {
7771573Srgrimes			/* range */
7781573Srgrimes			NEXT();
7791573Srgrimes			if (EAT('-'))
7801573Srgrimes				finish = '-';
7811573Srgrimes			else
7821573Srgrimes				finish = p_b_symbol(p);
7831573Srgrimes		} else
7841573Srgrimes			finish = start;
78517514Sache		if (start == finish)
786132019Stjr			CHadd(p, cs, start);
78717514Sache		else {
788227753Stheraven			if (table->__collate_load_error) {
78924637Sache				(void)REQUIRE((uch)start <= (uch)finish, REG_ERANGE);
790132019Stjr				CHaddrange(p, cs, start, finish);
79124637Sache			} else {
792227753Stheraven				(void)REQUIRE(__collate_range_cmp(table, start, finish) <= 0, REG_ERANGE);
793132019Stjr				for (i = 0; i <= UCHAR_MAX; i++) {
794227753Stheraven					if (   __collate_range_cmp(table, start, i) <= 0
795227753Stheraven					    && __collate_range_cmp(table, i, finish) <= 0
79624637Sache					   )
797132019Stjr						CHadd(p, cs, i);
79824637Sache				}
79917514Sache			}
80017514Sache		}
8011573Srgrimes		break;
8021573Srgrimes	}
8031573Srgrimes}
8041573Srgrimes
8051573Srgrimes/*
8061573Srgrimes - p_b_cclass - parse a character-class name and deal with it
80792889Sobrien == static void p_b_cclass(struct parse *p, cset *cs);
8081573Srgrimes */
8091573Srgrimesstatic void
810170528Sdelphijp_b_cclass(struct parse *p, cset *cs)
8111573Srgrimes{
81292889Sobrien	char *sp = p->next;
81392889Sobrien	size_t len;
814132019Stjr	wctype_t wct;
815132019Stjr	char clname[16];
8161573Srgrimes
81717508Sache	while (MORE() && isalpha((uch)PEEK()))
8181573Srgrimes		NEXT();
8191573Srgrimes	len = p->next - sp;
820132019Stjr	if (len >= sizeof(clname) - 1) {
8211573Srgrimes		SETERROR(REG_ECTYPE);
8221573Srgrimes		return;
8231573Srgrimes	}
824132019Stjr	memcpy(clname, sp, len);
825132019Stjr	clname[len] = '\0';
826132019Stjr	if ((wct = wctype(clname)) == 0) {
827132019Stjr		SETERROR(REG_ECTYPE);
828132019Stjr		return;
82917508Sache	}
830132019Stjr	CHaddtype(p, cs, wct);
8311573Srgrimes}
8321573Srgrimes
8331573Srgrimes/*
8341573Srgrimes - p_b_eclass - parse an equivalence-class name and deal with it
83592889Sobrien == static void p_b_eclass(struct parse *p, cset *cs);
8361573Srgrimes *
8371573Srgrimes * This implementation is incomplete. xxx
8381573Srgrimes */
8391573Srgrimesstatic void
840170528Sdelphijp_b_eclass(struct parse *p, cset *cs)
8411573Srgrimes{
842132019Stjr	wint_t c;
8431573Srgrimes
8441573Srgrimes	c = p_b_coll_elem(p, '=');
845132019Stjr	CHadd(p, cs, c);
8461573Srgrimes}
8471573Srgrimes
8481573Srgrimes/*
8491573Srgrimes - p_b_symbol - parse a character or [..]ed multicharacter collating symbol
850227414Skevlo == static wint_t p_b_symbol(struct parse *p);
8511573Srgrimes */
852132019Stjrstatic wint_t			/* value of symbol */
853170528Sdelphijp_b_symbol(struct parse *p)
8541573Srgrimes{
855132019Stjr	wint_t value;
8561573Srgrimes
85717141Sjkh	(void)REQUIRE(MORE(), REG_EBRACK);
8581573Srgrimes	if (!EATTWO('[', '.'))
859132019Stjr		return(WGETNEXT());
8601573Srgrimes
8611573Srgrimes	/* collating symbol */
8621573Srgrimes	value = p_b_coll_elem(p, '.');
86317141Sjkh	(void)REQUIRE(EATTWO('.', ']'), REG_ECOLLATE);
8641573Srgrimes	return(value);
8651573Srgrimes}
8661573Srgrimes
8671573Srgrimes/*
8681573Srgrimes - p_b_coll_elem - parse a collating-element name and look it up
869227414Skevlo == static wint_t p_b_coll_elem(struct parse *p, wint_t endc);
8701573Srgrimes */
871132019Stjrstatic wint_t			/* value of collating element */
872170528Sdelphijp_b_coll_elem(struct parse *p,
873170528Sdelphij	wint_t endc)		/* name ended by endc,']' */
8741573Srgrimes{
87592889Sobrien	char *sp = p->next;
87692889Sobrien	struct cname *cp;
87792889Sobrien	int len;
878132019Stjr	mbstate_t mbs;
879132019Stjr	wchar_t wc;
880132019Stjr	size_t clen;
8811573Srgrimes
8821573Srgrimes	while (MORE() && !SEETWO(endc, ']'))
8831573Srgrimes		NEXT();
8841573Srgrimes	if (!MORE()) {
8851573Srgrimes		SETERROR(REG_EBRACK);
8861573Srgrimes		return(0);
8871573Srgrimes	}
8881573Srgrimes	len = p->next - sp;
8891573Srgrimes	for (cp = cnames; cp->name != NULL; cp++)
8901573Srgrimes		if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0')
8911573Srgrimes			return(cp->code);	/* known name */
892132019Stjr	memset(&mbs, 0, sizeof(mbs));
893132019Stjr	if ((clen = mbrtowc(&wc, sp, len, &mbs)) == len)
894132019Stjr		return (wc);			/* single character */
895132019Stjr	else if (clen == (size_t)-1 || clen == (size_t)-2)
896132019Stjr		SETERROR(REG_ILLSEQ);
897132019Stjr	else
898132019Stjr		SETERROR(REG_ECOLLATE);		/* neither */
8991573Srgrimes	return(0);
9001573Srgrimes}
9011573Srgrimes
9021573Srgrimes/*
9031573Srgrimes - othercase - return the case counterpart of an alphabetic
904227414Skevlo == static wint_t othercase(wint_t ch);
9051573Srgrimes */
906132019Stjrstatic wint_t			/* if no counterpart, return ch */
907170528Sdelphijothercase(wint_t ch)
9081573Srgrimes{
909132019Stjr	assert(iswalpha(ch));
910132019Stjr	if (iswupper(ch))
911132019Stjr		return(towlower(ch));
912132019Stjr	else if (iswlower(ch))
913132019Stjr		return(towupper(ch));
9141573Srgrimes	else			/* peculiar, but could happen */
9151573Srgrimes		return(ch);
9161573Srgrimes}
9171573Srgrimes
9181573Srgrimes/*
9191573Srgrimes - bothcases - emit a dualcase version of a two-case character
920227414Skevlo == static void bothcases(struct parse *p, wint_t ch);
9211573Srgrimes *
9221573Srgrimes * Boy, is this implementation ever a kludge...
9231573Srgrimes */
9241573Srgrimesstatic void
925170528Sdelphijbothcases(struct parse *p, wint_t ch)
9261573Srgrimes{
92792889Sobrien	char *oldnext = p->next;
92892889Sobrien	char *oldend = p->end;
929132019Stjr	char bracket[3 + MB_LEN_MAX];
930132019Stjr	size_t n;
931132019Stjr	mbstate_t mbs;
9321573Srgrimes
9331573Srgrimes	assert(othercase(ch) != ch);	/* p_bracket() would recurse */
9341573Srgrimes	p->next = bracket;
935132019Stjr	memset(&mbs, 0, sizeof(mbs));
936132019Stjr	n = wcrtomb(bracket, ch, &mbs);
937132019Stjr	assert(n != (size_t)-1);
938132019Stjr	bracket[n] = ']';
939132019Stjr	bracket[n + 1] = '\0';
940132019Stjr	p->end = bracket+n+1;
9411573Srgrimes	p_bracket(p);
942132019Stjr	assert(p->next == p->end);
9431573Srgrimes	p->next = oldnext;
9441573Srgrimes	p->end = oldend;
9451573Srgrimes}
9461573Srgrimes
9471573Srgrimes/*
9481573Srgrimes - ordinary - emit an ordinary character
949227414Skevlo == static void ordinary(struct parse *p, wint_t ch);
9501573Srgrimes */
9511573Srgrimesstatic void
952170528Sdelphijordinary(struct parse *p, wint_t ch)
9531573Srgrimes{
954132019Stjr	cset *cs;
9551573Srgrimes
956132019Stjr	if ((p->g->cflags&REG_ICASE) && iswalpha(ch) && othercase(ch) != ch)
9571573Srgrimes		bothcases(p, ch);
958132019Stjr	else if ((ch & OPDMASK) == ch)
959132019Stjr		EMIT(OCHAR, ch);
960132019Stjr	else {
961132019Stjr		/*
962132019Stjr		 * Kludge: character is too big to fit into an OCHAR operand.
963132019Stjr		 * Emit a singleton set.
964132019Stjr		 */
965132019Stjr		if ((cs = allocset(p)) == NULL)
966132019Stjr			return;
967132019Stjr		CHadd(p, cs, ch);
968132019Stjr		EMIT(OANYOF, (int)(cs - p->g->sets));
969132019Stjr	}
9701573Srgrimes}
9711573Srgrimes
9721573Srgrimes/*
9731573Srgrimes - nonnewline - emit REG_NEWLINE version of OANY
97492889Sobrien == static void nonnewline(struct parse *p);
9751573Srgrimes *
9761573Srgrimes * Boy, is this implementation ever a kludge...
9771573Srgrimes */
9781573Srgrimesstatic void
979170528Sdelphijnonnewline(struct parse *p)
9801573Srgrimes{
98192889Sobrien	char *oldnext = p->next;
98292889Sobrien	char *oldend = p->end;
9831573Srgrimes	char bracket[4];
9841573Srgrimes
9851573Srgrimes	p->next = bracket;
9861573Srgrimes	p->end = bracket+3;
9871573Srgrimes	bracket[0] = '^';
9881573Srgrimes	bracket[1] = '\n';
9891573Srgrimes	bracket[2] = ']';
9901573Srgrimes	bracket[3] = '\0';
9911573Srgrimes	p_bracket(p);
9921573Srgrimes	assert(p->next == bracket+3);
9931573Srgrimes	p->next = oldnext;
9941573Srgrimes	p->end = oldend;
9951573Srgrimes}
9961573Srgrimes
9971573Srgrimes/*
9981573Srgrimes - repeat - generate code for a bounded repetition, recursively if needed
99992889Sobrien == static void repeat(struct parse *p, sopno start, int from, int to);
10001573Srgrimes */
10011573Srgrimesstatic void
1002170528Sdelphijrepeat(struct parse *p,
1003170528Sdelphij	sopno start,		/* operand from here to end of strip */
1004170528Sdelphij	int from,		/* repeated from this number */
1005170528Sdelphij	int to)			/* to this number of times (maybe INFINITY) */
10061573Srgrimes{
100792889Sobrien	sopno finish = HERE();
10081573Srgrimes#	define	N	2
10091573Srgrimes#	define	INF	3
10101573Srgrimes#	define	REP(f, t)	((f)*8 + (t))
10111573Srgrimes#	define	MAP(n)	(((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N)
101292889Sobrien	sopno copy;
10131573Srgrimes
10141573Srgrimes	if (p->error != 0)	/* head off possible runaway recursion */
10151573Srgrimes		return;
10161573Srgrimes
10171573Srgrimes	assert(from <= to);
10181573Srgrimes
10191573Srgrimes	switch (REP(MAP(from), MAP(to))) {
10201573Srgrimes	case REP(0, 0):			/* must be user doing this */
10211573Srgrimes		DROP(finish-start);	/* drop the operand */
10221573Srgrimes		break;
10231573Srgrimes	case REP(0, 1):			/* as x{1,1}? */
10241573Srgrimes	case REP(0, N):			/* as x{1,n}? */
10251573Srgrimes	case REP(0, INF):		/* as x{1,}? */
10261573Srgrimes		/* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
10271573Srgrimes		INSERT(OCH_, start);		/* offset is wrong... */
10281573Srgrimes		repeat(p, start+1, 1, to);
10291573Srgrimes		ASTERN(OOR1, start);
10301573Srgrimes		AHEAD(start);			/* ... fix it */
10311573Srgrimes		EMIT(OOR2, 0);
10321573Srgrimes		AHEAD(THERE());
10331573Srgrimes		ASTERN(O_CH, THERETHERE());
10341573Srgrimes		break;
10351573Srgrimes	case REP(1, 1):			/* trivial case */
10361573Srgrimes		/* done */
10371573Srgrimes		break;
10381573Srgrimes	case REP(1, N):			/* as x?x{1,n-1} */
10391573Srgrimes		/* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
10401573Srgrimes		INSERT(OCH_, start);
10411573Srgrimes		ASTERN(OOR1, start);
10421573Srgrimes		AHEAD(start);
10431573Srgrimes		EMIT(OOR2, 0);			/* offset very wrong... */
10441573Srgrimes		AHEAD(THERE());			/* ...so fix it */
10451573Srgrimes		ASTERN(O_CH, THERETHERE());
10461573Srgrimes		copy = dupl(p, start+1, finish+1);
10471573Srgrimes		assert(copy == finish+4);
10481573Srgrimes		repeat(p, copy, 1, to-1);
10491573Srgrimes		break;
10501573Srgrimes	case REP(1, INF):		/* as x+ */
10511573Srgrimes		INSERT(OPLUS_, start);
10521573Srgrimes		ASTERN(O_PLUS, start);
10531573Srgrimes		break;
10541573Srgrimes	case REP(N, N):			/* as xx{m-1,n-1} */
10551573Srgrimes		copy = dupl(p, start, finish);
10561573Srgrimes		repeat(p, copy, from-1, to-1);
10571573Srgrimes		break;
10581573Srgrimes	case REP(N, INF):		/* as xx{n-1,INF} */
10591573Srgrimes		copy = dupl(p, start, finish);
10601573Srgrimes		repeat(p, copy, from-1, to);
10611573Srgrimes		break;
10621573Srgrimes	default:			/* "can't happen" */
10631573Srgrimes		SETERROR(REG_ASSERT);	/* just in case */
10641573Srgrimes		break;
10651573Srgrimes	}
10661573Srgrimes}
10671573Srgrimes
10681573Srgrimes/*
1069132019Stjr - wgetnext - helper function for WGETNEXT() macro. Gets the next wide
1070132019Stjr - character from the parse struct, signals a REG_ILLSEQ error if the
1071132019Stjr - character can't be converted. Returns the number of bytes consumed.
1072132019Stjr */
1073132019Stjrstatic wint_t
1074170528Sdelphijwgetnext(struct parse *p)
1075132019Stjr{
1076132019Stjr	mbstate_t mbs;
1077132019Stjr	wchar_t wc;
1078132019Stjr	size_t n;
1079132019Stjr
1080132019Stjr	memset(&mbs, 0, sizeof(mbs));
1081132019Stjr	n = mbrtowc(&wc, p->next, p->end - p->next, &mbs);
1082132019Stjr	if (n == (size_t)-1 || n == (size_t)-2) {
1083132019Stjr		SETERROR(REG_ILLSEQ);
1084132019Stjr		return (0);
1085132019Stjr	}
1086132019Stjr	if (n == 0)
1087132019Stjr		n = 1;
1088132019Stjr	p->next += n;
1089132019Stjr	return (wc);
1090132019Stjr}
1091132019Stjr
1092132019Stjr/*
10931573Srgrimes - seterr - set an error condition
109492889Sobrien == static int seterr(struct parse *p, int e);
10951573Srgrimes */
10961573Srgrimesstatic int			/* useless but makes type checking happy */
1097170528Sdelphijseterr(struct parse *p, int e)
10981573Srgrimes{
10991573Srgrimes	if (p->error == 0)	/* keep earliest error condition */
11001573Srgrimes		p->error = e;
11011573Srgrimes	p->next = nuls;		/* try to bring things to a halt */
11021573Srgrimes	p->end = nuls;
11031573Srgrimes	return(0);		/* make the return value well-defined */
11041573Srgrimes}
11051573Srgrimes
11061573Srgrimes/*
11071573Srgrimes - allocset - allocate a set of characters for []
110892889Sobrien == static cset *allocset(struct parse *p);
11091573Srgrimes */
11101573Srgrimesstatic cset *
1111170528Sdelphijallocset(struct parse *p)
11121573Srgrimes{
1113132019Stjr	cset *cs, *ncs;
11141573Srgrimes
1115132019Stjr	ncs = realloc(p->g->sets, (p->g->ncsets + 1) * sizeof(*ncs));
1116132019Stjr	if (ncs == NULL) {
1117132019Stjr		SETERROR(REG_ESPACE);
1118132019Stjr		return (NULL);
11191573Srgrimes	}
1120132019Stjr	p->g->sets = ncs;
1121132019Stjr	cs = &p->g->sets[p->g->ncsets++];
1122132019Stjr	memset(cs, 0, sizeof(*cs));
11231573Srgrimes
11241573Srgrimes	return(cs);
11251573Srgrimes}
11261573Srgrimes
11271573Srgrimes/*
11281573Srgrimes - freeset - free a now-unused set
112992889Sobrien == static void freeset(struct parse *p, cset *cs);
11301573Srgrimes */
11311573Srgrimesstatic void
1132170528Sdelphijfreeset(struct parse *p, cset *cs)
11331573Srgrimes{
113492889Sobrien	cset *top = &p->g->sets[p->g->ncsets];
11351573Srgrimes
1136132019Stjr	free(cs->wides);
1137132019Stjr	free(cs->ranges);
1138132019Stjr	free(cs->types);
1139132019Stjr	memset(cs, 0, sizeof(*cs));
11401573Srgrimes	if (cs == top-1)	/* recover only the easy case */
11411573Srgrimes		p->g->ncsets--;
11421573Srgrimes}
11431573Srgrimes
11441573Srgrimes/*
1145132019Stjr - singleton - Determine whether a set contains only one character,
1146132019Stjr - returning it if so, otherwise returning OUT.
11471573Srgrimes */
1148132019Stjrstatic wint_t
1149170528Sdelphijsingleton(cset *cs)
11501573Srgrimes{
1151132019Stjr	wint_t i, s, n;
11521573Srgrimes
1153132019Stjr	for (i = n = 0; i < NC; i++)
1154132019Stjr		if (CHIN(cs, i)) {
1155132019Stjr			n++;
1156132019Stjr			s = i;
11571573Srgrimes		}
1158132019Stjr	if (n == 1)
1159132019Stjr		return (s);
1160132019Stjr	if (cs->nwides == 1 && cs->nranges == 0 && cs->ntypes == 0 &&
1161132019Stjr	    cs->icase == 0)
1162132019Stjr		return (cs->wides[0]);
1163132019Stjr	/* Don't bother handling the other cases. */
1164132019Stjr	return (OUT);
1165132019Stjr}
11661573Srgrimes
1167132019Stjr/*
1168132019Stjr - CHadd - add character to character set.
1169132019Stjr */
1170132019Stjrstatic void
1171170528SdelphijCHadd(struct parse *p, cset *cs, wint_t ch)
1172132019Stjr{
1173134802Stjr	wint_t nch, *newwides;
1174132019Stjr	assert(ch >= 0);
1175134802Stjr	if (ch < NC)
1176132019Stjr		cs->bmp[ch >> 3] |= 1 << (ch & 7);
1177134802Stjr	else {
1178132019Stjr		newwides = realloc(cs->wides, (cs->nwides + 1) *
1179132019Stjr		    sizeof(*cs->wides));
1180132019Stjr		if (newwides == NULL) {
1181132019Stjr			SETERROR(REG_ESPACE);
1182132019Stjr			return;
1183132019Stjr		}
1184132019Stjr		cs->wides = newwides;
1185132019Stjr		cs->wides[cs->nwides++] = ch;
11861573Srgrimes	}
1187134802Stjr	if (cs->icase) {
1188134802Stjr		if ((nch = towlower(ch)) < NC)
1189134802Stjr			cs->bmp[nch >> 3] |= 1 << (nch & 7);
1190134802Stjr		if ((nch = towupper(ch)) < NC)
1191134802Stjr			cs->bmp[nch >> 3] |= 1 << (nch & 7);
1192134802Stjr	}
11931573Srgrimes}
11941573Srgrimes
11951573Srgrimes/*
1196132019Stjr - CHaddrange - add all characters in the range [min,max] to a character set.
11971573Srgrimes */
1198132019Stjrstatic void
1199170528SdelphijCHaddrange(struct parse *p, cset *cs, wint_t min, wint_t max)
12001573Srgrimes{
1201132019Stjr	crange *newranges;
12021573Srgrimes
1203132019Stjr	for (; min < NC && min <= max; min++)
1204132019Stjr		CHadd(p, cs, min);
1205132019Stjr	if (min >= max)
1206132019Stjr		return;
1207132019Stjr	newranges = realloc(cs->ranges, (cs->nranges + 1) *
1208132019Stjr	    sizeof(*cs->ranges));
1209132019Stjr	if (newranges == NULL) {
1210132019Stjr		SETERROR(REG_ESPACE);
1211132019Stjr		return;
1212132019Stjr	}
1213132019Stjr	cs->ranges = newranges;
1214132019Stjr	cs->ranges[cs->nranges].min = min;
1215247596Sdelphij	cs->ranges[cs->nranges].max = max;
1216132019Stjr	cs->nranges++;
12171573Srgrimes}
12181573Srgrimes
12191573Srgrimes/*
1220132019Stjr - CHaddtype - add all characters of a certain type to a character set.
12211573Srgrimes */
1222132019Stjrstatic void
1223170528SdelphijCHaddtype(struct parse *p, cset *cs, wctype_t wct)
12241573Srgrimes{
1225132019Stjr	wint_t i;
1226132019Stjr	wctype_t *newtypes;
12271573Srgrimes
1228134802Stjr	for (i = 0; i < NC; i++)
1229132019Stjr		if (iswctype(i, wct))
1230132019Stjr			CHadd(p, cs, i);
1231132019Stjr	newtypes = realloc(cs->types, (cs->ntypes + 1) *
1232132019Stjr	    sizeof(*cs->types));
1233132019Stjr	if (newtypes == NULL) {
1234132019Stjr		SETERROR(REG_ESPACE);
1235132019Stjr		return;
1236132019Stjr	}
1237132019Stjr	cs->types = newtypes;
1238132019Stjr	cs->types[cs->ntypes++] = wct;
12391573Srgrimes}
12401573Srgrimes
12411573Srgrimes/*
12421573Srgrimes - dupl - emit a duplicate of a bunch of sops
124392889Sobrien == static sopno dupl(struct parse *p, sopno start, sopno finish);
12441573Srgrimes */
12451573Srgrimesstatic sopno			/* start of duplicate */
1246170528Sdelphijdupl(struct parse *p,
1247170528Sdelphij	sopno start,		/* from here */
1248170528Sdelphij	sopno finish)		/* to this less one */
12491573Srgrimes{
125092889Sobrien	sopno ret = HERE();
125192889Sobrien	sopno len = finish - start;
12521573Srgrimes
12531573Srgrimes	assert(finish >= start);
12541573Srgrimes	if (len == 0)
12551573Srgrimes		return(ret);
1256227414Skevlo	if (!enlarge(p, p->ssize + len)) /* this many unexpected additions */
1257227414Skevlo		return(ret);
12581573Srgrimes	(void) memcpy((char *)(p->strip + p->slen),
12591573Srgrimes		(char *)(p->strip + start), (size_t)len*sizeof(sop));
12601573Srgrimes	p->slen += len;
12611573Srgrimes	return(ret);
12621573Srgrimes}
12631573Srgrimes
12641573Srgrimes/*
12651573Srgrimes - doemit - emit a strip operator
126692889Sobrien == static void doemit(struct parse *p, sop op, size_t opnd);
12671573Srgrimes *
12681573Srgrimes * It might seem better to implement this as a macro with a function as
12691573Srgrimes * hard-case backup, but it's just too big and messy unless there are
12701573Srgrimes * some changes to the data structures.  Maybe later.
12711573Srgrimes */
12721573Srgrimesstatic void
1273170528Sdelphijdoemit(struct parse *p, sop op, size_t opnd)
12741573Srgrimes{
12751573Srgrimes	/* avoid making error situations worse */
12761573Srgrimes	if (p->error != 0)
12771573Srgrimes		return;
12781573Srgrimes
12791573Srgrimes	/* deal with oversize operands ("can't happen", more or less) */
12801573Srgrimes	assert(opnd < 1<<OPSHIFT);
12811573Srgrimes
12821573Srgrimes	/* deal with undersized strip */
12831573Srgrimes	if (p->slen >= p->ssize)
1284227414Skevlo		if (!enlarge(p, (p->ssize+1) / 2 * 3))	/* +50% */
1285227414Skevlo			return;
12861573Srgrimes
12871573Srgrimes	/* finally, it's all reduced to the easy case */
12881573Srgrimes	p->strip[p->slen++] = SOP(op, opnd);
12891573Srgrimes}
12901573Srgrimes
12911573Srgrimes/*
12921573Srgrimes - doinsert - insert a sop into the strip
129392889Sobrien == static void doinsert(struct parse *p, sop op, size_t opnd, sopno pos);
12941573Srgrimes */
12951573Srgrimesstatic void
1296170528Sdelphijdoinsert(struct parse *p, sop op, size_t opnd, sopno pos)
12971573Srgrimes{
129892889Sobrien	sopno sn;
129992889Sobrien	sop s;
130092889Sobrien	int i;
13011573Srgrimes
13021573Srgrimes	/* avoid making error situations worse */
13031573Srgrimes	if (p->error != 0)
13041573Srgrimes		return;
13051573Srgrimes
13061573Srgrimes	sn = HERE();
13071573Srgrimes	EMIT(op, opnd);		/* do checks, ensure space */
13081573Srgrimes	assert(HERE() == sn+1);
13091573Srgrimes	s = p->strip[sn];
13101573Srgrimes
13111573Srgrimes	/* adjust paren pointers */
13121573Srgrimes	assert(pos > 0);
13131573Srgrimes	for (i = 1; i < NPAREN; i++) {
13141573Srgrimes		if (p->pbegin[i] >= pos) {
13151573Srgrimes			p->pbegin[i]++;
13161573Srgrimes		}
13171573Srgrimes		if (p->pend[i] >= pos) {
13181573Srgrimes			p->pend[i]++;
13191573Srgrimes		}
13201573Srgrimes	}
13211573Srgrimes
13221573Srgrimes	memmove((char *)&p->strip[pos+1], (char *)&p->strip[pos],
13231573Srgrimes						(HERE()-pos-1)*sizeof(sop));
13241573Srgrimes	p->strip[pos] = s;
13251573Srgrimes}
13261573Srgrimes
13271573Srgrimes/*
13281573Srgrimes - dofwd - complete a forward reference
132992889Sobrien == static void dofwd(struct parse *p, sopno pos, sop value);
13301573Srgrimes */
13311573Srgrimesstatic void
1332170528Sdelphijdofwd(struct parse *p, sopno pos, sop value)
13331573Srgrimes{
13341573Srgrimes	/* avoid making error situations worse */
13351573Srgrimes	if (p->error != 0)
13361573Srgrimes		return;
13371573Srgrimes
13381573Srgrimes	assert(value < 1<<OPSHIFT);
13391573Srgrimes	p->strip[pos] = OP(p->strip[pos]) | value;
13401573Srgrimes}
13411573Srgrimes
13421573Srgrimes/*
13431573Srgrimes - enlarge - enlarge the strip
1344227414Skevlo == static int enlarge(struct parse *p, sopno size);
13451573Srgrimes */
1346227414Skevlostatic int
1347170528Sdelphijenlarge(struct parse *p, sopno size)
13481573Srgrimes{
134992889Sobrien	sop *sp;
13501573Srgrimes
13511573Srgrimes	if (p->ssize >= size)
1352227414Skevlo		return 1;
13531573Srgrimes
13541573Srgrimes	sp = (sop *)realloc(p->strip, size*sizeof(sop));
13551573Srgrimes	if (sp == NULL) {
13561573Srgrimes		SETERROR(REG_ESPACE);
1357227414Skevlo		return 0;
13581573Srgrimes	}
13591573Srgrimes	p->strip = sp;
13601573Srgrimes	p->ssize = size;
1361227414Skevlo	return 1;
13621573Srgrimes}
13631573Srgrimes
13641573Srgrimes/*
13651573Srgrimes - stripsnug - compact the strip
136692889Sobrien == static void stripsnug(struct parse *p, struct re_guts *g);
13671573Srgrimes */
13681573Srgrimesstatic void
1369170528Sdelphijstripsnug(struct parse *p, struct re_guts *g)
13701573Srgrimes{
13711573Srgrimes	g->nstates = p->slen;
13721573Srgrimes	g->strip = (sop *)realloc((char *)p->strip, p->slen * sizeof(sop));
13731573Srgrimes	if (g->strip == NULL) {
13741573Srgrimes		SETERROR(REG_ESPACE);
13751573Srgrimes		g->strip = p->strip;
13761573Srgrimes	}
13771573Srgrimes}
13781573Srgrimes
13791573Srgrimes/*
13801573Srgrimes - findmust - fill in must and mlen with longest mandatory literal string
138192889Sobrien == static void findmust(struct parse *p, struct re_guts *g);
13821573Srgrimes *
13831573Srgrimes * This algorithm could do fancy things like analyzing the operands of |
13841573Srgrimes * for common subsequences.  Someday.  This code is simple and finds most
13851573Srgrimes * of the interesting cases.
13861573Srgrimes *
13871573Srgrimes * Note that must and mlen got initialized during setup.
13881573Srgrimes */
13891573Srgrimesstatic void
1390170528Sdelphijfindmust(struct parse *p, struct re_guts *g)
13911573Srgrimes{
139292889Sobrien	sop *scan;
13931573Srgrimes	sop *start;
139492889Sobrien	sop *newstart;
139592889Sobrien	sopno newlen;
139692889Sobrien	sop s;
139792889Sobrien	char *cp;
139862391Sdcs	int offset;
1399132019Stjr	char buf[MB_LEN_MAX];
1400132019Stjr	size_t clen;
1401132019Stjr	mbstate_t mbs;
14021573Srgrimes
14031573Srgrimes	/* avoid making error situations worse */
14041573Srgrimes	if (p->error != 0)
14051573Srgrimes		return;
14061573Srgrimes
1407132019Stjr	/*
1408132019Stjr	 * It's not generally safe to do a ``char'' substring search on
1409132019Stjr	 * multibyte character strings, but it's safe for at least
1410132019Stjr	 * UTF-8 (see RFC 3629).
1411132019Stjr	 */
1412132019Stjr	if (MB_CUR_MAX > 1 &&
1413132019Stjr	    strcmp(_CurrentRuneLocale->__encoding, "UTF-8") != 0)
1414132019Stjr		return;
1415132019Stjr
14161573Srgrimes	/* find the longest OCHAR sequence in strip */
14171573Srgrimes	newlen = 0;
141862391Sdcs	offset = 0;
141962391Sdcs	g->moffset = 0;
14201573Srgrimes	scan = g->strip + 1;
14211573Srgrimes	do {
14221573Srgrimes		s = *scan++;
14231573Srgrimes		switch (OP(s)) {
14241573Srgrimes		case OCHAR:		/* sequence member */
1425132019Stjr			if (newlen == 0) {		/* new sequence */
1426132019Stjr				memset(&mbs, 0, sizeof(mbs));
14271573Srgrimes				newstart = scan - 1;
1428132019Stjr			}
1429132019Stjr			clen = wcrtomb(buf, OPND(s), &mbs);
1430132019Stjr			if (clen == (size_t)-1)
1431132019Stjr				goto toohard;
1432132019Stjr			newlen += clen;
14331573Srgrimes			break;
14341573Srgrimes		case OPLUS_:		/* things that don't break one */
14351573Srgrimes		case OLPAREN:
14361573Srgrimes		case ORPAREN:
14371573Srgrimes			break;
14381573Srgrimes		case OQUEST_:		/* things that must be skipped */
14391573Srgrimes		case OCH_:
1440131973Stjr			offset = altoffset(scan, offset);
14411573Srgrimes			scan--;
14421573Srgrimes			do {
14431573Srgrimes				scan += OPND(s);
14441573Srgrimes				s = *scan;
14451573Srgrimes				/* assert() interferes w debug printouts */
14461573Srgrimes				if (OP(s) != O_QUEST && OP(s) != O_CH &&
14471573Srgrimes							OP(s) != OOR2) {
14481573Srgrimes					g->iflags |= BAD;
14491573Srgrimes					return;
14501573Srgrimes				}
14511573Srgrimes			} while (OP(s) != O_QUEST && OP(s) != O_CH);
1452102411Scharnier			/* FALLTHROUGH */
145362391Sdcs		case OBOW:		/* things that break a sequence */
145462391Sdcs		case OEOW:
145562391Sdcs		case OBOL:
145662391Sdcs		case OEOL:
145762391Sdcs		case O_QUEST:
145862391Sdcs		case O_CH:
145962391Sdcs		case OEND:
14601573Srgrimes			if (newlen > g->mlen) {		/* ends one */
14611573Srgrimes				start = newstart;
14621573Srgrimes				g->mlen = newlen;
146362391Sdcs				if (offset > -1) {
146462391Sdcs					g->moffset += offset;
146562391Sdcs					offset = newlen;
146662391Sdcs				} else
146762391Sdcs					g->moffset = offset;
146862391Sdcs			} else {
146962391Sdcs				if (offset > -1)
147062391Sdcs					offset += newlen;
14711573Srgrimes			}
14721573Srgrimes			newlen = 0;
14731573Srgrimes			break;
147462391Sdcs		case OANY:
147562391Sdcs			if (newlen > g->mlen) {		/* ends one */
147662391Sdcs				start = newstart;
147762391Sdcs				g->mlen = newlen;
147862391Sdcs				if (offset > -1) {
147962391Sdcs					g->moffset += offset;
148062391Sdcs					offset = newlen;
148162391Sdcs				} else
148262391Sdcs					g->moffset = offset;
148362391Sdcs			} else {
148462391Sdcs				if (offset > -1)
148562391Sdcs					offset += newlen;
148662391Sdcs			}
148762391Sdcs			if (offset > -1)
148862391Sdcs				offset++;
148962391Sdcs			newlen = 0;
149062391Sdcs			break;
149162391Sdcs		case OANYOF:		/* may or may not invalidate offset */
149262391Sdcs			/* First, everything as OANY */
149362391Sdcs			if (newlen > g->mlen) {		/* ends one */
149462391Sdcs				start = newstart;
149562391Sdcs				g->mlen = newlen;
149662391Sdcs				if (offset > -1) {
149762391Sdcs					g->moffset += offset;
149862391Sdcs					offset = newlen;
149962391Sdcs				} else
150062391Sdcs					g->moffset = offset;
150162391Sdcs			} else {
150262391Sdcs				if (offset > -1)
150362391Sdcs					offset += newlen;
150462391Sdcs			}
150562391Sdcs			if (offset > -1)
150662391Sdcs				offset++;
150762391Sdcs			newlen = 0;
150862391Sdcs			break;
1509132019Stjr		toohard:
151062391Sdcs		default:
151162391Sdcs			/* Anything here makes it impossible or too hard
151262391Sdcs			 * to calculate the offset -- so we give up;
151362391Sdcs			 * save the last known good offset, in case the
151462391Sdcs			 * must sequence doesn't occur later.
151562391Sdcs			 */
151662391Sdcs			if (newlen > g->mlen) {		/* ends one */
151762391Sdcs				start = newstart;
151862391Sdcs				g->mlen = newlen;
151962391Sdcs				if (offset > -1)
152062391Sdcs					g->moffset += offset;
152162391Sdcs				else
152262391Sdcs					g->moffset = offset;
152362391Sdcs			}
152462391Sdcs			offset = -1;
152562391Sdcs			newlen = 0;
152662391Sdcs			break;
15271573Srgrimes		}
15281573Srgrimes	} while (OP(s) != OEND);
15291573Srgrimes
153062391Sdcs	if (g->mlen == 0) {		/* there isn't one */
153162391Sdcs		g->moffset = -1;
15321573Srgrimes		return;
153362391Sdcs	}
15341573Srgrimes
15351573Srgrimes	/* turn it into a character string */
15361573Srgrimes	g->must = malloc((size_t)g->mlen + 1);
15371573Srgrimes	if (g->must == NULL) {		/* argh; just forget it */
15381573Srgrimes		g->mlen = 0;
153962391Sdcs		g->moffset = -1;
15401573Srgrimes		return;
15411573Srgrimes	}
15421573Srgrimes	cp = g->must;
15431573Srgrimes	scan = start;
1544132019Stjr	memset(&mbs, 0, sizeof(mbs));
1545132019Stjr	while (cp < g->must + g->mlen) {
15461573Srgrimes		while (OP(s = *scan++) != OCHAR)
15471573Srgrimes			continue;
1548132019Stjr		clen = wcrtomb(cp, OPND(s), &mbs);
1549132019Stjr		assert(clen != (size_t)-1);
1550132019Stjr		cp += clen;
15511573Srgrimes	}
15521573Srgrimes	assert(cp == g->must + g->mlen);
15531573Srgrimes	*cp++ = '\0';		/* just on general principles */
15541573Srgrimes}
15551573Srgrimes
15561573Srgrimes/*
155762391Sdcs - altoffset - choose biggest offset among multiple choices
1558131973Stjr == static int altoffset(sop *scan, int offset);
155962391Sdcs *
156062391Sdcs * Compute, recursively if necessary, the largest offset among multiple
156162391Sdcs * re paths.
156262391Sdcs */
156362391Sdcsstatic int
1564170528Sdelphijaltoffset(sop *scan, int offset)
156562391Sdcs{
156662391Sdcs	int largest;
156762391Sdcs	int try;
156862391Sdcs	sop s;
156962391Sdcs
157062391Sdcs	/* If we gave up already on offsets, return */
157162391Sdcs	if (offset == -1)
157262391Sdcs		return -1;
157362391Sdcs
157462391Sdcs	largest = 0;
157562391Sdcs	try = 0;
157662391Sdcs	s = *scan++;
157762391Sdcs	while (OP(s) != O_QUEST && OP(s) != O_CH) {
157862391Sdcs		switch (OP(s)) {
157962391Sdcs		case OOR1:
158062391Sdcs			if (try > largest)
158162391Sdcs				largest = try;
158262391Sdcs			try = 0;
158362391Sdcs			break;
158462391Sdcs		case OQUEST_:
158562391Sdcs		case OCH_:
1586131973Stjr			try = altoffset(scan, try);
158762391Sdcs			if (try == -1)
158862391Sdcs				return -1;
158962391Sdcs			scan--;
159062391Sdcs			do {
159162391Sdcs				scan += OPND(s);
159262391Sdcs				s = *scan;
159362391Sdcs				if (OP(s) != O_QUEST && OP(s) != O_CH &&
159462391Sdcs							OP(s) != OOR2)
159562391Sdcs					return -1;
159662391Sdcs			} while (OP(s) != O_QUEST && OP(s) != O_CH);
159762855Sdcs			/* We must skip to the next position, or we'll
159862855Sdcs			 * leave altoffset() too early.
159962855Sdcs			 */
160062855Sdcs			scan++;
160162391Sdcs			break;
160262391Sdcs		case OANYOF:
160362391Sdcs		case OCHAR:
160462391Sdcs		case OANY:
160562391Sdcs			try++;
160662391Sdcs		case OBOW:
160762391Sdcs		case OEOW:
160862391Sdcs		case OLPAREN:
160962391Sdcs		case ORPAREN:
161062391Sdcs		case OOR2:
161162391Sdcs			break;
161262391Sdcs		default:
161362391Sdcs			try = -1;
161462391Sdcs			break;
161562391Sdcs		}
161662391Sdcs		if (try == -1)
161762391Sdcs			return -1;
161862391Sdcs		s = *scan++;
161962391Sdcs	}
162062391Sdcs
162162391Sdcs	if (try > largest)
162262391Sdcs		largest = try;
162362391Sdcs
162462391Sdcs	return largest+offset;
162562391Sdcs}
162662391Sdcs
162762391Sdcs/*
162862232Sdcs - computejumps - compute char jumps for BM scan
162992889Sobrien == static void computejumps(struct parse *p, struct re_guts *g);
163062232Sdcs *
163162232Sdcs * This algorithm assumes g->must exists and is has size greater than
163262232Sdcs * zero. It's based on the algorithm found on Computer Algorithms by
163362232Sdcs * Sara Baase.
163462232Sdcs *
163562232Sdcs * A char jump is the number of characters one needs to jump based on
163662232Sdcs * the value of the character from the text that was mismatched.
163762232Sdcs */
163862232Sdcsstatic void
1639170528Sdelphijcomputejumps(struct parse *p, struct re_guts *g)
164062232Sdcs{
164162232Sdcs	int ch;
164262232Sdcs	int mindex;
164362232Sdcs
164462232Sdcs	/* Avoid making errors worse */
164562232Sdcs	if (p->error != 0)
164662232Sdcs		return;
164762232Sdcs
164862848Sdcs	g->charjump = (int*) malloc((NC + 1) * sizeof(int));
164962232Sdcs	if (g->charjump == NULL)	/* Not a fatal error */
165062232Sdcs		return;
165162754Sdcs	/* Adjust for signed chars, if necessary */
165262754Sdcs	g->charjump = &g->charjump[-(CHAR_MIN)];
165362232Sdcs
165462232Sdcs	/* If the character does not exist in the pattern, the jump
165562232Sdcs	 * is equal to the number of characters in the pattern.
165662232Sdcs	 */
165762754Sdcs	for (ch = CHAR_MIN; ch < (CHAR_MAX + 1); ch++)
165862232Sdcs		g->charjump[ch] = g->mlen;
165962232Sdcs
166062232Sdcs	/* If the character does exist, compute the jump that would
166162232Sdcs	 * take us to the last character in the pattern equal to it
166262232Sdcs	 * (notice that we match right to left, so that last character
166362232Sdcs	 * is the first one that would be matched).
166462232Sdcs	 */
166562232Sdcs	for (mindex = 0; mindex < g->mlen; mindex++)
1666111010Snectar		g->charjump[(int)g->must[mindex]] = g->mlen - mindex - 1;
166762232Sdcs}
166862232Sdcs
166962232Sdcs/*
167062232Sdcs - computematchjumps - compute match jumps for BM scan
167192889Sobrien == static void computematchjumps(struct parse *p, struct re_guts *g);
167262232Sdcs *
167362232Sdcs * This algorithm assumes g->must exists and is has size greater than
167462232Sdcs * zero. It's based on the algorithm found on Computer Algorithms by
167562232Sdcs * Sara Baase.
167662232Sdcs *
167762232Sdcs * A match jump is the number of characters one needs to advance based
167862232Sdcs * on the already-matched suffix.
167962232Sdcs * Notice that all values here are minus (g->mlen-1), because of the way
168062232Sdcs * the search algorithm works.
168162232Sdcs */
168262232Sdcsstatic void
1683170528Sdelphijcomputematchjumps(struct parse *p, struct re_guts *g)
168462232Sdcs{
168562232Sdcs	int mindex;		/* General "must" iterator */
168662232Sdcs	int suffix;		/* Keeps track of matching suffix */
168762232Sdcs	int ssuffix;		/* Keeps track of suffixes' suffix */
168862232Sdcs	int* pmatches;		/* pmatches[k] points to the next i
168962232Sdcs				 * such that i+1...mlen is a substring
169062232Sdcs				 * of k+1...k+mlen-i-1
169162232Sdcs				 */
169262232Sdcs
169362232Sdcs	/* Avoid making errors worse */
169462232Sdcs	if (p->error != 0)
169562232Sdcs		return;
169662232Sdcs
169762848Sdcs	pmatches = (int*) malloc(g->mlen * sizeof(unsigned int));
169862232Sdcs	if (pmatches == NULL) {
169962232Sdcs		g->matchjump = NULL;
170062232Sdcs		return;
170162232Sdcs	}
170262232Sdcs
170362848Sdcs	g->matchjump = (int*) malloc(g->mlen * sizeof(unsigned int));
170462232Sdcs	if (g->matchjump == NULL)	/* Not a fatal error */
170562232Sdcs		return;
170662232Sdcs
170762232Sdcs	/* Set maximum possible jump for each character in the pattern */
170862232Sdcs	for (mindex = 0; mindex < g->mlen; mindex++)
170962232Sdcs		g->matchjump[mindex] = 2*g->mlen - mindex - 1;
171062232Sdcs
171162232Sdcs	/* Compute pmatches[] */
171262232Sdcs	for (mindex = g->mlen - 1, suffix = g->mlen; mindex >= 0;
171362232Sdcs	    mindex--, suffix--) {
171462232Sdcs		pmatches[mindex] = suffix;
171562232Sdcs
171662232Sdcs		/* If a mismatch is found, interrupting the substring,
171762232Sdcs		 * compute the matchjump for that position. If no
171862232Sdcs		 * mismatch is found, then a text substring mismatched
171962232Sdcs		 * against the suffix will also mismatch against the
172062232Sdcs		 * substring.
172162232Sdcs		 */
172262232Sdcs		while (suffix < g->mlen
172362232Sdcs		    && g->must[mindex] != g->must[suffix]) {
172462232Sdcs			g->matchjump[suffix] = MIN(g->matchjump[suffix],
172562232Sdcs			    g->mlen - mindex - 1);
172662232Sdcs			suffix = pmatches[suffix];
172762232Sdcs		}
172862232Sdcs	}
172962232Sdcs
173062232Sdcs	/* Compute the matchjump up to the last substring found to jump
173162232Sdcs	 * to the beginning of the largest must pattern prefix matching
173262232Sdcs	 * it's own suffix.
173362232Sdcs	 */
173462232Sdcs	for (mindex = 0; mindex <= suffix; mindex++)
173562232Sdcs		g->matchjump[mindex] = MIN(g->matchjump[mindex],
173662232Sdcs		    g->mlen + suffix - mindex);
173762232Sdcs
173862232Sdcs        ssuffix = pmatches[suffix];
173962391Sdcs        while (suffix < g->mlen) {
174062673Sdcs                while (suffix <= ssuffix && suffix < g->mlen) {
174162232Sdcs                        g->matchjump[suffix] = MIN(g->matchjump[suffix],
174262232Sdcs			    g->mlen + ssuffix - suffix);
174362232Sdcs                        suffix++;
174462232Sdcs                }
174586208Sdcs		if (suffix < g->mlen)
174686208Sdcs                	ssuffix = pmatches[ssuffix];
174762232Sdcs        }
174862232Sdcs
174962232Sdcs	free(pmatches);
175062232Sdcs}
175162232Sdcs
175262232Sdcs/*
17531573Srgrimes - pluscount - count + nesting
175492889Sobrien == static sopno pluscount(struct parse *p, struct re_guts *g);
17551573Srgrimes */
17561573Srgrimesstatic sopno			/* nesting depth */
1757170528Sdelphijpluscount(struct parse *p, struct re_guts *g)
17581573Srgrimes{
175992889Sobrien	sop *scan;
176092889Sobrien	sop s;
176192889Sobrien	sopno plusnest = 0;
176292889Sobrien	sopno maxnest = 0;
17631573Srgrimes
17641573Srgrimes	if (p->error != 0)
17651573Srgrimes		return(0);	/* there may not be an OEND */
17661573Srgrimes
17671573Srgrimes	scan = g->strip + 1;
17681573Srgrimes	do {
17691573Srgrimes		s = *scan++;
17701573Srgrimes		switch (OP(s)) {
17711573Srgrimes		case OPLUS_:
17721573Srgrimes			plusnest++;
17731573Srgrimes			break;
17741573Srgrimes		case O_PLUS:
17751573Srgrimes			if (plusnest > maxnest)
17761573Srgrimes				maxnest = plusnest;
17771573Srgrimes			plusnest--;
17781573Srgrimes			break;
17791573Srgrimes		}
17801573Srgrimes	} while (OP(s) != OEND);
17811573Srgrimes	if (plusnest != 0)
17821573Srgrimes		g->iflags |= BAD;
17831573Srgrimes	return(maxnest);
17841573Srgrimes}
1785