1101282Smdodd/*	$NetBSD: regcomp.c,v 1.48 2023/08/30 20:37:24 christos Exp $	*/
2204977Simp
3101282Smdodd/*-
4101282Smdodd * SPDX-License-Identifier: BSD-3-Clause
5101282Smdodd *
6101282Smdodd * Copyright (c) 1992, 1993, 1994 Henry Spencer.
7101282Smdodd * Copyright (c) 1992, 1993, 1994
8101282Smdodd *	The Regents of the University of California.  All rights reserved.
9101282Smdodd *
10101282Smdodd * Copyright (c) 2011 The FreeBSD Foundation
11101282Smdodd * All rights reserved.
12101282Smdodd * Portions of this software were developed by David Chisnall
13101282Smdodd * under sponsorship from the FreeBSD Foundation.
14101282Smdodd *
15101282Smdodd * This code is derived from software contributed to Berkeley by
16101282Smdodd * Henry Spencer.
17101282Smdodd *
18101282Smdodd * Redistribution and use in source and binary forms, with or without
19101282Smdodd * modification, are permitted provided that the following conditions
20101282Smdodd * are met:
21101282Smdodd * 1. Redistributions of source code must retain the above copyright
22101282Smdodd *    notice, this list of conditions and the following disclaimer.
23101282Smdodd * 2. Redistributions in binary form must reproduce the above copyright
24101282Smdodd *    notice, this list of conditions and the following disclaimer in the
25101282Smdodd *    documentation and/or other materials provided with the distribution.
26101282Smdodd * 3. Neither the name of the University nor the names of its contributors
27101282Smdodd *    may be used to endorse or promote products derived from this software
28288424Sjhb *    without specific prior written permission.
29168569Sdelphij *
30168569Sdelphij * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
31240005Szont * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
32240005Szont * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
33240005Szont * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
34240005Szont * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
35240005Szont * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
36240005Szont * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
37240005Szont * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
38295930Sjhb * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
39101282Smdodd * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
40288424Sjhb * SUCH DAMAGE.
41288424Sjhb *
42288424Sjhb *	@(#)regcomp.c	8.5 (Berkeley) 3/20/94
43288424Sjhb */
44288424Sjhb
45294849Sjhb#if HAVE_NBTOOL_CONFIG_H
46288424Sjhb#include "nbtool_config.h"
47288424Sjhb#endif
48288424Sjhb
49288424Sjhb#include <sys/cdefs.h>
50288424Sjhb#if 0
51288424Sjhbstatic char sccsid[] = "@(#)regcomp.c	8.5 (Berkeley) 3/20/94";
52288424Sjhb__FBSDID("$FreeBSD: head/lib/libc/regex/regcomp.c 368359 2020-12-05 03:18:48Z kevans $");
53288424Sjhb#endif
54288997Sbdrewery__RCSID("$NetBSD: regcomp.c,v 1.48 2023/08/30 20:37:24 christos Exp $");
55288424Sjhb
56288424Sjhb#ifndef LIBHACK
57288424Sjhb#define REGEX_GNU_EXTENSIONS
58288424Sjhb
59288424Sjhb#include "namespace.h"
60288424Sjhb#endif
61288424Sjhb#include <sys/types.h>
62288424Sjhb#include <stdio.h>
63288424Sjhb#include <string.h>
64288424Sjhb#include <ctype.h>
65288424Sjhb#include <limits.h>
66288424Sjhb#include <stdlib.h>
67288424Sjhb#include <regex.h>
68288424Sjhb#include <stdbool.h>
69288424Sjhb
70288424Sjhb#if defined(__weak_alias) && !defined(LIBHACK)
71288424Sjhb__weak_alias(regcomp,_regcomp)
72288424Sjhb#endif
73288424Sjhb
74168569Sdelphij#ifdef REGEX_LIBC_COLLATE
75168569Sdelphij#include "collate.h"
76296571Sjhb#endif
77288424Sjhb
78168569Sdelphij#include "utils.h"
79168569Sdelphij#include "regex2.h"
80288424Sjhb
81240562Szont#include "cname.h"
82240562Szont
83168569Sdelphij/*
84168569Sdelphij * Branching context, used to keep track of branch state for all of the branch-
85288424Sjhb * aware functions. In addition to keeping track of branch positions for the
86288424Sjhb * p_branch_* functions, we use this to simplify some clumsiness in BREs for
87288424Sjhb * detection of whether ^ is acting as an anchor or being used erroneously and
88288424Sjhb * also for whether we're in a sub-expression or not.
89288424Sjhb */
90296571Sjhbstruct branchc {
91288424Sjhb	sopno start;
92288424Sjhb	sopno back;
93101282Smdodd	sopno fwd;
94101282Smdodd
95101282Smdodd	int nbranch;
96153963Sbrian	int nchain;
97101282Smdodd	bool outer;
98101285Smdodd	bool terminate;
99101373Smdodd};
100168569Sdelphij
101168569Sdelphij/*
102240005Szont * parse structure, passed up and down to avoid global variables and
103288424Sjhb * other clumsinesses
104101282Smdodd */
105158630Spavstruct parse {
106247338Sdelphij	const char *next;	/* next character in RE */
107158630Spav	const char *end;	/* end of string (-> NUL normally) */
108158630Spav	int error;		/* has an error been seen? */
109158630Spav	int gnuext;
110158630Spav	sop *strip;		/* malloced strip */
111158630Spav	sopno ssize;		/* malloced strip size (allocated) */
112158630Spav	sopno slen;		/* malloced strip length (used) */
113158630Spav	size_t ncsalloc;	/* number of csets allocated */
114158630Spav	struct re_guts *g;
115168569Sdelphij#	define	NPAREN	10	/* we need to remember () 1-9 for back refs */
116247338Sdelphij	sopno pbegin[NPAREN];	/* -> ( ([0] unused) */
117192025Sdds	sopno pend[NPAREN];	/* -> ) ([0] unused) */
118192025Sdds	bool allowbranch;	/* can this expression branch? */
119192025Sdds	bool bre;		/* convenience; is this a BRE? */
120192025Sdds	int pflags;		/* other parsing flags -- legacy escapes? */
121192025Sdds	bool (*parse_expr)(struct parse *, struct branchc *);
122192025Sdds	void (*pre_parse)(struct parse *, struct branchc *);
123192025Sdds	void (*post_parse)(struct parse *, struct branchc *);
124192025Sdds};
125
126#define PFLAG_LEGACY_ESC	0x00000001
127
128/* ========= begin header generated by ./mkh ========= */
129#ifdef __cplusplus
130extern "C" {
131#endif
132
133/* === regcomp.c === */
134static bool p_ere_exp(struct parse *p, struct branchc *bc);
135static void p_str(struct parse *p);
136static int p_branch_eat_delim(struct parse *p, struct branchc *bc);
137static void p_branch_ins_offset(struct parse *p, struct branchc *bc);
138static void p_branch_fix_tail(struct parse *p, struct branchc *bc);
139static bool p_branch_empty(struct parse *p, struct branchc *bc);
140static bool p_branch_do(struct parse *p, struct branchc *bc);
141static void p_bre_pre_parse(struct parse *p, struct branchc *bc);
142static void p_bre_post_parse(struct parse *p, struct branchc *bc);
143static void p_re(struct parse *p, int end1, int end2);
144static bool p_simp_re(struct parse *p, struct branchc *bc);
145static int p_count(struct parse *p);
146static void p_bracket(struct parse *p);
147static int p_range_cmp(wchar_t c1, wchar_t c2);
148static void p_b_term(struct parse *p, cset *cs);
149#ifdef REGEX_GNU_EXTENSIONS
150static int p_b_pseudoclass(struct parse *p, char c);
151#endif
152static void p_b_cclass(struct parse *p, cset *cs);
153static void p_b_cclass_named(struct parse *p, cset *cs, const char[]);
154static void p_b_eclass(struct parse *p, cset *cs);
155static wint_t p_b_symbol(struct parse *p);
156static wint_t p_b_coll_elem(struct parse *p, wint_t endc);
157static bool may_escape(struct parse *p, const wint_t ch);
158static wint_t othercase(wint_t ch);
159static void bothcases(struct parse *p, wint_t ch);
160static void ordinary(struct parse *p, wint_t ch);
161static void nonnewline(struct parse *p);
162static void repeat(struct parse *p, sopno start, int from, int to);
163static int seterr(struct parse *p, int e);
164static cset *allocset(struct parse *p);
165static void freeset(struct parse *p, cset *cs);
166static void CHadd(struct parse *p, cset *cs, wint_t ch);
167static void CHaddrange(struct parse *p, cset *cs, wint_t min, wint_t max);
168static void CHaddtype(struct parse *p, cset *cs, wctype_t wct);
169static wint_t singleton(cset *cs);
170static sopno dupl(struct parse *p, sopno start, sopno finish);
171static void doemit(struct parse *p, sop op, size_t opnd);
172static void doinsert(struct parse *p, sop op, size_t opnd, sopno pos);
173static void dofwd(struct parse *p, sopno pos, sop value);
174static int enlarge(struct parse *p, sopno size);
175static void stripsnug(struct parse *p, struct re_guts *g);
176static void findmust(struct parse *p, struct re_guts *g);
177static int altoffset(sop *scan, int offset);
178static void computejumps(struct parse *p, struct re_guts *g);
179static void computematchjumps(struct parse *p, struct re_guts *g);
180static sopno pluscount(struct parse *p, struct re_guts *g);
181static wint_t wgetnext(struct parse *p);
182
183#ifdef __cplusplus
184}
185#endif
186/* ========= end header generated by ./mkh ========= */
187
188static char nuls[10];		/* place to point scanner in event of error */
189
190/*
191 * macros for use with parse structure
192 * BEWARE:  these know that the parse structure is named `p' !!!
193 */
194#define	PEEK()	(*p->next)
195#define	PEEK2()	(*(p->next+1))
196#define	MORE()	(p->next < p->end)
197#define	MORE2()	(p->next+1 < p->end)
198#define	SEE(c)	(MORE() && PEEK() == (c))
199#define	SEETWO(a, b)	(MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b))
200#define	SEESPEC(a)	(p->bre ? SEETWO('\\', a) : SEE(a))
201#define	EAT(c)	((SEE(c)) ? (NEXT(), 1) : 0)
202#define	EATTWO(a, b)	((SEETWO(a, b)) ? (NEXT2(), 1) : 0)
203#define	EATSPEC(a)	(p->bre ? EATTWO('\\', a) : EAT(a))
204#define	NEXT()	(p->next++)
205#define	NEXT2()	(p->next += 2)
206#define	NEXTn(n)	(p->next += (n))
207#define	GETNEXT()	(*p->next++)
208#define	WGETNEXT()	wgetnext(p)
209#define	SETERROR(e)	seterr(p, (e))
210#define	REQUIRE(co, e)	((co) || SETERROR(e))
211#define	MUSTSEE(c, e)	(REQUIRE(MORE() && PEEK() == (c), e))
212#define	MUSTEAT(c, e)	(REQUIRE(MORE() && GETNEXT() == (c), e))
213#define	MUSTNOTSEE(c, e)	(REQUIRE(!MORE() || PEEK() != (c), e))
214#define	EMIT(op, sopnd)	doemit(p, (op), (sopnd))
215#define	INSERT(op, pos)	doinsert(p, (op), HERE()-(pos)+1, pos)
216#define	AHEAD(pos)		dofwd(p, pos, HERE()-(pos))
217#define	ASTERN(sop, pos)	EMIT(sop, HERE()-pos)
218#define	HERE()		(p->slen)
219#define	THERE()		(p->slen - 1)
220#define	THERETHERE()	(p->slen - 2)
221#define	DROP(n)	(p->slen -= (n))
222
223/* Macro used by computejump()/computematchjump() */
224#ifndef MIN
225#define MIN(a,b)	((a)<(b)?(a):(b))
226#endif
227
228#ifndef NLS
229static const struct {
230	const char *name;
231	int (*func)(int);
232} wctypes[] = {
233#define ADD(x) { .name = # x, .func = is ## x }
234	ADD(alnum),
235	ADD(alpha),
236	ADD(blank),
237	ADD(cntrl),
238	ADD(digit),
239	ADD(graph),
240	ADD(lower),
241	ADD(print),
242	ADD(punct),
243	ADD(space),
244	ADD(upper),
245	ADD(xdigit),
246#undef ADD
247};
248
249wctype_t
250__regex_wctype(const char *str)
251{
252	for (size_t i = 0; i < __arraycount(wctypes); i++) {
253		if (strcmp(wctypes[i].name, str) == 0)
254			return (wctype_t)(i + 1);
255	}
256	return (wctype_t)0;
257}
258
259int
260__regex_iswctype(wint_t c, wctype_t ct)
261{
262	if (ct == 0)
263		return 0;
264	return (*wctypes[ct - 1].func)(c);
265}
266#endif
267
268static int				/* 0 success, otherwise REG_something */
269regcomp_internal(regex_t * __restrict preg,
270	const char * __restrict pattern,
271	int cflags, int pflags)
272{
273	struct parse pa;
274	struct re_guts *g;
275	struct parse *p = &pa;
276	int i;
277	size_t len;
278	size_t maxlen;
279#ifdef REDEBUG
280#	define	GOODFLAGS(f)	(f)
281#else
282#	define	GOODFLAGS(f)	((f)&~REG_DUMP)
283#endif
284
285	_DIAGASSERT(preg != NULL);
286	_DIAGASSERT(pattern != NULL);
287
288	cflags = GOODFLAGS(cflags);
289	if ((cflags&REG_EXTENDED) && (cflags&REG_NOSPEC))
290		return(REG_INVARG);
291
292	if (cflags&REG_PEND) {
293		if (preg->re_endp < pattern)
294			return(REG_INVARG);
295		len = preg->re_endp - pattern;
296	} else
297		len = strlen(pattern);
298
299	/* do the mallocs early so failure handling is easy */
300	g = malloc(sizeof(*g));
301	if (g == NULL)
302		return(REG_ESPACE);
303	/*
304	 * Limit the pattern space to avoid a 32-bit overflow on buffer
305	 * extension.  Also avoid any signed overflow in case of conversion
306	 * so make the real limit based on a 31-bit overflow.
307	 *
308	 * Likely not applicable on 64-bit systems but handle the case
309	 * generically (who are we to stop people from using ~715MB+
310	 * patterns?).
311	 */
312	maxlen = ((size_t)-1 >> 1) / sizeof(*p->strip) * 2 / 3;
313	if (len >= maxlen) {
314		free(g);
315		return(REG_ESPACE);
316	}
317	p->ssize = (sopno)(len / 2 * 3 + 1);	/* ugh */
318	assert(p->ssize >= len);
319
320	p->strip = calloc(p->ssize, sizeof(*p->strip));
321	p->slen = 0;
322	if (p->strip == NULL) {
323		free(g);
324		return(REG_ESPACE);
325	}
326
327	/* set things up */
328	p->g = g;
329	p->next = pattern;	/* convenience; we do not modify it */
330	p->end = p->next + len;
331	p->error = 0;
332	p->ncsalloc = 0;
333	p->pflags = pflags;
334	for (i = 0; i < NPAREN; i++) {
335		p->pbegin[i] = 0;
336		p->pend[i] = 0;
337	}
338#ifdef REGEX_GNU_EXTENSIONS
339	if ((cflags & REG_GNU) == 0) {
340		p->gnuext = false;
341		p->allowbranch = (cflags & REG_EXTENDED) != 0;
342	} else
343		p->gnuext = p->allowbranch = true;
344#else
345	p->gnuext = false;
346	p->allowbranch = (cflags & REG_EXTENDED) != 0;
347#endif
348	if (cflags & REG_EXTENDED) {
349		p->bre = false;
350		p->parse_expr = p_ere_exp;
351		p->pre_parse = NULL;
352		p->post_parse = NULL;
353	} else {
354		p->bre = true;
355		p->parse_expr = p_simp_re;
356		p->pre_parse = p_bre_pre_parse;
357		p->post_parse = p_bre_post_parse;
358	}
359	g->sets = NULL;
360	g->ncsets = 0;
361	g->cflags = cflags;
362	g->iflags = 0;
363	g->nbol = 0;
364	g->neol = 0;
365	g->must = NULL;
366	g->moffset = -1;
367	g->charjump = NULL;
368	g->matchjump = NULL;
369	g->mlen = 0;
370	g->nsub = 0;
371	g->backrefs = 0;
372
373	/* do it */
374	EMIT(OEND, 0);
375	g->firststate = THERE();
376	if (cflags & REG_NOSPEC)
377		p_str(p);
378	else
379		p_re(p, OUT, OUT);
380	EMIT(OEND, 0);
381	g->laststate = THERE();
382
383	/* tidy up loose ends and fill things in */
384	stripsnug(p, g);
385	findmust(p, g);
386	/* only use Boyer-Moore algorithm if the pattern is bigger
387	 * than three characters
388	 */
389	if(g->mlen > 3) {
390		computejumps(p, g);
391		computematchjumps(p, g);
392		if(g->matchjump == NULL && g->charjump != NULL) {
393			free(g->charjump);
394			g->charjump = NULL;
395		}
396	}
397	g->nplus = pluscount(p, g);
398	g->magic = MAGIC2;
399	preg->re_nsub = g->nsub;
400	preg->re_g = g;
401	preg->re_magic = MAGIC1;
402#ifndef REDEBUG
403	/* not debugging, so can't rely on the assert() in regexec() */
404	if (g->iflags&BAD)
405		SETERROR(REG_ASSERT);
406#endif
407
408	/* win or lose, we're done */
409	if (p->error != 0)	/* lose */
410		regfree(preg);
411	return(p->error);
412}
413
414/*
415 - regcomp - interface for parser and compilation
416 = extern int regcomp(regex_t *, const char *, int);
417 = #define	REG_BASIC	0000
418 = #define	REG_EXTENDED	0001
419 = #define	REG_ICASE	0002
420 = #define	REG_NOSUB	0004
421 = #define	REG_NEWLINE	0010
422 = #define	REG_NOSPEC	0020
423 = #define	REG_PEND	0040
424 = #define	REG_DUMP	0200
425 */
426int				/* 0 success, otherwise REG_something */
427regcomp(regex_t * __restrict preg,
428	const char * __restrict pattern,
429	int cflags)
430{
431
432	return (regcomp_internal(preg, pattern, cflags, 0));
433}
434
435/*
436 - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op,
437 - return whether we should terminate or not
438 == static bool p_ere_exp(struct parse *p);
439 */
440static bool
441p_ere_exp(struct parse *p, struct branchc *bc)
442{
443	char c;
444	wint_t wc;
445	sopno pos;
446	int count;
447	int count2;
448#ifdef REGEX_GNU_EXTENSIONS
449	size_t i;
450	int handled;
451#endif
452	sopno subno;
453	int wascaret = 0;
454
455	_DIAGASSERT(p != NULL);
456
457	(void)bc;
458	assert(MORE());		/* caller should have ensured this */
459	c = GETNEXT();
460
461#ifdef REGEX_GNU_EXTENSIONS
462	handled = 0;
463#endif
464	pos = HERE();
465	switch (c) {
466	case '(':
467		(void)REQUIRE(MORE(), REG_EPAREN);
468		p->g->nsub++;
469		subno = (sopno)p->g->nsub;
470		if (subno < NPAREN)
471			p->pbegin[subno] = HERE();
472		EMIT(OLPAREN, subno);
473		if (!SEE(')'))
474			p_re(p, ')', IGN);
475		if (subno < NPAREN) {
476			p->pend[subno] = HERE();
477			assert(p->pend[subno] != 0);
478		}
479		EMIT(ORPAREN, subno);
480		(void)MUSTEAT(')', REG_EPAREN);
481		break;
482#ifndef POSIX_MISTAKE
483	case ')':		/* happens only if no current unmatched ( */
484		/*
485		 * You may ask, why the ifndef?  Because I didn't notice
486		 * this until slightly too late for 1003.2, and none of the
487		 * other 1003.2 regular-expression reviewers noticed it at
488		 * all.  So an unmatched ) is legal POSIX, at least until
489		 * we can get it fixed.
490		 */
491		SETERROR(REG_EPAREN);
492		break;
493#endif
494	case '^':
495		EMIT(OBOL, 0);
496		p->g->iflags |= USEBOL;
497		p->g->nbol++;
498		wascaret = 1;
499		break;
500	case '$':
501		EMIT(OEOL, 0);
502		p->g->iflags |= USEEOL;
503		p->g->neol++;
504		break;
505	case '|':
506		SETERROR(REG_EMPTY);
507		break;
508	case '*':
509	case '+':
510	case '?':
511	case '{':
512		SETERROR(REG_BADRPT);
513		break;
514	case '.':
515		if (p->g->cflags&REG_NEWLINE)
516			nonnewline(p);
517		else
518			EMIT(OANY, 0);
519		break;
520	case '[':
521		p_bracket(p);
522		break;
523	case '\\':
524		(void)REQUIRE(MORE(), REG_EESCAPE);
525		wc = WGETNEXT();
526#ifdef REGEX_GNU_EXTENSIONS
527		if (p->gnuext) {
528			handled = 1;
529			switch (wc) {
530			case '`':
531				EMIT(OBOS, 0);
532				break;
533			case '\'':
534				EMIT(OEOS, 0);
535				break;
536			case 'B':
537				EMIT(ONWBND, 0);
538				break;
539			case 'b':
540				EMIT(OWBND, 0);
541				break;
542			case 'W':
543			case 'w':
544			case 'S':
545			case 's':
546				p_b_pseudoclass(p, wc);
547				break;
548			case 'a':
549				ordinary(p, '\a');
550				break;
551			case 'e':
552				ordinary(p, '\e');
553				break;
554			case 'f':
555				ordinary(p, '\f');
556				break;
557			case 'n':
558				ordinary(p, '\n');
559				break;
560			case 'r':
561				ordinary(p, '\r');
562				break;
563			case 't':
564				ordinary(p, '\t');
565				break;
566			case 'v':
567				ordinary(p, '\v');
568				break;
569			case '1':
570			case '2':
571			case '3':
572			case '4':
573			case '5':
574			case '6':
575			case '7':
576			case '8':
577			case '9':
578				i = wc - '0';
579				assert(i < NPAREN);
580				if (p->pend[i] != 0) {
581					assert(i <= p->g->nsub);
582					EMIT(OBACK_, i);
583					assert(p->pbegin[i] != 0);
584					assert(OP(p->strip[p->pbegin[i]]) == OLPAREN);
585					assert(OP(p->strip[p->pend[i]]) == ORPAREN);
586					(void) dupl(p, p->pbegin[i]+1, p->pend[i]);
587					EMIT(O_BACK, i);
588				} else
589					SETERROR(REG_ESUBREG);
590				p->g->backrefs = 1;
591				break;
592			default:
593				handled = 0;
594			}
595			/* Don't proceed to the POSIX bits if we've already handled it */
596			if (handled)
597				break;
598		}
599#endif
600		switch (wc) {
601		case '<':
602			EMIT(OBOW, 0);
603			break;
604		case '>':
605			EMIT(OEOW, 0);
606			break;
607		default:
608			if (may_escape(p, wc))
609				ordinary(p, wc);
610			else
611				SETERROR(REG_EESCAPE);
612			break;
613		}
614		break;
615	default:
616		if (p->error != 0)
617			return (false);
618		p->next--;
619		wc = WGETNEXT();
620		ordinary(p, wc);
621		break;
622	}
623
624	if (!MORE())
625		return (false);
626	c = PEEK();
627	/* we call { a repetition if followed by a digit */
628	if (!( c == '*' || c == '+' || c == '?' || c == '{'))
629		return (false);		/* no repetition, we're done */
630	else if (c == '{')
631		(void)REQUIRE(MORE2() && \
632		    (isdigit((uch)PEEK2()) || PEEK2() == ','), REG_BADRPT);
633	NEXT();
634
635	(void)REQUIRE(!wascaret, REG_BADRPT);
636	switch (c) {
637	case '*':	/* implemented as +? */
638		/* this case does not require the (y|) trick, noKLUDGE */
639		INSERT(OPLUS_, pos);
640		ASTERN(O_PLUS, pos);
641		INSERT(OQUEST_, pos);
642		ASTERN(O_QUEST, pos);
643		break;
644	case '+':
645		INSERT(OPLUS_, pos);
646		ASTERN(O_PLUS, pos);
647		break;
648	case '?':
649		/* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
650		INSERT(OCH_, pos);		/* offset slightly wrong */
651		ASTERN(OOR1, pos);		/* this one's right */
652		AHEAD(pos);			/* fix the OCH_ */
653		EMIT(OOR2, 0);			/* offset very wrong... */
654		AHEAD(THERE());			/* ...so fix it */
655		ASTERN(O_CH, THERETHERE());
656		break;
657	case '{':
658		count = p_count(p);
659		if (EAT(',')) {
660			if (isdigit((uch)PEEK())) {
661				count2 = p_count(p);
662				(void)REQUIRE(count <= count2, REG_BADBR);
663			} else		/* single number with comma */
664				count2 = INFINITY;
665		} else		/* just a single number */
666			count2 = count;
667		repeat(p, pos, count, count2);
668		if (!EAT('}')) {	/* error heuristics */
669			while (MORE() && PEEK() != '}')
670				NEXT();
671			(void)REQUIRE(MORE(), REG_EBRACE);
672			SETERROR(REG_BADBR);
673		}
674		break;
675	}
676
677	if (!MORE())
678		return (false);
679	c = PEEK();
680	if (!( c == '*' || c == '+' || c == '?' ||
681				(c == '{' && MORE2() && isdigit((uch)PEEK2())) ) )
682		return (false);
683	SETERROR(REG_BADRPT);
684	return (false);
685}
686
687/*
688 - p_str - string (no metacharacters) "parser"
689 == static void p_str(struct parse *p);
690 */
691static void
692p_str(struct parse *p)
693{
694	(void)REQUIRE(MORE(), REG_EMPTY);
695	while (MORE())
696		ordinary(p, WGETNEXT());
697}
698
699/*
700 * Eat consecutive branch delimiters for the kind of expression that we are
701 * parsing, return the number of delimiters that we ate.
702 */
703static int
704p_branch_eat_delim(struct parse *p, struct branchc *bc)
705{
706	int nskip;
707
708	(void)bc;
709	nskip = 0;
710	while (EATSPEC('|'))
711		++nskip;
712	return (nskip);
713}
714
715/*
716 * Insert necessary branch book-keeping operations. This emits a
717 * bogus 'next' offset, since we still have more to parse
718 */
719static void
720p_branch_ins_offset(struct parse *p, struct branchc *bc)
721{
722
723	if (bc->nbranch == 0) {
724		INSERT(OCH_, bc->start);	/* offset is wrong */
725		bc->fwd = bc->start;
726		bc->back = bc->start;
727	}
728
729	ASTERN(OOR1, bc->back);
730	bc->back = THERE();
731	AHEAD(bc->fwd);			/* fix previous offset */
732	bc->fwd = HERE();
733	EMIT(OOR2, 0);			/* offset is very wrong */
734	++bc->nbranch;
735}
736
737/*
738 * Fix the offset of the tail branch, if we actually had any branches.
739 * This is to correct the bogus placeholder offset that we use.
740 */
741static void
742p_branch_fix_tail(struct parse *p, struct branchc *bc)
743{
744
745	/* Fix bogus offset at the tail if we actually have branches */
746	if (bc->nbranch > 0) {
747		AHEAD(bc->fwd);
748		ASTERN(O_CH, bc->back);
749	}
750}
751
752/*
753 * Signal to the parser that an empty branch has been encountered; this will,
754 * in the future, be used to allow for more permissive behavior with empty
755 * branches. The return value should indicate whether parsing may continue
756 * or not.
757 */
758static bool
759p_branch_empty(struct parse *p, struct branchc *bc)
760{
761
762	(void)bc;
763	SETERROR(REG_EMPTY);
764	return (false);
765}
766
767/*
768 * Take care of any branching requirements. This includes inserting the
769 * appropriate branching instructions as well as eating all of the branch
770 * delimiters until we either run out of pattern or need to parse more pattern.
771 */
772static bool
773p_branch_do(struct parse *p, struct branchc *bc)
774{
775	int ate = 0;
776
777	ate = p_branch_eat_delim(p, bc);
778	if (ate == 0)
779		return (false);
780	else if ((ate > 1 || (bc->outer && !MORE())) && !p_branch_empty(p, bc))
781		/*
782		 * Halt parsing only if we have an empty branch and p_branch_empty
783		 * indicates that we must not continue. In the future, this will not
784		 * necessarily be an error.
785		 */
786		return (false);
787	p_branch_ins_offset(p, bc);
788
789	return (true);
790}
791
792static void
793p_bre_pre_parse(struct parse *p, struct branchc *bc)
794{
795
796	(void)bc;
797	/*
798	 * Does not move cleanly into expression parser because of
799	 * ordinary interpration of * at the beginning position of
800	 * an expression.
801	 */
802	if (EAT('^')) {
803		EMIT(OBOL, 0);
804		p->g->iflags |= USEBOL;
805		p->g->nbol++;
806	}
807}
808
809static void
810p_bre_post_parse(struct parse *p, struct branchc *bc)
811{
812
813	/* Expression is terminating due to EOL token */
814	if (bc->terminate) {
815		DROP(1);
816		EMIT(OEOL, 0);
817		p->g->iflags |= USEEOL;
818		p->g->neol++;
819	}
820}
821
822/*
823 - p_re - Top level parser, concatenation and BRE anchoring
824 == static void p_re(struct parse *p, int end1, int end2);
825 * Giving end1 as OUT essentially eliminates the end1/end2 check.
826 *
827 * This implementation is a bit of a kludge, in that a trailing $ is first
828 * taken as an ordinary character and then revised to be an anchor.
829 * The amount of lookahead needed to avoid this kludge is excessive.
830 */
831static void
832p_re(struct parse *p,
833	int end1,	/* first terminating character */
834	int end2)	/* second terminating character; ignored for EREs */
835{
836	struct branchc bc;
837
838	bc.nbranch = 0;
839	if (end1 == OUT && end2 == OUT)
840		bc.outer = true;
841	else
842		bc.outer = false;
843#define	SEEEND()	(!p->bre ? SEE(end1) : SEETWO(end1, end2))
844	for (;;) {
845		bc.start = HERE();
846		bc.nchain = 0;
847		bc.terminate = false;
848		if (p->pre_parse != NULL)
849			p->pre_parse(p, &bc);
850		while (MORE() && (!p->allowbranch || !SEESPEC('|')) && !SEEEND()) {
851			bc.terminate = p->parse_expr(p, &bc);
852			++bc.nchain;
853		}
854		if (p->post_parse != NULL)
855			p->post_parse(p, &bc);
856		(void) REQUIRE(p->gnuext || HERE() != bc.start, REG_EMPTY);
857#ifdef REGEX_GNU_EXTENSIONS
858		if (p->gnuext && HERE() == bc.start && !p_branch_empty(p, &bc))
859			break;
860#endif
861		if (!p->allowbranch)
862			break;
863		/*
864		 * p_branch_do's return value indicates whether we should
865		 * continue parsing or not. This is both for correctness and
866		 * a slight optimization, because it will check if we've
867		 * encountered an empty branch or the end of the string
868		 * immediately following a branch delimiter.
869		 */
870		if (!p_branch_do(p, &bc))
871			break;
872	}
873#undef SEE_END
874	if (p->allowbranch)
875		p_branch_fix_tail(p, &bc);
876	assert(!MORE() || SEE(end1));
877}
878
879/*
880 - p_simp_re - parse a simple RE, an atom possibly followed by a repetition
881 == static bool p_simp_re(struct parse *p, struct branchc *bc);
882 */
883static bool			/* was the simple RE an unbackslashed $? */
884p_simp_re(struct parse *p, struct branchc *bc)
885{
886	int c;
887	int cc;			/* convenient/control character */
888	int count;
889	int count2;
890	sopno pos;
891	bool handled;
892	size_t i;
893	wint_t wc;
894	sopno subno;
895#	define	BACKSL	(1<<CHAR_BIT)
896
897	pos = HERE();		/* repetition op, if any, covers from here */
898	handled = false;
899
900	assert(MORE());		/* caller should have ensured this */
901	c = (uch)GETNEXT();
902	if (c == '\\') {
903		(void)REQUIRE(MORE(), REG_EESCAPE);
904		cc = (uch)GETNEXT();
905		c = BACKSL | cc;
906#ifdef REGEX_GNU_EXTENSIONS
907		if (p->gnuext) {
908			handled = true;
909			switch (c) {
910			case BACKSL|'`':
911				EMIT(OBOS, 0);
912				break;
913			case BACKSL|'\'':
914				EMIT(OEOS, 0);
915				break;
916			case BACKSL|'B':
917				EMIT(ONWBND, 0);
918				break;
919			case BACKSL|'b':
920				EMIT(OWBND, 0);
921				break;
922			case BACKSL|'W':
923			case BACKSL|'w':
924			case BACKSL|'S':
925			case BACKSL|'s':
926				p_b_pseudoclass(p, cc);
927				break;
928			case BACKSL|'a':
929				ordinary(p, '\a');
930				break;
931			case BACKSL|'e':
932				ordinary(p, '\e');
933				break;
934			case BACKSL|'f':
935				ordinary(p, '\f');
936				break;
937			case BACKSL|'n':
938				ordinary(p, '\n');
939				break;
940			case BACKSL|'r':
941				ordinary(p, '\r');
942				break;
943			case BACKSL|'t':
944				ordinary(p, '\t');
945				break;
946			case BACKSL|'v':
947				ordinary(p, '\v');
948				break;
949			default:
950				handled = false;
951			}
952		}
953#endif
954	}
955	if (!handled) {
956		switch (c) {
957		case '.':
958			if (p->g->cflags&REG_NEWLINE)
959				nonnewline(p);
960			else
961				EMIT(OANY, 0);
962			break;
963		case '[':
964			p_bracket(p);
965			break;
966		case BACKSL|'<':
967			EMIT(OBOW, 0);
968			break;
969		case BACKSL|'>':
970			EMIT(OEOW, 0);
971			break;
972		case BACKSL|'{':
973			SETERROR(REG_BADRPT);
974			break;
975		case BACKSL|'(':
976			p->g->nsub++;
977			subno = (sopno)p->g->nsub;
978			if (subno < NPAREN)
979				p->pbegin[subno] = HERE();
980			EMIT(OLPAREN, subno);
981			/* the MORE here is an error heuristic */
982			if (MORE() && !SEETWO('\\', ')'))
983				p_re(p, '\\', ')');
984			if (subno < NPAREN) {
985				p->pend[subno] = HERE();
986				assert(p->pend[subno] != 0);
987			}
988			EMIT(ORPAREN, subno);
989			(void)REQUIRE(EATTWO('\\', ')'), REG_EPAREN);
990			break;
991		case BACKSL|')':	/* should not get here -- must be user */
992			SETERROR(REG_EPAREN);
993			break;
994		case BACKSL|'1':
995		case BACKSL|'2':
996		case BACKSL|'3':
997		case BACKSL|'4':
998		case BACKSL|'5':
999		case BACKSL|'6':
1000		case BACKSL|'7':
1001		case BACKSL|'8':
1002		case BACKSL|'9':
1003			i = (c&~BACKSL) - '0';
1004			assert(i < NPAREN);
1005			if (p->pend[i] != 0) {
1006				assert(i <= p->g->nsub);
1007				EMIT(OBACK_, i);
1008				assert(p->pbegin[i] != 0);
1009				assert(OP(p->strip[p->pbegin[i]]) == OLPAREN);
1010				assert(OP(p->strip[p->pend[i]]) == ORPAREN);
1011				(void) dupl(p, p->pbegin[i]+1, p->pend[i]);
1012				EMIT(O_BACK, i);
1013			} else
1014				SETERROR(REG_ESUBREG);
1015			p->g->backrefs = 1;
1016			break;
1017		case '*':
1018			/*
1019			 * Ordinary if used as the first character beyond BOL anchor of
1020			 * a (sub-)expression, counts as a bad repetition operator if it
1021			 * appears otherwise.
1022			 */
1023			(void)REQUIRE(bc->nchain == 0, REG_BADRPT);
1024			/* FALLTHROUGH */
1025		default:
1026			if (p->error != 0)
1027				return (false);	/* Definitely not $... */
1028			p->next--;
1029			wc = WGETNEXT();
1030			if ((c & BACKSL) == 0 || may_escape(p, wc))
1031				ordinary(p, wc);
1032			else
1033				SETERROR(REG_EESCAPE);
1034			break;
1035		}
1036	}
1037
1038	if (EAT('*')) {		/* implemented as +? */
1039		/* this case does not require the (y|) trick, noKLUDGE */
1040		INSERT(OPLUS_, pos);
1041		ASTERN(O_PLUS, pos);
1042		INSERT(OQUEST_, pos);
1043		ASTERN(O_QUEST, pos);
1044#ifdef REGEX_GNU_EXTENSIONS
1045	} else if (p->gnuext && EATTWO('\\', '?')) {
1046		INSERT(OQUEST_, pos);
1047		ASTERN(O_QUEST, pos);
1048	} else if (p->gnuext && EATTWO('\\', '+')) {
1049		INSERT(OPLUS_, pos);
1050		ASTERN(O_PLUS, pos);
1051#endif
1052	} else if (EATTWO('\\', '{')) {
1053		count = p_count(p);
1054		if (EAT(',')) {
1055			if (MORE() && isdigit((uch)PEEK())) {
1056				count2 = p_count(p);
1057				(void)REQUIRE(count <= count2, REG_BADBR);
1058			} else		/* single number with comma */
1059				count2 = INFINITY;
1060		} else		/* just a single number */
1061			count2 = count;
1062		repeat(p, pos, count, count2);
1063		if (!EATTWO('\\', '}')) {	/* error heuristics */
1064			while (MORE() && !SEETWO('\\', '}'))
1065				NEXT();
1066			(void)REQUIRE(MORE(), REG_EBRACE);
1067			SETERROR(REG_BADBR);
1068		}
1069	} else if (c == '$')     /* $ (but not \$) ends it */
1070		return (true);
1071
1072	return (false);
1073}
1074
1075/*
1076 - p_count - parse a repetition count
1077 == static int p_count(struct parse *p);
1078 */
1079static int			/* the value */
1080p_count(struct parse *p)
1081{
1082	int count = 0;
1083	int ndigits = 0;
1084
1085	while (MORE() && isdigit((uch)PEEK()) && count <= DUPMAX) {
1086		count = count*10 + ((uch)GETNEXT() - '0');
1087		ndigits++;
1088	}
1089
1090	(void)REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR);
1091	return(count);
1092}
1093
1094/*
1095 - p_bracket - parse a bracketed character list
1096 == static void p_bracket(struct parse *p);
1097 */
1098static void
1099p_bracket(struct parse *p)
1100{
1101	cset *cs;
1102	wint_t ch;
1103
1104	/* Dept of Truly Sickening Special-Case Kludges */
1105	if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]", 6) == 0) {
1106		EMIT(OBOW, 0);
1107		NEXTn(6);
1108		return;
1109	}
1110	if (p->next + 5 < p->end && strncmp(p->next, "[:>:]]", 6) == 0) {
1111		EMIT(OEOW, 0);
1112		NEXTn(6);
1113		return;
1114	}
1115
1116	if ((cs = allocset(p)) == NULL)
1117		return;
1118
1119	if (p->g->cflags&REG_ICASE)
1120		cs->icase = 1;
1121	if (EAT('^'))
1122		cs->invert = 1;
1123	if (EAT(']'))
1124		CHadd(p, cs, ']');
1125	else if (EAT('-'))
1126		CHadd(p, cs, '-');
1127	while (MORE() && PEEK() != ']' && !SEETWO('-', ']'))
1128		p_b_term(p, cs);
1129	if (EAT('-'))
1130		CHadd(p, cs, '-');
1131	(void)MUSTEAT(']', REG_EBRACK);
1132
1133	if (p->error != 0)	/* don't mess things up further */
1134		return;
1135
1136	if (cs->invert && p->g->cflags&REG_NEWLINE)
1137		cs->bmp['\n' >> 3] |= 1 << ('\n' & 7);
1138
1139	if ((ch = singleton(cs)) != OUT) {	/* optimize singleton sets */
1140		ordinary(p, ch);
1141		freeset(p, cs);
1142	} else
1143		EMIT(OANYOF, (size_t)(cs - p->g->sets));
1144}
1145
1146static int
1147p_range_cmp(wchar_t c1, wchar_t c2)
1148{
1149#ifdef REGEX_LIBC_COLLATE
1150	return __wcollate_range_cmp(c1, c2);
1151#elif defined(NLS)
1152	/* Copied from libc/collate __wcollate_range_cmp */
1153	wchar_t s1[2], s2[2];
1154
1155	s1[0] = c1;
1156	s1[1] = L'\0';
1157	s2[0] = c2;
1158	s2[1] = L'\0';
1159	return wcscoll(s1, s2);
1160#else
1161	char s1[2], s2[2];
1162
1163	s1[0] = (char)c1;
1164	s1[1] = '\0';
1165	s2[0] = (char)c2;
1166	s2[1] = '\0';
1167	return strcoll(s1, s2);
1168#endif
1169}
1170
1171/*
1172 - p_b_term - parse one term of a bracketed character list
1173 == static void p_b_term(struct parse *p, cset *cs);
1174 */
1175static void
1176p_b_term(struct parse *p, cset *cs)
1177{
1178	char c;
1179	wint_t start, finish;
1180	wint_t i;
1181#ifdef REGEX_LIBC_COLLATE
1182	struct xlocale_collate *table =
1183		(struct xlocale_collate*)__get_locale()->components[XLC_COLLATE];
1184#endif
1185
1186	_DIAGASSERT(p != NULL);
1187	_DIAGASSERT(cs != NULL);
1188
1189	/* classify what we've got */
1190	switch ((MORE()) ? PEEK() : '\0') {
1191	case '[':
1192		c = (MORE2()) ? PEEK2() : '\0';
1193		break;
1194	case '-':
1195		SETERROR(REG_ERANGE);
1196		return;			/* NOTE RETURN */
1197	default:
1198		c = '\0';
1199		break;
1200	}
1201
1202	switch (c) {
1203	case ':':		/* character class */
1204		NEXT2();
1205		(void)REQUIRE(MORE(), REG_EBRACK);
1206		c = PEEK();
1207		(void)REQUIRE(c != '-' && c != ']', REG_ECTYPE);
1208		p_b_cclass(p, cs);
1209		(void)REQUIRE(MORE(), REG_EBRACK);
1210		(void)REQUIRE(EATTWO(':', ']'), REG_ECTYPE);
1211		break;
1212	case '=':		/* equivalence class */
1213		NEXT2();
1214		(void)REQUIRE(MORE(), REG_EBRACK);
1215		c = PEEK();
1216		(void)REQUIRE(c != '-' && c != ']', REG_ECOLLATE);
1217		p_b_eclass(p, cs);
1218		(void)REQUIRE(MORE(), REG_EBRACK);
1219		(void)REQUIRE(EATTWO('=', ']'), REG_ECOLLATE);
1220		break;
1221	default:		/* symbol, ordinary character, or range */
1222		start = p_b_symbol(p);
1223		if (SEE('-') && MORE2() && PEEK2() != ']') {
1224			/* range */
1225			NEXT();
1226			if (EAT('-'))
1227				finish = '-';
1228			else
1229				finish = p_b_symbol(p);
1230		} else
1231			finish = start;
1232		if (start == finish)
1233			CHadd(p, cs, start);
1234		else {
1235#ifdef REGEX_LIBC_COLLATE
1236			if (table->__collate_load_error || MB_CUR_MAX > 1) {
1237#else
1238			if (MB_CUR_MAX > 1) {
1239#endif
1240				(void)REQUIRE(start <= finish, REG_ERANGE);
1241				CHaddrange(p, cs, start, finish);
1242			} else {
1243				(void)REQUIRE(p_range_cmp(start, finish) <= 0, REG_ERANGE);
1244				for (i = 0; i <= UCHAR_MAX; i++) {
1245					if (p_range_cmp(start, i) <= 0 &&
1246					    p_range_cmp(i, finish) <= 0 )
1247						CHadd(p, cs, i);
1248				}
1249			}
1250		}
1251		break;
1252	}
1253}
1254
1255#ifdef REGEX_GNU_EXTENSIONS
1256/*
1257 - p_b_pseudoclass - parse a pseudo-class (\w, \W, \s, \S)
1258 == static int p_b_pseudoclass(struct parse *p, char c)
1259 */
1260static int
1261p_b_pseudoclass(struct parse *p, char c) {
1262	cset *cs;
1263
1264	if ((cs = allocset(p)) == NULL)
1265		return(0);
1266
1267	if (p->g->cflags&REG_ICASE)
1268		cs->icase = 1;
1269
1270	switch (c) {
1271	case 'W':
1272		cs->invert = 1;
1273		/* FALLTHROUGH */
1274	case 'w':
1275		p_b_cclass_named(p, cs, "alnum");
1276		break;
1277	case 'S':
1278		cs->invert = 1;
1279		/* FALLTHROUGH */
1280	case 's':
1281		p_b_cclass_named(p, cs, "space");
1282		break;
1283	default:
1284		return(0);
1285	}
1286
1287	EMIT(OANYOF, (size_t)(cs - p->g->sets));
1288	return(1);
1289}
1290#endif
1291
1292/*
1293 - p_b_cclass - parse a character-class name and deal with it
1294 == static void p_b_cclass(struct parse *p, cset *cs);
1295 */
1296static void
1297p_b_cclass(struct parse *p, cset *cs)
1298{
1299	const char *sp = p->next;
1300	size_t len;
1301	char clname[16];
1302
1303	while (MORE() && isalpha((uch)PEEK()))
1304		NEXT();
1305	len = p->next - sp;
1306	if (len >= sizeof(clname) - 1) {
1307		SETERROR(REG_ECTYPE);
1308		return;
1309	}
1310	memcpy(clname, sp, len);
1311	clname[len] = '\0';
1312
1313	p_b_cclass_named(p, cs, clname);
1314}
1315
1316/*
1317 - p_b_cclass_named - deal with a named character class
1318 == static void p_b_cclass_named(struct parse *p, cset *cs, const char []);
1319 */
1320static void
1321p_b_cclass_named(struct parse *p, cset *cs, const char clname[]) {
1322	wctype_t wct;
1323
1324	if ((wct = wctype(clname)) == 0) {
1325		SETERROR(REG_ECTYPE);
1326		return;
1327	}
1328	CHaddtype(p, cs, wct);
1329}
1330
1331/*
1332 - p_b_eclass - parse an equivalence-class name and deal with it
1333 == static void p_b_eclass(struct parse *p, cset *cs);
1334 *
1335 * This implementation is incomplete. xxx
1336 */
1337static void
1338p_b_eclass(struct parse *p, cset *cs)
1339{
1340	wint_t c;
1341
1342	_DIAGASSERT(p != NULL);
1343	_DIAGASSERT(cs != NULL);
1344
1345	c = p_b_coll_elem(p, '=');
1346	CHadd(p, cs, c);
1347}
1348
1349/*
1350 - p_b_symbol - parse a character or [..]ed multicharacter collating symbol
1351 == static wint_t p_b_symbol(struct parse *p);
1352 */
1353static wint_t			/* value of symbol */
1354p_b_symbol(struct parse *p)
1355{
1356	wint_t value;
1357
1358	_DIAGASSERT(p != NULL);
1359
1360	(void)REQUIRE(MORE(), REG_EBRACK);
1361	if (!EATTWO('[', '.'))
1362		return(WGETNEXT());
1363
1364	/* collating symbol */
1365	value = p_b_coll_elem(p, '.');
1366	(void)REQUIRE(EATTWO('.', ']'), REG_ECOLLATE);
1367	return(value);
1368}
1369
1370/*
1371 - p_b_coll_elem - parse a collating-element name and look it up
1372 == static wint_t p_b_coll_elem(struct parse *p, wint_t endc);
1373 */
1374static wint_t			/* value of collating element */
1375p_b_coll_elem(struct parse *p,
1376	wint_t endc)		/* name ended by endc,']' */
1377{
1378	const char *sp = p->next;
1379	struct cname *cp;
1380	size_t len;
1381
1382	_DIAGASSERT(p != NULL);
1383
1384	while (MORE() && !SEETWO(endc, ']'))
1385		NEXT();
1386	if (!MORE()) {
1387		SETERROR(REG_EBRACK);
1388		return(0);
1389	}
1390	len = p->next - sp;
1391	for (cp = cnames; cp->name != NULL; cp++)
1392		if (strncmp(cp->name, sp, len) == 0 && strlen(cp->name) == len)
1393			return(cp->code);	/* known name */
1394#ifdef NLS
1395	mbstate_t mbs;
1396	wchar_t wc;
1397	size_t clen;
1398
1399	memset(&mbs, 0, sizeof(mbs));
1400	if ((clen = mbrtowc(&wc, sp, len, &mbs)) == len)
1401		return (wc);			/* single character */
1402	else if (clen == (size_t)-1 || clen == (size_t)-2)
1403		SETERROR(REG_ILLSEQ);
1404	else
1405		SETERROR(REG_ECOLLATE);		/* neither */
1406	return(0);
1407#else
1408	if (len == 1)
1409		return *sp;    /* single character */
1410	SETERROR(REG_ECOLLATE);                 /* neither */
1411	return 0;
1412#endif
1413}
1414
1415/*
1416 - may_escape - determine whether 'ch' is escape-able in the current context
1417 == static int may_escape(struct parse *p, const wint_t ch)
1418 */
1419static bool
1420may_escape(struct parse *p, const wint_t ch)
1421{
1422
1423	if ((p->pflags & PFLAG_LEGACY_ESC) != 0)
1424		return (true);
1425	if (iswalpha(ch) || ch == '\'' || ch == '`')
1426		return (false);
1427	return (true);
1428#ifdef NOTYET
1429	/*
1430	 * Build a whitelist of characters that may be escaped to produce an
1431	 * ordinary in the current context. This assumes that these have not
1432	 * been otherwise interpreted as a special character. Escaping an
1433	 * ordinary character yields undefined results according to
1434	 * IEEE 1003.1-2008. Some extensions (notably, some GNU extensions) take
1435	 * advantage of this and use escaped ordinary characters to provide
1436	 * special meaning, e.g. \b, \B, \w, \W, \s, \S.
1437	 */
1438	switch(ch) {
1439	case '|':
1440	case '+':
1441	case '?':
1442		/* The above characters may not be escaped in BREs */
1443		if (!(p->g->cflags&REG_EXTENDED))
1444			return (false);
1445		/* Fallthrough */
1446	case '(':
1447	case ')':
1448	case '{':
1449	case '}':
1450	case '.':
1451	case '[':
1452	case ']':
1453	case '\\':
1454	case '*':
1455	case '^':
1456	case '$':
1457		return (true);
1458	default:
1459		return (false);
1460	}
1461#endif
1462}
1463
1464/*
1465 - othercase - return the case counterpart of an alphabetic
1466 == static wint_t othercase(wint_t ch);
1467 */
1468static wint_t			/* if no counterpart, return ch */
1469othercase(wint_t ch)
1470{
1471	assert(iswalpha(ch));
1472	if (iswupper(ch))
1473		return(towlower(ch));
1474	else if (iswlower(ch))
1475		return(towupper(ch));
1476	else			/* peculiar, but could happen */
1477		return(ch);
1478}
1479
1480/*
1481 - bothcases - emit a dualcase version of a two-case character
1482 == static void bothcases(struct parse *p, wint_t ch);
1483 *
1484 * Boy, is this implementation ever a kludge...
1485 */
1486static void
1487bothcases(struct parse *p, wint_t ch)
1488{
1489	const char *oldnext = p->next;
1490	const char *oldend = p->end;
1491	char bracket[3 + MB_LEN_MAX];
1492	size_t n;
1493
1494	_DIAGASSERT(p != NULL);
1495
1496	assert(othercase(ch) != ch);	/* p_bracket() would recurse */
1497	p->next = bracket;
1498#ifdef NLS
1499	mbstate_t mbs;
1500	memset(&mbs, 0, sizeof(mbs));
1501	n = wcrtomb(bracket, ch, &mbs);
1502	assert(n != (size_t)-1);
1503#else
1504	n = 0;
1505	bracket[n++] = ch;
1506#endif
1507	bracket[n] = ']';
1508	bracket[n + 1] = '\0';
1509	p->end = bracket+n+1;
1510	p_bracket(p);
1511	assert(p->next == p->end);
1512	p->next = oldnext;
1513	p->end = oldend;
1514}
1515
1516/*
1517 - ordinary - emit an ordinary character
1518 == static void ordinary(struct parse *p, wint_t ch);
1519 */
1520static void
1521ordinary(struct parse *p, wint_t ch)
1522{
1523	cset *cs;
1524
1525	_DIAGASSERT(p != NULL);
1526
1527	if ((p->g->cflags&REG_ICASE) && iswalpha(ch) && othercase(ch) != ch)
1528		bothcases(p, ch);
1529	else if ((wint_t)(ch & OPDMASK) == ch)
1530		EMIT(OCHAR, (size_t)ch);
1531	else {
1532		/*
1533		 * Kludge: character is too big to fit into an OCHAR operand.
1534		 * Emit a singleton set.
1535		 */
1536		if ((cs = allocset(p)) == NULL)
1537			return;
1538		CHadd(p, cs, ch);
1539		EMIT(OANYOF, (size_t)(cs - p->g->sets));
1540	}
1541}
1542
1543/*
1544 - nonnewline - emit REG_NEWLINE version of OANY
1545 == static void nonnewline(struct parse *p);
1546 *
1547 * Boy, is this implementation ever a kludge...
1548 */
1549static void
1550nonnewline(struct parse *p)
1551{
1552	const char *oldnext = p->next;
1553	const char *oldend = p->end;
1554	char bracket[4];
1555
1556	_DIAGASSERT(p != NULL);
1557
1558	p->next = bracket;
1559	p->end = bracket+3;
1560	bracket[0] = '^';
1561	bracket[1] = '\n';
1562	bracket[2] = ']';
1563	bracket[3] = '\0';
1564	p_bracket(p);
1565	assert(p->next == bracket+3);
1566	p->next = oldnext;
1567	p->end = oldend;
1568}
1569
1570/*
1571 - repeat - generate code for a bounded repetition, recursively if needed
1572 == static void repeat(struct parse *p, sopno start, int from, int to);
1573 */
1574static void
1575repeat(struct parse *p,
1576	sopno start,		/* operand from here to end of strip */
1577	int from,		/* repeated from this number */
1578	int to)			/* to this number of times (maybe INFINITY) */
1579{
1580	sopno finish = HERE();
1581#	define	N	2
1582#	define	INF	3
1583#	define	REP(f, t)	((f)*8 + (t))
1584#	define	MAP(n)	(((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N)
1585	sopno copy;
1586
1587	_DIAGASSERT(p != NULL);
1588
1589	if (p->error != 0)	/* head off possible runaway recursion */
1590		return;
1591
1592	assert(from <= to);
1593
1594	switch (REP(MAP(from), MAP(to))) {
1595	case REP(0, 0):			/* must be user doing this */
1596		DROP(finish-start);	/* drop the operand */
1597		break;
1598	case REP(0, 1):			/* as x{1,1}? */
1599	case REP(0, N):			/* as x{1,n}? */
1600	case REP(0, INF):		/* as x{1,}? */
1601		/* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
1602		INSERT(OCH_, start);		/* offset is wrong... */
1603		repeat(p, start+1, 1, to);
1604		ASTERN(OOR1, start);
1605		AHEAD(start);			/* ... fix it */
1606		EMIT(OOR2, 0);
1607		AHEAD(THERE());
1608		ASTERN(O_CH, THERETHERE());
1609		break;
1610	case REP(1, 1):			/* trivial case */
1611		/* done */
1612		break;
1613	case REP(1, N):			/* as x?x{1,n-1} */
1614		/* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
1615		INSERT(OCH_, start);
1616		ASTERN(OOR1, start);
1617		AHEAD(start);
1618		EMIT(OOR2, 0);			/* offset very wrong... */
1619		AHEAD(THERE());			/* ...so fix it */
1620		ASTERN(O_CH, THERETHERE());
1621		copy = dupl(p, start+1, finish+1);
1622		assert(copy == finish+4);
1623		repeat(p, copy, 1, to-1);
1624		break;
1625	case REP(1, INF):		/* as x+ */
1626		INSERT(OPLUS_, start);
1627		ASTERN(O_PLUS, start);
1628		break;
1629	case REP(N, N):			/* as xx{m-1,n-1} */
1630		copy = dupl(p, start, finish);
1631		repeat(p, copy, from-1, to-1);
1632		break;
1633	case REP(N, INF):		/* as xx{n-1,INF} */
1634		copy = dupl(p, start, finish);
1635		repeat(p, copy, from-1, to);
1636		break;
1637	default:			/* "can't happen" */
1638		SETERROR(REG_ASSERT);	/* just in case */
1639		break;
1640	}
1641}
1642
1643/*
1644 - wgetnext - helper function for WGETNEXT() macro. Gets the next wide
1645 - character from the parse struct, signals a REG_ILLSEQ error if the
1646 - character can't be converted. Returns the number of bytes consumed.
1647 */
1648static wint_t
1649wgetnext(struct parse *p)
1650{
1651#ifdef NLS
1652	mbstate_t mbs;
1653	wchar_t wc;
1654	size_t n;
1655
1656	memset(&mbs, 0, sizeof(mbs));
1657	n = mbrtowc(&wc, p->next, (size_t)(p->end - p->next), &mbs);
1658	if (n == (size_t)-1 || n == (size_t)-2) {
1659		SETERROR(REG_ILLSEQ);
1660		return (0);
1661	}
1662	if (n == 0)
1663		n = 1;
1664	p->next += n;
1665	return wc;
1666#else
1667	return *p->next++;
1668#endif
1669}
1670
1671/*
1672 - seterr - set an error condition
1673 == static int seterr(struct parse *p, int e);
1674 */
1675static int			/* useless but makes type checking happy */
1676seterr(struct parse *p, int e)
1677{
1678
1679	_DIAGASSERT(p != NULL);
1680
1681	if (p->error == 0)	/* keep earliest error condition */
1682		p->error = e;
1683	p->next = nuls;		/* try to bring things to a halt */
1684	p->end = nuls;
1685	return(0);		/* make the return value well-defined */
1686}
1687
1688/*
1689 - allocset - allocate a set of characters for []
1690 == static cset *allocset(struct parse *p);
1691 */
1692static cset *
1693allocset(struct parse *p)
1694{
1695	cset *cs, *ncs;
1696
1697	_DIAGASSERT(p != NULL);
1698
1699	ncs = reallocarray(p->g->sets, p->g->ncsets + 1, sizeof(*ncs));
1700	if (ncs == NULL) {
1701		SETERROR(REG_ESPACE);
1702		return (NULL);
1703	}
1704	p->g->sets = ncs;
1705	cs = &p->g->sets[p->g->ncsets++];
1706	memset(cs, 0, sizeof(*cs));
1707
1708	return(cs);
1709}
1710
1711/*
1712 - freeset - free a now-unused set
1713 == static void freeset(struct parse *p, cset *cs);
1714 */
1715static void
1716freeset(struct parse *p, cset *cs)
1717{
1718	cset *top;
1719
1720	_DIAGASSERT(p != NULL);
1721	_DIAGASSERT(cs != NULL);
1722
1723	top = &p->g->sets[p->g->ncsets];
1724
1725	free(cs->wides);
1726	free(cs->ranges);
1727	free(cs->types);
1728	memset(cs, 0, sizeof(*cs));
1729	if (cs == top-1)	/* recover only the easy case */
1730		p->g->ncsets--;
1731}
1732
1733/*
1734 - singleton - Determine whether a set contains only one character,
1735 - returning it if so, otherwise returning OUT.
1736 */
1737static wint_t
1738singleton(cset *cs)
1739{
1740	wint_t i, s, n;
1741
1742	for (i = n = 0; i < NC; i++)
1743		if (CHIN(cs, i)) {
1744			n++;
1745			s = i;
1746		}
1747	if (n == 1)
1748		return (s);
1749	if (cs->nwides == 1 && cs->nranges == 0 && cs->ntypes == 0 &&
1750	    cs->icase == 0)
1751		return (cs->wides[0]);
1752	/* Don't bother handling the other cases. */
1753	return (OUT);
1754}
1755
1756/*
1757 - CHadd - add character to character set.
1758 */
1759static void
1760CHadd(struct parse *p, cset *cs, wint_t ch)
1761{
1762	wint_t nch, *newwides;
1763
1764	_DIAGASSERT(p != NULL);
1765	_DIAGASSERT(cs != NULL);
1766
1767	assert(ch >= 0);
1768	if (ch < NC)
1769		cs->bmp[(unsigned)ch >> 3] |= 1 << (ch & 7);
1770	else {
1771		newwides = reallocarray(cs->wides, cs->nwides + 1,
1772		    sizeof(*cs->wides));
1773		if (newwides == NULL) {
1774			SETERROR(REG_ESPACE);
1775			return;
1776		}
1777		cs->wides = newwides;
1778		cs->wides[cs->nwides++] = ch;
1779	}
1780	if (cs->icase) {
1781		if ((nch = towlower(ch)) < NC)
1782			cs->bmp[(unsigned)nch >> 3] |= 1 << (nch & 7);
1783		if ((nch = towupper(ch)) < NC)
1784			cs->bmp[(unsigned)nch >> 3] |= 1 << (nch & 7);
1785	}
1786}
1787
1788/*
1789 - CHaddrange - add all characters in the range [min,max] to a character set.
1790 */
1791static void
1792CHaddrange(struct parse *p, cset *cs, wint_t min, wint_t max)
1793{
1794	crange *newranges;
1795
1796	_DIAGASSERT(p != NULL);
1797	_DIAGASSERT(cs != NULL);
1798
1799	for (; min < NC && min <= max; min++)
1800		CHadd(p, cs, min);
1801	if (min >= max)
1802		return;
1803	newranges = reallocarray(cs->ranges, cs->nranges + 1,
1804	    sizeof(*cs->ranges));
1805	if (newranges == NULL) {
1806		SETERROR(REG_ESPACE);
1807		return;
1808	}
1809	cs->ranges = newranges;
1810	cs->ranges[cs->nranges].min = min;
1811	cs->ranges[cs->nranges].max = max;
1812	cs->nranges++;
1813}
1814
1815/*
1816 - CHaddtype - add all characters of a certain type to a character set.
1817 */
1818static void
1819CHaddtype(struct parse *p, cset *cs, wctype_t wct)
1820{
1821	wint_t i;
1822	wctype_t *newtypes;
1823
1824	_DIAGASSERT(p != NULL);
1825	_DIAGASSERT(cs != NULL);
1826
1827	for (i = 0; i < NC; i++)
1828		if (iswctype(i, wct))
1829			CHadd(p, cs, i);
1830	newtypes = reallocarray(cs->types, cs->ntypes + 1,
1831	    sizeof(*cs->types));
1832	if (newtypes == NULL) {
1833		SETERROR(REG_ESPACE);
1834		return;
1835	}
1836	cs->types = newtypes;
1837	cs->types[cs->ntypes++] = wct;
1838}
1839
1840/*
1841 - dupl - emit a duplicate of a bunch of sops
1842 == static sopno dupl(struct parse *p, sopno start, sopno finish);
1843 */
1844static sopno			/* start of duplicate */
1845dupl(struct parse *p,
1846	sopno start,		/* from here */
1847	sopno finish)		/* to this less one */
1848{
1849	sopno ret = HERE();
1850	sopno len = finish - start;
1851
1852	_DIAGASSERT(p != NULL);
1853
1854	assert(finish >= start);
1855	if (len == 0)
1856		return(ret);
1857	if (!enlarge(p, p->ssize + len)) /* this many unexpected additions */
1858		return(ret);
1859	(void) memcpy(p->strip + p->slen,
1860	    p->strip + start, len * sizeof(*p->strip));
1861	p->slen += len;
1862	return(ret);
1863}
1864
1865/*
1866 - doemit - emit a strip operator
1867 == static void doemit(struct parse *p, sop op, size_t opnd);
1868 *
1869 * It might seem better to implement this as a macro with a function as
1870 * hard-case backup, but it's just too big and messy unless there are
1871 * some changes to the data structures.  Maybe later.
1872 */
1873static void
1874doemit(struct parse *p, sop op, size_t opnd)
1875{
1876	/* avoid making error situations worse */
1877	if (p->error != 0)
1878		return;
1879
1880	_DIAGASSERT(p != NULL);
1881
1882	/* deal with oversize operands ("can't happen", more or less) */
1883	assert(opnd < 1<<OPSHIFT);
1884
1885	/* deal with undersized strip */
1886	if (p->slen >= p->ssize)
1887		if (!enlarge(p, (p->ssize+1) / 2 * 3))	/* +50% */
1888			return;
1889
1890	/* finally, it's all reduced to the easy case */
1891	p->strip[p->slen++] = (sopno)SOP(op, opnd);
1892}
1893
1894/*
1895 - doinsert - insert a sop into the strip
1896 == static void doinsert(struct parse *p, sop op, size_t opnd, sopno pos);
1897 */
1898static void
1899doinsert(struct parse *p, sop op, size_t opnd, sopno pos)
1900{
1901	sopno sn;
1902	sop s;
1903	int i;
1904
1905	_DIAGASSERT(p != NULL);
1906
1907	/* avoid making error situations worse */
1908	if (p->error != 0)
1909		return;
1910
1911	sn = HERE();
1912	EMIT(op, opnd);		/* do checks, ensure space */
1913	assert(HERE() == sn+1);
1914	s = p->strip[sn];
1915
1916	/* adjust paren pointers */
1917	assert(pos > 0);
1918	for (i = 1; i < NPAREN; i++) {
1919		if (p->pbegin[i] >= pos) {
1920			p->pbegin[i]++;
1921		}
1922		if (p->pend[i] >= pos) {
1923			p->pend[i]++;
1924		}
1925	}
1926
1927	memmove(&p->strip[pos+1], &p->strip[pos],
1928	    (HERE()-pos-1)*sizeof(*p->strip));
1929	p->strip[pos] = s;
1930}
1931
1932/*
1933 - dofwd - complete a forward reference
1934 == static void dofwd(struct parse *p, sopno pos, sop value);
1935 */
1936static void
1937dofwd(struct parse *p, sopno pos, sop value)
1938{
1939
1940	_DIAGASSERT(p != NULL);
1941
1942	/* avoid making error situations worse */
1943	if (p->error != 0)
1944		return;
1945
1946	assert(value < 1<<OPSHIFT);
1947	p->strip[pos] = OP(p->strip[pos]) | value;
1948}
1949
1950/*
1951 - enlarge - enlarge the strip
1952 == static int enlarge(struct parse *p, sopno size);
1953 */
1954static int
1955enlarge(struct parse *p, sopno size)
1956{
1957	sop *sp;
1958
1959	_DIAGASSERT(p != NULL);
1960
1961	if (p->ssize >= size)
1962		return 1;
1963
1964	sp = reallocarray(p->strip, size, sizeof(*p->strip));
1965	if (sp == NULL) {
1966		SETERROR(REG_ESPACE);
1967		return 0;
1968	}
1969	p->strip = sp;
1970	p->ssize = size;
1971	return 1;
1972}
1973
1974/*
1975 - stripsnug - compact the strip
1976 == static void stripsnug(struct parse *p, struct re_guts *g);
1977 */
1978static void
1979stripsnug(struct parse *p, struct re_guts *g)
1980{
1981
1982	_DIAGASSERT(p != NULL);
1983	_DIAGASSERT(g != NULL);
1984
1985	g->nstates = p->slen;
1986	g->strip = reallocarray(p->strip, p->slen, sizeof(*p->strip));
1987	if (g->strip == NULL) {
1988		SETERROR(REG_ESPACE);
1989		g->strip = p->strip;
1990	}
1991}
1992
1993/*
1994 - findmust - fill in must and mlen with longest mandatory literal string
1995 == static void findmust(struct parse *p, struct re_guts *g);
1996 *
1997 * This algorithm could do fancy things like analyzing the operands of |
1998 * for common subsequences.  Someday.  This code is simple and finds most
1999 * of the interesting cases.
2000 *
2001 * Note that must and mlen got initialized during setup.
2002 */
2003static void
2004findmust(struct parse *p, struct re_guts *g)
2005{
2006	sop *scan;
2007	sop *start = NULL;
2008	sop *newstart = NULL;
2009	sopno newlen;
2010	sop s;
2011	char *cp;
2012	int offset;
2013	mbstate_t mbs;
2014
2015	_DIAGASSERT(p != NULL);
2016	_DIAGASSERT(g != NULL);
2017
2018	/* avoid making error situations worse */
2019	if (p->error != 0)
2020		return;
2021
2022#ifdef notyet
2023	/*
2024	 * It's not generally safe to do a ``char'' substring search on
2025	 * multibyte character strings, but it's safe for at least
2026	 * UTF-8 (see RFC 3629).
2027	 */
2028	if (MB_CUR_MAX > 1 &&
2029	    strcmp(_CurrentRuneLocale->__encoding, "UTF-8") != 0)
2030		return;
2031#endif
2032
2033	/* find the longest OCHAR sequence in strip */
2034	newlen = 0;
2035	offset = 0;
2036	g->moffset = 0;
2037	scan = g->strip + 1;
2038	do {
2039		s = *scan++;
2040		switch (OP(s)) {
2041		case OCHAR:		/* sequence member */
2042			if (newlen == 0) {		/* new sequence */
2043				memset(&mbs, 0, sizeof(mbs));
2044				newstart = scan - 1;
2045			}
2046#ifdef NLS
2047			char buf[MB_LEN_MAX];
2048			size_t clen = wcrtomb(buf, (int)OPND(s), &mbs);
2049			if (clen == (size_t)-1)
2050				goto toohard;
2051			newlen += (sopno)clen;
2052#else
2053			newlen++;
2054#endif
2055			break;
2056		case OPLUS_:		/* things that don't break one */
2057		case OLPAREN:
2058		case ORPAREN:
2059			break;
2060		case OQUEST_:		/* things that must be skipped */
2061		case OCH_:
2062			offset = altoffset(scan, offset);
2063			scan--;
2064			do {
2065				scan += OPND(s);
2066				s = *scan;
2067				/* assert() interferes w debug printouts */
2068				if (OP(s) != O_QUEST &&
2069				    OP(s) != O_CH && OP(s) != OOR2) {
2070					g->iflags |= BAD;
2071					return;
2072				}
2073			} while (OP(s) != O_QUEST && OP(s) != O_CH);
2074			/* FALLTHROUGH */
2075		case OBOW:		/* things that break a sequence */
2076		case OEOW:
2077		case OBOL:
2078		case OEOL:
2079		case OBOS:
2080		case OEOS:
2081		case OWBND:
2082		case ONWBND:
2083		case O_QUEST:
2084		case O_CH:
2085		case OEND:
2086			if (newlen > (sopno)g->mlen) {		/* ends one */
2087				start = newstart;
2088				g->mlen = newlen;
2089				if (offset > -1) {
2090					g->moffset += offset;
2091					offset = newlen;
2092				} else
2093					g->moffset = offset;
2094			} else {
2095				if (offset > -1)
2096					offset += newlen;
2097			}
2098			newlen = 0;
2099			break;
2100		case OANY:
2101			if (newlen > (sopno)g->mlen) {		/* ends one */
2102				start = newstart;
2103				g->mlen = newlen;
2104				if (offset > -1) {
2105					g->moffset += offset;
2106					offset = newlen;
2107				} else
2108					g->moffset = offset;
2109			} else {
2110				if (offset > -1)
2111					offset += newlen;
2112			}
2113			if (offset > -1)
2114				offset++;
2115			newlen = 0;
2116			break;
2117		case OANYOF:		/* may or may not invalidate offset */
2118			/* First, everything as OANY */
2119			if (newlen > (sopno)g->mlen) {		/* ends one */
2120				start = newstart;
2121				g->mlen = newlen;
2122				if (offset > -1) {
2123					g->moffset += offset;
2124					offset = newlen;
2125				} else
2126					g->moffset = offset;
2127			} else {
2128				if (offset > -1)
2129					offset += newlen;
2130			}
2131			if (offset > -1)
2132				offset++;
2133			newlen = 0;
2134			break;
2135#ifdef NLS
2136		toohard:/*FALLTHROUGH*/
2137#endif
2138		default:
2139			/* Anything here makes it impossible or too hard
2140			 * to calculate the offset -- so we give up;
2141			 * save the last known good offset, in case the
2142			 * must sequence doesn't occur later.
2143			 */
2144			if (newlen > (sopno)g->mlen) {		/* ends one */
2145				start = newstart;
2146				g->mlen = newlen;
2147				if (offset > -1)
2148					g->moffset += offset;
2149				else
2150					g->moffset = offset;
2151			}
2152			offset = -1;
2153			newlen = 0;
2154			break;
2155		}
2156	} while (OP(s) != OEND);
2157
2158	if (g->mlen == 0) {		/* there isn't one */
2159		g->moffset = -1;
2160		return;
2161	}
2162
2163	/* turn it into a character string */
2164	g->must = malloc((size_t)g->mlen + 1);
2165	if (g->must == NULL) {		/* argh; just forget it */
2166		g->mlen = 0;
2167		g->moffset = -1;
2168		return;
2169	}
2170	cp = g->must;
2171	scan = start;
2172	memset(&mbs, 0, sizeof(mbs));
2173	while (cp < g->must + g->mlen) {
2174		while (OP(s = *scan++) != OCHAR)
2175			continue;
2176#ifdef NLS
2177		size_t clen = wcrtomb(cp, (int)OPND(s), &mbs);
2178		assert(clen != (size_t)-1);
2179		cp += clen;
2180#else
2181		*cp++ = OPND(s);
2182#endif
2183	}
2184	assert(cp == g->must + g->mlen);
2185	*cp++ = '\0';		/* just on general principles */
2186}
2187
2188/*
2189 - altoffset - choose biggest offset among multiple choices
2190 == static int altoffset(sop *scan, int offset);
2191 *
2192 * Compute, recursively if necessary, the largest offset among multiple
2193 * re paths.
2194 */
2195static int
2196altoffset(sop *scan, int offset)
2197{
2198	int largest;
2199	int try;
2200	sop s;
2201
2202	_DIAGASSERT(scan != NULL);
2203
2204	/* If we gave up already on offsets, return */
2205	if (offset == -1)
2206		return -1;
2207
2208	largest = 0;
2209	try = 0;
2210	s = *scan++;
2211	while (OP(s) != O_QUEST && OP(s) != O_CH) {
2212		switch (OP(s)) {
2213		case OOR1:
2214			if (try > largest)
2215				largest = try;
2216			try = 0;
2217			break;
2218		case OQUEST_:
2219		case OCH_:
2220			try = altoffset(scan, try);
2221			if (try == -1)
2222				return -1;
2223			scan--;
2224			do {
2225				scan += OPND(s);
2226				s = *scan;
2227				if (OP(s) != O_QUEST &&
2228				    OP(s) != O_CH && OP(s) != OOR2)
2229					return -1;
2230			} while (OP(s) != O_QUEST && OP(s) != O_CH);
2231			/* We must skip to the next position, or we'll
2232			 * leave altoffset() too early.
2233			 */
2234			scan++;
2235			break;
2236		case OANYOF:
2237		case OCHAR:
2238		case OANY:
2239			try++;
2240			/*FALLTHROUGH*/
2241		case OBOW:
2242		case OEOW:
2243		case OWBND:
2244		case ONWBND:
2245		case OLPAREN:
2246		case ORPAREN:
2247		case OOR2:
2248			break;
2249		default:
2250			try = -1;
2251			break;
2252		}
2253		if (try == -1)
2254			return -1;
2255		s = *scan++;
2256	}
2257
2258	if (try > largest)
2259		largest = try;
2260
2261	return largest+offset;
2262}
2263
2264/*
2265 - computejumps - compute char jumps for BM scan
2266 == static void computejumps(struct parse *p, struct re_guts *g);
2267 *
2268 * This algorithm assumes g->must exists and is has size greater than
2269 * zero. It's based on the algorithm found on Computer Algorithms by
2270 * Sara Baase.
2271 *
2272 * A char jump is the number of characters one needs to jump based on
2273 * the value of the character from the text that was mismatched.
2274 */
2275static void
2276computejumps(struct parse *p, struct re_guts *g)
2277{
2278	int ch;
2279	size_t mindex;
2280
2281	_DIAGASSERT(p != NULL);
2282	_DIAGASSERT(g != NULL);
2283
2284	/* Avoid making errors worse */
2285	if (p->error != 0)
2286		return;
2287
2288	g->charjump = calloc((NC_MAX + 1), sizeof(*g->charjump));
2289	if (g->charjump == NULL)	/* Not a fatal error */
2290		return;
2291	/* Adjust for signed chars, if necessary */
2292	g->charjump = &g->charjump[-(CHAR_MIN)];
2293
2294	/* If the character does not exist in the pattern, the jump
2295	 * is equal to the number of characters in the pattern.
2296	 */
2297	for (ch = CHAR_MIN; ch < (CHAR_MAX + 1); ch++)
2298		g->charjump[ch] = g->mlen;
2299
2300	/* If the character does exist, compute the jump that would
2301	 * take us to the last character in the pattern equal to it
2302	 * (notice that we match right to left, so that last character
2303	 * is the first one that would be matched).
2304	 */
2305	for (mindex = 0; mindex < g->mlen; mindex++)
2306		g->charjump[(int)g->must[mindex]] = g->mlen - mindex - 1;
2307}
2308
2309/*
2310 - computematchjumps - compute match jumps for BM scan
2311 == static void computematchjumps(struct parse *p, struct re_guts *g);
2312 *
2313 * This algorithm assumes g->must exists and is has size greater than
2314 * zero. It's based on the algorithm found on Computer Algorithms by
2315 * Sara Baase.
2316 *
2317 * A match jump is the number of characters one needs to advance based
2318 * on the already-matched suffix.
2319 * Notice that all values here are minus (g->mlen-1), because of the way
2320 * the search algorithm works.
2321 */
2322static void
2323computematchjumps(struct parse *p, struct re_guts *g)
2324{
2325	size_t mindex;		/* General "must" iterator */
2326	size_t suffix;		/* Keeps track of matching suffix */
2327	size_t ssuffix;		/* Keeps track of suffixes' suffix */
2328	size_t* pmatches;	/* pmatches[k] points to the next i
2329				 * such that i+1...mlen is a substring
2330				 * of k+1...k+mlen-i-1
2331				 */
2332
2333	_DIAGASSERT(p != NULL);
2334	_DIAGASSERT(g != NULL);
2335
2336	/* Avoid making errors worse */
2337	if (p->error != 0)
2338		return;
2339
2340	pmatches = calloc(g->mlen, sizeof(*pmatches));
2341	if (pmatches == NULL) {
2342		g->matchjump = NULL;
2343		return;
2344	}
2345
2346	g->matchjump = calloc(g->mlen, sizeof(*g->matchjump));
2347	if (g->matchjump == NULL) {	/* Not a fatal error */
2348		free(pmatches);
2349		return;
2350	}
2351
2352	/* Set maximum possible jump for each character in the pattern */
2353	for (mindex = 0; mindex < g->mlen; mindex++)
2354		g->matchjump[mindex] = 2 * g->mlen - mindex - 1;
2355
2356	/* Compute pmatches[] */
2357	for (suffix = mindex = g->mlen; mindex-- > 0; suffix--) {
2358		pmatches[mindex] = suffix;
2359
2360		/* If a mismatch is found, interrupting the substring,
2361		 * compute the matchjump for that position. If no
2362		 * mismatch is found, then a text substring mismatched
2363		 * against the suffix will also mismatch against the
2364		 * substring.
2365		 */
2366		while (suffix < g->mlen
2367		    && g->must[mindex] != g->must[suffix]) {
2368			g->matchjump[suffix] = MIN(g->matchjump[suffix],
2369			    g->mlen - mindex - 1);
2370			suffix = pmatches[suffix];
2371		}
2372	}
2373
2374	/* Compute the matchjump up to the last substring found to jump
2375	 * to the beginning of the largest must pattern prefix matching
2376	 * it's own suffix.
2377	 */
2378	for (mindex = 0; mindex <= suffix; mindex++)
2379		g->matchjump[mindex] = MIN(g->matchjump[mindex],
2380		    g->mlen + suffix - mindex);
2381
2382        ssuffix = pmatches[suffix];
2383        while (suffix < g->mlen) {
2384                while (suffix <= ssuffix && suffix < g->mlen) {
2385                        g->matchjump[suffix] = MIN(g->matchjump[suffix],
2386			    g->mlen + ssuffix - suffix);
2387                        suffix++;
2388                }
2389		if (suffix < g->mlen)
2390                	ssuffix = pmatches[ssuffix];
2391        }
2392
2393	free(pmatches);
2394}
2395
2396/*
2397 - pluscount - count + nesting
2398 == static sopno pluscount(struct parse *p, struct re_guts *g);
2399 */
2400static sopno			/* nesting depth */
2401pluscount(struct parse *p, struct re_guts *g)
2402{
2403	sop *scan;
2404	sop s;
2405	sopno plusnest = 0;
2406	sopno maxnest = 0;
2407
2408	_DIAGASSERT(p != NULL);
2409	_DIAGASSERT(g != NULL);
2410
2411	if (p->error != 0)
2412		return(0);	/* there may not be an OEND */
2413
2414	scan = g->strip + 1;
2415	do {
2416		s = *scan++;
2417		switch (OP(s)) {
2418		case OPLUS_:
2419			plusnest++;
2420			break;
2421		case O_PLUS:
2422			if (plusnest > maxnest)
2423				maxnest = plusnest;
2424			plusnest--;
2425			break;
2426		}
2427	} while (OP(s) != OEND);
2428	if (plusnest != 0)
2429		g->iflags |= BAD;
2430	return(maxnest);
2431}
2432