1/*-
2 * Copyright (c) 1992, 1993, 1994 Henry Spencer.
3 * Copyright (c) 1992, 1993, 1994
4 *	The Regents of the University of California.  All rights reserved.
5 *
6 * Copyright (c) 2011 The FreeBSD Foundation
7 * All rights reserved.
8 * Portions of this software were developed by David Chisnall
9 * under sponsorship from the FreeBSD Foundation.
10 *
11 * This code is derived from software contributed to Berkeley by
12 * Henry Spencer.
13 *
14 * Redistribution and use in source and binary forms, with or without
15 * modification, are permitted provided that the following conditions
16 * are met:
17 * 1. Redistributions of source code must retain the above copyright
18 *    notice, this list of conditions and the following disclaimer.
19 * 2. Redistributions in binary form must reproduce the above copyright
20 *    notice, this list of conditions and the following disclaimer in the
21 *    documentation and/or other materials provided with the distribution.
22 * 4. Neither the name of the University nor the names of its contributors
23 *    may be used to endorse or promote products derived from this software
24 *    without specific prior written permission.
25 *
26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36 * SUCH DAMAGE.
37 *
38 *	@(#)regcomp.c	8.5 (Berkeley) 3/20/94
39 */
40
41#if defined(LIBC_SCCS) && !defined(lint)
42static char sccsid[] = "@(#)regcomp.c	8.5 (Berkeley) 3/20/94";
43#endif /* LIBC_SCCS and not lint */
44#include <sys/cdefs.h>
45__FBSDID("$FreeBSD: stable/11/lib/libc/regex/regcomp.c 328151 2018-01-18 22:10:00Z kevans $");
46
47#include <sys/types.h>
48#include <stdio.h>
49#include <string.h>
50#include <ctype.h>
51#include <limits.h>
52#include <stdlib.h>
53#include <regex.h>
54#include <stdbool.h>
55#include <wchar.h>
56#include <wctype.h>
57
58#include "collate.h"
59
60#include "utils.h"
61#include "regex2.h"
62
63#include "cname.h"
64
65/*
66 * Branching context, used to keep track of branch state for all of the branch-
67 * aware functions. In addition to keeping track of branch positions for the
68 * p_branch_* functions, we use this to simplify some clumsiness in BREs for
69 * detection of whether ^ is acting as an anchor or being used erroneously and
70 * also for whether we're in a sub-expression or not.
71 */
72struct branchc {
73	sopno start;
74	sopno back;
75	sopno fwd;
76
77	int nbranch;
78	int nchain;
79	bool outer;
80	bool terminate;
81};
82
83/*
84 * parse structure, passed up and down to avoid global variables and
85 * other clumsinesses
86 */
87struct parse {
88	const char *next;	/* next character in RE */
89	const char *end;	/* end of string (-> NUL normally) */
90	int error;		/* has an error been seen? */
91	sop *strip;		/* malloced strip */
92	sopno ssize;		/* malloced strip size (allocated) */
93	sopno slen;		/* malloced strip length (used) */
94	int ncsalloc;		/* number of csets allocated */
95	struct re_guts *g;
96#	define	NPAREN	10	/* we need to remember () 1-9 for back refs */
97	sopno pbegin[NPAREN];	/* -> ( ([0] unused) */
98	sopno pend[NPAREN];	/* -> ) ([0] unused) */
99	bool allowbranch;	/* can this expression branch? */
100	bool bre;		/* convenience; is this a BRE? */
101	bool (*parse_expr)(struct parse *, struct branchc *);
102	void (*pre_parse)(struct parse *, struct branchc *);
103	void (*post_parse)(struct parse *, struct branchc *);
104};
105
106/* ========= begin header generated by ./mkh ========= */
107#ifdef __cplusplus
108extern "C" {
109#endif
110
111/* === regcomp.c === */
112static bool p_ere_exp(struct parse *p, struct branchc *bc);
113static void p_str(struct parse *p);
114static int p_branch_eat_delim(struct parse *p, struct branchc *bc);
115static void p_branch_ins_offset(struct parse *p, struct branchc *bc);
116static void p_branch_fix_tail(struct parse *p, struct branchc *bc);
117static bool p_branch_empty(struct parse *p, struct branchc *bc);
118static bool p_branch_do(struct parse *p, struct branchc *bc);
119static void p_bre_pre_parse(struct parse *p, struct branchc *bc);
120static void p_bre_post_parse(struct parse *p, struct branchc *bc);
121static void p_re(struct parse *p, int end1, int end2);
122static bool p_simp_re(struct parse *p, struct branchc *bc);
123static int p_count(struct parse *p);
124static void p_bracket(struct parse *p);
125static void p_b_term(struct parse *p, cset *cs);
126static void p_b_cclass(struct parse *p, cset *cs);
127static void p_b_eclass(struct parse *p, cset *cs);
128static wint_t p_b_symbol(struct parse *p);
129static wint_t p_b_coll_elem(struct parse *p, wint_t endc);
130static wint_t othercase(wint_t ch);
131static void bothcases(struct parse *p, wint_t ch);
132static void ordinary(struct parse *p, wint_t ch);
133static void nonnewline(struct parse *p);
134static void repeat(struct parse *p, sopno start, int from, int to);
135static int seterr(struct parse *p, int e);
136static cset *allocset(struct parse *p);
137static void freeset(struct parse *p, cset *cs);
138static void CHadd(struct parse *p, cset *cs, wint_t ch);
139static void CHaddrange(struct parse *p, cset *cs, wint_t min, wint_t max);
140static void CHaddtype(struct parse *p, cset *cs, wctype_t wct);
141static wint_t singleton(cset *cs);
142static sopno dupl(struct parse *p, sopno start, sopno finish);
143static void doemit(struct parse *p, sop op, size_t opnd);
144static void doinsert(struct parse *p, sop op, size_t opnd, sopno pos);
145static void dofwd(struct parse *p, sopno pos, sop value);
146static int enlarge(struct parse *p, sopno size);
147static void stripsnug(struct parse *p, struct re_guts *g);
148static void findmust(struct parse *p, struct re_guts *g);
149static int altoffset(sop *scan, int offset);
150static void computejumps(struct parse *p, struct re_guts *g);
151static void computematchjumps(struct parse *p, struct re_guts *g);
152static sopno pluscount(struct parse *p, struct re_guts *g);
153static wint_t wgetnext(struct parse *p);
154
155#ifdef __cplusplus
156}
157#endif
158/* ========= end header generated by ./mkh ========= */
159
160static char nuls[10];		/* place to point scanner in event of error */
161
162/*
163 * macros for use with parse structure
164 * BEWARE:  these know that the parse structure is named `p' !!!
165 */
166#define	PEEK()	(*p->next)
167#define	PEEK2()	(*(p->next+1))
168#define	MORE()	(p->next < p->end)
169#define	MORE2()	(p->next+1 < p->end)
170#define	SEE(c)	(MORE() && PEEK() == (c))
171#define	SEETWO(a, b)	(MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b))
172#define	SEESPEC(a)	(p->bre ? SEETWO('\\', a) : SEE(a))
173#define	EAT(c)	((SEE(c)) ? (NEXT(), 1) : 0)
174#define	EATTWO(a, b)	((SEETWO(a, b)) ? (NEXT2(), 1) : 0)
175#define	NEXT()	(p->next++)
176#define	NEXT2()	(p->next += 2)
177#define	NEXTn(n)	(p->next += (n))
178#define	GETNEXT()	(*p->next++)
179#define	WGETNEXT()	wgetnext(p)
180#define	SETERROR(e)	seterr(p, (e))
181#define	REQUIRE(co, e)	((co) || SETERROR(e))
182#define	MUSTSEE(c, e)	(REQUIRE(MORE() && PEEK() == (c), e))
183#define	MUSTEAT(c, e)	(REQUIRE(MORE() && GETNEXT() == (c), e))
184#define	MUSTNOTSEE(c, e)	(REQUIRE(!MORE() || PEEK() != (c), e))
185#define	EMIT(op, sopnd)	doemit(p, (sop)(op), (size_t)(sopnd))
186#define	INSERT(op, pos)	doinsert(p, (sop)(op), HERE()-(pos)+1, pos)
187#define	AHEAD(pos)		dofwd(p, pos, HERE()-(pos))
188#define	ASTERN(sop, pos)	EMIT(sop, HERE()-pos)
189#define	HERE()		(p->slen)
190#define	THERE()		(p->slen - 1)
191#define	THERETHERE()	(p->slen - 2)
192#define	DROP(n)	(p->slen -= (n))
193
194#ifndef NDEBUG
195static int never = 0;		/* for use in asserts; shuts lint up */
196#else
197#define	never	0		/* some <assert.h>s have bugs too */
198#endif
199
200/* Macro used by computejump()/computematchjump() */
201#define MIN(a,b)	((a)<(b)?(a):(b))
202
203/*
204 - regcomp - interface for parser and compilation
205 = extern int regcomp(regex_t *, const char *, int);
206 = #define	REG_BASIC	0000
207 = #define	REG_EXTENDED	0001
208 = #define	REG_ICASE	0002
209 = #define	REG_NOSUB	0004
210 = #define	REG_NEWLINE	0010
211 = #define	REG_NOSPEC	0020
212 = #define	REG_PEND	0040
213 = #define	REG_DUMP	0200
214 */
215int				/* 0 success, otherwise REG_something */
216regcomp(regex_t * __restrict preg,
217	const char * __restrict pattern,
218	int cflags)
219{
220	struct parse pa;
221	struct re_guts *g;
222	struct parse *p = &pa;
223	int i;
224	size_t len;
225	size_t maxlen;
226#ifdef REDEBUG
227#	define	GOODFLAGS(f)	(f)
228#else
229#	define	GOODFLAGS(f)	((f)&~REG_DUMP)
230#endif
231
232	cflags = GOODFLAGS(cflags);
233	if ((cflags&REG_EXTENDED) && (cflags&REG_NOSPEC))
234		return(REG_INVARG);
235
236	if (cflags&REG_PEND) {
237		if (preg->re_endp < pattern)
238			return(REG_INVARG);
239		len = preg->re_endp - pattern;
240	} else
241		len = strlen(pattern);
242
243	/* do the mallocs early so failure handling is easy */
244	g = (struct re_guts *)malloc(sizeof(struct re_guts));
245	if (g == NULL)
246		return(REG_ESPACE);
247	/*
248	 * Limit the pattern space to avoid a 32-bit overflow on buffer
249	 * extension.  Also avoid any signed overflow in case of conversion
250	 * so make the real limit based on a 31-bit overflow.
251	 *
252	 * Likely not applicable on 64-bit systems but handle the case
253	 * generically (who are we to stop people from using ~715MB+
254	 * patterns?).
255	 */
256	maxlen = ((size_t)-1 >> 1) / sizeof(sop) * 2 / 3;
257	if (len >= maxlen) {
258		free((char *)g);
259		return(REG_ESPACE);
260	}
261	p->ssize = len/(size_t)2*(size_t)3 + (size_t)1;	/* ugh */
262	assert(p->ssize >= len);
263
264	p->strip = (sop *)malloc(p->ssize * sizeof(sop));
265	p->slen = 0;
266	if (p->strip == NULL) {
267		free((char *)g);
268		return(REG_ESPACE);
269	}
270
271	/* set things up */
272	p->g = g;
273	p->next = pattern;	/* convenience; we do not modify it */
274	p->end = p->next + len;
275	p->error = 0;
276	p->ncsalloc = 0;
277	for (i = 0; i < NPAREN; i++) {
278		p->pbegin[i] = 0;
279		p->pend[i] = 0;
280	}
281	if (cflags & REG_EXTENDED) {
282		p->allowbranch = true;
283		p->bre = false;
284		p->parse_expr = p_ere_exp;
285		p->pre_parse = NULL;
286		p->post_parse = NULL;
287	} else {
288		p->allowbranch = false;
289		p->bre = true;
290		p->parse_expr = p_simp_re;
291		p->pre_parse = p_bre_pre_parse;
292		p->post_parse = p_bre_post_parse;
293	}
294	g->sets = NULL;
295	g->ncsets = 0;
296	g->cflags = cflags;
297	g->iflags = 0;
298	g->nbol = 0;
299	g->neol = 0;
300	g->must = NULL;
301	g->moffset = -1;
302	g->charjump = NULL;
303	g->matchjump = NULL;
304	g->mlen = 0;
305	g->nsub = 0;
306	g->backrefs = 0;
307
308	/* do it */
309	EMIT(OEND, 0);
310	g->firststate = THERE();
311	if (cflags & REG_NOSPEC)
312		p_str(p);
313	else
314		p_re(p, OUT, OUT);
315	EMIT(OEND, 0);
316	g->laststate = THERE();
317
318	/* tidy up loose ends and fill things in */
319	stripsnug(p, g);
320	findmust(p, g);
321	/* only use Boyer-Moore algorithm if the pattern is bigger
322	 * than three characters
323	 */
324	if(g->mlen > 3) {
325		computejumps(p, g);
326		computematchjumps(p, g);
327		if(g->matchjump == NULL && g->charjump != NULL) {
328			free(g->charjump);
329			g->charjump = NULL;
330		}
331	}
332	g->nplus = pluscount(p, g);
333	g->magic = MAGIC2;
334	preg->re_nsub = g->nsub;
335	preg->re_g = g;
336	preg->re_magic = MAGIC1;
337#ifndef REDEBUG
338	/* not debugging, so can't rely on the assert() in regexec() */
339	if (g->iflags&BAD)
340		SETERROR(REG_ASSERT);
341#endif
342
343	/* win or lose, we're done */
344	if (p->error != 0)	/* lose */
345		regfree(preg);
346	return(p->error);
347}
348
349/*
350 - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op,
351 - return whether we should terminate or not
352 == static bool p_ere_exp(struct parse *p);
353 */
354static bool
355p_ere_exp(struct parse *p, struct branchc *bc)
356{
357	char c;
358	wint_t wc;
359	sopno pos;
360	int count;
361	int count2;
362	sopno subno;
363	int wascaret = 0;
364
365	assert(MORE());		/* caller should have ensured this */
366	c = GETNEXT();
367
368	pos = HERE();
369	switch (c) {
370	case '(':
371		(void)REQUIRE(MORE(), REG_EPAREN);
372		p->g->nsub++;
373		subno = p->g->nsub;
374		if (subno < NPAREN)
375			p->pbegin[subno] = HERE();
376		EMIT(OLPAREN, subno);
377		if (!SEE(')'))
378			p_re(p, ')', IGN);
379		if (subno < NPAREN) {
380			p->pend[subno] = HERE();
381			assert(p->pend[subno] != 0);
382		}
383		EMIT(ORPAREN, subno);
384		(void)MUSTEAT(')', REG_EPAREN);
385		break;
386#ifndef POSIX_MISTAKE
387	case ')':		/* happens only if no current unmatched ( */
388		/*
389		 * You may ask, why the ifndef?  Because I didn't notice
390		 * this until slightly too late for 1003.2, and none of the
391		 * other 1003.2 regular-expression reviewers noticed it at
392		 * all.  So an unmatched ) is legal POSIX, at least until
393		 * we can get it fixed.
394		 */
395		SETERROR(REG_EPAREN);
396		break;
397#endif
398	case '^':
399		EMIT(OBOL, 0);
400		p->g->iflags |= USEBOL;
401		p->g->nbol++;
402		wascaret = 1;
403		break;
404	case '$':
405		EMIT(OEOL, 0);
406		p->g->iflags |= USEEOL;
407		p->g->neol++;
408		break;
409	case '|':
410		SETERROR(REG_EMPTY);
411		break;
412	case '*':
413	case '+':
414	case '?':
415		SETERROR(REG_BADRPT);
416		break;
417	case '.':
418		if (p->g->cflags&REG_NEWLINE)
419			nonnewline(p);
420		else
421			EMIT(OANY, 0);
422		break;
423	case '[':
424		p_bracket(p);
425		break;
426	case '\\':
427		(void)REQUIRE(MORE(), REG_EESCAPE);
428		wc = WGETNEXT();
429		switch (wc) {
430		case '<':
431			EMIT(OBOW, 0);
432			break;
433		case '>':
434			EMIT(OEOW, 0);
435			break;
436		default:
437			ordinary(p, wc);
438			break;
439		}
440		break;
441	case '{':		/* okay as ordinary except if digit follows */
442		(void)REQUIRE(!MORE() || !isdigit((uch)PEEK()), REG_BADRPT);
443		/* FALLTHROUGH */
444	default:
445		if (p->error != 0)
446			return (false);
447		p->next--;
448		wc = WGETNEXT();
449		ordinary(p, wc);
450		break;
451	}
452
453	if (!MORE())
454		return (false);
455	c = PEEK();
456	/* we call { a repetition if followed by a digit */
457	if (!( c == '*' || c == '+' || c == '?' ||
458				(c == '{' && MORE2() && isdigit((uch)PEEK2())) ))
459		return (false);		/* no repetition, we're done */
460	NEXT();
461
462	(void)REQUIRE(!wascaret, REG_BADRPT);
463	switch (c) {
464	case '*':	/* implemented as +? */
465		/* this case does not require the (y|) trick, noKLUDGE */
466		INSERT(OPLUS_, pos);
467		ASTERN(O_PLUS, pos);
468		INSERT(OQUEST_, pos);
469		ASTERN(O_QUEST, pos);
470		break;
471	case '+':
472		INSERT(OPLUS_, pos);
473		ASTERN(O_PLUS, pos);
474		break;
475	case '?':
476		/* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
477		INSERT(OCH_, pos);		/* offset slightly wrong */
478		ASTERN(OOR1, pos);		/* this one's right */
479		AHEAD(pos);			/* fix the OCH_ */
480		EMIT(OOR2, 0);			/* offset very wrong... */
481		AHEAD(THERE());			/* ...so fix it */
482		ASTERN(O_CH, THERETHERE());
483		break;
484	case '{':
485		count = p_count(p);
486		if (EAT(',')) {
487			if (isdigit((uch)PEEK())) {
488				count2 = p_count(p);
489				(void)REQUIRE(count <= count2, REG_BADBR);
490			} else		/* single number with comma */
491				count2 = INFINITY;
492		} else		/* just a single number */
493			count2 = count;
494		repeat(p, pos, count, count2);
495		if (!EAT('}')) {	/* error heuristics */
496			while (MORE() && PEEK() != '}')
497				NEXT();
498			(void)REQUIRE(MORE(), REG_EBRACE);
499			SETERROR(REG_BADBR);
500		}
501		break;
502	}
503
504	if (!MORE())
505		return (false);
506	c = PEEK();
507	if (!( c == '*' || c == '+' || c == '?' ||
508				(c == '{' && MORE2() && isdigit((uch)PEEK2())) ) )
509		return (false);
510	SETERROR(REG_BADRPT);
511	return (false);
512}
513
514/*
515 - p_str - string (no metacharacters) "parser"
516 == static void p_str(struct parse *p);
517 */
518static void
519p_str(struct parse *p)
520{
521	(void)REQUIRE(MORE(), REG_EMPTY);
522	while (MORE())
523		ordinary(p, WGETNEXT());
524}
525
526/*
527 * Eat consecutive branch delimiters for the kind of expression that we are
528 * parsing, return the number of delimiters that we ate.
529 */
530static int
531p_branch_eat_delim(struct parse *p, struct branchc *bc)
532{
533	int nskip;
534
535	nskip = 0;
536	while (EAT('|'))
537		++nskip;
538	return (nskip);
539}
540
541/*
542 * Insert necessary branch book-keeping operations. This emits a
543 * bogus 'next' offset, since we still have more to parse
544 */
545static void
546p_branch_ins_offset(struct parse *p, struct branchc *bc)
547{
548
549	if (bc->nbranch == 0) {
550		INSERT(OCH_, bc->start);	/* offset is wrong */
551		bc->fwd = bc->start;
552		bc->back = bc->start;
553	}
554
555	ASTERN(OOR1, bc->back);
556	bc->back = THERE();
557	AHEAD(bc->fwd);			/* fix previous offset */
558	bc->fwd = HERE();
559	EMIT(OOR2, 0);			/* offset is very wrong */
560	++bc->nbranch;
561}
562
563/*
564 * Fix the offset of the tail branch, if we actually had any branches.
565 * This is to correct the bogus placeholder offset that we use.
566 */
567static void
568p_branch_fix_tail(struct parse *p, struct branchc *bc)
569{
570
571	/* Fix bogus offset at the tail if we actually have branches */
572	if (bc->nbranch > 0) {
573		AHEAD(bc->fwd);
574		ASTERN(O_CH, bc->back);
575	}
576}
577
578/*
579 * Signal to the parser that an empty branch has been encountered; this will,
580 * in the future, be used to allow for more permissive behavior with empty
581 * branches. The return value should indicate whether parsing may continue
582 * or not.
583 */
584static bool
585p_branch_empty(struct parse *p, struct branchc *bc)
586{
587
588	SETERROR(REG_EMPTY);
589	return (false);
590}
591
592/*
593 * Take care of any branching requirements. This includes inserting the
594 * appropriate branching instructions as well as eating all of the branch
595 * delimiters until we either run out of pattern or need to parse more pattern.
596 */
597static bool
598p_branch_do(struct parse *p, struct branchc *bc)
599{
600	int ate = 0;
601
602	ate = p_branch_eat_delim(p, bc);
603	if (ate == 0)
604		return (false);
605	else if ((ate > 1 || (bc->outer && !MORE())) && !p_branch_empty(p, bc))
606		/*
607		 * Halt parsing only if we have an empty branch and p_branch_empty
608		 * indicates that we must not continue. In the future, this will not
609		 * necessarily be an error.
610		 */
611		return (false);
612	p_branch_ins_offset(p, bc);
613
614	return (true);
615}
616
617static void
618p_bre_pre_parse(struct parse *p, struct branchc *bc)
619{
620
621	(void) bc;
622	/*
623	 * Does not move cleanly into expression parser because of
624	 * ordinary interpration of * at the beginning position of
625	 * an expression.
626	 */
627	if (EAT('^')) {
628		EMIT(OBOL, 0);
629		p->g->iflags |= USEBOL;
630		p->g->nbol++;
631	}
632}
633
634static void
635p_bre_post_parse(struct parse *p, struct branchc *bc)
636{
637
638	/* Expression is terminating due to EOL token */
639	if (bc->terminate) {
640		DROP(1);
641		EMIT(OEOL, 0);
642		p->g->iflags |= USEEOL;
643		p->g->neol++;
644	}
645}
646
647/*
648 - p_re - Top level parser, concatenation and BRE anchoring
649 == static void p_re(struct parse *p, int end1, int end2);
650 * Giving end1 as OUT essentially eliminates the end1/end2 check.
651 *
652 * This implementation is a bit of a kludge, in that a trailing $ is first
653 * taken as an ordinary character and then revised to be an anchor.
654 * The amount of lookahead needed to avoid this kludge is excessive.
655 */
656static void
657p_re(struct parse *p,
658	int end1,	/* first terminating character */
659	int end2)	/* second terminating character; ignored for EREs */
660{
661	struct branchc bc;
662
663	bc.nbranch = 0;
664	if (end1 == OUT && end2 == OUT)
665		bc.outer = true;
666	else
667		bc.outer = false;
668#define	SEEEND()	(!p->bre ? SEE(end1) : SEETWO(end1, end2))
669	for (;;) {
670		bc.start = HERE();
671		bc.nchain = 0;
672		bc.terminate = false;
673		if (p->pre_parse != NULL)
674			p->pre_parse(p, &bc);
675		while (MORE() && (!p->allowbranch || !SEESPEC('|')) && !SEEEND()) {
676			bc.terminate = p->parse_expr(p, &bc);
677			++bc.nchain;
678		}
679		if (p->post_parse != NULL)
680			p->post_parse(p, &bc);
681		(void) REQUIRE(HERE() != bc.start, REG_EMPTY);
682		if (!p->allowbranch)
683			break;
684		/*
685		 * p_branch_do's return value indicates whether we should
686		 * continue parsing or not. This is both for correctness and
687		 * a slight optimization, because it will check if we've
688		 * encountered an empty branch or the end of the string
689		 * immediately following a branch delimiter.
690		 */
691		if (!p_branch_do(p, &bc))
692			break;
693	}
694#undef SEE_END
695	if (p->allowbranch)
696		p_branch_fix_tail(p, &bc);
697	assert(!MORE() || SEE(end1));
698}
699
700/*
701 - p_simp_re - parse a simple RE, an atom possibly followed by a repetition
702 == static bool p_simp_re(struct parse *p, struct branchc *bc);
703 */
704static bool			/* was the simple RE an unbackslashed $? */
705p_simp_re(struct parse *p, struct branchc *bc)
706{
707	int c;
708	int count;
709	int count2;
710	sopno pos;
711	int i;
712	wint_t wc;
713	sopno subno;
714#	define	BACKSL	(1<<CHAR_BIT)
715
716	pos = HERE();		/* repetition op, if any, covers from here */
717
718	assert(MORE());		/* caller should have ensured this */
719	c = GETNEXT();
720	if (c == '\\') {
721		(void)REQUIRE(MORE(), REG_EESCAPE);
722		c = BACKSL | GETNEXT();
723	}
724	switch (c) {
725	case '.':
726		if (p->g->cflags&REG_NEWLINE)
727			nonnewline(p);
728		else
729			EMIT(OANY, 0);
730		break;
731	case '[':
732		p_bracket(p);
733		break;
734	case BACKSL|'<':
735		EMIT(OBOW, 0);
736		break;
737	case BACKSL|'>':
738		EMIT(OEOW, 0);
739		break;
740	case BACKSL|'{':
741		SETERROR(REG_BADRPT);
742		break;
743	case BACKSL|'(':
744		p->g->nsub++;
745		subno = p->g->nsub;
746		if (subno < NPAREN)
747			p->pbegin[subno] = HERE();
748		EMIT(OLPAREN, subno);
749		/* the MORE here is an error heuristic */
750		if (MORE() && !SEETWO('\\', ')'))
751			p_re(p, '\\', ')');
752		if (subno < NPAREN) {
753			p->pend[subno] = HERE();
754			assert(p->pend[subno] != 0);
755		}
756		EMIT(ORPAREN, subno);
757		(void)REQUIRE(EATTWO('\\', ')'), REG_EPAREN);
758		break;
759	case BACKSL|')':	/* should not get here -- must be user */
760	case BACKSL|'}':
761		SETERROR(REG_EPAREN);
762		break;
763	case BACKSL|'1':
764	case BACKSL|'2':
765	case BACKSL|'3':
766	case BACKSL|'4':
767	case BACKSL|'5':
768	case BACKSL|'6':
769	case BACKSL|'7':
770	case BACKSL|'8':
771	case BACKSL|'9':
772		i = (c&~BACKSL) - '0';
773		assert(i < NPAREN);
774		if (p->pend[i] != 0) {
775			assert(i <= p->g->nsub);
776			EMIT(OBACK_, i);
777			assert(p->pbegin[i] != 0);
778			assert(OP(p->strip[p->pbegin[i]]) == OLPAREN);
779			assert(OP(p->strip[p->pend[i]]) == ORPAREN);
780			(void) dupl(p, p->pbegin[i]+1, p->pend[i]);
781			EMIT(O_BACK, i);
782		} else
783			SETERROR(REG_ESUBREG);
784		p->g->backrefs = 1;
785		break;
786	case '*':
787		/*
788		 * Ordinary if used as the first character beyond BOL anchor of
789		 * a (sub-)expression, counts as a bad repetition operator if it
790		 * appears otherwise.
791		 */
792		(void)REQUIRE(bc->nchain == 0, REG_BADRPT);
793		/* FALLTHROUGH */
794	default:
795		if (p->error != 0)
796			return (false);	/* Definitely not $... */
797		p->next--;
798		wc = WGETNEXT();
799		ordinary(p, wc);
800		break;
801	}
802
803	if (EAT('*')) {		/* implemented as +? */
804		/* this case does not require the (y|) trick, noKLUDGE */
805		INSERT(OPLUS_, pos);
806		ASTERN(O_PLUS, pos);
807		INSERT(OQUEST_, pos);
808		ASTERN(O_QUEST, pos);
809	} else if (EATTWO('\\', '{')) {
810		count = p_count(p);
811		if (EAT(',')) {
812			if (MORE() && isdigit((uch)PEEK())) {
813				count2 = p_count(p);
814				(void)REQUIRE(count <= count2, REG_BADBR);
815			} else		/* single number with comma */
816				count2 = INFINITY;
817		} else		/* just a single number */
818			count2 = count;
819		repeat(p, pos, count, count2);
820		if (!EATTWO('\\', '}')) {	/* error heuristics */
821			while (MORE() && !SEETWO('\\', '}'))
822				NEXT();
823			(void)REQUIRE(MORE(), REG_EBRACE);
824			SETERROR(REG_BADBR);
825		}
826	} else if (c == '$')     /* $ (but not \$) ends it */
827		return (true);
828
829	return (false);
830}
831
832/*
833 - p_count - parse a repetition count
834 == static int p_count(struct parse *p);
835 */
836static int			/* the value */
837p_count(struct parse *p)
838{
839	int count = 0;
840	int ndigits = 0;
841
842	while (MORE() && isdigit((uch)PEEK()) && count <= DUPMAX) {
843		count = count*10 + (GETNEXT() - '0');
844		ndigits++;
845	}
846
847	(void)REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR);
848	return(count);
849}
850
851/*
852 - p_bracket - parse a bracketed character list
853 == static void p_bracket(struct parse *p);
854 */
855static void
856p_bracket(struct parse *p)
857{
858	cset *cs;
859	wint_t ch;
860
861	/* Dept of Truly Sickening Special-Case Kludges */
862	if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]", 6) == 0) {
863		EMIT(OBOW, 0);
864		NEXTn(6);
865		return;
866	}
867	if (p->next + 5 < p->end && strncmp(p->next, "[:>:]]", 6) == 0) {
868		EMIT(OEOW, 0);
869		NEXTn(6);
870		return;
871	}
872
873	if ((cs = allocset(p)) == NULL)
874		return;
875
876	if (p->g->cflags&REG_ICASE)
877		cs->icase = 1;
878	if (EAT('^'))
879		cs->invert = 1;
880	if (EAT(']'))
881		CHadd(p, cs, ']');
882	else if (EAT('-'))
883		CHadd(p, cs, '-');
884	while (MORE() && PEEK() != ']' && !SEETWO('-', ']'))
885		p_b_term(p, cs);
886	if (EAT('-'))
887		CHadd(p, cs, '-');
888	(void)MUSTEAT(']', REG_EBRACK);
889
890	if (p->error != 0)	/* don't mess things up further */
891		return;
892
893	if (cs->invert && p->g->cflags&REG_NEWLINE)
894		cs->bmp['\n' >> 3] |= 1 << ('\n' & 7);
895
896	if ((ch = singleton(cs)) != OUT) {	/* optimize singleton sets */
897		ordinary(p, ch);
898		freeset(p, cs);
899	} else
900		EMIT(OANYOF, (int)(cs - p->g->sets));
901}
902
903/*
904 - p_b_term - parse one term of a bracketed character list
905 == static void p_b_term(struct parse *p, cset *cs);
906 */
907static void
908p_b_term(struct parse *p, cset *cs)
909{
910	char c;
911	wint_t start, finish;
912	wint_t i;
913	struct xlocale_collate *table =
914		(struct xlocale_collate*)__get_locale()->components[XLC_COLLATE];
915
916	/* classify what we've got */
917	switch ((MORE()) ? PEEK() : '\0') {
918	case '[':
919		c = (MORE2()) ? PEEK2() : '\0';
920		break;
921	case '-':
922		SETERROR(REG_ERANGE);
923		return;			/* NOTE RETURN */
924	default:
925		c = '\0';
926		break;
927	}
928
929	switch (c) {
930	case ':':		/* character class */
931		NEXT2();
932		(void)REQUIRE(MORE(), REG_EBRACK);
933		c = PEEK();
934		(void)REQUIRE(c != '-' && c != ']', REG_ECTYPE);
935		p_b_cclass(p, cs);
936		(void)REQUIRE(MORE(), REG_EBRACK);
937		(void)REQUIRE(EATTWO(':', ']'), REG_ECTYPE);
938		break;
939	case '=':		/* equivalence class */
940		NEXT2();
941		(void)REQUIRE(MORE(), REG_EBRACK);
942		c = PEEK();
943		(void)REQUIRE(c != '-' && c != ']', REG_ECOLLATE);
944		p_b_eclass(p, cs);
945		(void)REQUIRE(MORE(), REG_EBRACK);
946		(void)REQUIRE(EATTWO('=', ']'), REG_ECOLLATE);
947		break;
948	default:		/* symbol, ordinary character, or range */
949		start = p_b_symbol(p);
950		if (SEE('-') && MORE2() && PEEK2() != ']') {
951			/* range */
952			NEXT();
953			if (EAT('-'))
954				finish = '-';
955			else
956				finish = p_b_symbol(p);
957		} else
958			finish = start;
959		if (start == finish)
960			CHadd(p, cs, start);
961		else {
962			if (table->__collate_load_error || MB_CUR_MAX > 1) {
963				(void)REQUIRE(start <= finish, REG_ERANGE);
964				CHaddrange(p, cs, start, finish);
965			} else {
966				(void)REQUIRE(__wcollate_range_cmp(start, finish) <= 0, REG_ERANGE);
967				for (i = 0; i <= UCHAR_MAX; i++) {
968					if (   __wcollate_range_cmp(start, i) <= 0
969					    && __wcollate_range_cmp(i, finish) <= 0
970					   )
971						CHadd(p, cs, i);
972				}
973			}
974		}
975		break;
976	}
977}
978
979/*
980 - p_b_cclass - parse a character-class name and deal with it
981 == static void p_b_cclass(struct parse *p, cset *cs);
982 */
983static void
984p_b_cclass(struct parse *p, cset *cs)
985{
986	const char *sp = p->next;
987	size_t len;
988	wctype_t wct;
989	char clname[16];
990
991	while (MORE() && isalpha((uch)PEEK()))
992		NEXT();
993	len = p->next - sp;
994	if (len >= sizeof(clname) - 1) {
995		SETERROR(REG_ECTYPE);
996		return;
997	}
998	memcpy(clname, sp, len);
999	clname[len] = '\0';
1000	if ((wct = wctype(clname)) == 0) {
1001		SETERROR(REG_ECTYPE);
1002		return;
1003	}
1004	CHaddtype(p, cs, wct);
1005}
1006
1007/*
1008 - p_b_eclass - parse an equivalence-class name and deal with it
1009 == static void p_b_eclass(struct parse *p, cset *cs);
1010 *
1011 * This implementation is incomplete. xxx
1012 */
1013static void
1014p_b_eclass(struct parse *p, cset *cs)
1015{
1016	wint_t c;
1017
1018	c = p_b_coll_elem(p, '=');
1019	CHadd(p, cs, c);
1020}
1021
1022/*
1023 - p_b_symbol - parse a character or [..]ed multicharacter collating symbol
1024 == static wint_t p_b_symbol(struct parse *p);
1025 */
1026static wint_t			/* value of symbol */
1027p_b_symbol(struct parse *p)
1028{
1029	wint_t value;
1030
1031	(void)REQUIRE(MORE(), REG_EBRACK);
1032	if (!EATTWO('[', '.'))
1033		return(WGETNEXT());
1034
1035	/* collating symbol */
1036	value = p_b_coll_elem(p, '.');
1037	(void)REQUIRE(EATTWO('.', ']'), REG_ECOLLATE);
1038	return(value);
1039}
1040
1041/*
1042 - p_b_coll_elem - parse a collating-element name and look it up
1043 == static wint_t p_b_coll_elem(struct parse *p, wint_t endc);
1044 */
1045static wint_t			/* value of collating element */
1046p_b_coll_elem(struct parse *p,
1047	wint_t endc)		/* name ended by endc,']' */
1048{
1049	const char *sp = p->next;
1050	struct cname *cp;
1051	mbstate_t mbs;
1052	wchar_t wc;
1053	size_t clen, len;
1054
1055	while (MORE() && !SEETWO(endc, ']'))
1056		NEXT();
1057	if (!MORE()) {
1058		SETERROR(REG_EBRACK);
1059		return(0);
1060	}
1061	len = p->next - sp;
1062	for (cp = cnames; cp->name != NULL; cp++)
1063		if (strncmp(cp->name, sp, len) == 0 && strlen(cp->name) == len)
1064			return(cp->code);	/* known name */
1065	memset(&mbs, 0, sizeof(mbs));
1066	if ((clen = mbrtowc(&wc, sp, len, &mbs)) == len)
1067		return (wc);			/* single character */
1068	else if (clen == (size_t)-1 || clen == (size_t)-2)
1069		SETERROR(REG_ILLSEQ);
1070	else
1071		SETERROR(REG_ECOLLATE);		/* neither */
1072	return(0);
1073}
1074
1075/*
1076 - othercase - return the case counterpart of an alphabetic
1077 == static wint_t othercase(wint_t ch);
1078 */
1079static wint_t			/* if no counterpart, return ch */
1080othercase(wint_t ch)
1081{
1082	assert(iswalpha(ch));
1083	if (iswupper(ch))
1084		return(towlower(ch));
1085	else if (iswlower(ch))
1086		return(towupper(ch));
1087	else			/* peculiar, but could happen */
1088		return(ch);
1089}
1090
1091/*
1092 - bothcases - emit a dualcase version of a two-case character
1093 == static void bothcases(struct parse *p, wint_t ch);
1094 *
1095 * Boy, is this implementation ever a kludge...
1096 */
1097static void
1098bothcases(struct parse *p, wint_t ch)
1099{
1100	const char *oldnext = p->next;
1101	const char *oldend = p->end;
1102	char bracket[3 + MB_LEN_MAX];
1103	size_t n;
1104	mbstate_t mbs;
1105
1106	assert(othercase(ch) != ch);	/* p_bracket() would recurse */
1107	p->next = bracket;
1108	memset(&mbs, 0, sizeof(mbs));
1109	n = wcrtomb(bracket, ch, &mbs);
1110	assert(n != (size_t)-1);
1111	bracket[n] = ']';
1112	bracket[n + 1] = '\0';
1113	p->end = bracket+n+1;
1114	p_bracket(p);
1115	assert(p->next == p->end);
1116	p->next = oldnext;
1117	p->end = oldend;
1118}
1119
1120/*
1121 - ordinary - emit an ordinary character
1122 == static void ordinary(struct parse *p, wint_t ch);
1123 */
1124static void
1125ordinary(struct parse *p, wint_t ch)
1126{
1127	cset *cs;
1128
1129	if ((p->g->cflags&REG_ICASE) && iswalpha(ch) && othercase(ch) != ch)
1130		bothcases(p, ch);
1131	else if ((ch & OPDMASK) == ch)
1132		EMIT(OCHAR, ch);
1133	else {
1134		/*
1135		 * Kludge: character is too big to fit into an OCHAR operand.
1136		 * Emit a singleton set.
1137		 */
1138		if ((cs = allocset(p)) == NULL)
1139			return;
1140		CHadd(p, cs, ch);
1141		EMIT(OANYOF, (int)(cs - p->g->sets));
1142	}
1143}
1144
1145/*
1146 - nonnewline - emit REG_NEWLINE version of OANY
1147 == static void nonnewline(struct parse *p);
1148 *
1149 * Boy, is this implementation ever a kludge...
1150 */
1151static void
1152nonnewline(struct parse *p)
1153{
1154	const char *oldnext = p->next;
1155	const char *oldend = p->end;
1156	char bracket[4];
1157
1158	p->next = bracket;
1159	p->end = bracket+3;
1160	bracket[0] = '^';
1161	bracket[1] = '\n';
1162	bracket[2] = ']';
1163	bracket[3] = '\0';
1164	p_bracket(p);
1165	assert(p->next == bracket+3);
1166	p->next = oldnext;
1167	p->end = oldend;
1168}
1169
1170/*
1171 - repeat - generate code for a bounded repetition, recursively if needed
1172 == static void repeat(struct parse *p, sopno start, int from, int to);
1173 */
1174static void
1175repeat(struct parse *p,
1176	sopno start,		/* operand from here to end of strip */
1177	int from,		/* repeated from this number */
1178	int to)			/* to this number of times (maybe INFINITY) */
1179{
1180	sopno finish = HERE();
1181#	define	N	2
1182#	define	INF	3
1183#	define	REP(f, t)	((f)*8 + (t))
1184#	define	MAP(n)	(((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N)
1185	sopno copy;
1186
1187	if (p->error != 0)	/* head off possible runaway recursion */
1188		return;
1189
1190	assert(from <= to);
1191
1192	switch (REP(MAP(from), MAP(to))) {
1193	case REP(0, 0):			/* must be user doing this */
1194		DROP(finish-start);	/* drop the operand */
1195		break;
1196	case REP(0, 1):			/* as x{1,1}? */
1197	case REP(0, N):			/* as x{1,n}? */
1198	case REP(0, INF):		/* as x{1,}? */
1199		/* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
1200		INSERT(OCH_, start);		/* offset is wrong... */
1201		repeat(p, start+1, 1, to);
1202		ASTERN(OOR1, start);
1203		AHEAD(start);			/* ... fix it */
1204		EMIT(OOR2, 0);
1205		AHEAD(THERE());
1206		ASTERN(O_CH, THERETHERE());
1207		break;
1208	case REP(1, 1):			/* trivial case */
1209		/* done */
1210		break;
1211	case REP(1, N):			/* as x?x{1,n-1} */
1212		/* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
1213		INSERT(OCH_, start);
1214		ASTERN(OOR1, start);
1215		AHEAD(start);
1216		EMIT(OOR2, 0);			/* offset very wrong... */
1217		AHEAD(THERE());			/* ...so fix it */
1218		ASTERN(O_CH, THERETHERE());
1219		copy = dupl(p, start+1, finish+1);
1220		assert(copy == finish+4);
1221		repeat(p, copy, 1, to-1);
1222		break;
1223	case REP(1, INF):		/* as x+ */
1224		INSERT(OPLUS_, start);
1225		ASTERN(O_PLUS, start);
1226		break;
1227	case REP(N, N):			/* as xx{m-1,n-1} */
1228		copy = dupl(p, start, finish);
1229		repeat(p, copy, from-1, to-1);
1230		break;
1231	case REP(N, INF):		/* as xx{n-1,INF} */
1232		copy = dupl(p, start, finish);
1233		repeat(p, copy, from-1, to);
1234		break;
1235	default:			/* "can't happen" */
1236		SETERROR(REG_ASSERT);	/* just in case */
1237		break;
1238	}
1239}
1240
1241/*
1242 - wgetnext - helper function for WGETNEXT() macro. Gets the next wide
1243 - character from the parse struct, signals a REG_ILLSEQ error if the
1244 - character can't be converted. Returns the number of bytes consumed.
1245 */
1246static wint_t
1247wgetnext(struct parse *p)
1248{
1249	mbstate_t mbs;
1250	wchar_t wc;
1251	size_t n;
1252
1253	memset(&mbs, 0, sizeof(mbs));
1254	n = mbrtowc(&wc, p->next, p->end - p->next, &mbs);
1255	if (n == (size_t)-1 || n == (size_t)-2) {
1256		SETERROR(REG_ILLSEQ);
1257		return (0);
1258	}
1259	if (n == 0)
1260		n = 1;
1261	p->next += n;
1262	return (wc);
1263}
1264
1265/*
1266 - seterr - set an error condition
1267 == static int seterr(struct parse *p, int e);
1268 */
1269static int			/* useless but makes type checking happy */
1270seterr(struct parse *p, int e)
1271{
1272	if (p->error == 0)	/* keep earliest error condition */
1273		p->error = e;
1274	p->next = nuls;		/* try to bring things to a halt */
1275	p->end = nuls;
1276	return(0);		/* make the return value well-defined */
1277}
1278
1279/*
1280 - allocset - allocate a set of characters for []
1281 == static cset *allocset(struct parse *p);
1282 */
1283static cset *
1284allocset(struct parse *p)
1285{
1286	cset *cs, *ncs;
1287
1288	ncs = reallocarray(p->g->sets, p->g->ncsets + 1, sizeof(*ncs));
1289	if (ncs == NULL) {
1290		SETERROR(REG_ESPACE);
1291		return (NULL);
1292	}
1293	p->g->sets = ncs;
1294	cs = &p->g->sets[p->g->ncsets++];
1295	memset(cs, 0, sizeof(*cs));
1296
1297	return(cs);
1298}
1299
1300/*
1301 - freeset - free a now-unused set
1302 == static void freeset(struct parse *p, cset *cs);
1303 */
1304static void
1305freeset(struct parse *p, cset *cs)
1306{
1307	cset *top = &p->g->sets[p->g->ncsets];
1308
1309	free(cs->wides);
1310	free(cs->ranges);
1311	free(cs->types);
1312	memset(cs, 0, sizeof(*cs));
1313	if (cs == top-1)	/* recover only the easy case */
1314		p->g->ncsets--;
1315}
1316
1317/*
1318 - singleton - Determine whether a set contains only one character,
1319 - returning it if so, otherwise returning OUT.
1320 */
1321static wint_t
1322singleton(cset *cs)
1323{
1324	wint_t i, s, n;
1325
1326	for (i = n = 0; i < NC; i++)
1327		if (CHIN(cs, i)) {
1328			n++;
1329			s = i;
1330		}
1331	if (n == 1)
1332		return (s);
1333	if (cs->nwides == 1 && cs->nranges == 0 && cs->ntypes == 0 &&
1334	    cs->icase == 0)
1335		return (cs->wides[0]);
1336	/* Don't bother handling the other cases. */
1337	return (OUT);
1338}
1339
1340/*
1341 - CHadd - add character to character set.
1342 */
1343static void
1344CHadd(struct parse *p, cset *cs, wint_t ch)
1345{
1346	wint_t nch, *newwides;
1347	assert(ch >= 0);
1348	if (ch < NC)
1349		cs->bmp[ch >> 3] |= 1 << (ch & 7);
1350	else {
1351		newwides = reallocarray(cs->wides, cs->nwides + 1,
1352		    sizeof(*cs->wides));
1353		if (newwides == NULL) {
1354			SETERROR(REG_ESPACE);
1355			return;
1356		}
1357		cs->wides = newwides;
1358		cs->wides[cs->nwides++] = ch;
1359	}
1360	if (cs->icase) {
1361		if ((nch = towlower(ch)) < NC)
1362			cs->bmp[nch >> 3] |= 1 << (nch & 7);
1363		if ((nch = towupper(ch)) < NC)
1364			cs->bmp[nch >> 3] |= 1 << (nch & 7);
1365	}
1366}
1367
1368/*
1369 - CHaddrange - add all characters in the range [min,max] to a character set.
1370 */
1371static void
1372CHaddrange(struct parse *p, cset *cs, wint_t min, wint_t max)
1373{
1374	crange *newranges;
1375
1376	for (; min < NC && min <= max; min++)
1377		CHadd(p, cs, min);
1378	if (min >= max)
1379		return;
1380	newranges = reallocarray(cs->ranges, cs->nranges + 1,
1381	    sizeof(*cs->ranges));
1382	if (newranges == NULL) {
1383		SETERROR(REG_ESPACE);
1384		return;
1385	}
1386	cs->ranges = newranges;
1387	cs->ranges[cs->nranges].min = min;
1388	cs->ranges[cs->nranges].max = max;
1389	cs->nranges++;
1390}
1391
1392/*
1393 - CHaddtype - add all characters of a certain type to a character set.
1394 */
1395static void
1396CHaddtype(struct parse *p, cset *cs, wctype_t wct)
1397{
1398	wint_t i;
1399	wctype_t *newtypes;
1400
1401	for (i = 0; i < NC; i++)
1402		if (iswctype(i, wct))
1403			CHadd(p, cs, i);
1404	newtypes = reallocarray(cs->types, cs->ntypes + 1,
1405	    sizeof(*cs->types));
1406	if (newtypes == NULL) {
1407		SETERROR(REG_ESPACE);
1408		return;
1409	}
1410	cs->types = newtypes;
1411	cs->types[cs->ntypes++] = wct;
1412}
1413
1414/*
1415 - dupl - emit a duplicate of a bunch of sops
1416 == static sopno dupl(struct parse *p, sopno start, sopno finish);
1417 */
1418static sopno			/* start of duplicate */
1419dupl(struct parse *p,
1420	sopno start,		/* from here */
1421	sopno finish)		/* to this less one */
1422{
1423	sopno ret = HERE();
1424	sopno len = finish - start;
1425
1426	assert(finish >= start);
1427	if (len == 0)
1428		return(ret);
1429	if (!enlarge(p, p->ssize + len)) /* this many unexpected additions */
1430		return(ret);
1431	(void) memcpy((char *)(p->strip + p->slen),
1432		(char *)(p->strip + start), (size_t)len*sizeof(sop));
1433	p->slen += len;
1434	return(ret);
1435}
1436
1437/*
1438 - doemit - emit a strip operator
1439 == static void doemit(struct parse *p, sop op, size_t opnd);
1440 *
1441 * It might seem better to implement this as a macro with a function as
1442 * hard-case backup, but it's just too big and messy unless there are
1443 * some changes to the data structures.  Maybe later.
1444 */
1445static void
1446doemit(struct parse *p, sop op, size_t opnd)
1447{
1448	/* avoid making error situations worse */
1449	if (p->error != 0)
1450		return;
1451
1452	/* deal with oversize operands ("can't happen", more or less) */
1453	assert(opnd < 1<<OPSHIFT);
1454
1455	/* deal with undersized strip */
1456	if (p->slen >= p->ssize)
1457		if (!enlarge(p, (p->ssize+1) / 2 * 3))	/* +50% */
1458			return;
1459
1460	/* finally, it's all reduced to the easy case */
1461	p->strip[p->slen++] = SOP(op, opnd);
1462}
1463
1464/*
1465 - doinsert - insert a sop into the strip
1466 == static void doinsert(struct parse *p, sop op, size_t opnd, sopno pos);
1467 */
1468static void
1469doinsert(struct parse *p, sop op, size_t opnd, sopno pos)
1470{
1471	sopno sn;
1472	sop s;
1473	int i;
1474
1475	/* avoid making error situations worse */
1476	if (p->error != 0)
1477		return;
1478
1479	sn = HERE();
1480	EMIT(op, opnd);		/* do checks, ensure space */
1481	assert(HERE() == sn+1);
1482	s = p->strip[sn];
1483
1484	/* adjust paren pointers */
1485	assert(pos > 0);
1486	for (i = 1; i < NPAREN; i++) {
1487		if (p->pbegin[i] >= pos) {
1488			p->pbegin[i]++;
1489		}
1490		if (p->pend[i] >= pos) {
1491			p->pend[i]++;
1492		}
1493	}
1494
1495	memmove((char *)&p->strip[pos+1], (char *)&p->strip[pos],
1496						(HERE()-pos-1)*sizeof(sop));
1497	p->strip[pos] = s;
1498}
1499
1500/*
1501 - dofwd - complete a forward reference
1502 == static void dofwd(struct parse *p, sopno pos, sop value);
1503 */
1504static void
1505dofwd(struct parse *p, sopno pos, sop value)
1506{
1507	/* avoid making error situations worse */
1508	if (p->error != 0)
1509		return;
1510
1511	assert(value < 1<<OPSHIFT);
1512	p->strip[pos] = OP(p->strip[pos]) | value;
1513}
1514
1515/*
1516 - enlarge - enlarge the strip
1517 == static int enlarge(struct parse *p, sopno size);
1518 */
1519static int
1520enlarge(struct parse *p, sopno size)
1521{
1522	sop *sp;
1523
1524	if (p->ssize >= size)
1525		return 1;
1526
1527	sp = reallocarray(p->strip, size, sizeof(sop));
1528	if (sp == NULL) {
1529		SETERROR(REG_ESPACE);
1530		return 0;
1531	}
1532	p->strip = sp;
1533	p->ssize = size;
1534	return 1;
1535}
1536
1537/*
1538 - stripsnug - compact the strip
1539 == static void stripsnug(struct parse *p, struct re_guts *g);
1540 */
1541static void
1542stripsnug(struct parse *p, struct re_guts *g)
1543{
1544	g->nstates = p->slen;
1545	g->strip = reallocarray((char *)p->strip, p->slen, sizeof(sop));
1546	if (g->strip == NULL) {
1547		SETERROR(REG_ESPACE);
1548		g->strip = p->strip;
1549	}
1550}
1551
1552/*
1553 - findmust - fill in must and mlen with longest mandatory literal string
1554 == static void findmust(struct parse *p, struct re_guts *g);
1555 *
1556 * This algorithm could do fancy things like analyzing the operands of |
1557 * for common subsequences.  Someday.  This code is simple and finds most
1558 * of the interesting cases.
1559 *
1560 * Note that must and mlen got initialized during setup.
1561 */
1562static void
1563findmust(struct parse *p, struct re_guts *g)
1564{
1565	sop *scan;
1566	sop *start = NULL;
1567	sop *newstart = NULL;
1568	sopno newlen;
1569	sop s;
1570	char *cp;
1571	int offset;
1572	char buf[MB_LEN_MAX];
1573	size_t clen;
1574	mbstate_t mbs;
1575
1576	/* avoid making error situations worse */
1577	if (p->error != 0)
1578		return;
1579
1580	/*
1581	 * It's not generally safe to do a ``char'' substring search on
1582	 * multibyte character strings, but it's safe for at least
1583	 * UTF-8 (see RFC 3629).
1584	 */
1585	if (MB_CUR_MAX > 1 &&
1586	    strcmp(_CurrentRuneLocale->__encoding, "UTF-8") != 0)
1587		return;
1588
1589	/* find the longest OCHAR sequence in strip */
1590	newlen = 0;
1591	offset = 0;
1592	g->moffset = 0;
1593	scan = g->strip + 1;
1594	do {
1595		s = *scan++;
1596		switch (OP(s)) {
1597		case OCHAR:		/* sequence member */
1598			if (newlen == 0) {		/* new sequence */
1599				memset(&mbs, 0, sizeof(mbs));
1600				newstart = scan - 1;
1601			}
1602			clen = wcrtomb(buf, OPND(s), &mbs);
1603			if (clen == (size_t)-1)
1604				goto toohard;
1605			newlen += clen;
1606			break;
1607		case OPLUS_:		/* things that don't break one */
1608		case OLPAREN:
1609		case ORPAREN:
1610			break;
1611		case OQUEST_:		/* things that must be skipped */
1612		case OCH_:
1613			offset = altoffset(scan, offset);
1614			scan--;
1615			do {
1616				scan += OPND(s);
1617				s = *scan;
1618				/* assert() interferes w debug printouts */
1619				if (OP(s) != O_QUEST && OP(s) != O_CH &&
1620							OP(s) != OOR2) {
1621					g->iflags |= BAD;
1622					return;
1623				}
1624			} while (OP(s) != O_QUEST && OP(s) != O_CH);
1625			/* FALLTHROUGH */
1626		case OBOW:		/* things that break a sequence */
1627		case OEOW:
1628		case OBOL:
1629		case OEOL:
1630		case O_QUEST:
1631		case O_CH:
1632		case OEND:
1633			if (newlen > g->mlen) {		/* ends one */
1634				start = newstart;
1635				g->mlen = newlen;
1636				if (offset > -1) {
1637					g->moffset += offset;
1638					offset = newlen;
1639				} else
1640					g->moffset = offset;
1641			} else {
1642				if (offset > -1)
1643					offset += newlen;
1644			}
1645			newlen = 0;
1646			break;
1647		case OANY:
1648			if (newlen > g->mlen) {		/* ends one */
1649				start = newstart;
1650				g->mlen = newlen;
1651				if (offset > -1) {
1652					g->moffset += offset;
1653					offset = newlen;
1654				} else
1655					g->moffset = offset;
1656			} else {
1657				if (offset > -1)
1658					offset += newlen;
1659			}
1660			if (offset > -1)
1661				offset++;
1662			newlen = 0;
1663			break;
1664		case OANYOF:		/* may or may not invalidate offset */
1665			/* First, everything as OANY */
1666			if (newlen > g->mlen) {		/* ends one */
1667				start = newstart;
1668				g->mlen = newlen;
1669				if (offset > -1) {
1670					g->moffset += offset;
1671					offset = newlen;
1672				} else
1673					g->moffset = offset;
1674			} else {
1675				if (offset > -1)
1676					offset += newlen;
1677			}
1678			if (offset > -1)
1679				offset++;
1680			newlen = 0;
1681			break;
1682		toohard:
1683		default:
1684			/* Anything here makes it impossible or too hard
1685			 * to calculate the offset -- so we give up;
1686			 * save the last known good offset, in case the
1687			 * must sequence doesn't occur later.
1688			 */
1689			if (newlen > g->mlen) {		/* ends one */
1690				start = newstart;
1691				g->mlen = newlen;
1692				if (offset > -1)
1693					g->moffset += offset;
1694				else
1695					g->moffset = offset;
1696			}
1697			offset = -1;
1698			newlen = 0;
1699			break;
1700		}
1701	} while (OP(s) != OEND);
1702
1703	if (g->mlen == 0) {		/* there isn't one */
1704		g->moffset = -1;
1705		return;
1706	}
1707
1708	/* turn it into a character string */
1709	g->must = malloc((size_t)g->mlen + 1);
1710	if (g->must == NULL) {		/* argh; just forget it */
1711		g->mlen = 0;
1712		g->moffset = -1;
1713		return;
1714	}
1715	cp = g->must;
1716	scan = start;
1717	memset(&mbs, 0, sizeof(mbs));
1718	while (cp < g->must + g->mlen) {
1719		while (OP(s = *scan++) != OCHAR)
1720			continue;
1721		clen = wcrtomb(cp, OPND(s), &mbs);
1722		assert(clen != (size_t)-1);
1723		cp += clen;
1724	}
1725	assert(cp == g->must + g->mlen);
1726	*cp++ = '\0';		/* just on general principles */
1727}
1728
1729/*
1730 - altoffset - choose biggest offset among multiple choices
1731 == static int altoffset(sop *scan, int offset);
1732 *
1733 * Compute, recursively if necessary, the largest offset among multiple
1734 * re paths.
1735 */
1736static int
1737altoffset(sop *scan, int offset)
1738{
1739	int largest;
1740	int try;
1741	sop s;
1742
1743	/* If we gave up already on offsets, return */
1744	if (offset == -1)
1745		return -1;
1746
1747	largest = 0;
1748	try = 0;
1749	s = *scan++;
1750	while (OP(s) != O_QUEST && OP(s) != O_CH) {
1751		switch (OP(s)) {
1752		case OOR1:
1753			if (try > largest)
1754				largest = try;
1755			try = 0;
1756			break;
1757		case OQUEST_:
1758		case OCH_:
1759			try = altoffset(scan, try);
1760			if (try == -1)
1761				return -1;
1762			scan--;
1763			do {
1764				scan += OPND(s);
1765				s = *scan;
1766				if (OP(s) != O_QUEST && OP(s) != O_CH &&
1767							OP(s) != OOR2)
1768					return -1;
1769			} while (OP(s) != O_QUEST && OP(s) != O_CH);
1770			/* We must skip to the next position, or we'll
1771			 * leave altoffset() too early.
1772			 */
1773			scan++;
1774			break;
1775		case OANYOF:
1776		case OCHAR:
1777		case OANY:
1778			try++;
1779		case OBOW:
1780		case OEOW:
1781		case OLPAREN:
1782		case ORPAREN:
1783		case OOR2:
1784			break;
1785		default:
1786			try = -1;
1787			break;
1788		}
1789		if (try == -1)
1790			return -1;
1791		s = *scan++;
1792	}
1793
1794	if (try > largest)
1795		largest = try;
1796
1797	return largest+offset;
1798}
1799
1800/*
1801 - computejumps - compute char jumps for BM scan
1802 == static void computejumps(struct parse *p, struct re_guts *g);
1803 *
1804 * This algorithm assumes g->must exists and is has size greater than
1805 * zero. It's based on the algorithm found on Computer Algorithms by
1806 * Sara Baase.
1807 *
1808 * A char jump is the number of characters one needs to jump based on
1809 * the value of the character from the text that was mismatched.
1810 */
1811static void
1812computejumps(struct parse *p, struct re_guts *g)
1813{
1814	int ch;
1815	int mindex;
1816
1817	/* Avoid making errors worse */
1818	if (p->error != 0)
1819		return;
1820
1821	g->charjump = (int*) malloc((NC + 1) * sizeof(int));
1822	if (g->charjump == NULL)	/* Not a fatal error */
1823		return;
1824	/* Adjust for signed chars, if necessary */
1825	g->charjump = &g->charjump[-(CHAR_MIN)];
1826
1827	/* If the character does not exist in the pattern, the jump
1828	 * is equal to the number of characters in the pattern.
1829	 */
1830	for (ch = CHAR_MIN; ch < (CHAR_MAX + 1); ch++)
1831		g->charjump[ch] = g->mlen;
1832
1833	/* If the character does exist, compute the jump that would
1834	 * take us to the last character in the pattern equal to it
1835	 * (notice that we match right to left, so that last character
1836	 * is the first one that would be matched).
1837	 */
1838	for (mindex = 0; mindex < g->mlen; mindex++)
1839		g->charjump[(int)g->must[mindex]] = g->mlen - mindex - 1;
1840}
1841
1842/*
1843 - computematchjumps - compute match jumps for BM scan
1844 == static void computematchjumps(struct parse *p, struct re_guts *g);
1845 *
1846 * This algorithm assumes g->must exists and is has size greater than
1847 * zero. It's based on the algorithm found on Computer Algorithms by
1848 * Sara Baase.
1849 *
1850 * A match jump is the number of characters one needs to advance based
1851 * on the already-matched suffix.
1852 * Notice that all values here are minus (g->mlen-1), because of the way
1853 * the search algorithm works.
1854 */
1855static void
1856computematchjumps(struct parse *p, struct re_guts *g)
1857{
1858	int mindex;		/* General "must" iterator */
1859	int suffix;		/* Keeps track of matching suffix */
1860	int ssuffix;		/* Keeps track of suffixes' suffix */
1861	int* pmatches;		/* pmatches[k] points to the next i
1862				 * such that i+1...mlen is a substring
1863				 * of k+1...k+mlen-i-1
1864				 */
1865
1866	/* Avoid making errors worse */
1867	if (p->error != 0)
1868		return;
1869
1870	pmatches = (int*) malloc(g->mlen * sizeof(int));
1871	if (pmatches == NULL) {
1872		g->matchjump = NULL;
1873		return;
1874	}
1875
1876	g->matchjump = (int*) malloc(g->mlen * sizeof(int));
1877	if (g->matchjump == NULL) {	/* Not a fatal error */
1878		free(pmatches);
1879		return;
1880	}
1881
1882	/* Set maximum possible jump for each character in the pattern */
1883	for (mindex = 0; mindex < g->mlen; mindex++)
1884		g->matchjump[mindex] = 2*g->mlen - mindex - 1;
1885
1886	/* Compute pmatches[] */
1887	for (mindex = g->mlen - 1, suffix = g->mlen; mindex >= 0;
1888	    mindex--, suffix--) {
1889		pmatches[mindex] = suffix;
1890
1891		/* If a mismatch is found, interrupting the substring,
1892		 * compute the matchjump for that position. If no
1893		 * mismatch is found, then a text substring mismatched
1894		 * against the suffix will also mismatch against the
1895		 * substring.
1896		 */
1897		while (suffix < g->mlen
1898		    && g->must[mindex] != g->must[suffix]) {
1899			g->matchjump[suffix] = MIN(g->matchjump[suffix],
1900			    g->mlen - mindex - 1);
1901			suffix = pmatches[suffix];
1902		}
1903	}
1904
1905	/* Compute the matchjump up to the last substring found to jump
1906	 * to the beginning of the largest must pattern prefix matching
1907	 * it's own suffix.
1908	 */
1909	for (mindex = 0; mindex <= suffix; mindex++)
1910		g->matchjump[mindex] = MIN(g->matchjump[mindex],
1911		    g->mlen + suffix - mindex);
1912
1913        ssuffix = pmatches[suffix];
1914        while (suffix < g->mlen) {
1915                while (suffix <= ssuffix && suffix < g->mlen) {
1916                        g->matchjump[suffix] = MIN(g->matchjump[suffix],
1917			    g->mlen + ssuffix - suffix);
1918                        suffix++;
1919                }
1920		if (suffix < g->mlen)
1921                	ssuffix = pmatches[ssuffix];
1922        }
1923
1924	free(pmatches);
1925}
1926
1927/*
1928 - pluscount - count + nesting
1929 == static sopno pluscount(struct parse *p, struct re_guts *g);
1930 */
1931static sopno			/* nesting depth */
1932pluscount(struct parse *p, struct re_guts *g)
1933{
1934	sop *scan;
1935	sop s;
1936	sopno plusnest = 0;
1937	sopno maxnest = 0;
1938
1939	if (p->error != 0)
1940		return(0);	/* there may not be an OEND */
1941
1942	scan = g->strip + 1;
1943	do {
1944		s = *scan++;
1945		switch (OP(s)) {
1946		case OPLUS_:
1947			plusnest++;
1948			break;
1949		case O_PLUS:
1950			if (plusnest > maxnest)
1951				maxnest = plusnest;
1952			plusnest--;
1953			break;
1954		}
1955	} while (OP(s) != OEND);
1956	if (plusnest != 0)
1957		g->iflags |= BAD;
1958	return(maxnest);
1959}
1960