regcomp.c revision 132019
1/*-
2 * Copyright (c) 1992, 1993, 1994 Henry Spencer.
3 * Copyright (c) 1992, 1993, 1994
4 *	The Regents of the University of California.  All rights reserved.
5 *
6 * This code is derived from software contributed to Berkeley by
7 * Henry Spencer.
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 * 1. Redistributions of source code must retain the above copyright
13 *    notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 *    notice, this list of conditions and the following disclaimer in the
16 *    documentation and/or other materials provided with the distribution.
17 * 3. All advertising materials mentioning features or use of this software
18 *    must display the following acknowledgement:
19 *	This product includes software developed by the University of
20 *	California, Berkeley and its contributors.
21 * 4. Neither the name of the University nor the names of its contributors
22 *    may be used to endorse or promote products derived from this software
23 *    without specific prior written permission.
24 *
25 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
26 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
28 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
29 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
30 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
31 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
32 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
33 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
34 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
35 * SUCH DAMAGE.
36 *
37 *	@(#)regcomp.c	8.5 (Berkeley) 3/20/94
38 */
39
40#if defined(LIBC_SCCS) && !defined(lint)
41static char sccsid[] = "@(#)regcomp.c	8.5 (Berkeley) 3/20/94";
42#endif /* LIBC_SCCS and not lint */
43#include <sys/cdefs.h>
44__FBSDID("$FreeBSD: head/lib/libc/regex/regcomp.c 132019 2004-07-12 07:35:59Z tjr $");
45
46#include <sys/types.h>
47#include <stdio.h>
48#include <string.h>
49#include <ctype.h>
50#include <limits.h>
51#include <stdlib.h>
52#include <regex.h>
53#include <wchar.h>
54#include <wctype.h>
55
56#include "collate.h"
57
58#include "utils.h"
59#include "regex2.h"
60
61#include "cname.h"
62
63/*
64 * parse structure, passed up and down to avoid global variables and
65 * other clumsinesses
66 */
67struct parse {
68	char *next;		/* next character in RE */
69	char *end;		/* end of string (-> NUL normally) */
70	int error;		/* has an error been seen? */
71	sop *strip;		/* malloced strip */
72	sopno ssize;		/* malloced strip size (allocated) */
73	sopno slen;		/* malloced strip length (used) */
74	int ncsalloc;		/* number of csets allocated */
75	struct re_guts *g;
76#	define	NPAREN	10	/* we need to remember () 1-9 for back refs */
77	sopno pbegin[NPAREN];	/* -> ( ([0] unused) */
78	sopno pend[NPAREN];	/* -> ) ([0] unused) */
79};
80
81/* ========= begin header generated by ./mkh ========= */
82#ifdef __cplusplus
83extern "C" {
84#endif
85
86/* === regcomp.c === */
87static void p_ere(struct parse *p, wint_t stop);
88static void p_ere_exp(struct parse *p);
89static void p_str(struct parse *p);
90static void p_bre(struct parse *p, wint_t end1, wint_t end2);
91static int p_simp_re(struct parse *p, int starordinary);
92static int p_count(struct parse *p);
93static void p_bracket(struct parse *p);
94static void p_b_term(struct parse *p, cset *cs);
95static void p_b_cclass(struct parse *p, cset *cs);
96static void p_b_eclass(struct parse *p, cset *cs);
97static wint_t p_b_symbol(struct parse *p);
98static wint_t p_b_coll_elem(struct parse *p, wint_t endc);
99static wint_t othercase(wint_t ch);
100static void bothcases(struct parse *p, wint_t ch);
101static void ordinary(struct parse *p, wint_t ch);
102static void nonnewline(struct parse *p);
103static void repeat(struct parse *p, sopno start, int from, int to);
104static int seterr(struct parse *p, int e);
105static cset *allocset(struct parse *p);
106static void freeset(struct parse *p, cset *cs);
107static void CHadd(struct parse *p, cset *cs, wint_t ch);
108static void CHaddrange(struct parse *p, cset *cs, wint_t min, wint_t max);
109static void CHaddtype(struct parse *p, cset *cs, wctype_t wct);
110static wint_t singleton(cset *cs);
111static sopno dupl(struct parse *p, sopno start, sopno finish);
112static void doemit(struct parse *p, sop op, size_t opnd);
113static void doinsert(struct parse *p, sop op, size_t opnd, sopno pos);
114static void dofwd(struct parse *p, sopno pos, sop value);
115static void enlarge(struct parse *p, sopno size);
116static void stripsnug(struct parse *p, struct re_guts *g);
117static void findmust(struct parse *p, struct re_guts *g);
118static int altoffset(sop *scan, int offset);
119static void computejumps(struct parse *p, struct re_guts *g);
120static void computematchjumps(struct parse *p, struct re_guts *g);
121static sopno pluscount(struct parse *p, struct re_guts *g);
122static wint_t wgetnext(struct parse *p);
123
124#ifdef __cplusplus
125}
126#endif
127/* ========= end header generated by ./mkh ========= */
128
129static char nuls[10];		/* place to point scanner in event of error */
130
131/*
132 * macros for use with parse structure
133 * BEWARE:  these know that the parse structure is named `p' !!!
134 */
135#define	PEEK()	(*p->next)
136#define	PEEK2()	(*(p->next+1))
137#define	MORE()	(p->next < p->end)
138#define	MORE2()	(p->next+1 < p->end)
139#define	SEE(c)	(MORE() && PEEK() == (c))
140#define	SEETWO(a, b)	(MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b))
141#define	EAT(c)	((SEE(c)) ? (NEXT(), 1) : 0)
142#define	EATTWO(a, b)	((SEETWO(a, b)) ? (NEXT2(), 1) : 0)
143#define	NEXT()	(p->next++)
144#define	NEXT2()	(p->next += 2)
145#define	NEXTn(n)	(p->next += (n))
146#define	GETNEXT()	(*p->next++)
147#define	WGETNEXT()	wgetnext(p)
148#define	SETERROR(e)	seterr(p, (e))
149#define	REQUIRE(co, e)	((co) || SETERROR(e))
150#define	MUSTSEE(c, e)	(REQUIRE(MORE() && PEEK() == (c), e))
151#define	MUSTEAT(c, e)	(REQUIRE(MORE() && GETNEXT() == (c), e))
152#define	MUSTNOTSEE(c, e)	(REQUIRE(!MORE() || PEEK() != (c), e))
153#define	EMIT(op, sopnd)	doemit(p, (sop)(op), (size_t)(sopnd))
154#define	INSERT(op, pos)	doinsert(p, (sop)(op), HERE()-(pos)+1, pos)
155#define	AHEAD(pos)		dofwd(p, pos, HERE()-(pos))
156#define	ASTERN(sop, pos)	EMIT(sop, HERE()-pos)
157#define	HERE()		(p->slen)
158#define	THERE()		(p->slen - 1)
159#define	THERETHERE()	(p->slen - 2)
160#define	DROP(n)	(p->slen -= (n))
161
162#ifndef NDEBUG
163static int never = 0;		/* for use in asserts; shuts lint up */
164#else
165#define	never	0		/* some <assert.h>s have bugs too */
166#endif
167
168/* Macro used by computejump()/computematchjump() */
169#define MIN(a,b)	((a)<(b)?(a):(b))
170
171/*
172 - regcomp - interface for parser and compilation
173 = extern int regcomp(regex_t *, const char *, int);
174 = #define	REG_BASIC	0000
175 = #define	REG_EXTENDED	0001
176 = #define	REG_ICASE	0002
177 = #define	REG_NOSUB	0004
178 = #define	REG_NEWLINE	0010
179 = #define	REG_NOSPEC	0020
180 = #define	REG_PEND	0040
181 = #define	REG_DUMP	0200
182 */
183int				/* 0 success, otherwise REG_something */
184regcomp(preg, pattern, cflags)
185regex_t * __restrict preg;
186const char * __restrict pattern;
187int cflags;
188{
189	struct parse pa;
190	struct re_guts *g;
191	struct parse *p = &pa;
192	int i;
193	size_t len;
194#ifdef REDEBUG
195#	define	GOODFLAGS(f)	(f)
196#else
197#	define	GOODFLAGS(f)	((f)&~REG_DUMP)
198#endif
199
200	cflags = GOODFLAGS(cflags);
201	if ((cflags&REG_EXTENDED) && (cflags&REG_NOSPEC))
202		return(REG_INVARG);
203
204	if (cflags&REG_PEND) {
205		if (preg->re_endp < pattern)
206			return(REG_INVARG);
207		len = preg->re_endp - pattern;
208	} else
209		len = strlen((char *)pattern);
210
211	/* do the mallocs early so failure handling is easy */
212	g = (struct re_guts *)malloc(sizeof(struct re_guts));
213	if (g == NULL)
214		return(REG_ESPACE);
215	p->ssize = len/(size_t)2*(size_t)3 + (size_t)1;	/* ugh */
216	p->strip = (sop *)malloc(p->ssize * sizeof(sop));
217	p->slen = 0;
218	if (p->strip == NULL) {
219		free((char *)g);
220		return(REG_ESPACE);
221	}
222
223	/* set things up */
224	p->g = g;
225	p->next = (char *)pattern;	/* convenience; we do not modify it */
226	p->end = p->next + len;
227	p->error = 0;
228	p->ncsalloc = 0;
229	for (i = 0; i < NPAREN; i++) {
230		p->pbegin[i] = 0;
231		p->pend[i] = 0;
232	}
233	g->sets = NULL;
234	g->ncsets = 0;
235	g->cflags = cflags;
236	g->iflags = 0;
237	g->nbol = 0;
238	g->neol = 0;
239	g->must = NULL;
240	g->moffset = -1;
241	g->charjump = NULL;
242	g->matchjump = NULL;
243	g->mlen = 0;
244	g->nsub = 0;
245	g->backrefs = 0;
246
247	/* do it */
248	EMIT(OEND, 0);
249	g->firststate = THERE();
250	if (cflags&REG_EXTENDED)
251		p_ere(p, OUT);
252	else if (cflags&REG_NOSPEC)
253		p_str(p);
254	else
255		p_bre(p, OUT, OUT);
256	EMIT(OEND, 0);
257	g->laststate = THERE();
258
259	/* tidy up loose ends and fill things in */
260	stripsnug(p, g);
261	findmust(p, g);
262	/* only use Boyer-Moore algorithm if the pattern is bigger
263	 * than three characters
264	 */
265	if(g->mlen > 3) {
266		computejumps(p, g);
267		computematchjumps(p, g);
268		if(g->matchjump == NULL && g->charjump != NULL) {
269			free(g->charjump);
270			g->charjump = NULL;
271		}
272	}
273	g->nplus = pluscount(p, g);
274	g->magic = MAGIC2;
275	preg->re_nsub = g->nsub;
276	preg->re_g = g;
277	preg->re_magic = MAGIC1;
278#ifndef REDEBUG
279	/* not debugging, so can't rely on the assert() in regexec() */
280	if (g->iflags&BAD)
281		SETERROR(REG_ASSERT);
282#endif
283
284	/* win or lose, we're done */
285	if (p->error != 0)	/* lose */
286		regfree(preg);
287	return(p->error);
288}
289
290/*
291 - p_ere - ERE parser top level, concatenation and alternation
292 == static void p_ere(struct parse *p, int stop);
293 */
294static void
295p_ere(p, stop)
296struct parse *p;
297int stop;			/* character this ERE should end at */
298{
299	char c;
300	sopno prevback;
301	sopno prevfwd;
302	sopno conc;
303	int first = 1;		/* is this the first alternative? */
304
305	for (;;) {
306		/* do a bunch of concatenated expressions */
307		conc = HERE();
308		while (MORE() && (c = PEEK()) != '|' && c != stop)
309			p_ere_exp(p);
310		(void)REQUIRE(HERE() != conc, REG_EMPTY);	/* require nonempty */
311
312		if (!EAT('|'))
313			break;		/* NOTE BREAK OUT */
314
315		if (first) {
316			INSERT(OCH_, conc);	/* offset is wrong */
317			prevfwd = conc;
318			prevback = conc;
319			first = 0;
320		}
321		ASTERN(OOR1, prevback);
322		prevback = THERE();
323		AHEAD(prevfwd);			/* fix previous offset */
324		prevfwd = HERE();
325		EMIT(OOR2, 0);			/* offset is very wrong */
326	}
327
328	if (!first) {		/* tail-end fixups */
329		AHEAD(prevfwd);
330		ASTERN(O_CH, prevback);
331	}
332
333	assert(!MORE() || SEE(stop));
334}
335
336/*
337 - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op
338 == static void p_ere_exp(struct parse *p);
339 */
340static void
341p_ere_exp(p)
342struct parse *p;
343{
344	char c;
345	wint_t wc;
346	sopno pos;
347	int count;
348	int count2;
349	sopno subno;
350	int wascaret = 0;
351
352	assert(MORE());		/* caller should have ensured this */
353	c = GETNEXT();
354
355	pos = HERE();
356	switch (c) {
357	case '(':
358		(void)REQUIRE(MORE(), REG_EPAREN);
359		p->g->nsub++;
360		subno = p->g->nsub;
361		if (subno < NPAREN)
362			p->pbegin[subno] = HERE();
363		EMIT(OLPAREN, subno);
364		if (!SEE(')'))
365			p_ere(p, ')');
366		if (subno < NPAREN) {
367			p->pend[subno] = HERE();
368			assert(p->pend[subno] != 0);
369		}
370		EMIT(ORPAREN, subno);
371		(void)MUSTEAT(')', REG_EPAREN);
372		break;
373#ifndef POSIX_MISTAKE
374	case ')':		/* happens only if no current unmatched ( */
375		/*
376		 * You may ask, why the ifndef?  Because I didn't notice
377		 * this until slightly too late for 1003.2, and none of the
378		 * other 1003.2 regular-expression reviewers noticed it at
379		 * all.  So an unmatched ) is legal POSIX, at least until
380		 * we can get it fixed.
381		 */
382		SETERROR(REG_EPAREN);
383		break;
384#endif
385	case '^':
386		EMIT(OBOL, 0);
387		p->g->iflags |= USEBOL;
388		p->g->nbol++;
389		wascaret = 1;
390		break;
391	case '$':
392		EMIT(OEOL, 0);
393		p->g->iflags |= USEEOL;
394		p->g->neol++;
395		break;
396	case '|':
397		SETERROR(REG_EMPTY);
398		break;
399	case '*':
400	case '+':
401	case '?':
402		SETERROR(REG_BADRPT);
403		break;
404	case '.':
405		if (p->g->cflags&REG_NEWLINE)
406			nonnewline(p);
407		else
408			EMIT(OANY, 0);
409		break;
410	case '[':
411		p_bracket(p);
412		break;
413	case '\\':
414		(void)REQUIRE(MORE(), REG_EESCAPE);
415		wc = WGETNEXT();
416		ordinary(p, wc);
417		break;
418	case '{':		/* okay as ordinary except if digit follows */
419		(void)REQUIRE(!MORE() || !isdigit((uch)PEEK()), REG_BADRPT);
420		/* FALLTHROUGH */
421	default:
422		p->next--;
423		wc = WGETNEXT();
424		ordinary(p, wc);
425		break;
426	}
427
428	if (!MORE())
429		return;
430	c = PEEK();
431	/* we call { a repetition if followed by a digit */
432	if (!( c == '*' || c == '+' || c == '?' ||
433				(c == '{' && MORE2() && isdigit((uch)PEEK2())) ))
434		return;		/* no repetition, we're done */
435	NEXT();
436
437	(void)REQUIRE(!wascaret, REG_BADRPT);
438	switch (c) {
439	case '*':	/* implemented as +? */
440		/* this case does not require the (y|) trick, noKLUDGE */
441		INSERT(OPLUS_, pos);
442		ASTERN(O_PLUS, pos);
443		INSERT(OQUEST_, pos);
444		ASTERN(O_QUEST, pos);
445		break;
446	case '+':
447		INSERT(OPLUS_, pos);
448		ASTERN(O_PLUS, pos);
449		break;
450	case '?':
451		/* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
452		INSERT(OCH_, pos);		/* offset slightly wrong */
453		ASTERN(OOR1, pos);		/* this one's right */
454		AHEAD(pos);			/* fix the OCH_ */
455		EMIT(OOR2, 0);			/* offset very wrong... */
456		AHEAD(THERE());			/* ...so fix it */
457		ASTERN(O_CH, THERETHERE());
458		break;
459	case '{':
460		count = p_count(p);
461		if (EAT(',')) {
462			if (isdigit((uch)PEEK())) {
463				count2 = p_count(p);
464				(void)REQUIRE(count <= count2, REG_BADBR);
465			} else		/* single number with comma */
466				count2 = INFINITY;
467		} else		/* just a single number */
468			count2 = count;
469		repeat(p, pos, count, count2);
470		if (!EAT('}')) {	/* error heuristics */
471			while (MORE() && PEEK() != '}')
472				NEXT();
473			(void)REQUIRE(MORE(), REG_EBRACE);
474			SETERROR(REG_BADBR);
475		}
476		break;
477	}
478
479	if (!MORE())
480		return;
481	c = PEEK();
482	if (!( c == '*' || c == '+' || c == '?' ||
483				(c == '{' && MORE2() && isdigit((uch)PEEK2())) ) )
484		return;
485	SETERROR(REG_BADRPT);
486}
487
488/*
489 - p_str - string (no metacharacters) "parser"
490 == static void p_str(struct parse *p);
491 */
492static void
493p_str(p)
494struct parse *p;
495{
496	(void)REQUIRE(MORE(), REG_EMPTY);
497	while (MORE())
498		ordinary(p, WGETNEXT());
499}
500
501/*
502 - p_bre - BRE parser top level, anchoring and concatenation
503 == static void p_bre(struct parse *p, int end1, \
504 ==	int end2);
505 * Giving end1 as OUT essentially eliminates the end1/end2 check.
506 *
507 * This implementation is a bit of a kludge, in that a trailing $ is first
508 * taken as an ordinary character and then revised to be an anchor.
509 * The amount of lookahead needed to avoid this kludge is excessive.
510 */
511static void
512p_bre(p, end1, end2)
513struct parse *p;
514int end1;			/* first terminating character */
515int end2;			/* second terminating character */
516{
517	sopno start = HERE();
518	int first = 1;			/* first subexpression? */
519	int wasdollar = 0;
520
521	if (EAT('^')) {
522		EMIT(OBOL, 0);
523		p->g->iflags |= USEBOL;
524		p->g->nbol++;
525	}
526	while (MORE() && !SEETWO(end1, end2)) {
527		wasdollar = p_simp_re(p, first);
528		first = 0;
529	}
530	if (wasdollar) {	/* oops, that was a trailing anchor */
531		DROP(1);
532		EMIT(OEOL, 0);
533		p->g->iflags |= USEEOL;
534		p->g->neol++;
535	}
536
537	(void)REQUIRE(HERE() != start, REG_EMPTY);	/* require nonempty */
538}
539
540/*
541 - p_simp_re - parse a simple RE, an atom possibly followed by a repetition
542 == static int p_simp_re(struct parse *p, int starordinary);
543 */
544static int			/* was the simple RE an unbackslashed $? */
545p_simp_re(p, starordinary)
546struct parse *p;
547int starordinary;		/* is a leading * an ordinary character? */
548{
549	int c;
550	int count;
551	int count2;
552	sopno pos;
553	int i;
554	wint_t wc;
555	sopno subno;
556#	define	BACKSL	(1<<CHAR_BIT)
557
558	pos = HERE();		/* repetion op, if any, covers from here */
559
560	assert(MORE());		/* caller should have ensured this */
561	c = GETNEXT();
562	if (c == '\\') {
563		(void)REQUIRE(MORE(), REG_EESCAPE);
564		c = BACKSL | GETNEXT();
565	}
566	switch (c) {
567	case '.':
568		if (p->g->cflags&REG_NEWLINE)
569			nonnewline(p);
570		else
571			EMIT(OANY, 0);
572		break;
573	case '[':
574		p_bracket(p);
575		break;
576	case BACKSL|'{':
577		SETERROR(REG_BADRPT);
578		break;
579	case BACKSL|'(':
580		p->g->nsub++;
581		subno = p->g->nsub;
582		if (subno < NPAREN)
583			p->pbegin[subno] = HERE();
584		EMIT(OLPAREN, subno);
585		/* the MORE here is an error heuristic */
586		if (MORE() && !SEETWO('\\', ')'))
587			p_bre(p, '\\', ')');
588		if (subno < NPAREN) {
589			p->pend[subno] = HERE();
590			assert(p->pend[subno] != 0);
591		}
592		EMIT(ORPAREN, subno);
593		(void)REQUIRE(EATTWO('\\', ')'), REG_EPAREN);
594		break;
595	case BACKSL|')':	/* should not get here -- must be user */
596	case BACKSL|'}':
597		SETERROR(REG_EPAREN);
598		break;
599	case BACKSL|'1':
600	case BACKSL|'2':
601	case BACKSL|'3':
602	case BACKSL|'4':
603	case BACKSL|'5':
604	case BACKSL|'6':
605	case BACKSL|'7':
606	case BACKSL|'8':
607	case BACKSL|'9':
608		i = (c&~BACKSL) - '0';
609		assert(i < NPAREN);
610		if (p->pend[i] != 0) {
611			assert(i <= p->g->nsub);
612			EMIT(OBACK_, i);
613			assert(p->pbegin[i] != 0);
614			assert(OP(p->strip[p->pbegin[i]]) == OLPAREN);
615			assert(OP(p->strip[p->pend[i]]) == ORPAREN);
616			(void) dupl(p, p->pbegin[i]+1, p->pend[i]);
617			EMIT(O_BACK, i);
618		} else
619			SETERROR(REG_ESUBREG);
620		p->g->backrefs = 1;
621		break;
622	case '*':
623		(void)REQUIRE(starordinary, REG_BADRPT);
624		/* FALLTHROUGH */
625	default:
626		p->next--;
627		wc = WGETNEXT();
628		ordinary(p, wc);
629		break;
630	}
631
632	if (EAT('*')) {		/* implemented as +? */
633		/* this case does not require the (y|) trick, noKLUDGE */
634		INSERT(OPLUS_, pos);
635		ASTERN(O_PLUS, pos);
636		INSERT(OQUEST_, pos);
637		ASTERN(O_QUEST, pos);
638	} else if (EATTWO('\\', '{')) {
639		count = p_count(p);
640		if (EAT(',')) {
641			if (MORE() && isdigit((uch)PEEK())) {
642				count2 = p_count(p);
643				(void)REQUIRE(count <= count2, REG_BADBR);
644			} else		/* single number with comma */
645				count2 = INFINITY;
646		} else		/* just a single number */
647			count2 = count;
648		repeat(p, pos, count, count2);
649		if (!EATTWO('\\', '}')) {	/* error heuristics */
650			while (MORE() && !SEETWO('\\', '}'))
651				NEXT();
652			(void)REQUIRE(MORE(), REG_EBRACE);
653			SETERROR(REG_BADBR);
654		}
655	} else if (c == '$')     /* $ (but not \$) ends it */
656		return(1);
657
658	return(0);
659}
660
661/*
662 - p_count - parse a repetition count
663 == static int p_count(struct parse *p);
664 */
665static int			/* the value */
666p_count(p)
667struct parse *p;
668{
669	int count = 0;
670	int ndigits = 0;
671
672	while (MORE() && isdigit((uch)PEEK()) && count <= DUPMAX) {
673		count = count*10 + (GETNEXT() - '0');
674		ndigits++;
675	}
676
677	(void)REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR);
678	return(count);
679}
680
681/*
682 - p_bracket - parse a bracketed character list
683 == static void p_bracket(struct parse *p);
684 */
685static void
686p_bracket(p)
687struct parse *p;
688{
689	cset *cs;
690	wint_t ch;
691
692	/* Dept of Truly Sickening Special-Case Kludges */
693	if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]", 6) == 0) {
694		EMIT(OBOW, 0);
695		NEXTn(6);
696		return;
697	}
698	if (p->next + 5 < p->end && strncmp(p->next, "[:>:]]", 6) == 0) {
699		EMIT(OEOW, 0);
700		NEXTn(6);
701		return;
702	}
703
704	if ((cs = allocset(p)) == NULL)
705		return;
706
707	if (p->g->cflags&REG_ICASE)
708		cs->icase = 1;
709	if (EAT('^'))
710		cs->invert = 1;
711	if (EAT(']'))
712		CHadd(p, cs, ']');
713	else if (EAT('-'))
714		CHadd(p, cs, '-');
715	while (MORE() && PEEK() != ']' && !SEETWO('-', ']'))
716		p_b_term(p, cs);
717	if (EAT('-'))
718		CHadd(p, cs, '-');
719	(void)MUSTEAT(']', REG_EBRACK);
720
721	if (p->error != 0)	/* don't mess things up further */
722		return;
723
724	if (cs->invert && p->g->cflags&REG_NEWLINE)
725		cs->bmp['\n' >> 3] |= 1 << ('\n' & 7);
726
727	if ((ch = singleton(cs)) != OUT) {	/* optimize singleton sets */
728		ordinary(p, ch);
729		freeset(p, cs);
730	} else
731		EMIT(OANYOF, (int)(cs - p->g->sets));
732}
733
734/*
735 - p_b_term - parse one term of a bracketed character list
736 == static void p_b_term(struct parse *p, cset *cs);
737 */
738static void
739p_b_term(p, cs)
740struct parse *p;
741cset *cs;
742{
743	char c;
744	wint_t start, finish;
745	wint_t i;
746
747	/* classify what we've got */
748	switch ((MORE()) ? PEEK() : '\0') {
749	case '[':
750		c = (MORE2()) ? PEEK2() : '\0';
751		break;
752	case '-':
753		SETERROR(REG_ERANGE);
754		return;			/* NOTE RETURN */
755		break;
756	default:
757		c = '\0';
758		break;
759	}
760
761	switch (c) {
762	case ':':		/* character class */
763		NEXT2();
764		(void)REQUIRE(MORE(), REG_EBRACK);
765		c = PEEK();
766		(void)REQUIRE(c != '-' && c != ']', REG_ECTYPE);
767		p_b_cclass(p, cs);
768		(void)REQUIRE(MORE(), REG_EBRACK);
769		(void)REQUIRE(EATTWO(':', ']'), REG_ECTYPE);
770		break;
771	case '=':		/* equivalence class */
772		NEXT2();
773		(void)REQUIRE(MORE(), REG_EBRACK);
774		c = PEEK();
775		(void)REQUIRE(c != '-' && c != ']', REG_ECOLLATE);
776		p_b_eclass(p, cs);
777		(void)REQUIRE(MORE(), REG_EBRACK);
778		(void)REQUIRE(EATTWO('=', ']'), REG_ECOLLATE);
779		break;
780	default:		/* symbol, ordinary character, or range */
781		start = p_b_symbol(p);
782		if (SEE('-') && MORE2() && PEEK2() != ']') {
783			/* range */
784			NEXT();
785			if (EAT('-'))
786				finish = '-';
787			else
788				finish = p_b_symbol(p);
789		} else
790			finish = start;
791		if (start == finish)
792			CHadd(p, cs, start);
793		else {
794			if (__collate_load_error) {
795				(void)REQUIRE((uch)start <= (uch)finish, REG_ERANGE);
796				CHaddrange(p, cs, start, finish);
797			} else {
798				(void)REQUIRE(__collate_range_cmp(start, finish) <= 0, REG_ERANGE);
799				for (i = 0; i <= UCHAR_MAX; i++) {
800					if (   __collate_range_cmp(start, i) <= 0
801					    && __collate_range_cmp(i, finish) <= 0
802					   )
803						CHadd(p, cs, i);
804				}
805			}
806		}
807		break;
808	}
809}
810
811/*
812 - p_b_cclass - parse a character-class name and deal with it
813 == static void p_b_cclass(struct parse *p, cset *cs);
814 */
815static void
816p_b_cclass(p, cs)
817struct parse *p;
818cset *cs;
819{
820	char *sp = p->next;
821	size_t len;
822	wctype_t wct;
823	char clname[16];
824
825	while (MORE() && isalpha((uch)PEEK()))
826		NEXT();
827	len = p->next - sp;
828	if (len >= sizeof(clname) - 1) {
829		SETERROR(REG_ECTYPE);
830		return;
831	}
832	memcpy(clname, sp, len);
833	clname[len] = '\0';
834	if ((wct = wctype(clname)) == 0) {
835		SETERROR(REG_ECTYPE);
836		return;
837	}
838	CHaddtype(p, cs, wct);
839}
840
841/*
842 - p_b_eclass - parse an equivalence-class name and deal with it
843 == static void p_b_eclass(struct parse *p, cset *cs);
844 *
845 * This implementation is incomplete. xxx
846 */
847static void
848p_b_eclass(p, cs)
849struct parse *p;
850cset *cs;
851{
852	wint_t c;
853
854	c = p_b_coll_elem(p, '=');
855	CHadd(p, cs, c);
856}
857
858/*
859 - p_b_symbol - parse a character or [..]ed multicharacter collating symbol
860 == static char p_b_symbol(struct parse *p);
861 */
862static wint_t			/* value of symbol */
863p_b_symbol(p)
864struct parse *p;
865{
866	wint_t value;
867
868	(void)REQUIRE(MORE(), REG_EBRACK);
869	if (!EATTWO('[', '.'))
870		return(WGETNEXT());
871
872	/* collating symbol */
873	value = p_b_coll_elem(p, '.');
874	(void)REQUIRE(EATTWO('.', ']'), REG_ECOLLATE);
875	return(value);
876}
877
878/*
879 - p_b_coll_elem - parse a collating-element name and look it up
880 == static char p_b_coll_elem(struct parse *p, int endc);
881 */
882static wint_t			/* value of collating element */
883p_b_coll_elem(p, endc)
884struct parse *p;
885wint_t endc;			/* name ended by endc,']' */
886{
887	char *sp = p->next;
888	struct cname *cp;
889	int len;
890	mbstate_t mbs;
891	wchar_t wc;
892	size_t clen;
893
894	while (MORE() && !SEETWO(endc, ']'))
895		NEXT();
896	if (!MORE()) {
897		SETERROR(REG_EBRACK);
898		return(0);
899	}
900	len = p->next - sp;
901	for (cp = cnames; cp->name != NULL; cp++)
902		if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0')
903			return(cp->code);	/* known name */
904	memset(&mbs, 0, sizeof(mbs));
905	if ((clen = mbrtowc(&wc, sp, len, &mbs)) == len)
906		return (wc);			/* single character */
907	else if (clen == (size_t)-1 || clen == (size_t)-2)
908		SETERROR(REG_ILLSEQ);
909	else
910		SETERROR(REG_ECOLLATE);		/* neither */
911	return(0);
912}
913
914/*
915 - othercase - return the case counterpart of an alphabetic
916 == static char othercase(int ch);
917 */
918static wint_t			/* if no counterpart, return ch */
919othercase(ch)
920wint_t ch;
921{
922	assert(iswalpha(ch));
923	if (iswupper(ch))
924		return(towlower(ch));
925	else if (iswlower(ch))
926		return(towupper(ch));
927	else			/* peculiar, but could happen */
928		return(ch);
929}
930
931/*
932 - bothcases - emit a dualcase version of a two-case character
933 == static void bothcases(struct parse *p, int ch);
934 *
935 * Boy, is this implementation ever a kludge...
936 */
937static void
938bothcases(p, ch)
939struct parse *p;
940wint_t ch;
941{
942	char *oldnext = p->next;
943	char *oldend = p->end;
944	char bracket[3 + MB_LEN_MAX];
945	size_t n;
946	mbstate_t mbs;
947
948	assert(othercase(ch) != ch);	/* p_bracket() would recurse */
949	p->next = bracket;
950	memset(&mbs, 0, sizeof(mbs));
951	n = wcrtomb(bracket, ch, &mbs);
952	assert(n != (size_t)-1);
953	bracket[n] = ']';
954	bracket[n + 1] = '\0';
955	p->end = bracket+n+1;
956	p_bracket(p);
957	assert(p->next == p->end);
958	p->next = oldnext;
959	p->end = oldend;
960}
961
962/*
963 - ordinary - emit an ordinary character
964 == static void ordinary(struct parse *p, int ch);
965 */
966static void
967ordinary(p, ch)
968struct parse *p;
969wint_t ch;
970{
971	cset *cs;
972
973	if ((p->g->cflags&REG_ICASE) && iswalpha(ch) && othercase(ch) != ch)
974		bothcases(p, ch);
975	else if ((ch & OPDMASK) == ch)
976		EMIT(OCHAR, ch);
977	else {
978		/*
979		 * Kludge: character is too big to fit into an OCHAR operand.
980		 * Emit a singleton set.
981		 */
982		if ((cs = allocset(p)) == NULL)
983			return;
984		CHadd(p, cs, ch);
985		EMIT(OANYOF, (int)(cs - p->g->sets));
986	}
987}
988
989/*
990 - nonnewline - emit REG_NEWLINE version of OANY
991 == static void nonnewline(struct parse *p);
992 *
993 * Boy, is this implementation ever a kludge...
994 */
995static void
996nonnewline(p)
997struct parse *p;
998{
999	char *oldnext = p->next;
1000	char *oldend = p->end;
1001	char bracket[4];
1002
1003	p->next = bracket;
1004	p->end = bracket+3;
1005	bracket[0] = '^';
1006	bracket[1] = '\n';
1007	bracket[2] = ']';
1008	bracket[3] = '\0';
1009	p_bracket(p);
1010	assert(p->next == bracket+3);
1011	p->next = oldnext;
1012	p->end = oldend;
1013}
1014
1015/*
1016 - repeat - generate code for a bounded repetition, recursively if needed
1017 == static void repeat(struct parse *p, sopno start, int from, int to);
1018 */
1019static void
1020repeat(p, start, from, to)
1021struct parse *p;
1022sopno start;			/* operand from here to end of strip */
1023int from;			/* repeated from this number */
1024int to;				/* to this number of times (maybe INFINITY) */
1025{
1026	sopno finish = HERE();
1027#	define	N	2
1028#	define	INF	3
1029#	define	REP(f, t)	((f)*8 + (t))
1030#	define	MAP(n)	(((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N)
1031	sopno copy;
1032
1033	if (p->error != 0)	/* head off possible runaway recursion */
1034		return;
1035
1036	assert(from <= to);
1037
1038	switch (REP(MAP(from), MAP(to))) {
1039	case REP(0, 0):			/* must be user doing this */
1040		DROP(finish-start);	/* drop the operand */
1041		break;
1042	case REP(0, 1):			/* as x{1,1}? */
1043	case REP(0, N):			/* as x{1,n}? */
1044	case REP(0, INF):		/* as x{1,}? */
1045		/* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
1046		INSERT(OCH_, start);		/* offset is wrong... */
1047		repeat(p, start+1, 1, to);
1048		ASTERN(OOR1, start);
1049		AHEAD(start);			/* ... fix it */
1050		EMIT(OOR2, 0);
1051		AHEAD(THERE());
1052		ASTERN(O_CH, THERETHERE());
1053		break;
1054	case REP(1, 1):			/* trivial case */
1055		/* done */
1056		break;
1057	case REP(1, N):			/* as x?x{1,n-1} */
1058		/* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
1059		INSERT(OCH_, start);
1060		ASTERN(OOR1, start);
1061		AHEAD(start);
1062		EMIT(OOR2, 0);			/* offset very wrong... */
1063		AHEAD(THERE());			/* ...so fix it */
1064		ASTERN(O_CH, THERETHERE());
1065		copy = dupl(p, start+1, finish+1);
1066		assert(copy == finish+4);
1067		repeat(p, copy, 1, to-1);
1068		break;
1069	case REP(1, INF):		/* as x+ */
1070		INSERT(OPLUS_, start);
1071		ASTERN(O_PLUS, start);
1072		break;
1073	case REP(N, N):			/* as xx{m-1,n-1} */
1074		copy = dupl(p, start, finish);
1075		repeat(p, copy, from-1, to-1);
1076		break;
1077	case REP(N, INF):		/* as xx{n-1,INF} */
1078		copy = dupl(p, start, finish);
1079		repeat(p, copy, from-1, to);
1080		break;
1081	default:			/* "can't happen" */
1082		SETERROR(REG_ASSERT);	/* just in case */
1083		break;
1084	}
1085}
1086
1087/*
1088 - wgetnext - helper function for WGETNEXT() macro. Gets the next wide
1089 - character from the parse struct, signals a REG_ILLSEQ error if the
1090 - character can't be converted. Returns the number of bytes consumed.
1091 */
1092static wint_t
1093wgetnext(p)
1094struct parse *p;
1095{
1096	mbstate_t mbs;
1097	wchar_t wc;
1098	size_t n;
1099
1100	memset(&mbs, 0, sizeof(mbs));
1101	n = mbrtowc(&wc, p->next, p->end - p->next, &mbs);
1102	if (n == (size_t)-1 || n == (size_t)-2) {
1103		SETERROR(REG_ILLSEQ);
1104		return (0);
1105	}
1106	if (n == 0)
1107		n = 1;
1108	p->next += n;
1109	return (wc);
1110}
1111
1112/*
1113 - seterr - set an error condition
1114 == static int seterr(struct parse *p, int e);
1115 */
1116static int			/* useless but makes type checking happy */
1117seterr(p, e)
1118struct parse *p;
1119int e;
1120{
1121	if (p->error == 0)	/* keep earliest error condition */
1122		p->error = e;
1123	p->next = nuls;		/* try to bring things to a halt */
1124	p->end = nuls;
1125	return(0);		/* make the return value well-defined */
1126}
1127
1128/*
1129 - allocset - allocate a set of characters for []
1130 == static cset *allocset(struct parse *p);
1131 */
1132static cset *
1133allocset(p)
1134struct parse *p;
1135{
1136	cset *cs, *ncs;
1137
1138	ncs = realloc(p->g->sets, (p->g->ncsets + 1) * sizeof(*ncs));
1139	if (ncs == NULL) {
1140		SETERROR(REG_ESPACE);
1141		return (NULL);
1142	}
1143	p->g->sets = ncs;
1144	cs = &p->g->sets[p->g->ncsets++];
1145	memset(cs, 0, sizeof(*cs));
1146
1147	return(cs);
1148}
1149
1150/*
1151 - freeset - free a now-unused set
1152 == static void freeset(struct parse *p, cset *cs);
1153 */
1154static void
1155freeset(p, cs)
1156struct parse *p;
1157cset *cs;
1158{
1159	cset *top = &p->g->sets[p->g->ncsets];
1160
1161	free(cs->wides);
1162	free(cs->ranges);
1163	free(cs->types);
1164	memset(cs, 0, sizeof(*cs));
1165	if (cs == top-1)	/* recover only the easy case */
1166		p->g->ncsets--;
1167}
1168
1169/*
1170 - singleton - Determine whether a set contains only one character,
1171 - returning it if so, otherwise returning OUT.
1172 */
1173static wint_t
1174singleton(cs)
1175cset *cs;
1176{
1177	wint_t i, s, n;
1178
1179	for (i = n = 0; i < NC; i++)
1180		if (CHIN(cs, i)) {
1181			n++;
1182			s = i;
1183		}
1184	if (n == 1)
1185		return (s);
1186	if (cs->nwides == 1 && cs->nranges == 0 && cs->ntypes == 0 &&
1187	    cs->icase == 0)
1188		return (cs->wides[0]);
1189	/* Don't bother handling the other cases. */
1190	return (OUT);
1191}
1192
1193/*
1194 - CHadd - add character to character set.
1195 */
1196static void
1197CHadd(p, cs, ch)
1198struct parse *p;
1199cset *cs;
1200wint_t ch;
1201{
1202	wint_t *newwides;
1203	assert(ch >= 0);
1204	if (ch < NC) {
1205		cs->bmp[ch >> 3] |= 1 << (ch & 7);
1206		if (cs->icase) {
1207			cs->bmp[towlower(ch) >> 3] |= 1 << (towlower(ch) & 7);
1208			cs->bmp[towupper(ch) >> 3] |= 1 << (towupper(ch) & 7);
1209		}
1210	} else {
1211		newwides = realloc(cs->wides, (cs->nwides + 1) *
1212		    sizeof(*cs->wides));
1213		if (newwides == NULL) {
1214			SETERROR(REG_ESPACE);
1215			return;
1216		}
1217		cs->wides = newwides;
1218		cs->wides[cs->nwides++] = ch;
1219	}
1220}
1221
1222/*
1223 - CHaddrange - add all characters in the range [min,max] to a character set.
1224 */
1225static void
1226CHaddrange(p, cs, min, max)
1227struct parse *p;
1228cset *cs;
1229wint_t min, max;
1230{
1231	crange *newranges;
1232
1233	for (; min < NC && min <= max; min++)
1234		CHadd(p, cs, min);
1235	if (min >= max)
1236		return;
1237	newranges = realloc(cs->ranges, (cs->nranges + 1) *
1238	    sizeof(*cs->ranges));
1239	if (newranges == NULL) {
1240		SETERROR(REG_ESPACE);
1241		return;
1242	}
1243	cs->ranges = newranges;
1244	cs->ranges[cs->nranges].min = min;
1245	cs->ranges[cs->nranges].min = max;
1246	cs->nranges++;
1247}
1248
1249/*
1250 - CHaddtype - add all characters of a certain type to a character set.
1251 */
1252static void
1253CHaddtype(p, cs, wct)
1254struct parse *p;
1255cset *cs;
1256wctype_t wct;
1257{
1258	wint_t i;
1259	wctype_t *newtypes;
1260
1261	for (i = 0; i < NC; i++) {
1262		if (iswctype(i, wct))
1263			CHadd(p, cs, i);
1264		if (cs->icase && i != towlower(i))
1265			CHadd(p, cs, towlower(i));
1266		if (cs->icase && i != towupper(i))
1267			CHadd(p, cs, towupper(i));
1268	}
1269	newtypes = realloc(cs->types, (cs->ntypes + 1) *
1270	    sizeof(*cs->types));
1271	if (newtypes == NULL) {
1272		SETERROR(REG_ESPACE);
1273		return;
1274	}
1275	cs->types = newtypes;
1276	cs->types[cs->ntypes++] = wct;
1277}
1278
1279/*
1280 - dupl - emit a duplicate of a bunch of sops
1281 == static sopno dupl(struct parse *p, sopno start, sopno finish);
1282 */
1283static sopno			/* start of duplicate */
1284dupl(p, start, finish)
1285struct parse *p;
1286sopno start;			/* from here */
1287sopno finish;			/* to this less one */
1288{
1289	sopno ret = HERE();
1290	sopno len = finish - start;
1291
1292	assert(finish >= start);
1293	if (len == 0)
1294		return(ret);
1295	enlarge(p, p->ssize + len);	/* this many unexpected additions */
1296	assert(p->ssize >= p->slen + len);
1297	(void) memcpy((char *)(p->strip + p->slen),
1298		(char *)(p->strip + start), (size_t)len*sizeof(sop));
1299	p->slen += len;
1300	return(ret);
1301}
1302
1303/*
1304 - doemit - emit a strip operator
1305 == static void doemit(struct parse *p, sop op, size_t opnd);
1306 *
1307 * It might seem better to implement this as a macro with a function as
1308 * hard-case backup, but it's just too big and messy unless there are
1309 * some changes to the data structures.  Maybe later.
1310 */
1311static void
1312doemit(p, op, opnd)
1313struct parse *p;
1314sop op;
1315size_t opnd;
1316{
1317	/* avoid making error situations worse */
1318	if (p->error != 0)
1319		return;
1320
1321	/* deal with oversize operands ("can't happen", more or less) */
1322	assert(opnd < 1<<OPSHIFT);
1323
1324	/* deal with undersized strip */
1325	if (p->slen >= p->ssize)
1326		enlarge(p, (p->ssize+1) / 2 * 3);	/* +50% */
1327	assert(p->slen < p->ssize);
1328
1329	/* finally, it's all reduced to the easy case */
1330	p->strip[p->slen++] = SOP(op, opnd);
1331}
1332
1333/*
1334 - doinsert - insert a sop into the strip
1335 == static void doinsert(struct parse *p, sop op, size_t opnd, sopno pos);
1336 */
1337static void
1338doinsert(p, op, opnd, pos)
1339struct parse *p;
1340sop op;
1341size_t opnd;
1342sopno pos;
1343{
1344	sopno sn;
1345	sop s;
1346	int i;
1347
1348	/* avoid making error situations worse */
1349	if (p->error != 0)
1350		return;
1351
1352	sn = HERE();
1353	EMIT(op, opnd);		/* do checks, ensure space */
1354	assert(HERE() == sn+1);
1355	s = p->strip[sn];
1356
1357	/* adjust paren pointers */
1358	assert(pos > 0);
1359	for (i = 1; i < NPAREN; i++) {
1360		if (p->pbegin[i] >= pos) {
1361			p->pbegin[i]++;
1362		}
1363		if (p->pend[i] >= pos) {
1364			p->pend[i]++;
1365		}
1366	}
1367
1368	memmove((char *)&p->strip[pos+1], (char *)&p->strip[pos],
1369						(HERE()-pos-1)*sizeof(sop));
1370	p->strip[pos] = s;
1371}
1372
1373/*
1374 - dofwd - complete a forward reference
1375 == static void dofwd(struct parse *p, sopno pos, sop value);
1376 */
1377static void
1378dofwd(p, pos, value)
1379struct parse *p;
1380sopno pos;
1381sop value;
1382{
1383	/* avoid making error situations worse */
1384	if (p->error != 0)
1385		return;
1386
1387	assert(value < 1<<OPSHIFT);
1388	p->strip[pos] = OP(p->strip[pos]) | value;
1389}
1390
1391/*
1392 - enlarge - enlarge the strip
1393 == static void enlarge(struct parse *p, sopno size);
1394 */
1395static void
1396enlarge(p, size)
1397struct parse *p;
1398sopno size;
1399{
1400	sop *sp;
1401
1402	if (p->ssize >= size)
1403		return;
1404
1405	sp = (sop *)realloc(p->strip, size*sizeof(sop));
1406	if (sp == NULL) {
1407		SETERROR(REG_ESPACE);
1408		return;
1409	}
1410	p->strip = sp;
1411	p->ssize = size;
1412}
1413
1414/*
1415 - stripsnug - compact the strip
1416 == static void stripsnug(struct parse *p, struct re_guts *g);
1417 */
1418static void
1419stripsnug(p, g)
1420struct parse *p;
1421struct re_guts *g;
1422{
1423	g->nstates = p->slen;
1424	g->strip = (sop *)realloc((char *)p->strip, p->slen * sizeof(sop));
1425	if (g->strip == NULL) {
1426		SETERROR(REG_ESPACE);
1427		g->strip = p->strip;
1428	}
1429}
1430
1431/*
1432 - findmust - fill in must and mlen with longest mandatory literal string
1433 == static void findmust(struct parse *p, struct re_guts *g);
1434 *
1435 * This algorithm could do fancy things like analyzing the operands of |
1436 * for common subsequences.  Someday.  This code is simple and finds most
1437 * of the interesting cases.
1438 *
1439 * Note that must and mlen got initialized during setup.
1440 */
1441static void
1442findmust(p, g)
1443struct parse *p;
1444struct re_guts *g;
1445{
1446	sop *scan;
1447	sop *start;
1448	sop *newstart;
1449	sopno newlen;
1450	sop s;
1451	char *cp;
1452	int offset;
1453	char buf[MB_LEN_MAX];
1454	size_t clen;
1455	mbstate_t mbs;
1456
1457	/* avoid making error situations worse */
1458	if (p->error != 0)
1459		return;
1460
1461	/*
1462	 * It's not generally safe to do a ``char'' substring search on
1463	 * multibyte character strings, but it's safe for at least
1464	 * UTF-8 (see RFC 3629).
1465	 */
1466	if (MB_CUR_MAX > 1 &&
1467	    strcmp(_CurrentRuneLocale->__encoding, "UTF-8") != 0)
1468		return;
1469
1470	/* find the longest OCHAR sequence in strip */
1471	newlen = 0;
1472	offset = 0;
1473	g->moffset = 0;
1474	scan = g->strip + 1;
1475	do {
1476		s = *scan++;
1477		switch (OP(s)) {
1478		case OCHAR:		/* sequence member */
1479			if (newlen == 0) {		/* new sequence */
1480				memset(&mbs, 0, sizeof(mbs));
1481				newstart = scan - 1;
1482			}
1483			clen = wcrtomb(buf, OPND(s), &mbs);
1484			if (clen == (size_t)-1)
1485				goto toohard;
1486			newlen += clen;
1487			break;
1488		case OPLUS_:		/* things that don't break one */
1489		case OLPAREN:
1490		case ORPAREN:
1491			break;
1492		case OQUEST_:		/* things that must be skipped */
1493		case OCH_:
1494			offset = altoffset(scan, offset);
1495			scan--;
1496			do {
1497				scan += OPND(s);
1498				s = *scan;
1499				/* assert() interferes w debug printouts */
1500				if (OP(s) != O_QUEST && OP(s) != O_CH &&
1501							OP(s) != OOR2) {
1502					g->iflags |= BAD;
1503					return;
1504				}
1505			} while (OP(s) != O_QUEST && OP(s) != O_CH);
1506			/* FALLTHROUGH */
1507		case OBOW:		/* things that break a sequence */
1508		case OEOW:
1509		case OBOL:
1510		case OEOL:
1511		case O_QUEST:
1512		case O_CH:
1513		case OEND:
1514			if (newlen > g->mlen) {		/* ends one */
1515				start = newstart;
1516				g->mlen = newlen;
1517				if (offset > -1) {
1518					g->moffset += offset;
1519					offset = newlen;
1520				} else
1521					g->moffset = offset;
1522			} else {
1523				if (offset > -1)
1524					offset += newlen;
1525			}
1526			newlen = 0;
1527			break;
1528		case OANY:
1529			if (newlen > g->mlen) {		/* ends one */
1530				start = newstart;
1531				g->mlen = newlen;
1532				if (offset > -1) {
1533					g->moffset += offset;
1534					offset = newlen;
1535				} else
1536					g->moffset = offset;
1537			} else {
1538				if (offset > -1)
1539					offset += newlen;
1540			}
1541			if (offset > -1)
1542				offset++;
1543			newlen = 0;
1544			break;
1545		case OANYOF:		/* may or may not invalidate offset */
1546			/* First, everything as OANY */
1547			if (newlen > g->mlen) {		/* ends one */
1548				start = newstart;
1549				g->mlen = newlen;
1550				if (offset > -1) {
1551					g->moffset += offset;
1552					offset = newlen;
1553				} else
1554					g->moffset = offset;
1555			} else {
1556				if (offset > -1)
1557					offset += newlen;
1558			}
1559			if (offset > -1)
1560				offset++;
1561			newlen = 0;
1562			break;
1563		toohard:
1564		default:
1565			/* Anything here makes it impossible or too hard
1566			 * to calculate the offset -- so we give up;
1567			 * save the last known good offset, in case the
1568			 * must sequence doesn't occur later.
1569			 */
1570			if (newlen > g->mlen) {		/* ends one */
1571				start = newstart;
1572				g->mlen = newlen;
1573				if (offset > -1)
1574					g->moffset += offset;
1575				else
1576					g->moffset = offset;
1577			}
1578			offset = -1;
1579			newlen = 0;
1580			break;
1581		}
1582	} while (OP(s) != OEND);
1583
1584	if (g->mlen == 0) {		/* there isn't one */
1585		g->moffset = -1;
1586		return;
1587	}
1588
1589	/* turn it into a character string */
1590	g->must = malloc((size_t)g->mlen + 1);
1591	if (g->must == NULL) {		/* argh; just forget it */
1592		g->mlen = 0;
1593		g->moffset = -1;
1594		return;
1595	}
1596	cp = g->must;
1597	scan = start;
1598	memset(&mbs, 0, sizeof(mbs));
1599	while (cp < g->must + g->mlen) {
1600		while (OP(s = *scan++) != OCHAR)
1601			continue;
1602		clen = wcrtomb(cp, OPND(s), &mbs);
1603		assert(clen != (size_t)-1);
1604		cp += clen;
1605	}
1606	assert(cp == g->must + g->mlen);
1607	*cp++ = '\0';		/* just on general principles */
1608}
1609
1610/*
1611 - altoffset - choose biggest offset among multiple choices
1612 == static int altoffset(sop *scan, int offset);
1613 *
1614 * Compute, recursively if necessary, the largest offset among multiple
1615 * re paths.
1616 */
1617static int
1618altoffset(scan, offset)
1619sop *scan;
1620int offset;
1621{
1622	int largest;
1623	int try;
1624	sop s;
1625
1626	/* If we gave up already on offsets, return */
1627	if (offset == -1)
1628		return -1;
1629
1630	largest = 0;
1631	try = 0;
1632	s = *scan++;
1633	while (OP(s) != O_QUEST && OP(s) != O_CH) {
1634		switch (OP(s)) {
1635		case OOR1:
1636			if (try > largest)
1637				largest = try;
1638			try = 0;
1639			break;
1640		case OQUEST_:
1641		case OCH_:
1642			try = altoffset(scan, try);
1643			if (try == -1)
1644				return -1;
1645			scan--;
1646			do {
1647				scan += OPND(s);
1648				s = *scan;
1649				if (OP(s) != O_QUEST && OP(s) != O_CH &&
1650							OP(s) != OOR2)
1651					return -1;
1652			} while (OP(s) != O_QUEST && OP(s) != O_CH);
1653			/* We must skip to the next position, or we'll
1654			 * leave altoffset() too early.
1655			 */
1656			scan++;
1657			break;
1658		case OANYOF:
1659		case OCHAR:
1660		case OANY:
1661			try++;
1662		case OBOW:
1663		case OEOW:
1664		case OLPAREN:
1665		case ORPAREN:
1666		case OOR2:
1667			break;
1668		default:
1669			try = -1;
1670			break;
1671		}
1672		if (try == -1)
1673			return -1;
1674		s = *scan++;
1675	}
1676
1677	if (try > largest)
1678		largest = try;
1679
1680	return largest+offset;
1681}
1682
1683/*
1684 - computejumps - compute char jumps for BM scan
1685 == static void computejumps(struct parse *p, struct re_guts *g);
1686 *
1687 * This algorithm assumes g->must exists and is has size greater than
1688 * zero. It's based on the algorithm found on Computer Algorithms by
1689 * Sara Baase.
1690 *
1691 * A char jump is the number of characters one needs to jump based on
1692 * the value of the character from the text that was mismatched.
1693 */
1694static void
1695computejumps(p, g)
1696struct parse *p;
1697struct re_guts *g;
1698{
1699	int ch;
1700	int mindex;
1701
1702	/* Avoid making errors worse */
1703	if (p->error != 0)
1704		return;
1705
1706	g->charjump = (int*) malloc((NC + 1) * sizeof(int));
1707	if (g->charjump == NULL)	/* Not a fatal error */
1708		return;
1709	/* Adjust for signed chars, if necessary */
1710	g->charjump = &g->charjump[-(CHAR_MIN)];
1711
1712	/* If the character does not exist in the pattern, the jump
1713	 * is equal to the number of characters in the pattern.
1714	 */
1715	for (ch = CHAR_MIN; ch < (CHAR_MAX + 1); ch++)
1716		g->charjump[ch] = g->mlen;
1717
1718	/* If the character does exist, compute the jump that would
1719	 * take us to the last character in the pattern equal to it
1720	 * (notice that we match right to left, so that last character
1721	 * is the first one that would be matched).
1722	 */
1723	for (mindex = 0; mindex < g->mlen; mindex++)
1724		g->charjump[(int)g->must[mindex]] = g->mlen - mindex - 1;
1725}
1726
1727/*
1728 - computematchjumps - compute match jumps for BM scan
1729 == static void computematchjumps(struct parse *p, struct re_guts *g);
1730 *
1731 * This algorithm assumes g->must exists and is has size greater than
1732 * zero. It's based on the algorithm found on Computer Algorithms by
1733 * Sara Baase.
1734 *
1735 * A match jump is the number of characters one needs to advance based
1736 * on the already-matched suffix.
1737 * Notice that all values here are minus (g->mlen-1), because of the way
1738 * the search algorithm works.
1739 */
1740static void
1741computematchjumps(p, g)
1742struct parse *p;
1743struct re_guts *g;
1744{
1745	int mindex;		/* General "must" iterator */
1746	int suffix;		/* Keeps track of matching suffix */
1747	int ssuffix;		/* Keeps track of suffixes' suffix */
1748	int* pmatches;		/* pmatches[k] points to the next i
1749				 * such that i+1...mlen is a substring
1750				 * of k+1...k+mlen-i-1
1751				 */
1752
1753	/* Avoid making errors worse */
1754	if (p->error != 0)
1755		return;
1756
1757	pmatches = (int*) malloc(g->mlen * sizeof(unsigned int));
1758	if (pmatches == NULL) {
1759		g->matchjump = NULL;
1760		return;
1761	}
1762
1763	g->matchjump = (int*) malloc(g->mlen * sizeof(unsigned int));
1764	if (g->matchjump == NULL)	/* Not a fatal error */
1765		return;
1766
1767	/* Set maximum possible jump for each character in the pattern */
1768	for (mindex = 0; mindex < g->mlen; mindex++)
1769		g->matchjump[mindex] = 2*g->mlen - mindex - 1;
1770
1771	/* Compute pmatches[] */
1772	for (mindex = g->mlen - 1, suffix = g->mlen; mindex >= 0;
1773	    mindex--, suffix--) {
1774		pmatches[mindex] = suffix;
1775
1776		/* If a mismatch is found, interrupting the substring,
1777		 * compute the matchjump for that position. If no
1778		 * mismatch is found, then a text substring mismatched
1779		 * against the suffix will also mismatch against the
1780		 * substring.
1781		 */
1782		while (suffix < g->mlen
1783		    && g->must[mindex] != g->must[suffix]) {
1784			g->matchjump[suffix] = MIN(g->matchjump[suffix],
1785			    g->mlen - mindex - 1);
1786			suffix = pmatches[suffix];
1787		}
1788	}
1789
1790	/* Compute the matchjump up to the last substring found to jump
1791	 * to the beginning of the largest must pattern prefix matching
1792	 * it's own suffix.
1793	 */
1794	for (mindex = 0; mindex <= suffix; mindex++)
1795		g->matchjump[mindex] = MIN(g->matchjump[mindex],
1796		    g->mlen + suffix - mindex);
1797
1798        ssuffix = pmatches[suffix];
1799        while (suffix < g->mlen) {
1800                while (suffix <= ssuffix && suffix < g->mlen) {
1801                        g->matchjump[suffix] = MIN(g->matchjump[suffix],
1802			    g->mlen + ssuffix - suffix);
1803                        suffix++;
1804                }
1805		if (suffix < g->mlen)
1806                	ssuffix = pmatches[ssuffix];
1807        }
1808
1809	free(pmatches);
1810}
1811
1812/*
1813 - pluscount - count + nesting
1814 == static sopno pluscount(struct parse *p, struct re_guts *g);
1815 */
1816static sopno			/* nesting depth */
1817pluscount(p, g)
1818struct parse *p;
1819struct re_guts *g;
1820{
1821	sop *scan;
1822	sop s;
1823	sopno plusnest = 0;
1824	sopno maxnest = 0;
1825
1826	if (p->error != 0)
1827		return(0);	/* there may not be an OEND */
1828
1829	scan = g->strip + 1;
1830	do {
1831		s = *scan++;
1832		switch (OP(s)) {
1833		case OPLUS_:
1834			plusnest++;
1835			break;
1836		case O_PLUS:
1837			if (plusnest > maxnest)
1838				maxnest = plusnest;
1839			plusnest--;
1840			break;
1841		}
1842	} while (OP(s) != OEND);
1843	if (plusnest != 0)
1844		g->iflags |= BAD;
1845	return(maxnest);
1846}
1847