118334Speter/*	$NetBSD: regcomp.c,v 1.8 2021/05/17 04:01:57 rin Exp $ */
218334Speter/*-
318334Speter * Copyright (c) 1992, 1993, 1994 Henry Spencer.
418334Speter * Copyright (c) 1992, 1993, 1994
518334Speter *	The Regents of the University of California.  All rights reserved.
618334Speter *
718334Speter * This code is derived from software contributed to Berkeley by
818334Speter * Henry Spencer of the University of Toronto.
918334Speter *
1018334Speter * Redistribution and use in source and binary forms, with or without
1118334Speter * modification, are permitted provided that the following conditions
1218334Speter * are met:
1318334Speter * 1. Redistributions of source code must retain the above copyright
1418334Speter *    notice, this list of conditions and the following disclaimer.
1518334Speter * 2. Redistributions in binary form must reproduce the above copyright
1618334Speter *    notice, this list of conditions and the following disclaimer in the
1718334Speter *    documentation and/or other materials provided with the distribution.
1818334Speter * 3. All advertising materials mentioning features or use of this software
1918334Speter *    must display the following acknowledgement:
2018334Speter *	This product includes software developed by the University of
2118334Speter *	California, Berkeley and its contributors.
2218334Speter * 4. Neither the name of the University nor the names of its contributors
2318334Speter *    may be used to endorse or promote products derived from this software
2418334Speter *    without specific prior written permission.
2518334Speter *
2618334Speter * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
2718334Speter * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
2818334Speter * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
2918334Speter * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
3018334Speter * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
3118334Speter * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
3218334Speter * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
3318334Speter * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
3418334Speter * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
3518334Speter * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
3618334Speter * SUCH DAMAGE.
3718334Speter *
3818334Speter *	@(#)regcomp.c	8.4 (Berkeley) 3/19/94
3918334Speter */
4018334Speter
4118334Speter#include <sys/cdefs.h>
4218334Speter#if 0
4318334Speter#if defined(LIBC_SCCS) && !defined(lint)
4418334Speterstatic char sccsid[] = "@(#)regcomp.c	8.4 (Berkeley) 3/19/94";
4518334Speter#endif /* LIBC_SCCS and not lint */
4618334Speter#else
4718334Speter__RCSID("$NetBSD: regcomp.c,v 1.8 2021/05/17 04:01:57 rin Exp $");
4818334Speter#endif
4918334Speter
5018334Speter#include <sys/types.h>
5118334Speter#include <stdio.h>
5218334Speter#include <string.h>
5318334Speter#include <ctype.h>
5418334Speter#include <limits.h>
5518334Speter#include <stdlib.h>
5618334Speter#include <regex.h>
5718334Speter
5818334Speter#include "utils.h"
5918334Speter#include "regex2.h"
6018334Speter
6118334Speter#include "cclass.h"
6218334Speter#include "cname.h"
6318334Speter
6418334Speter/*
6518334Speter * parse structure, passed up and down to avoid global variables and
6618334Speter * other clumsinesses
6718334Speter */
6818334Speterstruct parse {
6918334Speter	RCHAR_T *next;		/* next character in RE */
7018334Speter	RCHAR_T *end;		/* end of string (-> NUL normally) */
7118334Speter	int error;		/* has an error been seen? */
7218334Speter	sop *strip;		/* malloced strip */
7318334Speter	RCHAR_T *stripdata;	/* malloced stripdata */
7418334Speter	sopno ssize;		/* malloced strip size (allocated) */
7518334Speter	sopno slen;		/* malloced strip length (used) */
7618334Speter	int ncsalloc;		/* number of csets allocated */
7718334Speter	struct re_guts *g;
7818334Speter#	define	NPAREN	10	/* we need to remember () 1-9 for back refs */
7918334Speter	sopno pbegin[NPAREN];	/* -> ( ([0] unused) */
8018334Speter	sopno pend[NPAREN];	/* -> ) ([0] unused) */
8118334Speter};
8218334Speter
8318334Speter/* ========= begin header generated by ./mkh ========= */
8418334Speter#ifdef __cplusplus
8518334Speterextern "C" {
8618334Speter#endif
8718334Speter
8818334Speter/* === regcomp.c === */
8918334Speterstatic void p_ere __P((struct parse *p, int stop, size_t reclimit));
9018334Speterstatic void p_ere_exp __P((struct parse *p, size_t reclimit));
9118334Speterstatic void p_str __P((struct parse *p));
9218334Speterstatic void p_bre __P((struct parse *p, int end1, int end2, size_t reclimit));
9318334Speterstatic int p_simp_re __P((struct parse *p, int starordinary, size_t reclimit));
9418334Speterstatic int p_count __P((struct parse *p));
9518334Speterstatic void p_bracket __P((struct parse *p));
9618334Speterstatic void p_b_term __P((struct parse *p, cset *cs));
9718334Speterstatic void p_b_cclass __P((struct parse *p, cset *cs));
9818334Speterstatic void p_b_eclass __P((struct parse *p, cset *cs));
9918334Speterstatic char p_b_symbol __P((struct parse *p));
10018334Speterstatic char p_b_coll_elem __P((struct parse *p, int endc));
10118334Speterstatic RCHAR_T othercase __P((int ch));
10218334Speterstatic void bothcases __P((struct parse *p, int ch));
10318334Speterstatic void ordinary __P((struct parse *p, int ch));
10418334Speterstatic void nonnewline __P((struct parse *p));
10518334Speterstatic void repeat __P((struct parse *p, sopno start, int from, int to, size_t reclimit));
10618334Speterstatic int seterr __P((struct parse *p, int e));
10718334Speterstatic cset *allocset __P((struct parse *p));
10818334Speterstatic void freeset __P((struct parse *p, cset *cs));
10918334Speterstatic int freezeset __P((struct parse *p, cset *cs));
11018334Speterstatic int firstch __P((struct parse *p, cset *cs));
11118334Speterstatic int nch __P((struct parse *p, cset *cs));
11218334Speterstatic void mcadd __P((struct parse *p, cset *cs, const char *cp));
11318334Speter#ifdef notdef
11418334Speterstatic void mcsub __P((cset *cs, char *cp));
11518334Speterstatic int mcin __P((cset *cs, char *cp));
11618334Speterstatic char *mcfind __P((cset *cs, char *cp));
11718334Speter#endif
11818334Speterstatic void mcinvert __P((struct parse *p, cset *cs));
11918334Speterstatic void mccase __P((struct parse *p, cset *cs));
12018334Speter#ifdef notdef
12118334Speterstatic int isinsets __P((struct re_guts *g, int c));
12218334Speterstatic int samesets __P((struct re_guts *g, int c1, int c2));
12318334Speter#endif
12418334Speterstatic void categorize __P((struct parse *p, struct re_guts *g));
12518334Speterstatic sopno dupl __P((struct parse *p, sopno start, sopno finish));
12618334Speterstatic void doemit __P((struct parse *p, sop op, size_t opnd));
12718334Speterstatic void doinsert __P((struct parse *p, sop op, size_t opnd, sopno pos));
12818334Speterstatic void dofwd __P((struct parse *p, sopno pos, sop value));
12918334Speterstatic int enlarge __P((struct parse *p, sopno size));
13018334Speterstatic void stripsnug __P((struct parse *p, struct re_guts *g));
13118334Speterstatic void findmust __P((struct parse *p, struct re_guts *g));
13218334Speterstatic sopno pluscount __P((struct parse *p, struct re_guts *g));
13318334Speter
13418334Speter#ifdef __cplusplus
13518334Speter}
13618334Speter#endif
13718334Speter/* ========= end header generated by ./mkh ========= */
13818334Speter
13918334Speterstatic RCHAR_T nuls[10];		/* place to point scanner in event of error */
14018334Speter
14118334Speter/*
14218334Speter * macros for use with parse structure
14318334Speter * BEWARE:  these know that the parse structure is named `p' !!!
14418334Speter */
14518334Speter#define	PEEK()	(*p->next)
14618334Speter#define	PEEK2()	(*(p->next+1))
14718334Speter#define	MORE()	(p->next < p->end)
14818334Speter#define	MORE2()	(p->next+1 < p->end)
14918334Speter#define	SEE(c)	(MORE() && PEEK() == (c))
15018334Speter#define	SEETWO(a, b)	(MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b))
15118334Speter#define	EAT(c)	((SEE(c)) ? (NEXT(), 1) : 0)
15218334Speter#define	EATTWO(a, b)	((SEETWO(a, b)) ? (NEXT2(), 1) : 0)
15318334Speter#define	NEXT()	(p->next++)
15418334Speter#define	NEXT2()	(p->next += 2)
15518334Speter#define	NEXTn(n)	(p->next += (n))
15618334Speter#define	GETNEXT()	(*p->next++)
15718334Speter#define	SETERROR(e)	seterr(p, (e))
15818334Speter#define	REQUIRE(co, e)	((co) || SETERROR(e))
15918334Speter#define	MUSTSEE(c, e)	(REQUIRE(MORE() && PEEK() == (c), e))
16018334Speter#define	MUSTEAT(c, e)	(REQUIRE(MORE() && GETNEXT() == (c), e))
16118334Speter#define	MUSTNOTSEE(c, e)	(REQUIRE(!MORE() || PEEK() != (c), e))
16218334Speter#define	EMIT(op, sopnd)	doemit(p, (sop)(op), (size_t)(sopnd))
16318334Speter#define	INSERT(op, pos)	doinsert(p, (sop)(op), HERE()-(pos)+1, pos)
16418334Speter#define	AHEAD(pos)		dofwd(p, pos, HERE()-(pos))
16518334Speter#define	ASTERN(sop, pos)	EMIT(sop, HERE()-pos)
16618334Speter#define	HERE()		(p->slen)
16718334Speter#define	THERE()		(p->slen - 1)
16818334Speter#define	THERETHERE()	(p->slen - 2)
16918334Speter#define	DROP(n)	(p->slen -= (n))
17018334Speter
17118334Speter#ifndef NDEBUG
17218334Speterstatic int never = 0;		/* for use in asserts; shuts lint up */
17318334Speter#else
17418334Speter#define	never	0		/* some <assert.h>s have bugs too */
17518334Speter#endif
17618334Speter
17718334Speter#define	MEMLIMIT	0x8000000
17818334Speter#define MEMSIZE(p) \
17918334Speter	((p)->ncsalloc / CHAR_BIT * (p)->g->csetsize + \
18018334Speter	(p)->ncsalloc * sizeof(cset) + \
18118334Speter	(p)->ssize * sizeof(sop))
18218334Speter#define	RECLIMIT	256
18318334Speter
18418334Speter/*
18518334Speter - regcomp - interface for parser and compilation
18618334Speter = extern int regcomp(regex_t *, const RCHAR_T *, int);
18718334Speter = #define	REG_BASIC	0000
18818334Speter = #define	REG_EXTENDED	0001
18918334Speter = #define	REG_ICASE	0002
19018334Speter = #define	REG_NOSUB	0004
19118334Speter = #define	REG_NEWLINE	0010
19218334Speter = #define	REG_NOSPEC	0020
19318334Speter = #define	REG_PEND	0040
19418334Speter = #define	REG_DUMP	0200
19518334Speter */
19618334Speterint				/* 0 success, otherwise REG_something */
19718334Speterregcomp(regex_t *preg, const RCHAR_T *pattern, int cflags)
19818334Speter{
19918334Speter	struct parse pa;
20018334Speter	struct re_guts *g;
20118334Speter	struct parse *p = &pa;
20218334Speter	int i;
20318334Speter	size_t len;
20418334Speter#ifdef REDEBUG
20518334Speter#	define	GOODFLAGS(f)	(f)
20618334Speter#else
20718334Speter#	define	GOODFLAGS(f)	((f)&~REG_DUMP)
20818334Speter#endif
20918334Speter
21018334Speter	cflags = GOODFLAGS(cflags);
21118334Speter	if ((cflags&REG_EXTENDED) && (cflags&REG_NOSPEC))
21218334Speter		return(REG_INVARG);
21318334Speter
21418334Speter	if (cflags&REG_PEND) {
21518334Speter		if (preg->re_endp < pattern)
21618334Speter			return(REG_INVARG);
21718334Speter		len = preg->re_endp - pattern;
21818334Speter	} else
21918334Speter		len = STRLEN(pattern);
22018334Speter
22118334Speter	/* do the mallocs early so failure handling is easy */
22218334Speter	g = (struct re_guts *)malloc(sizeof(struct re_guts) +
22318334Speter							(NC-1)*sizeof(cat_t));
22418334Speter	if (g == NULL)
22518334Speter		return(REG_ESPACE);
22618334Speter	p->ssize = len/(size_t)2*(size_t)3 + (size_t)1;	/* ugh */
22718334Speter	p->strip = (sop *)malloc(p->ssize * sizeof(sop));
22818334Speter	if (p->strip == NULL) {
22918334Speter		free((char *)g);
23018334Speter		return(REG_ESPACE);
23118334Speter	}
23218334Speter	p->stripdata = (RCHAR_T *)malloc(p->ssize * sizeof(RCHAR_T));
23318334Speter	if (p->stripdata == NULL) {
23418334Speter		free((char *)p->strip);
23518334Speter		free((char *)g);
23618334Speter		return(REG_ESPACE);
23718334Speter	}
23818334Speter	p->slen = 0;
23918334Speter
24018334Speter	/* set things up */
24118334Speter	p->g = g;
24218334Speter	p->next = (RCHAR_T *)__UNCONST(pattern);	/* convenience; we do not modify it */
24318334Speter	p->end = p->next + len;
24418334Speter	p->error = 0;
24518334Speter	p->ncsalloc = 0;
24618334Speter	for (i = 0; i < NPAREN; i++) {
24718334Speter		p->pbegin[i] = 0;
24818334Speter		p->pend[i] = 0;
24918334Speter	}
25018334Speter	g->csetsize = NC;
25118334Speter	g->sets = NULL;
25218334Speter	g->setbits = NULL;
25318334Speter	g->ncsets = 0;
25418334Speter	g->cflags = cflags;
25518334Speter	g->iflags = 0;
25618334Speter	g->nbol = 0;
25718334Speter	g->neol = 0;
25818334Speter	g->must = NULL;
25918334Speter	g->mlen = 0;
26018334Speter	g->nsub = 0;
26118334Speter#if 0
26218334Speter	g->ncategories = 1;	/* category 0 is "everything else" */
26318334Speter	g->categories = &g->catspace[-(CHAR_MIN)];
26418334Speter	(void) memset((char *)g->catspace, 0, NC*sizeof(cat_t));
26518334Speter#endif
26618334Speter	g->backrefs = 0;
26718334Speter
26818334Speter	/* do it */
26918334Speter	EMIT(OEND, 0);
27018334Speter	g->firststate = THERE();
27118334Speter	if (cflags&REG_EXTENDED)
27218334Speter		p_ere(p, OUT, 0);
27318334Speter	else if (cflags&REG_NOSPEC)
27418334Speter		p_str(p);
27518334Speter	else
27618334Speter		p_bre(p, OUT, OUT, 0);
27718334Speter	EMIT(OEND, 0);
27818334Speter	g->laststate = THERE();
27918334Speter
28018334Speter	/* tidy up loose ends and fill things in */
28118334Speter	categorize(p, g);
28218334Speter	stripsnug(p, g);
28318334Speter	findmust(p, g);
28418334Speter	g->nplus = pluscount(p, g);
28518334Speter	g->magic = MAGIC2;
28618334Speter	preg->re_nsub = g->nsub;
28718334Speter	preg->re_g = g;
28818334Speter	preg->re_magic = MAGIC1;
28918334Speter#ifndef REDEBUG
29018334Speter	/* not debugging, so can't rely on the assert() in regexec() */
29118334Speter	if (g->iflags&BAD)
29218334Speter		SETERROR(REG_ASSERT);
29318334Speter#endif
29418334Speter
29518334Speter	/* win or lose, we're done */
29618334Speter	if (p->error != 0)	/* lose */
29718334Speter		regfree(preg);
29818334Speter	return(p->error);
29918334Speter}
30018334Speter
30118334Speter/*
30218334Speter - p_ere - ERE parser top level, concatenation and alternation
30318334Speter == static void p_ere(struct parse *p, int stop, size_t reclimit);
30418334Speter */
30518334Speterstatic void
30618334Speterp_ere(struct parse *p, int stop, size_t reclimit)
30718334Speter
30818334Speter         			/* character this ERE should end at */
30918334Speter{
31018334Speter	char c;
31118334Speter	sopno prevback = 0;
31218334Speter	sopno prevfwd = 0;
31318334Speter	sopno conc;
31418334Speter	int first = 1;		/* is this the first alternative? */
31518334Speter
31618334Speter	if (reclimit++ > RECLIMIT || p->error == REG_ESPACE) {
31718334Speter		p->error = REG_ESPACE;
31818334Speter		return;
31918334Speter	}
32018334Speter
32118334Speter	for (;;) {
32218334Speter		/* do a bunch of concatenated expressions */
32318334Speter		conc = HERE();
32418334Speter		while (MORE() && (c = PEEK()) != '|' && c != stop)
32518334Speter			p_ere_exp(p, reclimit);
32618334Speter		(void)REQUIRE(HERE() != conc, REG_EMPTY);	/* require nonempty */
32718334Speter
32818334Speter		if (!EAT('|'))
32918334Speter			break;		/* NOTE BREAK OUT */
330
331		if (first) {
332			INSERT(OCH_, conc);	/* offset is wrong */
333			prevfwd = conc;
334			prevback = conc;
335			first = 0;
336		}
337		ASTERN(OOR1, prevback);
338		prevback = THERE();
339		AHEAD(prevfwd);			/* fix previous offset */
340		prevfwd = HERE();
341		EMIT(OOR2, 0);			/* offset is very wrong */
342	}
343
344	if (!first) {		/* tail-end fixups */
345		AHEAD(prevfwd);
346		ASTERN(O_CH, prevback);
347	}
348
349	assert(!MORE() || SEE(stop));
350}
351
352/*
353 - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op
354 == static void p_ere_exp(struct parse *p);
355 */
356static void
357p_ere_exp(struct parse *p, size_t reclimit)
358{
359	char c;
360	sopno pos;
361	int count;
362	int count2;
363	sopno subno;
364	int wascaret = 0;
365
366	assert(MORE());		/* caller should have ensured this */
367	c = GETNEXT();
368
369	pos = HERE();
370	switch (c) {
371	case '(':
372		(void)REQUIRE(MORE(), REG_EPAREN);
373		p->g->nsub++;
374		subno = p->g->nsub;
375		if (subno < NPAREN)
376			p->pbegin[subno] = HERE();
377		EMIT(OLPAREN, subno);
378		if (!SEE(')'))
379			p_ere(p, ')', reclimit);
380		if (subno < NPAREN) {
381			p->pend[subno] = HERE();
382			assert(p->pend[subno] != 0);
383		}
384		EMIT(ORPAREN, subno);
385		(void)MUSTEAT(')', REG_EPAREN);
386		break;
387#ifndef POSIX_MISTAKE
388	case ')':		/* happens only if no current unmatched ( */
389		/*
390		 * You may ask, why the ifndef?  Because I didn't notice
391		 * this until slightly too late for 1003.2, and none of the
392		 * other 1003.2 regular-expression reviewers noticed it at
393		 * all.  So an unmatched ) is legal POSIX, at least until
394		 * we can get it fixed.
395		 */
396		SETERROR(REG_EPAREN);
397		break;
398#endif
399	case '^':
400		EMIT(OBOL, 0);
401		p->g->iflags |= USEBOL;
402		p->g->nbol++;
403		wascaret = 1;
404		break;
405	case '$':
406		EMIT(OEOL, 0);
407		p->g->iflags |= USEEOL;
408		p->g->neol++;
409		break;
410	case '|':
411		SETERROR(REG_EMPTY);
412		break;
413	case '*':
414	case '+':
415	case '?':
416		SETERROR(REG_BADRPT);
417		break;
418	case '.':
419		if (p->g->cflags&REG_NEWLINE)
420			nonnewline(p);
421		else
422			EMIT(OANY, 0);
423		break;
424	case '[':
425		p_bracket(p);
426		break;
427	case '\\':
428		(void)REQUIRE(MORE(), REG_EESCAPE);
429		c = GETNEXT();
430		ordinary(p, c);
431		break;
432	case '{':		/* okay as ordinary except if digit follows */
433		(void)REQUIRE(!MORE() || !ISDIGIT((UCHAR_T)PEEK()), REG_BADRPT);
434		/* FALLTHROUGH */
435	default:
436		ordinary(p, c);
437		break;
438	}
439
440	if (!MORE())
441		return;
442	c = PEEK();
443	/* we call { a repetition if followed by a digit */
444	if (!( c == '*' || c == '+' || c == '?' ||
445				(c == '{' && MORE2() && ISDIGIT((UCHAR_T)PEEK2())) ))
446		return;		/* no repetition, we're done */
447	NEXT();
448
449	(void)REQUIRE(!wascaret, REG_BADRPT);
450	switch (c) {
451	case '*':	/* implemented as +? */
452		/* this case does not require the (y|) trick, noKLUDGE */
453		INSERT(OPLUS_, pos);
454		ASTERN(O_PLUS, pos);
455		INSERT(OQUEST_, pos);
456		ASTERN(O_QUEST, pos);
457		break;
458	case '+':
459		INSERT(OPLUS_, pos);
460		ASTERN(O_PLUS, pos);
461		break;
462	case '?':
463		/* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
464		INSERT(OCH_, pos);		/* offset slightly wrong */
465		ASTERN(OOR1, pos);		/* this one's right */
466		AHEAD(pos);			/* fix the OCH_ */
467		EMIT(OOR2, 0);			/* offset very wrong... */
468		AHEAD(THERE());			/* ...so fix it */
469		ASTERN(O_CH, THERETHERE());
470		break;
471	case '{':
472		count = p_count(p);
473		if (EAT(',')) {
474			if (ISDIGIT((UCHAR_T)PEEK())) {
475				count2 = p_count(p);
476				(void)REQUIRE(count <= count2, REG_BADBR);
477			} else		/* single number with comma */
478				count2 = INFINITY;
479		} else		/* just a single number */
480			count2 = count;
481		repeat(p, pos, count, count2, 0);
482		if (!EAT('}')) {	/* error heuristics */
483			while (MORE() && PEEK() != '}')
484				NEXT();
485			(void)REQUIRE(MORE(), REG_EBRACE);
486			SETERROR(REG_BADBR);
487		}
488		break;
489	}
490
491	if (!MORE())
492		return;
493	c = PEEK();
494	if (!( c == '*' || c == '+' || c == '?' ||
495				(c == '{' && MORE2() && ISDIGIT((UCHAR_T)PEEK2())) ) )
496		return;
497	SETERROR(REG_BADRPT);
498}
499
500/*
501 - p_str - string (no metacharacters) "parser"
502 == static void p_str(struct parse *p);
503 */
504static void
505p_str(struct parse *p)
506{
507	(void)REQUIRE(MORE(), REG_EMPTY);
508	while (MORE())
509		ordinary(p, GETNEXT());
510}
511
512/*
513 - p_bre - BRE parser top level, anchoring and concatenation
514 == static void p_bre(struct parse *p, int end1, \
515 ==	int end2, size_t reclimit);
516 * Giving end1 as OUT essentially eliminates the end1/end2 check.
517 *
518 * This implementation is a bit of a kludge, in that a trailing $ is first
519 * taken as an ordinary character and then revised to be an anchor.  The
520 * only undesirable side effect is that '$' gets included as a character
521 * category in such cases.  This is fairly harmless; not worth fixing.
522 * The amount of lookahead needed to avoid this kludge is excessive.
523 */
524static void
525p_bre(struct parse *p, int end1, int end2, size_t reclimit)
526
527                  		/* first terminating character */
528                  		/* second terminating character */
529{
530	sopno start;
531	int first = 1;			/* first subexpression? */
532	int wasdollar = 0;
533
534	if (reclimit++ > RECLIMIT || p->error == REG_ESPACE) {
535		p->error = REG_ESPACE;
536		return;
537	}
538
539	start = HERE();
540
541	if (EAT('^')) {
542		EMIT(OBOL, 0);
543		p->g->iflags |= USEBOL;
544		p->g->nbol++;
545	}
546	while (MORE() && !SEETWO(end1, end2)) {
547		wasdollar = p_simp_re(p, first, reclimit);
548		first = 0;
549	}
550	if (wasdollar) {	/* oops, that was a trailing anchor */
551		DROP(1);
552		EMIT(OEOL, 0);
553		p->g->iflags |= USEEOL;
554		p->g->neol++;
555	}
556
557	(void)REQUIRE(HERE() != start, REG_EMPTY);	/* require nonempty */
558}
559
560/*
561 - p_simp_re - parse a simple RE, an atom possibly followed by a repetition
562 == static int p_simp_re(struct parse *p, int starordinary, size_t reclimit);
563 */
564static int			/* was the simple RE an unbackslashed $? */
565p_simp_re(struct parse *p, int starordinary, size_t reclimit)
566
567                 		/* is a leading * an ordinary character? */
568{
569	int c;
570	int count;
571	int count2;
572	sopno pos;
573	int i;
574	sopno subno;
575	int backsl;
576
577	pos = HERE();		/* repetion op, if any, covers from here */
578
579	assert(MORE());		/* caller should have ensured this */
580	c = GETNEXT();
581	backsl = c == '\\';
582	if (backsl) {
583		(void)REQUIRE(MORE(), REG_EESCAPE);
584		c = (unsigned char)GETNEXT();
585		switch (c) {
586		case '{':
587			SETERROR(REG_BADRPT);
588			break;
589		case '(':
590			p->g->nsub++;
591			subno = p->g->nsub;
592			if (subno < NPAREN)
593				p->pbegin[subno] = HERE();
594			EMIT(OLPAREN, subno);
595			/* the MORE here is an error heuristic */
596			if (MORE() && !SEETWO('\\', ')'))
597				p_bre(p, '\\', ')', reclimit);
598			if (subno < NPAREN) {
599				p->pend[subno] = HERE();
600				assert(p->pend[subno] != 0);
601			}
602			EMIT(ORPAREN, subno);
603			(void)REQUIRE(EATTWO('\\', ')'), REG_EPAREN);
604			break;
605		case ')':	/* should not get here -- must be user */
606		case '}':
607			SETERROR(REG_EPAREN);
608			break;
609		case '1':
610		case '2':
611		case '3':
612		case '4':
613		case '5':
614		case '6':
615		case '7':
616		case '8':
617		case '9':
618			i = c - '0';
619			assert(i < NPAREN);
620			if (p->pend[i] != 0) {
621				assert(i <= p->g->nsub);
622				EMIT(OBACK_, i);
623				assert(p->pbegin[i] != 0);
624				assert(p->strip[p->pbegin[i]] == OLPAREN);
625				assert(p->strip[p->pend[i]] == ORPAREN);
626				(void) dupl(p, p->pbegin[i]+1, p->pend[i]);
627				EMIT(O_BACK, i);
628			} else
629				SETERROR(REG_ESUBREG);
630			p->g->backrefs = 1;
631			break;
632		default:
633			ordinary(p, c);
634			break;
635		}
636	} else {
637		switch (c) {
638		case '.':
639			if (p->g->cflags&REG_NEWLINE)
640				nonnewline(p);
641			else
642				EMIT(OANY, 0);
643			break;
644		case '[':
645			p_bracket(p);
646			break;
647		case '*':
648			(void)REQUIRE(starordinary, REG_BADRPT);
649			/* FALLTHROUGH */
650		default:
651			ordinary(p, c);
652			break;
653		}
654	}
655
656	if (EAT('*')) {		/* implemented as +? */
657		/* this case does not require the (y|) trick, noKLUDGE */
658		INSERT(OPLUS_, pos);
659		ASTERN(O_PLUS, pos);
660		INSERT(OQUEST_, pos);
661		ASTERN(O_QUEST, pos);
662	} else if (EATTWO('\\', '{')) {
663		count = p_count(p);
664		if (EAT(',')) {
665			if (MORE() && ISDIGIT((UCHAR_T)PEEK())) {
666				count2 = p_count(p);
667				(void)REQUIRE(count <= count2, REG_BADBR);
668			} else		/* single number with comma */
669				count2 = INFINITY;
670		} else		/* just a single number */
671			count2 = count;
672		repeat(p, pos, count, count2, reclimit);
673		if (!EATTWO('\\', '}')) {	/* error heuristics */
674			while (MORE() && !SEETWO('\\', '}'))
675				NEXT();
676			(void)REQUIRE(MORE(), REG_EBRACE);
677			SETERROR(REG_BADBR);
678		}
679	} else if (!backsl && c == (unsigned char)'$')	/* $ (but not \$) ends it */
680		return(1);
681
682	return(0);
683}
684
685/*
686 - p_count - parse a repetition count
687 == static int p_count(struct parse *p);
688 */
689static int			/* the value */
690p_count(struct parse *p)
691{
692	int count = 0;
693	int ndigits = 0;
694
695	while (MORE() && ISDIGIT((UCHAR_T)PEEK()) && count <= DUPMAX) {
696		count = count*10 + (GETNEXT() - '0');
697		ndigits++;
698	}
699
700	(void)REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR);
701	return(count);
702}
703
704/*
705 - p_bracket - parse a bracketed character list
706 == static void p_bracket(struct parse *p);
707 *
708 * Note a significant property of this code:  if the allocset() did SETERROR,
709 * no set operations are done.
710 */
711static void
712p_bracket(struct parse *p)
713{
714	cset *cs;
715	int invert = 0;
716	static RCHAR_T bow[] = { '[', ':', '<', ':', ']', ']' };
717	static RCHAR_T eow[] = { '[', ':', '>', ':', ']', ']' };
718
719	cs = allocset(p);
720	if (cs == NULL)
721		return;
722
723	/* Dept of Truly Sickening Special-Case Kludges */
724	if (p->next + 5 < p->end && MEMCMP(p->next, bow, 6) == 0) {
725		EMIT(OBOW, 0);
726		NEXTn(6);
727		return;
728	}
729	if (p->next + 5 < p->end && MEMCMP(p->next, eow, 6) == 0) {
730		EMIT(OEOW, 0);
731		NEXTn(6);
732		return;
733	}
734
735	if (EAT('^'))
736		invert++;	/* make note to invert set at end */
737	if (EAT(']'))
738		CHadd(cs, ']');
739	else if (EAT('-'))
740		CHadd(cs, '-');
741	while (MORE() && PEEK() != ']' && !SEETWO('-', ']'))
742		p_b_term(p, cs);
743	if (EAT('-'))
744		CHadd(cs, '-');
745	(void)MUSTEAT(']', REG_EBRACK);
746
747	if (p->error != 0)	/* don't mess things up further */
748		return;
749
750	if (p->g->cflags&REG_ICASE) {
751		int i;
752		int ci;
753
754		for (i = p->g->csetsize - 1; i >= 0; i--)
755			if (CHIN(cs, i) && ISALPHA2(i)) {
756				ci = othercase(i);
757				if (ci != i)
758					CHadd(cs, ci);
759			}
760		if (cs->multis != NULL)
761			mccase(p, cs);
762	}
763	if (invert) {
764		int i;
765
766		for (i = p->g->csetsize - 1; i >= 0; i--)
767			if (CHIN(cs, i))
768				CHsub(cs, i);
769			else
770				CHadd(cs, i);
771		if (p->g->cflags&REG_NEWLINE)
772			CHsub(cs, '\n');
773		if (cs->multis != NULL)
774			mcinvert(p, cs);
775	}
776
777	assert(cs->multis == NULL);		/* xxx */
778
779	if (nch(p, cs) == 1) {		/* optimize singleton sets */
780		ordinary(p, firstch(p, cs));
781		freeset(p, cs);
782	} else
783		EMIT(OANYOF, freezeset(p, cs));
784}
785
786/*
787 - p_b_term - parse one term of a bracketed character list
788 == static void p_b_term(struct parse *p, cset *cs);
789 */
790static void
791p_b_term(struct parse *p, cset *cs)
792{
793	char c;
794	char start, finish;
795	int i;
796
797	/* classify what we've got */
798	switch ((MORE()) ? PEEK() : '\0') {
799	case '[':
800		c = (MORE2()) ? PEEK2() : '\0';
801		break;
802	case '-':
803		SETERROR(REG_ERANGE);
804		return;			/* NOTE RETURN */
805		break;
806	default:
807		c = '\0';
808		break;
809	}
810
811	switch (c) {
812	case ':':		/* character class */
813		NEXT2();
814		(void)REQUIRE(MORE(), REG_EBRACK);
815		c = PEEK();
816		(void)REQUIRE(c != '-' && c != ']', REG_ECTYPE);
817		p_b_cclass(p, cs);
818		(void)REQUIRE(MORE(), REG_EBRACK);
819		(void)REQUIRE(EATTWO(':', ']'), REG_ECTYPE);
820		break;
821	case '=':		/* equivalence class */
822		NEXT2();
823		(void)REQUIRE(MORE(), REG_EBRACK);
824		c = PEEK();
825		(void)REQUIRE(c != '-' && c != ']', REG_ECOLLATE);
826		p_b_eclass(p, cs);
827		(void)REQUIRE(MORE(), REG_EBRACK);
828		(void)REQUIRE(EATTWO('=', ']'), REG_ECOLLATE);
829		break;
830	default:		/* symbol, ordinary character, or range */
831/* xxx revision needed for multichar stuff */
832		start = p_b_symbol(p);
833		if (SEE('-') && MORE2() && PEEK2() != ']') {
834			/* range */
835			NEXT();
836			if (EAT('-'))
837				finish = '-';
838			else
839				finish = p_b_symbol(p);
840		} else
841			finish = start;
842/* xxx what about signed chars here... */
843		(void)REQUIRE(start <= finish, REG_ERANGE);
844		for (i = start; i <= finish; i++)
845			CHadd(cs, i);
846		break;
847	}
848}
849
850/*
851 - p_b_cclass - parse a character-class name and deal with it
852 == static void p_b_cclass(struct parse *p, cset *cs);
853 */
854static void
855p_b_cclass(struct parse *p, cset *cs)
856{
857	RCHAR_T *sp = p->next;
858	struct cclass *cp;
859	size_t len;
860	const char *u;
861	char c;
862
863	while (MORE() && ISALPHA2(PEEK()))
864		NEXT();
865	len = p->next - sp;
866	for (cp = cclasses; cp->name != NULL; cp++)
867		if (STRLEN(cp->name) == len && !MEMCMP(cp->name, sp, len))
868			break;
869	if (cp->name == NULL) {
870		/* oops, didn't find it */
871		SETERROR(REG_ECTYPE);
872		return;
873	}
874
875	u = cp->chars;
876	while ((c = *u++) != '\0')
877		CHadd(cs, c);
878	for (u = cp->multis; *u != '\0'; u += strlen(u) + 1)
879		MCadd(p, cs, u);
880}
881
882/*
883 - p_b_eclass - parse an equivalence-class name and deal with it
884 == static void p_b_eclass(struct parse *p, cset *cs);
885 *
886 * This implementation is incomplete. xxx
887 */
888static void
889p_b_eclass(struct parse *p, cset *cs)
890{
891	char c;
892
893	c = p_b_coll_elem(p, '=');
894	CHadd(cs, c);
895}
896
897/*
898 - p_b_symbol - parse a character or [..]ed multicharacter collating symbol
899 == static char p_b_symbol(struct parse *p);
900 */
901static char			/* value of symbol */
902p_b_symbol(struct parse *p)
903{
904	char value;
905
906	(void)REQUIRE(MORE(), REG_EBRACK);
907	if (!EATTWO('[', '.'))
908		return(GETNEXT());
909
910	/* collating symbol */
911	value = p_b_coll_elem(p, '.');
912	(void)REQUIRE(EATTWO('.', ']'), REG_ECOLLATE);
913	return(value);
914}
915
916/*
917 - p_b_coll_elem - parse a collating-element name and look it up
918 == static char p_b_coll_elem(struct parse *p, int endc);
919 */
920static char			/* value of collating element */
921p_b_coll_elem(struct parse *p, int endc)
922
923         			/* name ended by endc,']' */
924{
925	RCHAR_T *sp = p->next;
926	struct cname *cp;
927	size_t len;
928
929	while (MORE() && !SEETWO(endc, ']'))
930		NEXT();
931	if (!MORE()) {
932		SETERROR(REG_EBRACK);
933		return(0);
934	}
935	len = p->next - sp;
936	for (cp = cnames; cp->name != NULL; cp++)
937		if (STRLEN(cp->name) == len && !MEMCMP(cp->name, sp, len))
938			return(cp->code);	/* known name */
939	if (len == 1)
940		return(*sp);	/* single character */
941	SETERROR(REG_ECOLLATE);			/* neither */
942	return(0);
943}
944
945/*
946 - othercase - return the case counterpart of an alphabetic
947 == static char othercase(int ch);
948 */
949static RCHAR_T			/* if no counterpart, return ch */
950othercase(int ch)
951{
952	assert(ISALPHA2(ch));
953	if (ISUPPER(ch))
954		return(TOLOWER(ch));
955	else if (ISLOWER(ch))
956		return(TOUPPER(ch));
957	else			/* peculiar, but could happen */
958		return(ch);
959}
960
961/*
962 - bothcases - emit a dualcase version of a two-case character
963 == static void bothcases(struct parse *p, int ch);
964 *
965 * Boy, is this implementation ever a kludge...
966 */
967static void
968bothcases(struct parse *p, int ch)
969{
970	RCHAR_T *oldnext = p->next;
971	RCHAR_T *oldend = p->end;
972	RCHAR_T bracket[3];
973
974	assert(othercase(ch) != ch);	/* p_bracket() would recurse */
975	p->next = bracket;
976	p->end = bracket+2;
977	bracket[0] = ch;
978	bracket[1] = ']';
979	bracket[2] = '\0';
980	p_bracket(p);
981	assert(p->next == bracket+2);
982	p->next = oldnext;
983	p->end = oldend;
984}
985
986/*
987 - ordinary - emit an ordinary character
988 == static void ordinary(struct parse *p, int ch);
989 */
990static void
991ordinary(struct parse *p, int ch)
992{
993/*
994	cat_t *cap = p->g->categories;
995*/
996
997	if ((p->g->cflags&REG_ICASE) && ISALPHA2(ch) && othercase(ch) != ch)
998		bothcases(p, ch);
999	else {
1000		EMIT(OCHAR, (UCHAR_T)ch);
1001/*
1002		if (cap[ch] == 0)
1003			cap[ch] = p->g->ncategories++;
1004*/
1005	}
1006}
1007
1008/*
1009 - nonnewline - emit REG_NEWLINE version of OANY
1010 == static void nonnewline(struct parse *p);
1011 *
1012 * Boy, is this implementation ever a kludge...
1013 */
1014static void
1015nonnewline(struct parse *p)
1016{
1017	RCHAR_T *oldnext = p->next;
1018	RCHAR_T *oldend = p->end;
1019	RCHAR_T bracket[4];
1020
1021	p->next = bracket;
1022	p->end = bracket+3;
1023	bracket[0] = '^';
1024	bracket[1] = '\n';
1025	bracket[2] = ']';
1026	bracket[3] = '\0';
1027	p_bracket(p);
1028	assert(p->next == bracket+3);
1029	p->next = oldnext;
1030	p->end = oldend;
1031}
1032
1033/*
1034 - repeat - generate code for a bounded repetition, recursively if needed
1035 == static void repeat(struct parse *p, sopno start, int from, int to, size_t reclimit);
1036 */
1037static void
1038repeat(struct parse *p, sopno start, int from, int to, size_t reclimit)
1039
1040            			/* operand from here to end of strip */
1041         			/* repeated from this number */
1042       				/* to this number of times (maybe INFINITY) */
1043{
1044	sopno finish;
1045#	define	N	2
1046#	define	INF	3
1047#	define	REP(f, t)	((f)*8 + (t))
1048#	define	MAP(n)	(((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N)
1049	sopno copy;
1050
1051	if (reclimit++ > RECLIMIT)
1052		p->error = REG_ESPACE;
1053	if (p->error)
1054		return;
1055
1056	finish = HERE();
1057
1058	assert(from <= to);
1059
1060	switch (REP(MAP(from), MAP(to))) {
1061	case REP(0, 0):			/* must be user doing this */
1062		DROP(finish-start);	/* drop the operand */
1063		break;
1064	case REP(0, 1):			/* as x{1,1}? */
1065	case REP(0, N):			/* as x{1,n}? */
1066	case REP(0, INF):		/* as x{1,}? */
1067		/* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
1068		INSERT(OCH_, start);		/* offset is wrong... */
1069		repeat(p, start+1, 1, to, reclimit);
1070		ASTERN(OOR1, start);
1071		AHEAD(start);			/* ... fix it */
1072		EMIT(OOR2, 0);
1073		AHEAD(THERE());
1074		ASTERN(O_CH, THERETHERE());
1075		break;
1076	case REP(1, 1):			/* trivial case */
1077		/* done */
1078		break;
1079	case REP(1, N):			/* as x?x{1,n-1} */
1080		/* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
1081		INSERT(OCH_, start);
1082		ASTERN(OOR1, start);
1083		AHEAD(start);
1084		EMIT(OOR2, 0);			/* offset very wrong... */
1085		AHEAD(THERE());			/* ...so fix it */
1086		ASTERN(O_CH, THERETHERE());
1087		copy = dupl(p, start+1, finish+1);
1088		assert(copy == finish+4);
1089		repeat(p, copy, 1, to-1, reclimit);
1090		break;
1091	case REP(1, INF):		/* as x+ */
1092		INSERT(OPLUS_, start);
1093		ASTERN(O_PLUS, start);
1094		break;
1095	case REP(N, N):			/* as xx{m-1,n-1} */
1096		copy = dupl(p, start, finish);
1097		repeat(p, copy, from-1, to-1, reclimit);
1098		break;
1099	case REP(N, INF):		/* as xx{n-1,INF} */
1100		copy = dupl(p, start, finish);
1101		repeat(p, copy, from-1, to, reclimit);
1102		break;
1103	default:			/* "can't happen" */
1104		SETERROR(REG_ASSERT);	/* just in case */
1105		break;
1106	}
1107}
1108
1109/*
1110 - seterr - set an error condition
1111 == static int seterr(struct parse *p, int e);
1112 */
1113static int			/* useless but makes type checking happy */
1114seterr(struct parse *p, int e)
1115{
1116	if (p->error == 0)	/* keep earliest error condition */
1117		p->error = e;
1118	p->next = nuls;		/* try to bring things to a halt */
1119	p->end = nuls;
1120	return(0);		/* make the return value well-defined */
1121}
1122
1123/*
1124 - allocset - allocate a set of characters for []
1125 == static cset *allocset(struct parse *p);
1126 */
1127static cset *
1128allocset(struct parse *p)
1129{
1130	int no = p->g->ncsets++;
1131	size_t nc;
1132	size_t nbytes;
1133	cset *cs;
1134	size_t css = (size_t)p->g->csetsize;
1135	int i;
1136
1137	if (no >= p->ncsalloc) {	/* need another column of space */
1138		p->ncsalloc += CHAR_BIT;
1139		nc = p->ncsalloc;
1140		assert(nc % CHAR_BIT == 0);
1141		nbytes = nc / CHAR_BIT * css;
1142		if (MEMSIZE(p) > MEMLIMIT)
1143			goto oomem;
1144		if (p->g->sets == NULL)
1145			p->g->sets = (cset *)malloc(nc * sizeof(cset));
1146		else
1147			p->g->sets = (cset *)realloc((char *)p->g->sets,
1148							nc * sizeof(cset));
1149		if (p->g->setbits == NULL)
1150			p->g->setbits = (uch *)malloc(nbytes);
1151		else {
1152			p->g->setbits = (uch *)realloc((char *)p->g->setbits,
1153								nbytes);
1154			/* xxx this isn't right if setbits is now NULL */
1155			for (i = 0; i < no; i++)
1156				p->g->sets[i].ptr = p->g->setbits + css*(i/CHAR_BIT);
1157		}
1158		if (p->g->sets != NULL && p->g->setbits != NULL)
1159			(void) memset((char *)p->g->setbits + (nbytes - css),
1160								0, css);
1161		else {
1162oomem:
1163			no = 0;
1164			SETERROR(REG_ESPACE);
1165			/* caller's responsibility not to do set ops */
1166			return NULL;
1167		}
1168	}
1169
1170	cs = &p->g->sets[no];
1171	cs->ptr = p->g->setbits + css*((no)/CHAR_BIT);
1172	cs->mask = 1 << ((no) % CHAR_BIT);
1173	cs->hash = 0;
1174	cs->smultis = 0;
1175	cs->multis = NULL;
1176
1177	return(cs);
1178}
1179
1180/*
1181 - freeset - free a now-unused set
1182 == static void freeset(struct parse *p, cset *cs);
1183 */
1184static void
1185freeset(struct parse *p, cset *cs)
1186{
1187	size_t i;
1188	cset *top = &p->g->sets[p->g->ncsets];
1189	size_t css = (size_t)p->g->csetsize;
1190
1191	for (i = 0; i < css; i++)
1192		CHsub(cs, i);
1193	if (cs == top-1)	/* recover only the easy case */
1194		p->g->ncsets--;
1195}
1196
1197/*
1198 - freezeset - final processing on a set of characters
1199 == static int freezeset(struct parse *p, cset *cs);
1200 *
1201 * The main task here is merging identical sets.  This is usually a waste
1202 * of time (although the hash code minimizes the overhead), but can win
1203 * big if REG_ICASE is being used.  REG_ICASE, by the way, is why the hash
1204 * is done using addition rather than xor -- all ASCII [aA] sets xor to
1205 * the same value!
1206 */
1207static int			/* set number */
1208freezeset(struct parse *p, cset *cs)
1209{
1210	uch h = cs->hash;
1211	size_t i;
1212	cset *top = &p->g->sets[p->g->ncsets];
1213	cset *cs2;
1214	size_t css = (size_t)p->g->csetsize;
1215
1216	/* look for an earlier one which is the same */
1217	for (cs2 = &p->g->sets[0]; cs2 < top; cs2++)
1218		if (cs2->hash == h && cs2 != cs) {
1219			/* maybe */
1220			for (i = 0; i < css; i++)
1221				if (!!CHIN(cs2, i) != !!CHIN(cs, i))
1222					break;		/* no */
1223			if (i == css)
1224				break;			/* yes */
1225		}
1226
1227	if (cs2 < top) {	/* found one */
1228		freeset(p, cs);
1229		cs = cs2;
1230	}
1231
1232	return((int)(cs - p->g->sets));
1233}
1234
1235/*
1236 - firstch - return first character in a set (which must have at least one)
1237 == static int firstch(struct parse *p, cset *cs);
1238 */
1239static int			/* character; there is no "none" value */
1240firstch(struct parse *p, cset *cs)
1241{
1242	size_t i;
1243	size_t css = (size_t)p->g->csetsize;
1244
1245	for (i = 0; i < css; i++)
1246		if (CHIN(cs, i))
1247			return((char)i);
1248	assert(never);
1249	return(0);		/* arbitrary */
1250}
1251
1252/*
1253 - nch - number of characters in a set
1254 == static int nch(struct parse *p, cset *cs);
1255 */
1256static int
1257nch(struct parse *p, cset *cs)
1258{
1259	size_t i;
1260	size_t css = (size_t)p->g->csetsize;
1261	int n = 0;
1262
1263	for (i = 0; i < css; i++)
1264		if (CHIN(cs, i))
1265			n++;
1266	return(n);
1267}
1268
1269/*
1270 - mcadd - add a collating element to a cset
1271 == static void mcadd(struct parse *p, cset *cs, \
1272 ==	char *cp);
1273 */
1274static void
1275mcadd(struct parse *p, cset *cs, const char *cp)
1276{
1277	size_t oldend = cs->smultis;
1278
1279	cs->smultis += strlen(cp) + 1;
1280	if (cs->multis == NULL)
1281		cs->multis = malloc(cs->smultis);
1282	else
1283		cs->multis = realloc(cs->multis, cs->smultis);
1284	if (cs->multis == NULL) {
1285		SETERROR(REG_ESPACE);
1286		return;
1287	}
1288
1289	(void) strcpy(cs->multis + oldend - 1, cp);
1290	cs->multis[cs->smultis - 1] = '\0';
1291}
1292
1293#ifdef notdef
1294/*
1295 - mcsub - subtract a collating element from a cset
1296 == static void mcsub(cset *cs, char *cp);
1297 */
1298static void
1299mcsub(cset *cs, char *cp)
1300{
1301	char *fp = mcfind(cs, cp);
1302	size_t len = strlen(fp);
1303
1304	assert(fp != NULL);
1305	(void) memmove(fp, fp + len + 1,
1306				cs->smultis - (fp + len + 1 - cs->multis));
1307	cs->smultis -= len;
1308
1309	if (cs->smultis == 0) {
1310		free(cs->multis);
1311		cs->multis = NULL;
1312		return;
1313	}
1314
1315	cs->multis = realloc(cs->multis, cs->smultis);
1316	assert(cs->multis != NULL);
1317}
1318
1319/*
1320 - mcin - is a collating element in a cset?
1321 == static int mcin(cset *cs, char *cp);
1322 */
1323static int
1324mcin(cset *cs, char *cp)
1325{
1326	return(mcfind(cs, cp) != NULL);
1327}
1328
1329/*
1330 - mcfind - find a collating element in a cset
1331 == static char *mcfind(cset *cs, char *cp);
1332 */
1333static char *
1334mcfind(cset *cs, char *cp)
1335{
1336	char *p;
1337
1338	if (cs->multis == NULL)
1339		return(NULL);
1340	for (p = cs->multis; *p != '\0'; p += strlen(p) + 1)
1341		if (strcmp(cp, p) == 0)
1342			return(p);
1343	return(NULL);
1344}
1345#endif
1346
1347/*
1348 - mcinvert - invert the list of collating elements in a cset
1349 == static void mcinvert(struct parse *p, cset *cs);
1350 *
1351 * This would have to know the set of possibilities.  Implementation
1352 * is deferred.
1353 */
1354static void
1355mcinvert(struct parse *p, cset *cs)
1356{
1357	assert(cs->multis == NULL);	/* xxx */
1358}
1359
1360/*
1361 - mccase - add case counterparts of the list of collating elements in a cset
1362 == static void mccase(struct parse *p, cset *cs);
1363 *
1364 * This would have to know the set of possibilities.  Implementation
1365 * is deferred.
1366 */
1367static void
1368mccase(struct parse *p, cset *cs)
1369{
1370	assert(cs->multis == NULL);	/* xxx */
1371}
1372
1373#ifdef notdef
1374/*
1375 - isinsets - is this character in any sets?
1376 == static int isinsets(struct re_guts *g, int c);
1377 */
1378static int			/* predicate */
1379isinsets(struct re_guts *g, int c)
1380{
1381	uch *col;
1382	int i;
1383	int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT;
1384	unsigned uc = (unsigned char)c;
1385
1386	for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize)
1387		if (col[uc] != 0)
1388			return(1);
1389	return(0);
1390}
1391
1392/*
1393 - samesets - are these two characters in exactly the same sets?
1394 == static int samesets(struct re_guts *g, int c1, int c2);
1395 */
1396static int			/* predicate */
1397samesets(struct re_guts *g, int c1, int c2)
1398{
1399	uch *col;
1400	int i;
1401	int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT;
1402	unsigned uc1 = (unsigned char)c1;
1403	unsigned uc2 = (unsigned char)c2;
1404
1405	for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize)
1406		if (col[uc1] != col[uc2])
1407			return(0);
1408	return(1);
1409}
1410#endif
1411
1412/*
1413 - categorize - sort out character categories
1414 == static void categorize(struct parse *p, struct re_guts *g);
1415 */
1416static void
1417categorize(struct parse *p, struct re_guts *g)
1418{
1419#ifdef notdef
1420	cat_t *cats = g->categories;
1421	int c;
1422	int c2;
1423	cat_t cat;
1424
1425	/* avoid making error situations worse */
1426	if (p->error != 0)
1427		return;
1428
1429	for (c = CHAR_MIN; c <= CHAR_MAX; c++)
1430		if (cats[c] == 0 && isinsets(g, c)) {
1431			cat = g->ncategories++;
1432			cats[c] = cat;
1433			for (c2 = c+1; c2 <= CHAR_MAX; c2++)
1434				if (cats[c2] == 0 && samesets(g, c, c2))
1435					cats[c2] = cat;
1436		}
1437#endif
1438}
1439
1440/*
1441 - dupl - emit a duplicate of a bunch of sops
1442 == static sopno dupl(struct parse *p, sopno start, sopno finish);
1443 */
1444static sopno			/* start of duplicate */
1445dupl(struct parse *p, sopno start, sopno finish)
1446
1447            			/* from here */
1448             			/* to this less one */
1449{
1450	sopno ret = HERE();
1451	sopno len = finish - start;
1452
1453	assert(finish >= start);
1454	if (len == 0)
1455		return(ret);
1456	if (!enlarge(p, p->ssize + len))	/* this many unexpected additions */
1457		return ret;
1458	assert(p->ssize >= p->slen + len);
1459	(void) memcpy((char *)(p->strip + p->slen),
1460		(char *)(p->strip + start), (size_t)len*sizeof(sop));
1461	(void) memcpy((char *)(p->stripdata + p->slen),
1462		(char *)(p->stripdata + start), (size_t)len*sizeof(RCHAR_T));
1463	p->slen += len;
1464	return(ret);
1465}
1466
1467/*
1468 - doemit - emit a strip operator
1469 == static void doemit(struct parse *p, sop op, size_t opnd);
1470 *
1471 * It might seem better to implement this as a macro with a function as
1472 * hard-case backup, but it's just too big and messy unless there are
1473 * some changes to the data structures.  Maybe later.
1474 */
1475static void
1476doemit(struct parse *p, sop op, size_t opnd)
1477{
1478	/* avoid making error situations worse */
1479	if (p->error != 0)
1480		return;
1481
1482	/* deal with oversize operands ("can't happen", more or less) */
1483	assert(opnd < 1);
1484
1485	/* deal with undersized strip */
1486	if (p->slen >= p->ssize)
1487		if (!enlarge(p, (p->ssize+1) / 2 * 3))	/* +50% */
1488			return;
1489
1490	/* finally, it's all reduced to the easy case */
1491	p->strip[p->slen] = op;
1492	p->stripdata[p->slen] = opnd;
1493	p->slen++;
1494}
1495
1496/*
1497 - doinsert - insert a sop into the strip
1498 == static void doinsert(struct parse *p, sop op, size_t opnd, sopno pos);
1499 */
1500static void
1501doinsert(struct parse *p, sop op, size_t opnd, sopno pos)
1502{
1503	sopno sn;
1504	sop s;
1505	RCHAR_T d;
1506	int i;
1507
1508	/* avoid making error situations worse */
1509	if (p->error != 0)
1510		return;
1511
1512	sn = HERE();
1513	EMIT(op, opnd);		/* do checks, ensure space */
1514	assert(HERE() == sn+1);
1515	s = p->strip[sn];
1516	d = p->stripdata[sn];
1517
1518	/* adjust paren pointers */
1519	assert(pos > 0);
1520	for (i = 1; i < NPAREN; i++) {
1521		if (p->pbegin[i] >= pos) {
1522			p->pbegin[i]++;
1523		}
1524		if (p->pend[i] >= pos) {
1525			p->pend[i]++;
1526		}
1527	}
1528
1529	memmove((char *)&p->strip[pos+1], (char *)&p->strip[pos],
1530						(HERE()-pos-1)*sizeof(sop));
1531	memmove((char *)&p->stripdata[pos+1], (char *)&p->stripdata[pos],
1532						(HERE()-pos-1)*sizeof(RCHAR_T));
1533	p->strip[pos] = s;
1534	p->stripdata[pos] = d;
1535}
1536
1537/*
1538 - dofwd - complete a forward reference
1539 == static void dofwd(struct parse *p, sopno pos, sop value);
1540 */
1541static void
1542dofwd(struct parse *p, sopno pos, sop value)
1543{
1544	/* avoid making error situations worse */
1545	if (p->error != 0)
1546		return;
1547
1548	assert(value < 1);
1549	p->stripdata[pos] = value;
1550}
1551
1552/*
1553 - enlarge - enlarge the strip
1554 == static int enlarge(struct parse *p, sopno size);
1555 */
1556static int
1557enlarge(struct parse *p, sopno size)
1558{
1559	sop *sp;
1560	RCHAR_T *dp;
1561	sopno osize;
1562
1563	if (p->ssize >= size)
1564		return 1;
1565
1566	osize = p->ssize;
1567	p->ssize = size;
1568	if (MEMSIZE(p) > MEMLIMIT)
1569		goto oomem;
1570	sp = realloc(p->strip, p->ssize * sizeof(sop));
1571	if (sp == NULL)
1572		goto oomem;
1573	p->strip = sp;
1574	dp = realloc(p->stripdata, p->ssize * sizeof(RCHAR_T));
1575	if (dp == NULL) {
1576oomem:
1577		p->ssize = osize;
1578		SETERROR(REG_ESPACE);
1579		return 0;
1580	}
1581	p->stripdata = dp;
1582	return 1;
1583}
1584
1585/*
1586 - stripsnug - compact the strip
1587 == static void stripsnug(struct parse *p, struct re_guts *g);
1588 */
1589static void
1590stripsnug(struct parse *p, struct re_guts *g)
1591{
1592	g->nstates = p->slen;
1593	g->strip = (sop *)realloc((char *)p->strip,
1594	    p->slen * sizeof(sop));
1595	if (g->strip == NULL) {
1596		SETERROR(REG_ESPACE);
1597		g->strip = p->strip;
1598	}
1599	g->stripdata = (RCHAR_T *)realloc((char *)p->stripdata,
1600	    p->slen * sizeof(RCHAR_T));
1601	if (g->stripdata == NULL) {
1602		SETERROR(REG_ESPACE);
1603		g->stripdata = p->stripdata;
1604	}
1605}
1606
1607/*
1608 - findmust - fill in must and mlen with longest mandatory literal string
1609 == static void findmust(struct parse *p, struct re_guts *g);
1610 *
1611 * This algorithm could do fancy things like analyzing the operands of |
1612 * for common subsequences.  Someday.  This code is simple and finds most
1613 * of the interesting cases.
1614 *
1615 * Note that must and mlen got initialized during setup.
1616 */
1617static void
1618findmust(struct parse *p, struct re_guts *g)
1619{
1620	sop *scans;
1621	RCHAR_T *scand;
1622	sop *starts = 0;
1623	RCHAR_T *startd = NULL;
1624	sop *newstarts = 0;
1625	RCHAR_T *newstartd = NULL;
1626	sopno newlen;
1627	sop s;
1628	RCHAR_T d;
1629	RCHAR_T *cp;
1630	sopno i;
1631
1632	/* avoid making error situations worse */
1633	if (p->error != 0)
1634		return;
1635
1636	/* find the longest OCHAR sequence in strip */
1637	newlen = 0;
1638	scans = g->strip + 1;
1639	scand = g->stripdata + 1;
1640	do {
1641		s = *scans++;
1642		d = *scand++;
1643		switch (s) {
1644		case OCHAR:		/* sequence member */
1645			if (newlen == 0) {		/* new sequence */
1646				newstarts = scans - 1;
1647				newstartd = scand - 1;
1648			}
1649			newlen++;
1650			break;
1651		case OPLUS_:		/* things that don't break one */
1652		case OLPAREN:
1653		case ORPAREN:
1654			break;
1655		case OQUEST_:		/* things that must be skipped */
1656		case OCH_:
1657			scans--;
1658			scand--;
1659			do {
1660				scans += d;
1661				scand += d;
1662				s = *scans;
1663				d = *scand;
1664				/* assert() interferes w debug printouts */
1665				if (s != O_QUEST && s != O_CH && s != OOR2) {
1666					g->iflags |= BAD;
1667					return;
1668				}
1669			} while (s != O_QUEST && s != O_CH);
1670			/* fallthrough */
1671		default:		/* things that break a sequence */
1672			if (newlen > g->mlen) {		/* ends one */
1673				starts = newstarts;
1674				startd = newstartd;
1675				g->mlen = newlen;
1676			}
1677			newlen = 0;
1678			break;
1679		}
1680	} while (s != OEND);
1681
1682	if (g->mlen == 0)		/* there isn't one */
1683		return;
1684
1685	/* turn it into a character string */
1686	g->must = malloc(((size_t)g->mlen + 1) * sizeof(RCHAR_T));
1687	if (g->must == NULL) {		/* argh; just forget it */
1688		g->mlen = 0;
1689		return;
1690	}
1691	cp = g->must;
1692	scans = starts;
1693	scand = startd;
1694	for (i = g->mlen; i > 0; i--) {
1695		for (;;) {
1696			s = *scans++;
1697			d = *scand++;
1698			if (s == OCHAR)
1699				break;
1700		}
1701		assert(cp < g->must + g->mlen);
1702		*cp++ = d;
1703	}
1704	assert(cp == g->must + g->mlen);
1705	*cp++ = '\0';		/* just on general principles */
1706}
1707
1708/*
1709 - pluscount - count + nesting
1710 == static sopno pluscount(struct parse *p, struct re_guts *g);
1711 */
1712static sopno			/* nesting depth */
1713pluscount(struct parse *p, struct re_guts *g)
1714{
1715	sop *scan;
1716	sop s;
1717	sopno plusnest = 0;
1718	sopno maxnest = 0;
1719
1720	if (p->error != 0)
1721		return(0);	/* there may not be an OEND */
1722
1723	scan = g->strip + 1;
1724	do {
1725		s = *scan++;
1726		switch (s) {
1727		case OPLUS_:
1728			plusnest++;
1729			break;
1730		case O_PLUS:
1731			if (plusnest > maxnest)
1732				maxnest = plusnest;
1733			plusnest--;
1734			break;
1735		}
1736	} while (s != OEND);
1737	if (plusnest != 0)
1738		g->iflags |= BAD;
1739	return(maxnest);
1740}
1741