1/*	$NetBSD: regcomp.c,v 1.7 2011/11/19 17:45:11 tnozaki Exp $ */
2
3/*-
4 * Copyright (c) 1992, 1993, 1994 Henry Spencer.
5 * Copyright (c) 1992, 1993, 1994
6 *	The Regents of the University of California.  All rights reserved.
7 *
8 * This code is derived from software contributed to Berkeley by
9 * Henry Spencer of the University of Toronto.
10 *
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
13 * are met:
14 * 1. Redistributions of source code must retain the above copyright
15 *    notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 *    notice, this list of conditions and the following disclaimer in the
18 *    documentation and/or other materials provided with the distribution.
19 * 3. All advertising materials mentioning features or use of this software
20 *    must display the following acknowledgement:
21 *	This product includes software developed by the University of
22 *	California, Berkeley and its contributors.
23 * 4. Neither the name of the University nor the names of its contributors
24 *    may be used to endorse or promote products derived from this software
25 *    without specific prior written permission.
26 *
27 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
28 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
31 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
32 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
33 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
34 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
35 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
36 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
37 * SUCH DAMAGE.
38 *
39 *	@(#)regcomp.c	8.4 (Berkeley) 3/19/94
40 */
41
42#if defined(LIBC_SCCS) && !defined(lint)
43static char sccsid[] = "@(#)regcomp.c	8.4 (Berkeley) 3/19/94";
44#endif /* LIBC_SCCS and not lint */
45
46#include <sys/types.h>
47#include <stdio.h>
48#include <string.h>
49#include <ctype.h>
50#include <limits.h>
51#include <stdlib.h>
52#include <regex.h>
53
54#include "utils.h"
55#include "regex2.h"
56
57#include "cclass.h"
58#include "cname.h"
59
60/*
61 * parse structure, passed up and down to avoid global variables and
62 * other clumsinesses
63 */
64struct parse {
65	RCHAR_T *next;		/* next character in RE */
66	RCHAR_T *end;		/* end of string (-> NUL normally) */
67	int error;		/* has an error been seen? */
68	sop *strip;		/* malloced strip */
69	RCHAR_T *stripdata;	/* malloced stripdata */
70	sopno ssize;		/* malloced strip size (allocated) */
71	sopno slen;		/* malloced strip length (used) */
72	int ncsalloc;		/* number of csets allocated */
73	struct re_guts *g;
74#	define	NPAREN	10	/* we need to remember () 1-9 for back refs */
75	sopno pbegin[NPAREN];	/* -> ( ([0] unused) */
76	sopno pend[NPAREN];	/* -> ) ([0] unused) */
77};
78
79/* ========= begin header generated by ./mkh ========= */
80#ifdef __cplusplus
81extern "C" {
82#endif
83
84/* === regcomp.c === */
85static void p_ere __P((struct parse *p, int stop, size_t reclimit));
86static void p_ere_exp __P((struct parse *p, size_t reclimit));
87static void p_str __P((struct parse *p));
88static void p_bre __P((struct parse *p, int end1, int end2, size_t reclimit));
89static int p_simp_re __P((struct parse *p, int starordinary, size_t reclimit));
90static int p_count __P((struct parse *p));
91static void p_bracket __P((struct parse *p));
92static void p_b_term __P((struct parse *p, cset *cs));
93static void p_b_cclass __P((struct parse *p, cset *cs));
94static void p_b_eclass __P((struct parse *p, cset *cs));
95static char p_b_symbol __P((struct parse *p));
96static char p_b_coll_elem __P((struct parse *p, int endc));
97static char othercase __P((int ch));
98static void bothcases __P((struct parse *p, int ch));
99static void ordinary __P((struct parse *p, int ch));
100static void nonnewline __P((struct parse *p));
101static void repeat __P((struct parse *p, sopno start, int from, int to, size_t reclimit));
102static int seterr __P((struct parse *p, int e));
103static cset *allocset __P((struct parse *p));
104static void freeset __P((struct parse *p, cset *cs));
105static int freezeset __P((struct parse *p, cset *cs));
106static int firstch __P((struct parse *p, cset *cs));
107static int nch __P((struct parse *p, cset *cs));
108static void mcadd __P((struct parse *p, cset *cs, const char *cp));
109#ifdef notdef
110static void mcsub __P((cset *cs, char *cp));
111static int mcin __P((cset *cs, char *cp));
112static char *mcfind __P((cset *cs, char *cp));
113#endif
114static void mcinvert __P((struct parse *p, cset *cs));
115static void mccase __P((struct parse *p, cset *cs));
116#ifdef notdef
117static int isinsets __P((struct re_guts *g, int c));
118static int samesets __P((struct re_guts *g, int c1, int c2));
119#endif
120static void categorize __P((struct parse *p, struct re_guts *g));
121static sopno dupl __P((struct parse *p, sopno start, sopno finish));
122static void doemit __P((struct parse *p, sop op, size_t opnd));
123static void doinsert __P((struct parse *p, sop op, size_t opnd, sopno pos));
124static void dofwd __P((struct parse *p, sopno pos, sop value));
125static int enlarge __P((struct parse *p, sopno size));
126static void stripsnug __P((struct parse *p, struct re_guts *g));
127static void findmust __P((struct parse *p, struct re_guts *g));
128static sopno pluscount __P((struct parse *p, struct re_guts *g));
129
130#ifdef __cplusplus
131}
132#endif
133/* ========= end header generated by ./mkh ========= */
134
135static RCHAR_T nuls[10];		/* place to point scanner in event of error */
136
137/*
138 * macros for use with parse structure
139 * BEWARE:  these know that the parse structure is named `p' !!!
140 */
141#define	PEEK()	(*p->next)
142#define	PEEK2()	(*(p->next+1))
143#define	MORE()	(p->next < p->end)
144#define	MORE2()	(p->next+1 < p->end)
145#define	SEE(c)	(MORE() && PEEK() == (c))
146#define	SEETWO(a, b)	(MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b))
147#define	EAT(c)	((SEE(c)) ? (NEXT(), 1) : 0)
148#define	EATTWO(a, b)	((SEETWO(a, b)) ? (NEXT2(), 1) : 0)
149#define	NEXT()	(p->next++)
150#define	NEXT2()	(p->next += 2)
151#define	NEXTn(n)	(p->next += (n))
152#define	GETNEXT()	(*p->next++)
153#define	SETERROR(e)	seterr(p, (e))
154#define	REQUIRE(co, e)	((co) || SETERROR(e))
155#define	MUSTSEE(c, e)	(REQUIRE(MORE() && PEEK() == (c), e))
156#define	MUSTEAT(c, e)	(REQUIRE(MORE() && GETNEXT() == (c), e))
157#define	MUSTNOTSEE(c, e)	(REQUIRE(!MORE() || PEEK() != (c), e))
158#define	EMIT(op, sopnd)	doemit(p, (sop)(op), (size_t)(sopnd))
159#define	INSERT(op, pos)	doinsert(p, (sop)(op), HERE()-(pos)+1, pos)
160#define	AHEAD(pos)		dofwd(p, pos, HERE()-(pos))
161#define	ASTERN(sop, pos)	EMIT(sop, HERE()-pos)
162#define	HERE()		(p->slen)
163#define	THERE()		(p->slen - 1)
164#define	THERETHERE()	(p->slen - 2)
165#define	DROP(n)	(p->slen -= (n))
166
167#ifndef NDEBUG
168static int never = 0;		/* for use in asserts; shuts lint up */
169#else
170#define	never	0		/* some <assert.h>s have bugs too */
171#endif
172
173#define	MEMLIMIT	0x8000000
174#define MEMSIZE(p) \
175	((p)->ncsalloc / CHAR_BIT * (p)->g->csetsize + \
176	(p)->ncsalloc * sizeof(cset) + \
177	(p)->ssize * sizeof(sop))
178#define	RECLIMIT	256
179
180/*
181 - regcomp - interface for parser and compilation
182 = extern int regcomp(regex_t *, const RCHAR_T *, int);
183 = #define	REG_BASIC	0000
184 = #define	REG_EXTENDED	0001
185 = #define	REG_ICASE	0002
186 = #define	REG_NOSUB	0004
187 = #define	REG_NEWLINE	0010
188 = #define	REG_NOSPEC	0020
189 = #define	REG_PEND	0040
190 = #define	REG_DUMP	0200
191 */
192int				/* 0 success, otherwise REG_something */
193regcomp(regex_t *preg, const RCHAR_T *pattern, int cflags)
194{
195	struct parse pa;
196	register struct re_guts *g;
197	register struct parse *p = &pa;
198	register int i;
199	register size_t len;
200#ifdef REDEBUG
201#	define	GOODFLAGS(f)	(f)
202#else
203#	define	GOODFLAGS(f)	((f)&~REG_DUMP)
204#endif
205
206	cflags = GOODFLAGS(cflags);
207	if ((cflags&REG_EXTENDED) && (cflags&REG_NOSPEC))
208		return(REG_INVARG);
209
210	if (cflags&REG_PEND) {
211		if (preg->re_endp < pattern)
212			return(REG_INVARG);
213		len = preg->re_endp - pattern;
214	} else
215		len = STRLEN(pattern);
216
217	/* do the mallocs early so failure handling is easy */
218	g = (struct re_guts *)malloc(sizeof(struct re_guts) +
219							(NC-1)*sizeof(cat_t));
220	if (g == NULL)
221		return(REG_ESPACE);
222	p->ssize = len/(size_t)2*(size_t)3 + (size_t)1;	/* ugh */
223	p->strip = (sop *)malloc(p->ssize * sizeof(sop));
224	if (p->strip == NULL) {
225		free((char *)g);
226		return(REG_ESPACE);
227	}
228	p->stripdata = (RCHAR_T *)malloc(p->ssize * sizeof(RCHAR_T));
229	if (p->stripdata == NULL) {
230		free((char *)p->strip);
231		free((char *)g);
232		return(REG_ESPACE);
233	}
234	p->slen = 0;
235
236	/* set things up */
237	p->g = g;
238	p->next = (RCHAR_T *)pattern;	/* convenience; we do not modify it */
239	p->end = p->next + len;
240	p->error = 0;
241	p->ncsalloc = 0;
242	for (i = 0; i < NPAREN; i++) {
243		p->pbegin[i] = 0;
244		p->pend[i] = 0;
245	}
246	g->csetsize = NC;
247	g->sets = NULL;
248	g->setbits = NULL;
249	g->ncsets = 0;
250	g->cflags = cflags;
251	g->iflags = 0;
252	g->nbol = 0;
253	g->neol = 0;
254	g->must = NULL;
255	g->mlen = 0;
256	g->nsub = 0;
257#if 0
258	g->ncategories = 1;	/* category 0 is "everything else" */
259	g->categories = &g->catspace[-(CHAR_MIN)];
260	(void) memset((char *)g->catspace, 0, NC*sizeof(cat_t));
261#endif
262	g->backrefs = 0;
263
264	/* do it */
265	EMIT(OEND, 0);
266	g->firststate = THERE();
267	if (cflags&REG_EXTENDED)
268		p_ere(p, OUT, 0);
269	else if (cflags&REG_NOSPEC)
270		p_str(p);
271	else
272		p_bre(p, OUT, OUT, 0);
273	EMIT(OEND, 0);
274	g->laststate = THERE();
275
276	/* tidy up loose ends and fill things in */
277	categorize(p, g);
278	stripsnug(p, g);
279	findmust(p, g);
280	g->nplus = pluscount(p, g);
281	g->magic = MAGIC2;
282	preg->re_nsub = g->nsub;
283	preg->re_g = g;
284	preg->re_magic = MAGIC1;
285#ifndef REDEBUG
286	/* not debugging, so can't rely on the assert() in regexec() */
287	if (g->iflags&BAD)
288		SETERROR(REG_ASSERT);
289#endif
290
291	/* win or lose, we're done */
292	if (p->error != 0)	/* lose */
293		regfree(preg);
294	return(p->error);
295}
296
297/*
298 - p_ere - ERE parser top level, concatenation and alternation
299 == static void p_ere(register struct parse *p, int stop, size_t reclimit);
300 */
301static void
302p_ere(register struct parse *p, int stop, size_t reclimit)
303
304         			/* character this ERE should end at */
305{
306	register char c;
307	register sopno prevback = 0;
308	register sopno prevfwd = 0;
309	register sopno conc;
310	register int first = 1;		/* is this the first alternative? */
311
312	if (reclimit++ > RECLIMIT || p->error == REG_ESPACE) {
313		p->error = REG_ESPACE;
314		return;
315	}
316
317	for (;;) {
318		/* do a bunch of concatenated expressions */
319		conc = HERE();
320		while (MORE() && (c = PEEK()) != '|' && c != stop)
321			p_ere_exp(p, reclimit);
322		(void)REQUIRE(HERE() != conc, REG_EMPTY);	/* require nonempty */
323
324		if (!EAT('|'))
325			break;		/* NOTE BREAK OUT */
326
327		if (first) {
328			INSERT(OCH_, conc);	/* offset is wrong */
329			prevfwd = conc;
330			prevback = conc;
331			first = 0;
332		}
333		ASTERN(OOR1, prevback);
334		prevback = THERE();
335		AHEAD(prevfwd);			/* fix previous offset */
336		prevfwd = HERE();
337		EMIT(OOR2, 0);			/* offset is very wrong */
338	}
339
340	if (!first) {		/* tail-end fixups */
341		AHEAD(prevfwd);
342		ASTERN(O_CH, prevback);
343	}
344
345	assert(!MORE() || SEE(stop));
346}
347
348/*
349 - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op
350 == static void p_ere_exp(register struct parse *p);
351 */
352static void
353p_ere_exp(register struct parse *p, size_t reclimit)
354{
355	register char c;
356	register sopno pos;
357	register int count;
358	register int count2;
359	register sopno subno;
360	int wascaret = 0;
361
362	assert(MORE());		/* caller should have ensured this */
363	c = GETNEXT();
364
365	pos = HERE();
366	switch (c) {
367	case '(':
368		(void)REQUIRE(MORE(), REG_EPAREN);
369		p->g->nsub++;
370		subno = p->g->nsub;
371		if (subno < NPAREN)
372			p->pbegin[subno] = HERE();
373		EMIT(OLPAREN, subno);
374		if (!SEE(')'))
375			p_ere(p, ')', reclimit);
376		if (subno < NPAREN) {
377			p->pend[subno] = HERE();
378			assert(p->pend[subno] != 0);
379		}
380		EMIT(ORPAREN, subno);
381		(void)MUSTEAT(')', REG_EPAREN);
382		break;
383#ifndef POSIX_MISTAKE
384	case ')':		/* happens only if no current unmatched ( */
385		/*
386		 * You may ask, why the ifndef?  Because I didn't notice
387		 * this until slightly too late for 1003.2, and none of the
388		 * other 1003.2 regular-expression reviewers noticed it at
389		 * all.  So an unmatched ) is legal POSIX, at least until
390		 * we can get it fixed.
391		 */
392		SETERROR(REG_EPAREN);
393		break;
394#endif
395	case '^':
396		EMIT(OBOL, 0);
397		p->g->iflags |= USEBOL;
398		p->g->nbol++;
399		wascaret = 1;
400		break;
401	case '$':
402		EMIT(OEOL, 0);
403		p->g->iflags |= USEEOL;
404		p->g->neol++;
405		break;
406	case '|':
407		SETERROR(REG_EMPTY);
408		break;
409	case '*':
410	case '+':
411	case '?':
412		SETERROR(REG_BADRPT);
413		break;
414	case '.':
415		if (p->g->cflags&REG_NEWLINE)
416			nonnewline(p);
417		else
418			EMIT(OANY, 0);
419		break;
420	case '[':
421		p_bracket(p);
422		break;
423	case '\\':
424		(void)REQUIRE(MORE(), REG_EESCAPE);
425		c = GETNEXT();
426		ordinary(p, c);
427		break;
428	case '{':		/* okay as ordinary except if digit follows */
429		(void)REQUIRE(!MORE() || !ISDIGIT((UCHAR_T)PEEK()), REG_BADRPT);
430		/* FALLTHROUGH */
431	default:
432		ordinary(p, c);
433		break;
434	}
435
436	if (!MORE())
437		return;
438	c = PEEK();
439	/* we call { a repetition if followed by a digit */
440	if (!( c == '*' || c == '+' || c == '?' ||
441				(c == '{' && MORE2() && ISDIGIT((UCHAR_T)PEEK2())) ))
442		return;		/* no repetition, we're done */
443	NEXT();
444
445	(void)REQUIRE(!wascaret, REG_BADRPT);
446	switch (c) {
447	case '*':	/* implemented as +? */
448		/* this case does not require the (y|) trick, noKLUDGE */
449		INSERT(OPLUS_, pos);
450		ASTERN(O_PLUS, pos);
451		INSERT(OQUEST_, pos);
452		ASTERN(O_QUEST, pos);
453		break;
454	case '+':
455		INSERT(OPLUS_, pos);
456		ASTERN(O_PLUS, pos);
457		break;
458	case '?':
459		/* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
460		INSERT(OCH_, pos);		/* offset slightly wrong */
461		ASTERN(OOR1, pos);		/* this one's right */
462		AHEAD(pos);			/* fix the OCH_ */
463		EMIT(OOR2, 0);			/* offset very wrong... */
464		AHEAD(THERE());			/* ...so fix it */
465		ASTERN(O_CH, THERETHERE());
466		break;
467	case '{':
468		count = p_count(p);
469		if (EAT(',')) {
470			if (ISDIGIT((UCHAR_T)PEEK())) {
471				count2 = p_count(p);
472				(void)REQUIRE(count <= count2, REG_BADBR);
473			} else		/* single number with comma */
474				count2 = INFINITY;
475		} else		/* just a single number */
476			count2 = count;
477		repeat(p, pos, count, count2, 0);
478		if (!EAT('}')) {	/* error heuristics */
479			while (MORE() && PEEK() != '}')
480				NEXT();
481			(void)REQUIRE(MORE(), REG_EBRACE);
482			SETERROR(REG_BADBR);
483		}
484		break;
485	}
486
487	if (!MORE())
488		return;
489	c = PEEK();
490	if (!( c == '*' || c == '+' || c == '?' ||
491				(c == '{' && MORE2() && ISDIGIT((UCHAR_T)PEEK2())) ) )
492		return;
493	SETERROR(REG_BADRPT);
494}
495
496/*
497 - p_str - string (no metacharacters) "parser"
498 == static void p_str(register struct parse *p);
499 */
500static void
501p_str(register struct parse *p)
502{
503	(void)REQUIRE(MORE(), REG_EMPTY);
504	while (MORE())
505		ordinary(p, GETNEXT());
506}
507
508/*
509 - p_bre - BRE parser top level, anchoring and concatenation
510 == static void p_bre(register struct parse *p, register int end1, \
511 ==	register int end2, size_t reclimit);
512 * Giving end1 as OUT essentially eliminates the end1/end2 check.
513 *
514 * This implementation is a bit of a kludge, in that a trailing $ is first
515 * taken as an ordinary character and then revised to be an anchor.  The
516 * only undesirable side effect is that '$' gets included as a character
517 * category in such cases.  This is fairly harmless; not worth fixing.
518 * The amount of lookahead needed to avoid this kludge is excessive.
519 */
520static void
521p_bre(register struct parse *p, register int end1, register int end2, size_t reclimit)
522
523                  		/* first terminating character */
524                  		/* second terminating character */
525{
526	register sopno start;
527	register int first = 1;			/* first subexpression? */
528	register int wasdollar = 0;
529
530	if (reclimit++ > RECLIMIT || p->error == REG_ESPACE) {
531		p->error = REG_ESPACE;
532		return;
533	}
534
535	start = HERE();
536
537	if (EAT('^')) {
538		EMIT(OBOL, 0);
539		p->g->iflags |= USEBOL;
540		p->g->nbol++;
541	}
542	while (MORE() && !SEETWO(end1, end2)) {
543		wasdollar = p_simp_re(p, first, reclimit);
544		first = 0;
545	}
546	if (wasdollar) {	/* oops, that was a trailing anchor */
547		DROP(1);
548		EMIT(OEOL, 0);
549		p->g->iflags |= USEEOL;
550		p->g->neol++;
551	}
552
553	(void)REQUIRE(HERE() != start, REG_EMPTY);	/* require nonempty */
554}
555
556/*
557 - p_simp_re - parse a simple RE, an atom possibly followed by a repetition
558 == static int p_simp_re(register struct parse *p, int starordinary, size_t reclimit);
559 */
560static int			/* was the simple RE an unbackslashed $? */
561p_simp_re(register struct parse *p, int starordinary, size_t reclimit)
562
563                 		/* is a leading * an ordinary character? */
564{
565	register int c;
566	register int count;
567	register int count2;
568	register sopno pos;
569	register int i;
570	register sopno subno;
571	int backsl;
572
573	pos = HERE();		/* repetion op, if any, covers from here */
574
575	assert(MORE());		/* caller should have ensured this */
576	c = GETNEXT();
577	backsl = c == '\\';
578	if (backsl) {
579		(void)REQUIRE(MORE(), REG_EESCAPE);
580		c = (unsigned char)GETNEXT();
581		switch (c) {
582		case '{':
583			SETERROR(REG_BADRPT);
584			break;
585		case '(':
586			p->g->nsub++;
587			subno = p->g->nsub;
588			if (subno < NPAREN)
589				p->pbegin[subno] = HERE();
590			EMIT(OLPAREN, subno);
591			/* the MORE here is an error heuristic */
592			if (MORE() && !SEETWO('\\', ')'))
593				p_bre(p, '\\', ')', reclimit);
594			if (subno < NPAREN) {
595				p->pend[subno] = HERE();
596				assert(p->pend[subno] != 0);
597			}
598			EMIT(ORPAREN, subno);
599			(void)REQUIRE(EATTWO('\\', ')'), REG_EPAREN);
600			break;
601		case ')':	/* should not get here -- must be user */
602		case '}':
603			SETERROR(REG_EPAREN);
604			break;
605		case '1':
606		case '2':
607		case '3':
608		case '4':
609		case '5':
610		case '6':
611		case '7':
612		case '8':
613		case '9':
614			i = c - '0';
615			assert(i < NPAREN);
616			if (p->pend[i] != 0) {
617				assert(i <= p->g->nsub);
618				EMIT(OBACK_, i);
619				assert(p->pbegin[i] != 0);
620				assert(p->strip[p->pbegin[i]] == OLPAREN);
621				assert(p->strip[p->pend[i]] == ORPAREN);
622				(void) dupl(p, p->pbegin[i]+1, p->pend[i]);
623				EMIT(O_BACK, i);
624			} else
625				SETERROR(REG_ESUBREG);
626			p->g->backrefs = 1;
627			break;
628		default:
629			ordinary(p, c);
630			break;
631		}
632	} else {
633		switch (c) {
634		case '.':
635			if (p->g->cflags&REG_NEWLINE)
636				nonnewline(p);
637			else
638				EMIT(OANY, 0);
639			break;
640		case '[':
641			p_bracket(p);
642			break;
643		case '*':
644			(void)REQUIRE(starordinary, REG_BADRPT);
645			/* FALLTHROUGH */
646		default:
647			ordinary(p, c);
648			break;
649		}
650	}
651
652	if (EAT('*')) {		/* implemented as +? */
653		/* this case does not require the (y|) trick, noKLUDGE */
654		INSERT(OPLUS_, pos);
655		ASTERN(O_PLUS, pos);
656		INSERT(OQUEST_, pos);
657		ASTERN(O_QUEST, pos);
658	} else if (EATTWO('\\', '{')) {
659		count = p_count(p);
660		if (EAT(',')) {
661			if (MORE() && ISDIGIT((UCHAR_T)PEEK())) {
662				count2 = p_count(p);
663				(void)REQUIRE(count <= count2, REG_BADBR);
664			} else		/* single number with comma */
665				count2 = INFINITY;
666		} else		/* just a single number */
667			count2 = count;
668		repeat(p, pos, count, count2, reclimit);
669		if (!EATTWO('\\', '}')) {	/* error heuristics */
670			while (MORE() && !SEETWO('\\', '}'))
671				NEXT();
672			(void)REQUIRE(MORE(), REG_EBRACE);
673			SETERROR(REG_BADBR);
674		}
675	} else if (!backsl && c == (unsigned char)'$')	/* $ (but not \$) ends it */
676		return(1);
677
678	return(0);
679}
680
681/*
682 - p_count - parse a repetition count
683 == static int p_count(register struct parse *p);
684 */
685static int			/* the value */
686p_count(register struct parse *p)
687{
688	register int count = 0;
689	register int ndigits = 0;
690
691	while (MORE() && ISDIGIT((UCHAR_T)PEEK()) && count <= DUPMAX) {
692		count = count*10 + (GETNEXT() - '0');
693		ndigits++;
694	}
695
696	(void)REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR);
697	return(count);
698}
699
700/*
701 - p_bracket - parse a bracketed character list
702 == static void p_bracket(register struct parse *p);
703 *
704 * Note a significant property of this code:  if the allocset() did SETERROR,
705 * no set operations are done.
706 */
707static void
708p_bracket(register struct parse *p)
709{
710	register cset *cs;
711	register int invert = 0;
712	static RCHAR_T bow[] = { '[', ':', '<', ':', ']', ']' };
713	static RCHAR_T eow[] = { '[', ':', '>', ':', ']', ']' };
714
715	cs = allocset(p);
716	if (cs == NULL)
717		return;
718
719	/* Dept of Truly Sickening Special-Case Kludges */
720	if (p->next + 5 < p->end && MEMCMP(p->next, bow, 6) == 0) {
721		EMIT(OBOW, 0);
722		NEXTn(6);
723		return;
724	}
725	if (p->next + 5 < p->end && MEMCMP(p->next, eow, 6) == 0) {
726		EMIT(OEOW, 0);
727		NEXTn(6);
728		return;
729	}
730
731	if (EAT('^'))
732		invert++;	/* make note to invert set at end */
733	if (EAT(']'))
734		CHadd(cs, ']');
735	else if (EAT('-'))
736		CHadd(cs, '-');
737	while (MORE() && PEEK() != ']' && !SEETWO('-', ']'))
738		p_b_term(p, cs);
739	if (EAT('-'))
740		CHadd(cs, '-');
741	(void)MUSTEAT(']', REG_EBRACK);
742
743	if (p->error != 0)	/* don't mess things up further */
744		return;
745
746	if (p->g->cflags&REG_ICASE) {
747		register int i;
748		register int ci;
749
750		for (i = p->g->csetsize - 1; i >= 0; i--)
751			if (CHIN(cs, i) && isalpha(i)) {
752				ci = othercase(i);
753				if (ci != i)
754					CHadd(cs, ci);
755			}
756		if (cs->multis != NULL)
757			mccase(p, cs);
758	}
759	if (invert) {
760		register int i;
761
762		for (i = p->g->csetsize - 1; i >= 0; i--)
763			if (CHIN(cs, i))
764				CHsub(cs, i);
765			else
766				CHadd(cs, i);
767		if (p->g->cflags&REG_NEWLINE)
768			CHsub(cs, '\n');
769		if (cs->multis != NULL)
770			mcinvert(p, cs);
771	}
772
773	assert(cs->multis == NULL);		/* xxx */
774
775	if (nch(p, cs) == 1) {		/* optimize singleton sets */
776		ordinary(p, firstch(p, cs));
777		freeset(p, cs);
778	} else
779		EMIT(OANYOF, freezeset(p, cs));
780}
781
782/*
783 - p_b_term - parse one term of a bracketed character list
784 == static void p_b_term(register struct parse *p, register cset *cs);
785 */
786static void
787p_b_term(register struct parse *p, register cset *cs)
788{
789	register char c;
790	register char start, finish;
791	register int i;
792
793	/* classify what we've got */
794	switch ((MORE()) ? PEEK() : '\0') {
795	case '[':
796		c = (MORE2()) ? PEEK2() : '\0';
797		break;
798	case '-':
799		SETERROR(REG_ERANGE);
800		return;			/* NOTE RETURN */
801		break;
802	default:
803		c = '\0';
804		break;
805	}
806
807	switch (c) {
808	case ':':		/* character class */
809		NEXT2();
810		(void)REQUIRE(MORE(), REG_EBRACK);
811		c = PEEK();
812		(void)REQUIRE(c != '-' && c != ']', REG_ECTYPE);
813		p_b_cclass(p, cs);
814		(void)REQUIRE(MORE(), REG_EBRACK);
815		(void)REQUIRE(EATTWO(':', ']'), REG_ECTYPE);
816		break;
817	case '=':		/* equivalence class */
818		NEXT2();
819		(void)REQUIRE(MORE(), REG_EBRACK);
820		c = PEEK();
821		(void)REQUIRE(c != '-' && c != ']', REG_ECOLLATE);
822		p_b_eclass(p, cs);
823		(void)REQUIRE(MORE(), REG_EBRACK);
824		(void)REQUIRE(EATTWO('=', ']'), REG_ECOLLATE);
825		break;
826	default:		/* symbol, ordinary character, or range */
827/* xxx revision needed for multichar stuff */
828		start = p_b_symbol(p);
829		if (SEE('-') && MORE2() && PEEK2() != ']') {
830			/* range */
831			NEXT();
832			if (EAT('-'))
833				finish = '-';
834			else
835				finish = p_b_symbol(p);
836		} else
837			finish = start;
838/* xxx what about signed chars here... */
839		(void)REQUIRE(start <= finish, REG_ERANGE);
840		for (i = start; i <= finish; i++)
841			CHadd(cs, i);
842		break;
843	}
844}
845
846/*
847 - p_b_cclass - parse a character-class name and deal with it
848 == static void p_b_cclass(register struct parse *p, register cset *cs);
849 */
850static void
851p_b_cclass(register struct parse *p, register cset *cs)
852{
853	register RCHAR_T *sp = p->next;
854	register struct cclass *cp;
855	register size_t len;
856	register const char *u;
857	register char c;
858
859	while (MORE() && isalpha(PEEK()))
860		NEXT();
861	len = p->next - sp;
862	for (cp = cclasses; cp->name != NULL; cp++)
863		if (STRLEN(cp->name) == len && !MEMCMP(cp->name, sp, len))
864			break;
865	if (cp->name == NULL) {
866		/* oops, didn't find it */
867		SETERROR(REG_ECTYPE);
868		return;
869	}
870
871	u = cp->chars;
872	while ((c = *u++) != '\0')
873		CHadd(cs, c);
874	for (u = cp->multis; *u != '\0'; u += strlen(u) + 1)
875		MCadd(p, cs, u);
876}
877
878/*
879 - p_b_eclass - parse an equivalence-class name and deal with it
880 == static void p_b_eclass(register struct parse *p, register cset *cs);
881 *
882 * This implementation is incomplete. xxx
883 */
884static void
885p_b_eclass(register struct parse *p, register cset *cs)
886{
887	register char c;
888
889	c = p_b_coll_elem(p, '=');
890	CHadd(cs, c);
891}
892
893/*
894 - p_b_symbol - parse a character or [..]ed multicharacter collating symbol
895 == static char p_b_symbol(register struct parse *p);
896 */
897static char			/* value of symbol */
898p_b_symbol(register struct parse *p)
899{
900	register char value;
901
902	(void)REQUIRE(MORE(), REG_EBRACK);
903	if (!EATTWO('[', '.'))
904		return(GETNEXT());
905
906	/* collating symbol */
907	value = p_b_coll_elem(p, '.');
908	(void)REQUIRE(EATTWO('.', ']'), REG_ECOLLATE);
909	return(value);
910}
911
912/*
913 - p_b_coll_elem - parse a collating-element name and look it up
914 == static char p_b_coll_elem(register struct parse *p, int endc);
915 */
916static char			/* value of collating element */
917p_b_coll_elem(register struct parse *p, int endc)
918
919         			/* name ended by endc,']' */
920{
921	register RCHAR_T *sp = p->next;
922	register struct cname *cp;
923	register size_t len;
924
925	while (MORE() && !SEETWO(endc, ']'))
926		NEXT();
927	if (!MORE()) {
928		SETERROR(REG_EBRACK);
929		return(0);
930	}
931	len = p->next - sp;
932	for (cp = cnames; cp->name != NULL; cp++)
933		if (STRLEN(cp->name) == len && MEMCMP(cp->name, sp, len))
934			return(cp->code);	/* known name */
935	if (len == 1)
936		return(*sp);	/* single character */
937	SETERROR(REG_ECOLLATE);			/* neither */
938	return(0);
939}
940
941/*
942 - othercase - return the case counterpart of an alphabetic
943 == static char othercase(int ch);
944 */
945static char			/* if no counterpart, return ch */
946othercase(int ch)
947{
948	assert(isalpha(ch));
949	if (isupper(ch))
950		return(tolower(ch));
951	else if (islower(ch))
952		return(toupper(ch));
953	else			/* peculiar, but could happen */
954		return(ch);
955}
956
957/*
958 - bothcases - emit a dualcase version of a two-case character
959 == static void bothcases(register struct parse *p, int ch);
960 *
961 * Boy, is this implementation ever a kludge...
962 */
963static void
964bothcases(register struct parse *p, int ch)
965{
966	register RCHAR_T *oldnext = p->next;
967	register RCHAR_T *oldend = p->end;
968	RCHAR_T bracket[3];
969
970	assert(othercase(ch) != ch);	/* p_bracket() would recurse */
971	p->next = bracket;
972	p->end = bracket+2;
973	bracket[0] = ch;
974	bracket[1] = ']';
975	bracket[2] = '\0';
976	p_bracket(p);
977	assert(p->next == bracket+2);
978	p->next = oldnext;
979	p->end = oldend;
980}
981
982/*
983 - ordinary - emit an ordinary character
984 == static void ordinary(register struct parse *p, register int ch);
985 */
986static void
987ordinary(register struct parse *p, register int ch)
988{
989/*
990	register cat_t *cap = p->g->categories;
991*/
992
993	if ((p->g->cflags&REG_ICASE) && isalpha(ch) && othercase(ch) != ch)
994		bothcases(p, ch);
995	else {
996		EMIT(OCHAR, (UCHAR_T)ch);
997/*
998		if (cap[ch] == 0)
999			cap[ch] = p->g->ncategories++;
1000*/
1001	}
1002}
1003
1004/*
1005 - nonnewline - emit REG_NEWLINE version of OANY
1006 == static void nonnewline(register struct parse *p);
1007 *
1008 * Boy, is this implementation ever a kludge...
1009 */
1010static void
1011nonnewline(register struct parse *p)
1012{
1013	register RCHAR_T *oldnext = p->next;
1014	register RCHAR_T *oldend = p->end;
1015	RCHAR_T bracket[4];
1016
1017	p->next = bracket;
1018	p->end = bracket+3;
1019	bracket[0] = '^';
1020	bracket[1] = '\n';
1021	bracket[2] = ']';
1022	bracket[3] = '\0';
1023	p_bracket(p);
1024	assert(p->next == bracket+3);
1025	p->next = oldnext;
1026	p->end = oldend;
1027}
1028
1029/*
1030 - repeat - generate code for a bounded repetition, recursively if needed
1031 == static void repeat(register struct parse *p, sopno start, int from, int to, size_t reclimit);
1032 */
1033static void
1034repeat(register struct parse *p, sopno start, int from, int to, size_t reclimit)
1035
1036            			/* operand from here to end of strip */
1037         			/* repeated from this number */
1038       				/* to this number of times (maybe INFINITY) */
1039{
1040	register sopno finish;
1041#	define	N	2
1042#	define	INF	3
1043#	define	REP(f, t)	((f)*8 + (t))
1044#	define	MAP(n)	(((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N)
1045	register sopno copy;
1046
1047	if (reclimit++ > RECLIMIT)
1048		p->error = REG_ESPACE;
1049	if (p->error)
1050		return;
1051
1052	finish = HERE();
1053
1054	assert(from <= to);
1055
1056	switch (REP(MAP(from), MAP(to))) {
1057	case REP(0, 0):			/* must be user doing this */
1058		DROP(finish-start);	/* drop the operand */
1059		break;
1060	case REP(0, 1):			/* as x{1,1}? */
1061	case REP(0, N):			/* as x{1,n}? */
1062	case REP(0, INF):		/* as x{1,}? */
1063		/* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
1064		INSERT(OCH_, start);		/* offset is wrong... */
1065		repeat(p, start+1, 1, to, reclimit);
1066		ASTERN(OOR1, start);
1067		AHEAD(start);			/* ... fix it */
1068		EMIT(OOR2, 0);
1069		AHEAD(THERE());
1070		ASTERN(O_CH, THERETHERE());
1071		break;
1072	case REP(1, 1):			/* trivial case */
1073		/* done */
1074		break;
1075	case REP(1, N):			/* as x?x{1,n-1} */
1076		/* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
1077		INSERT(OCH_, start);
1078		ASTERN(OOR1, start);
1079		AHEAD(start);
1080		EMIT(OOR2, 0);			/* offset very wrong... */
1081		AHEAD(THERE());			/* ...so fix it */
1082		ASTERN(O_CH, THERETHERE());
1083		copy = dupl(p, start+1, finish+1);
1084		assert(copy == finish+4);
1085		repeat(p, copy, 1, to-1, reclimit);
1086		break;
1087	case REP(1, INF):		/* as x+ */
1088		INSERT(OPLUS_, start);
1089		ASTERN(O_PLUS, start);
1090		break;
1091	case REP(N, N):			/* as xx{m-1,n-1} */
1092		copy = dupl(p, start, finish);
1093		repeat(p, copy, from-1, to-1, reclimit);
1094		break;
1095	case REP(N, INF):		/* as xx{n-1,INF} */
1096		copy = dupl(p, start, finish);
1097		repeat(p, copy, from-1, to, reclimit);
1098		break;
1099	default:			/* "can't happen" */
1100		SETERROR(REG_ASSERT);	/* just in case */
1101		break;
1102	}
1103}
1104
1105/*
1106 - seterr - set an error condition
1107 == static int seterr(register struct parse *p, int e);
1108 */
1109static int			/* useless but makes type checking happy */
1110seterr(register struct parse *p, int e)
1111{
1112	if (p->error == 0)	/* keep earliest error condition */
1113		p->error = e;
1114	p->next = nuls;		/* try to bring things to a halt */
1115	p->end = nuls;
1116	return(0);		/* make the return value well-defined */
1117}
1118
1119/*
1120 - allocset - allocate a set of characters for []
1121 == static cset *allocset(register struct parse *p);
1122 */
1123static cset *
1124allocset(register struct parse *p)
1125{
1126	register int no = p->g->ncsets++;
1127	register size_t nc;
1128	register size_t nbytes;
1129	register cset *cs;
1130	register size_t css = (size_t)p->g->csetsize;
1131	register int i;
1132
1133	if (no >= p->ncsalloc) {	/* need another column of space */
1134		p->ncsalloc += CHAR_BIT;
1135		nc = p->ncsalloc;
1136		assert(nc % CHAR_BIT == 0);
1137		nbytes = nc / CHAR_BIT * css;
1138		if (MEMSIZE(p) > MEMLIMIT)
1139			goto oomem;
1140		if (p->g->sets == NULL)
1141			p->g->sets = (cset *)malloc(nc * sizeof(cset));
1142		else
1143			p->g->sets = (cset *)realloc((char *)p->g->sets,
1144							nc * sizeof(cset));
1145		if (p->g->setbits == NULL)
1146			p->g->setbits = (uch *)malloc(nbytes);
1147		else {
1148			p->g->setbits = (uch *)realloc((char *)p->g->setbits,
1149								nbytes);
1150			/* xxx this isn't right if setbits is now NULL */
1151			for (i = 0; i < no; i++)
1152				p->g->sets[i].ptr = p->g->setbits + css*(i/CHAR_BIT);
1153		}
1154		if (p->g->sets != NULL && p->g->setbits != NULL)
1155			(void) memset((char *)p->g->setbits + (nbytes - css),
1156								0, css);
1157		else {
1158oomem:
1159			no = 0;
1160			SETERROR(REG_ESPACE);
1161			/* caller's responsibility not to do set ops */
1162			return NULL;
1163		}
1164	}
1165
1166	cs = &p->g->sets[no];
1167	cs->ptr = p->g->setbits + css*((no)/CHAR_BIT);
1168	cs->mask = 1 << ((no) % CHAR_BIT);
1169	cs->hash = 0;
1170	cs->smultis = 0;
1171	cs->multis = NULL;
1172
1173	return(cs);
1174}
1175
1176/*
1177 - freeset - free a now-unused set
1178 == static void freeset(register struct parse *p, register cset *cs);
1179 */
1180static void
1181freeset(register struct parse *p, register cset *cs)
1182{
1183	register size_t i;
1184	register cset *top = &p->g->sets[p->g->ncsets];
1185	register size_t css = (size_t)p->g->csetsize;
1186
1187	for (i = 0; i < css; i++)
1188		CHsub(cs, i);
1189	if (cs == top-1)	/* recover only the easy case */
1190		p->g->ncsets--;
1191}
1192
1193/*
1194 - freezeset - final processing on a set of characters
1195 == static int freezeset(register struct parse *p, register cset *cs);
1196 *
1197 * The main task here is merging identical sets.  This is usually a waste
1198 * of time (although the hash code minimizes the overhead), but can win
1199 * big if REG_ICASE is being used.  REG_ICASE, by the way, is why the hash
1200 * is done using addition rather than xor -- all ASCII [aA] sets xor to
1201 * the same value!
1202 */
1203static int			/* set number */
1204freezeset(register struct parse *p, register cset *cs)
1205{
1206	register uch h = cs->hash;
1207	register size_t i;
1208	register cset *top = &p->g->sets[p->g->ncsets];
1209	register cset *cs2;
1210	register size_t css = (size_t)p->g->csetsize;
1211
1212	/* look for an earlier one which is the same */
1213	for (cs2 = &p->g->sets[0]; cs2 < top; cs2++)
1214		if (cs2->hash == h && cs2 != cs) {
1215			/* maybe */
1216			for (i = 0; i < css; i++)
1217				if (!!CHIN(cs2, i) != !!CHIN(cs, i))
1218					break;		/* no */
1219			if (i == css)
1220				break;			/* yes */
1221		}
1222
1223	if (cs2 < top) {	/* found one */
1224		freeset(p, cs);
1225		cs = cs2;
1226	}
1227
1228	return((int)(cs - p->g->sets));
1229}
1230
1231/*
1232 - firstch - return first character in a set (which must have at least one)
1233 == static int firstch(register struct parse *p, register cset *cs);
1234 */
1235static int			/* character; there is no "none" value */
1236firstch(register struct parse *p, register cset *cs)
1237{
1238	register size_t i;
1239	register size_t css = (size_t)p->g->csetsize;
1240
1241	for (i = 0; i < css; i++)
1242		if (CHIN(cs, i))
1243			return((char)i);
1244	assert(never);
1245	return(0);		/* arbitrary */
1246}
1247
1248/*
1249 - nch - number of characters in a set
1250 == static int nch(register struct parse *p, register cset *cs);
1251 */
1252static int
1253nch(register struct parse *p, register cset *cs)
1254{
1255	register size_t i;
1256	register size_t css = (size_t)p->g->csetsize;
1257	register int n = 0;
1258
1259	for (i = 0; i < css; i++)
1260		if (CHIN(cs, i))
1261			n++;
1262	return(n);
1263}
1264
1265/*
1266 - mcadd - add a collating element to a cset
1267 == static void mcadd(register struct parse *p, register cset *cs, \
1268 ==	register char *cp);
1269 */
1270static void
1271mcadd(register struct parse *p, register cset *cs, register const char *cp)
1272{
1273	register size_t oldend = cs->smultis;
1274	void *np;
1275
1276	cs->smultis += strlen(cp) + 1;
1277	np = realloc(cs->multis, cs->smultis);
1278	if (np == NULL) {
1279		if (cs->multis)
1280			free(cs->multis);
1281		cs->multis = NULL;
1282		SETERROR(REG_ESPACE);
1283		return;
1284	}
1285	cs->multis = np;
1286
1287	(void) strlcpy(cs->multis + oldend - 1, cp, cs->smultis - oldend + 1);
1288}
1289
1290#ifdef notdef
1291/*
1292 - mcsub - subtract a collating element from a cset
1293 == static void mcsub(register cset *cs, register char *cp);
1294 */
1295static void
1296mcsub(register cset *cs, register char *cp)
1297{
1298	register char *fp = mcfind(cs, cp);
1299	register size_t len = strlen(fp);
1300
1301	assert(fp != NULL);
1302	(void) memmove(fp, fp + len + 1,
1303				cs->smultis - (fp + len + 1 - cs->multis));
1304	cs->smultis -= len;
1305
1306	if (cs->smultis == 0) {
1307		free(cs->multis);
1308		cs->multis = NULL;
1309		return;
1310	}
1311
1312	cs->multis = realloc(cs->multis, cs->smultis);
1313	assert(cs->multis != NULL);
1314}
1315
1316/*
1317 - mcin - is a collating element in a cset?
1318 == static int mcin(register cset *cs, register char *cp);
1319 */
1320static int
1321mcin(register cset *cs, register char *cp)
1322{
1323	return(mcfind(cs, cp) != NULL);
1324}
1325
1326/*
1327 - mcfind - find a collating element in a cset
1328 == static char *mcfind(register cset *cs, register char *cp);
1329 */
1330static char *
1331mcfind(register cset *cs, register char *cp)
1332{
1333	register char *p;
1334
1335	if (cs->multis == NULL)
1336		return(NULL);
1337	for (p = cs->multis; *p != '\0'; p += strlen(p) + 1)
1338		if (strcmp(cp, p) == 0)
1339			return(p);
1340	return(NULL);
1341}
1342#endif
1343
1344/*
1345 - mcinvert - invert the list of collating elements in a cset
1346 == static void mcinvert(register struct parse *p, register cset *cs);
1347 *
1348 * This would have to know the set of possibilities.  Implementation
1349 * is deferred.
1350 */
1351static void
1352mcinvert(register struct parse *p, register cset *cs)
1353{
1354	assert(cs->multis == NULL);	/* xxx */
1355}
1356
1357/*
1358 - mccase - add case counterparts of the list of collating elements in a cset
1359 == static void mccase(register struct parse *p, register cset *cs);
1360 *
1361 * This would have to know the set of possibilities.  Implementation
1362 * is deferred.
1363 */
1364static void
1365mccase(register struct parse *p, register cset *cs)
1366{
1367	assert(cs->multis == NULL);	/* xxx */
1368}
1369
1370#ifdef notdef
1371/*
1372 - isinsets - is this character in any sets?
1373 == static int isinsets(register struct re_guts *g, int c);
1374 */
1375static int			/* predicate */
1376isinsets(register struct re_guts *g, int c)
1377{
1378	register uch *col;
1379	register int i;
1380	register int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT;
1381	register unsigned uc = (unsigned char)c;
1382
1383	for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize)
1384		if (col[uc] != 0)
1385			return(1);
1386	return(0);
1387}
1388
1389/*
1390 - samesets - are these two characters in exactly the same sets?
1391 == static int samesets(register struct re_guts *g, int c1, int c2);
1392 */
1393static int			/* predicate */
1394samesets(register struct re_guts *g, int c1, int c2)
1395{
1396	register uch *col;
1397	register int i;
1398	register int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT;
1399	register unsigned uc1 = (unsigned char)c1;
1400	register unsigned uc2 = (unsigned char)c2;
1401
1402	for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize)
1403		if (col[uc1] != col[uc2])
1404			return(0);
1405	return(1);
1406}
1407#endif
1408
1409/*
1410 - categorize - sort out character categories
1411 == static void categorize(struct parse *p, register struct re_guts *g);
1412 */
1413static void
1414categorize(struct parse *p, register struct re_guts *g)
1415{
1416#ifdef notdef
1417	register cat_t *cats = g->categories;
1418	register int c;
1419	register int c2;
1420	register cat_t cat;
1421
1422	/* avoid making error situations worse */
1423	if (p->error != 0)
1424		return;
1425
1426	for (c = CHAR_MIN; c <= CHAR_MAX; c++)
1427		if (cats[c] == 0 && isinsets(g, c)) {
1428			cat = g->ncategories++;
1429			cats[c] = cat;
1430			for (c2 = c+1; c2 <= CHAR_MAX; c2++)
1431				if (cats[c2] == 0 && samesets(g, c, c2))
1432					cats[c2] = cat;
1433		}
1434#endif
1435}
1436
1437/*
1438 - dupl - emit a duplicate of a bunch of sops
1439 == static sopno dupl(register struct parse *p, sopno start, sopno finish);
1440 */
1441static sopno			/* start of duplicate */
1442dupl(register struct parse *p, sopno start, sopno finish)
1443
1444            			/* from here */
1445             			/* to this less one */
1446{
1447	register sopno ret = HERE();
1448	register sopno len = finish - start;
1449
1450	assert(finish >= start);
1451	if (len == 0)
1452		return(ret);
1453	if (!enlarge(p, p->ssize + len))	/* this many unexpected additions */
1454		return ret;
1455	assert(p->ssize >= p->slen + len);
1456	(void) memcpy((char *)(p->strip + p->slen),
1457		(char *)(p->strip + start), (size_t)len*sizeof(sop));
1458	(void) memcpy((char *)(p->stripdata + p->slen),
1459		(char *)(p->stripdata + start), (size_t)len*sizeof(RCHAR_T));
1460	p->slen += len;
1461	return(ret);
1462}
1463
1464/*
1465 - doemit - emit a strip operator
1466 == static void doemit(register struct parse *p, sop op, size_t opnd);
1467 *
1468 * It might seem better to implement this as a macro with a function as
1469 * hard-case backup, but it's just too big and messy unless there are
1470 * some changes to the data structures.  Maybe later.
1471 */
1472static void
1473doemit(register struct parse *p, sop op, size_t opnd)
1474{
1475	/* avoid making error situations worse */
1476	if (p->error != 0)
1477		return;
1478
1479	/* deal with oversize operands ("can't happen", more or less) */
1480	assert(opnd < 1);
1481
1482	/* deal with undersized strip */
1483	if (p->slen >= p->ssize)
1484		if (!enlarge(p, (p->ssize+1) / 2 * 3))	/* +50% */
1485			return;
1486
1487	/* finally, it's all reduced to the easy case */
1488	p->strip[p->slen] = op;
1489	p->stripdata[p->slen] = opnd;
1490	p->slen++;
1491}
1492
1493/*
1494 - doinsert - insert a sop into the strip
1495 == static void doinsert(register struct parse *p, sop op, size_t opnd, sopno pos);
1496 */
1497static void
1498doinsert(register struct parse *p, sop op, size_t opnd, sopno pos)
1499{
1500	register sopno sn;
1501	register sop s;
1502	register RCHAR_T d;
1503	register int i;
1504
1505	/* avoid making error situations worse */
1506	if (p->error != 0)
1507		return;
1508
1509	sn = HERE();
1510	EMIT(op, opnd);		/* do checks, ensure space */
1511	assert(HERE() == sn+1);
1512	s = p->strip[sn];
1513	d = p->stripdata[sn];
1514
1515	/* adjust paren pointers */
1516	assert(pos > 0);
1517	for (i = 1; i < NPAREN; i++) {
1518		if (p->pbegin[i] >= pos) {
1519			p->pbegin[i]++;
1520		}
1521		if (p->pend[i] >= pos) {
1522			p->pend[i]++;
1523		}
1524	}
1525
1526	memmove((char *)&p->strip[pos+1], (char *)&p->strip[pos],
1527						(HERE()-pos-1)*sizeof(sop));
1528	memmove((char *)&p->stripdata[pos+1], (char *)&p->stripdata[pos],
1529						(HERE()-pos-1)*sizeof(RCHAR_T));
1530	p->strip[pos] = s;
1531	p->stripdata[pos] = d;
1532}
1533
1534/*
1535 - dofwd - complete a forward reference
1536 == static void dofwd(register struct parse *p, sopno pos, sop value);
1537 */
1538static void
1539dofwd(register struct parse *p, register sopno pos, sop value)
1540{
1541	/* avoid making error situations worse */
1542	if (p->error != 0)
1543		return;
1544
1545	assert(value < 1);
1546	p->stripdata[pos] = value;
1547}
1548
1549/*
1550 - enlarge - enlarge the strip
1551 == static int enlarge(register struct parse *p, sopno size);
1552 */
1553static int
1554enlarge(register struct parse *p, register sopno size)
1555{
1556	register sop *sp;
1557	register RCHAR_T *dp;
1558	sopno osize;
1559
1560	if (p->ssize >= size)
1561		return 1;
1562
1563	osize = p->ssize;
1564	p->ssize = size;
1565	if (MEMSIZE(p) > MEMLIMIT)
1566		goto oomem;
1567	sp = realloc(p->strip, p->ssize * sizeof(sop));
1568	if (sp == NULL)
1569		goto oomem;
1570	p->strip = sp;
1571	dp = realloc(p->stripdata, p->ssize * sizeof(RCHAR_T));
1572	if (dp == NULL) {
1573oomem:
1574		p->ssize = osize;
1575		SETERROR(REG_ESPACE);
1576		return 0;
1577	}
1578	p->stripdata = dp;
1579	return 1;
1580}
1581
1582/*
1583 - stripsnug - compact the strip
1584 == static void stripsnug(register struct parse *p, register struct re_guts *g);
1585 */
1586static void
1587stripsnug(register struct parse *p, register struct re_guts *g)
1588{
1589	g->nstates = p->slen;
1590	g->strip = (sop *)realloc((char *)p->strip,
1591	    p->slen * sizeof(sop));
1592	if (g->strip == NULL) {
1593		SETERROR(REG_ESPACE);
1594		g->strip = p->strip;
1595	}
1596	g->stripdata = (RCHAR_T *)realloc((char *)p->stripdata,
1597	    p->slen * sizeof(RCHAR_T));
1598	if (g->stripdata == NULL) {
1599		SETERROR(REG_ESPACE);
1600		g->stripdata = p->stripdata;
1601	}
1602}
1603
1604/*
1605 - findmust - fill in must and mlen with longest mandatory literal string
1606 == static void findmust(register struct parse *p, register struct re_guts *g);
1607 *
1608 * This algorithm could do fancy things like analyzing the operands of |
1609 * for common subsequences.  Someday.  This code is simple and finds most
1610 * of the interesting cases.
1611 *
1612 * Note that must and mlen got initialized during setup.
1613 */
1614static void
1615findmust(struct parse *p, register struct re_guts *g)
1616{
1617	register sop *scans;
1618	register RCHAR_T *scand;
1619	sop *starts = 0;
1620	RCHAR_T *startd = NULL;
1621	register sop *newstarts = 0;
1622	register RCHAR_T *newstartd = NULL;
1623	register sopno newlen;
1624	register sop s;
1625	register RCHAR_T d;
1626	register RCHAR_T *cp;
1627	register sopno i;
1628
1629	/* avoid making error situations worse */
1630	if (p->error != 0)
1631		return;
1632
1633	/* find the longest OCHAR sequence in strip */
1634	newlen = 0;
1635	scans = g->strip + 1;
1636	scand = g->stripdata + 1;
1637	do {
1638		s = *scans++;
1639		d = *scand++;
1640		switch (s) {
1641		case OCHAR:		/* sequence member */
1642			if (newlen == 0) {		/* new sequence */
1643				newstarts = scans - 1;
1644				newstartd = scand - 1;
1645			}
1646			newlen++;
1647			break;
1648		case OPLUS_:		/* things that don't break one */
1649		case OLPAREN:
1650		case ORPAREN:
1651			break;
1652		case OQUEST_:		/* things that must be skipped */
1653		case OCH_:
1654			scans--;
1655			scand--;
1656			do {
1657				scans += d;
1658				scand += d;
1659				s = *scans;
1660				d = *scand;
1661				/* assert() interferes w debug printouts */
1662				if (s != O_QUEST && s != O_CH && s != OOR2) {
1663					g->iflags |= BAD;
1664					return;
1665				}
1666			} while (s != O_QUEST && s != O_CH);
1667			/* fallthrough */
1668		default:		/* things that break a sequence */
1669			if (newlen > g->mlen) {		/* ends one */
1670				starts = newstarts;
1671				startd = newstartd;
1672				g->mlen = newlen;
1673			}
1674			newlen = 0;
1675			break;
1676		}
1677	} while (s != OEND);
1678
1679	if (g->mlen == 0)		/* there isn't one */
1680		return;
1681
1682	/* turn it into a character string */
1683	g->must = malloc(((size_t)g->mlen + 1) * sizeof(RCHAR_T));
1684	if (g->must == NULL) {		/* argh; just forget it */
1685		g->mlen = 0;
1686		return;
1687	}
1688	cp = g->must;
1689	scans = starts;
1690	scand = startd;
1691	for (i = g->mlen; i > 0; i--) {
1692		for (;;) {
1693			s = *scans++;
1694			d = *scand++;
1695			if (s == OCHAR)
1696				break;
1697		}
1698		assert(cp < g->must + g->mlen);
1699		*cp++ = d;
1700	}
1701	assert(cp == g->must + g->mlen);
1702	*cp++ = '\0';		/* just on general principles */
1703}
1704
1705/*
1706 - pluscount - count + nesting
1707 == static sopno pluscount(register struct parse *p, register struct re_guts *g);
1708 */
1709static sopno			/* nesting depth */
1710pluscount(struct parse *p, register struct re_guts *g)
1711{
1712	register sop *scan;
1713	register sop s;
1714	register sopno plusnest = 0;
1715	register sopno maxnest = 0;
1716
1717	if (p->error != 0)
1718		return(0);	/* there may not be an OEND */
1719
1720	scan = g->strip + 1;
1721	do {
1722		s = *scan++;
1723		switch (s) {
1724		case OPLUS_:
1725			plusnest++;
1726			break;
1727		case O_PLUS:
1728			if (plusnest > maxnest)
1729				maxnest = plusnest;
1730			plusnest--;
1731			break;
1732		}
1733	} while (s != OEND);
1734	if (plusnest != 0)
1735		g->iflags |= BAD;
1736	return(maxnest);
1737}
1738