regcomp.c revision 49094
1/*-
2 * Copyright (c) 1992, 1993, 1994 Henry Spencer.
3 * Copyright (c) 1992, 1993, 1994
4 *	The Regents of the University of California.  All rights reserved.
5 *
6 * This code is derived from software contributed to Berkeley by
7 * Henry Spencer.
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 * 1. Redistributions of source code must retain the above copyright
13 *    notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 *    notice, this list of conditions and the following disclaimer in the
16 *    documentation and/or other materials provided with the distribution.
17 * 3. All advertising materials mentioning features or use of this software
18 *    must display the following acknowledgement:
19 *	This product includes software developed by the University of
20 *	California, Berkeley and its contributors.
21 * 4. Neither the name of the University nor the names of its contributors
22 *    may be used to endorse or promote products derived from this software
23 *    without specific prior written permission.
24 *
25 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
26 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
28 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
29 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
30 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
31 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
32 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
33 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
34 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
35 * SUCH DAMAGE.
36 *
37 *	@(#)regcomp.c	8.5 (Berkeley) 3/20/94
38 */
39
40#if defined(LIBC_SCCS) && !defined(lint)
41static char sccsid[] = "@(#)regcomp.c	8.5 (Berkeley) 3/20/94";
42#endif /* LIBC_SCCS and not lint */
43
44#include <sys/types.h>
45#include <stdio.h>
46#include <string.h>
47#include <ctype.h>
48#include <limits.h>
49#include <stdlib.h>
50#include <regex.h>
51
52#include "collate.h"
53
54#include "utils.h"
55#include "regex2.h"
56
57#include "cclass.h"
58#include "cname.h"
59
60/*
61 * parse structure, passed up and down to avoid global variables and
62 * other clumsinesses
63 */
64struct parse {
65	char *next;		/* next character in RE */
66	char *end;		/* end of string (-> NUL normally) */
67	int error;		/* has an error been seen? */
68	sop *strip;		/* malloced strip */
69	sopno ssize;		/* malloced strip size (allocated) */
70	sopno slen;		/* malloced strip length (used) */
71	int ncsalloc;		/* number of csets allocated */
72	struct re_guts *g;
73#	define	NPAREN	10	/* we need to remember () 1-9 for back refs */
74	sopno pbegin[NPAREN];	/* -> ( ([0] unused) */
75	sopno pend[NPAREN];	/* -> ) ([0] unused) */
76};
77
78/* ========= begin header generated by ./mkh ========= */
79#ifdef __cplusplus
80extern "C" {
81#endif
82
83/* === regcomp.c === */
84static void p_ere __P((struct parse *p, int stop));
85static void p_ere_exp __P((struct parse *p));
86static void p_str __P((struct parse *p));
87static void p_bre __P((struct parse *p, int end1, int end2));
88static int p_simp_re __P((struct parse *p, int starordinary));
89static int p_count __P((struct parse *p));
90static void p_bracket __P((struct parse *p));
91static void p_b_term __P((struct parse *p, cset *cs));
92static void p_b_cclass __P((struct parse *p, cset *cs));
93static void p_b_eclass __P((struct parse *p, cset *cs));
94static char p_b_symbol __P((struct parse *p));
95static char p_b_coll_elem __P((struct parse *p, int endc));
96static char othercase __P((int ch));
97static void bothcases __P((struct parse *p, int ch));
98static void ordinary __P((struct parse *p, int ch));
99static void nonnewline __P((struct parse *p));
100static void repeat __P((struct parse *p, sopno start, int from, int to));
101static int seterr __P((struct parse *p, int e));
102static cset *allocset __P((struct parse *p));
103static void freeset __P((struct parse *p, cset *cs));
104static int freezeset __P((struct parse *p, cset *cs));
105static int firstch __P((struct parse *p, cset *cs));
106static int nch __P((struct parse *p, cset *cs));
107static void mcadd __P((struct parse *p, cset *cs, char *cp));
108#if used
109static void mcsub __P((cset *cs, char *cp));
110static int mcin __P((cset *cs, char *cp));
111static char *mcfind __P((cset *cs, char *cp));
112#endif
113static void mcinvert __P((struct parse *p, cset *cs));
114static void mccase __P((struct parse *p, cset *cs));
115static int isinsets __P((struct re_guts *g, int c));
116static int samesets __P((struct re_guts *g, int c1, int c2));
117static void categorize __P((struct parse *p, struct re_guts *g));
118static sopno dupl __P((struct parse *p, sopno start, sopno finish));
119static void doemit __P((struct parse *p, sop op, size_t opnd));
120static void doinsert __P((struct parse *p, sop op, size_t opnd, sopno pos));
121static void dofwd __P((struct parse *p, sopno pos, sop value));
122static void enlarge __P((struct parse *p, sopno size));
123static void stripsnug __P((struct parse *p, struct re_guts *g));
124static void findmust __P((struct parse *p, struct re_guts *g));
125static sopno pluscount __P((struct parse *p, struct re_guts *g));
126
127#ifdef __cplusplus
128}
129#endif
130/* ========= end header generated by ./mkh ========= */
131
132static char nuls[10];		/* place to point scanner in event of error */
133
134/*
135 * macros for use with parse structure
136 * BEWARE:  these know that the parse structure is named `p' !!!
137 */
138#define	PEEK()	(*p->next)
139#define	PEEK2()	(*(p->next+1))
140#define	MORE()	(p->next < p->end)
141#define	MORE2()	(p->next+1 < p->end)
142#define	SEE(c)	(MORE() && PEEK() == (c))
143#define	SEETWO(a, b)	(MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b))
144#define	EAT(c)	((SEE(c)) ? (NEXT(), 1) : 0)
145#define	EATTWO(a, b)	((SEETWO(a, b)) ? (NEXT2(), 1) : 0)
146#define	NEXT()	(p->next++)
147#define	NEXT2()	(p->next += 2)
148#define	NEXTn(n)	(p->next += (n))
149#define	GETNEXT()	(*p->next++)
150#define	SETERROR(e)	seterr(p, (e))
151#define	REQUIRE(co, e)	((co) || SETERROR(e))
152#define	MUSTSEE(c, e)	(REQUIRE(MORE() && PEEK() == (c), e))
153#define	MUSTEAT(c, e)	(REQUIRE(MORE() && GETNEXT() == (c), e))
154#define	MUSTNOTSEE(c, e)	(REQUIRE(!MORE() || PEEK() != (c), e))
155#define	EMIT(op, sopnd)	doemit(p, (sop)(op), (size_t)(sopnd))
156#define	INSERT(op, pos)	doinsert(p, (sop)(op), HERE()-(pos)+1, pos)
157#define	AHEAD(pos)		dofwd(p, pos, HERE()-(pos))
158#define	ASTERN(sop, pos)	EMIT(sop, HERE()-pos)
159#define	HERE()		(p->slen)
160#define	THERE()		(p->slen - 1)
161#define	THERETHERE()	(p->slen - 2)
162#define	DROP(n)	(p->slen -= (n))
163
164#ifndef NDEBUG
165static int never = 0;		/* for use in asserts; shuts lint up */
166#else
167#define	never	0		/* some <assert.h>s have bugs too */
168#endif
169
170/*
171 - regcomp - interface for parser and compilation
172 = extern int regcomp(regex_t *, const char *, int);
173 = #define	REG_BASIC	0000
174 = #define	REG_EXTENDED	0001
175 = #define	REG_ICASE	0002
176 = #define	REG_NOSUB	0004
177 = #define	REG_NEWLINE	0010
178 = #define	REG_NOSPEC	0020
179 = #define	REG_PEND	0040
180 = #define	REG_DUMP	0200
181 */
182int				/* 0 success, otherwise REG_something */
183regcomp(preg, pattern, cflags)
184regex_t *preg;
185const char *pattern;
186int cflags;
187{
188	struct parse pa;
189	register struct re_guts *g;
190	register struct parse *p = &pa;
191	register int i;
192	register size_t len;
193#ifdef REDEBUG
194#	define	GOODFLAGS(f)	(f)
195#else
196#	define	GOODFLAGS(f)	((f)&~REG_DUMP)
197#endif
198
199	cflags = GOODFLAGS(cflags);
200	if ((cflags&REG_EXTENDED) && (cflags&REG_NOSPEC))
201		return(REG_INVARG);
202
203	if (cflags&REG_PEND) {
204		if (preg->re_endp < pattern)
205			return(REG_INVARG);
206		len = preg->re_endp - pattern;
207	} else
208		len = strlen((char *)pattern);
209
210	/* do the mallocs early so failure handling is easy */
211	g = (struct re_guts *)malloc(sizeof(struct re_guts) +
212							(NC-1)*sizeof(cat_t));
213	if (g == NULL)
214		return(REG_ESPACE);
215	p->ssize = len/(size_t)2*(size_t)3 + (size_t)1;	/* ugh */
216	p->strip = (sop *)malloc(p->ssize * sizeof(sop));
217	p->slen = 0;
218	if (p->strip == NULL) {
219		free((char *)g);
220		return(REG_ESPACE);
221	}
222
223	/* set things up */
224	p->g = g;
225	p->next = (char *)pattern;	/* convenience; we do not modify it */
226	p->end = p->next + len;
227	p->error = 0;
228	p->ncsalloc = 0;
229	for (i = 0; i < NPAREN; i++) {
230		p->pbegin[i] = 0;
231		p->pend[i] = 0;
232	}
233	g->csetsize = NC;
234	g->sets = NULL;
235	g->setbits = NULL;
236	g->ncsets = 0;
237	g->cflags = cflags;
238	g->iflags = 0;
239	g->nbol = 0;
240	g->neol = 0;
241	g->must = NULL;
242	g->mlen = 0;
243	g->nsub = 0;
244	g->ncategories = 1;	/* category 0 is "everything else" */
245	g->categories = &g->catspace[-(CHAR_MIN)];
246	(void) memset((char *)g->catspace, 0, NC*sizeof(cat_t));
247	g->backrefs = 0;
248
249	/* do it */
250	EMIT(OEND, 0);
251	g->firststate = THERE();
252	if (cflags&REG_EXTENDED)
253		p_ere(p, OUT);
254	else if (cflags&REG_NOSPEC)
255		p_str(p);
256	else
257		p_bre(p, OUT, OUT);
258	EMIT(OEND, 0);
259	g->laststate = THERE();
260
261	/* tidy up loose ends and fill things in */
262	categorize(p, g);
263	stripsnug(p, g);
264	findmust(p, g);
265	g->nplus = pluscount(p, g);
266	g->magic = MAGIC2;
267	preg->re_nsub = g->nsub;
268	preg->re_g = g;
269	preg->re_magic = MAGIC1;
270#ifndef REDEBUG
271	/* not debugging, so can't rely on the assert() in regexec() */
272	if (g->iflags&BAD)
273		SETERROR(REG_ASSERT);
274#endif
275
276	/* win or lose, we're done */
277	if (p->error != 0)	/* lose */
278		regfree(preg);
279	return(p->error);
280}
281
282/*
283 - p_ere - ERE parser top level, concatenation and alternation
284 == static void p_ere(register struct parse *p, int stop);
285 */
286static void
287p_ere(p, stop)
288register struct parse *p;
289int stop;			/* character this ERE should end at */
290{
291	register char c;
292	register sopno prevback;
293	register sopno prevfwd;
294	register sopno conc;
295	register int first = 1;		/* is this the first alternative? */
296
297	for (;;) {
298		/* do a bunch of concatenated expressions */
299		conc = HERE();
300		while (MORE() && (c = PEEK()) != '|' && c != stop)
301			p_ere_exp(p);
302		(void)REQUIRE(HERE() != conc, REG_EMPTY);	/* require nonempty */
303
304		if (!EAT('|'))
305			break;		/* NOTE BREAK OUT */
306
307		if (first) {
308			INSERT(OCH_, conc);	/* offset is wrong */
309			prevfwd = conc;
310			prevback = conc;
311			first = 0;
312		}
313		ASTERN(OOR1, prevback);
314		prevback = THERE();
315		AHEAD(prevfwd);			/* fix previous offset */
316		prevfwd = HERE();
317		EMIT(OOR2, 0);			/* offset is very wrong */
318	}
319
320	if (!first) {		/* tail-end fixups */
321		AHEAD(prevfwd);
322		ASTERN(O_CH, prevback);
323	}
324
325	assert(!MORE() || SEE(stop));
326}
327
328/*
329 - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op
330 == static void p_ere_exp(register struct parse *p);
331 */
332static void
333p_ere_exp(p)
334register struct parse *p;
335{
336	register char c;
337	register sopno pos;
338	register int count;
339	register int count2;
340	register sopno subno;
341	int wascaret = 0;
342
343	assert(MORE());		/* caller should have ensured this */
344	c = GETNEXT();
345
346	pos = HERE();
347	switch (c) {
348	case '(':
349		(void)REQUIRE(MORE(), REG_EPAREN);
350		p->g->nsub++;
351		subno = p->g->nsub;
352		if (subno < NPAREN)
353			p->pbegin[subno] = HERE();
354		EMIT(OLPAREN, subno);
355		if (!SEE(')'))
356			p_ere(p, ')');
357		if (subno < NPAREN) {
358			p->pend[subno] = HERE();
359			assert(p->pend[subno] != 0);
360		}
361		EMIT(ORPAREN, subno);
362		(void)MUSTEAT(')', REG_EPAREN);
363		break;
364#ifndef POSIX_MISTAKE
365	case ')':		/* happens only if no current unmatched ( */
366		/*
367		 * You may ask, why the ifndef?  Because I didn't notice
368		 * this until slightly too late for 1003.2, and none of the
369		 * other 1003.2 regular-expression reviewers noticed it at
370		 * all.  So an unmatched ) is legal POSIX, at least until
371		 * we can get it fixed.
372		 */
373		SETERROR(REG_EPAREN);
374		break;
375#endif
376	case '^':
377		EMIT(OBOL, 0);
378		p->g->iflags |= USEBOL;
379		p->g->nbol++;
380		wascaret = 1;
381		break;
382	case '$':
383		EMIT(OEOL, 0);
384		p->g->iflags |= USEEOL;
385		p->g->neol++;
386		break;
387	case '|':
388		SETERROR(REG_EMPTY);
389		break;
390	case '*':
391	case '+':
392	case '?':
393		SETERROR(REG_BADRPT);
394		break;
395	case '.':
396		if (p->g->cflags&REG_NEWLINE)
397			nonnewline(p);
398		else
399			EMIT(OANY, 0);
400		break;
401	case '[':
402		p_bracket(p);
403		break;
404	case '\\':
405		(void)REQUIRE(MORE(), REG_EESCAPE);
406		c = GETNEXT();
407		ordinary(p, c);
408		break;
409	case '{':		/* okay as ordinary except if digit follows */
410		(void)REQUIRE(!MORE() || !isdigit((uch)PEEK()), REG_BADRPT);
411		/* FALLTHROUGH */
412	default:
413		ordinary(p, c);
414		break;
415	}
416
417	if (!MORE())
418		return;
419	c = PEEK();
420	/* we call { a repetition if followed by a digit */
421	if (!( c == '*' || c == '+' || c == '?' ||
422				(c == '{' && MORE2() && isdigit((uch)PEEK2())) ))
423		return;		/* no repetition, we're done */
424	NEXT();
425
426	(void)REQUIRE(!wascaret, REG_BADRPT);
427	switch (c) {
428	case '*':	/* implemented as +? */
429		/* this case does not require the (y|) trick, noKLUDGE */
430		INSERT(OPLUS_, pos);
431		ASTERN(O_PLUS, pos);
432		INSERT(OQUEST_, pos);
433		ASTERN(O_QUEST, pos);
434		break;
435	case '+':
436		INSERT(OPLUS_, pos);
437		ASTERN(O_PLUS, pos);
438		break;
439	case '?':
440		/* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
441		INSERT(OCH_, pos);		/* offset slightly wrong */
442		ASTERN(OOR1, pos);		/* this one's right */
443		AHEAD(pos);			/* fix the OCH_ */
444		EMIT(OOR2, 0);			/* offset very wrong... */
445		AHEAD(THERE());			/* ...so fix it */
446		ASTERN(O_CH, THERETHERE());
447		break;
448	case '{':
449		count = p_count(p);
450		if (EAT(',')) {
451			if (isdigit((uch)PEEK())) {
452				count2 = p_count(p);
453				(void)REQUIRE(count <= count2, REG_BADBR);
454			} else		/* single number with comma */
455				count2 = INFINITY;
456		} else		/* just a single number */
457			count2 = count;
458		repeat(p, pos, count, count2);
459		if (!EAT('}')) {	/* error heuristics */
460			while (MORE() && PEEK() != '}')
461				NEXT();
462			(void)REQUIRE(MORE(), REG_EBRACE);
463			SETERROR(REG_BADBR);
464		}
465		break;
466	}
467
468	if (!MORE())
469		return;
470	c = PEEK();
471	if (!( c == '*' || c == '+' || c == '?' ||
472				(c == '{' && MORE2() && isdigit((uch)PEEK2())) ) )
473		return;
474	SETERROR(REG_BADRPT);
475}
476
477/*
478 - p_str - string (no metacharacters) "parser"
479 == static void p_str(register struct parse *p);
480 */
481static void
482p_str(p)
483register struct parse *p;
484{
485	(void)REQUIRE(MORE(), REG_EMPTY);
486	while (MORE())
487		ordinary(p, GETNEXT());
488}
489
490/*
491 - p_bre - BRE parser top level, anchoring and concatenation
492 == static void p_bre(register struct parse *p, register int end1, \
493 ==	register int end2);
494 * Giving end1 as OUT essentially eliminates the end1/end2 check.
495 *
496 * This implementation is a bit of a kludge, in that a trailing $ is first
497 * taken as an ordinary character and then revised to be an anchor.  The
498 * only undesirable side effect is that '$' gets included as a character
499 * category in such cases.  This is fairly harmless; not worth fixing.
500 * The amount of lookahead needed to avoid this kludge is excessive.
501 */
502static void
503p_bre(p, end1, end2)
504register struct parse *p;
505register int end1;		/* first terminating character */
506register int end2;		/* second terminating character */
507{
508	register sopno start = HERE();
509	register int first = 1;			/* first subexpression? */
510	register int wasdollar = 0;
511
512	if (EAT('^')) {
513		EMIT(OBOL, 0);
514		p->g->iflags |= USEBOL;
515		p->g->nbol++;
516	}
517	while (MORE() && !SEETWO(end1, end2)) {
518		wasdollar = p_simp_re(p, first);
519		first = 0;
520	}
521	if (wasdollar) {	/* oops, that was a trailing anchor */
522		DROP(1);
523		EMIT(OEOL, 0);
524		p->g->iflags |= USEEOL;
525		p->g->neol++;
526	}
527
528	(void)REQUIRE(HERE() != start, REG_EMPTY);	/* require nonempty */
529}
530
531/*
532 - p_simp_re - parse a simple RE, an atom possibly followed by a repetition
533 == static int p_simp_re(register struct parse *p, int starordinary);
534 */
535static int			/* was the simple RE an unbackslashed $? */
536p_simp_re(p, starordinary)
537register struct parse *p;
538int starordinary;		/* is a leading * an ordinary character? */
539{
540	register int c;
541	register int count;
542	register int count2;
543	register sopno pos;
544	register int i;
545	register sopno subno;
546#	define	BACKSL	(1<<CHAR_BIT)
547
548	pos = HERE();		/* repetion op, if any, covers from here */
549
550	assert(MORE());		/* caller should have ensured this */
551	c = GETNEXT();
552	if (c == '\\') {
553		(void)REQUIRE(MORE(), REG_EESCAPE);
554		c = BACKSL | GETNEXT();
555	}
556	switch (c) {
557	case '.':
558		if (p->g->cflags&REG_NEWLINE)
559			nonnewline(p);
560		else
561			EMIT(OANY, 0);
562		break;
563	case '[':
564		p_bracket(p);
565		break;
566	case BACKSL|'{':
567		SETERROR(REG_BADRPT);
568		break;
569	case BACKSL|'(':
570		p->g->nsub++;
571		subno = p->g->nsub;
572		if (subno < NPAREN)
573			p->pbegin[subno] = HERE();
574		EMIT(OLPAREN, subno);
575		/* the MORE here is an error heuristic */
576		if (MORE() && !SEETWO('\\', ')'))
577			p_bre(p, '\\', ')');
578		if (subno < NPAREN) {
579			p->pend[subno] = HERE();
580			assert(p->pend[subno] != 0);
581		}
582		EMIT(ORPAREN, subno);
583		(void)REQUIRE(EATTWO('\\', ')'), REG_EPAREN);
584		break;
585	case BACKSL|')':	/* should not get here -- must be user */
586	case BACKSL|'}':
587		SETERROR(REG_EPAREN);
588		break;
589	case BACKSL|'1':
590	case BACKSL|'2':
591	case BACKSL|'3':
592	case BACKSL|'4':
593	case BACKSL|'5':
594	case BACKSL|'6':
595	case BACKSL|'7':
596	case BACKSL|'8':
597	case BACKSL|'9':
598		i = (c&~BACKSL) - '0';
599		assert(i < NPAREN);
600		if (p->pend[i] != 0) {
601			assert(i <= p->g->nsub);
602			EMIT(OBACK_, i);
603			assert(p->pbegin[i] != 0);
604			assert(OP(p->strip[p->pbegin[i]]) == OLPAREN);
605			assert(OP(p->strip[p->pend[i]]) == ORPAREN);
606			(void) dupl(p, p->pbegin[i]+1, p->pend[i]);
607			EMIT(O_BACK, i);
608		} else
609			SETERROR(REG_ESUBREG);
610		p->g->backrefs = 1;
611		break;
612	case '*':
613		(void)REQUIRE(starordinary, REG_BADRPT);
614		/* FALLTHROUGH */
615	default:
616		ordinary(p, (char)c);
617		break;
618	}
619
620	if (EAT('*')) {		/* implemented as +? */
621		/* this case does not require the (y|) trick, noKLUDGE */
622		INSERT(OPLUS_, pos);
623		ASTERN(O_PLUS, pos);
624		INSERT(OQUEST_, pos);
625		ASTERN(O_QUEST, pos);
626	} else if (EATTWO('\\', '{')) {
627		count = p_count(p);
628		if (EAT(',')) {
629			if (MORE() && isdigit((uch)PEEK())) {
630				count2 = p_count(p);
631				(void)REQUIRE(count <= count2, REG_BADBR);
632			} else		/* single number with comma */
633				count2 = INFINITY;
634		} else		/* just a single number */
635			count2 = count;
636		repeat(p, pos, count, count2);
637		if (!EATTWO('\\', '}')) {	/* error heuristics */
638			while (MORE() && !SEETWO('\\', '}'))
639				NEXT();
640			(void)REQUIRE(MORE(), REG_EBRACE);
641			SETERROR(REG_BADBR);
642		}
643	} else if (c == '$')     /* $ (but not \$) ends it */
644		return(1);
645
646	return(0);
647}
648
649/*
650 - p_count - parse a repetition count
651 == static int p_count(register struct parse *p);
652 */
653static int			/* the value */
654p_count(p)
655register struct parse *p;
656{
657	register int count = 0;
658	register int ndigits = 0;
659
660	while (MORE() && isdigit((uch)PEEK()) && count <= DUPMAX) {
661		count = count*10 + (GETNEXT() - '0');
662		ndigits++;
663	}
664
665	(void)REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR);
666	return(count);
667}
668
669/*
670 - p_bracket - parse a bracketed character list
671 == static void p_bracket(register struct parse *p);
672 *
673 * Note a significant property of this code:  if the allocset() did SETERROR,
674 * no set operations are done.
675 */
676static void
677p_bracket(p)
678register struct parse *p;
679{
680	register cset *cs = allocset(p);
681	register int invert = 0;
682
683	/* Dept of Truly Sickening Special-Case Kludges */
684	if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]", 6) == 0) {
685		EMIT(OBOW, 0);
686		NEXTn(6);
687		return;
688	}
689	if (p->next + 5 < p->end && strncmp(p->next, "[:>:]]", 6) == 0) {
690		EMIT(OEOW, 0);
691		NEXTn(6);
692		return;
693	}
694
695	if (EAT('^'))
696		invert++;	/* make note to invert set at end */
697	if (EAT(']'))
698		CHadd(cs, ']');
699	else if (EAT('-'))
700		CHadd(cs, '-');
701	while (MORE() && PEEK() != ']' && !SEETWO('-', ']'))
702		p_b_term(p, cs);
703	if (EAT('-'))
704		CHadd(cs, '-');
705	(void)MUSTEAT(']', REG_EBRACK);
706
707	if (p->error != 0)	/* don't mess things up further */
708		return;
709
710	if (p->g->cflags&REG_ICASE) {
711		register int i;
712		register int ci;
713
714		for (i = p->g->csetsize - 1; i >= 0; i--)
715			if (CHIN(cs, i) && isalpha(i)) {
716				ci = othercase(i);
717				if (ci != i)
718					CHadd(cs, ci);
719			}
720		if (cs->multis != NULL)
721			mccase(p, cs);
722	}
723	if (invert) {
724		register int i;
725
726		for (i = p->g->csetsize - 1; i >= 0; i--)
727			if (CHIN(cs, i))
728				CHsub(cs, i);
729			else
730				CHadd(cs, i);
731		if (p->g->cflags&REG_NEWLINE)
732			CHsub(cs, '\n');
733		if (cs->multis != NULL)
734			mcinvert(p, cs);
735	}
736
737	assert(cs->multis == NULL);		/* xxx */
738
739	if (nch(p, cs) == 1) {		/* optimize singleton sets */
740		ordinary(p, firstch(p, cs));
741		freeset(p, cs);
742	} else
743		EMIT(OANYOF, freezeset(p, cs));
744}
745
746/*
747 - p_b_term - parse one term of a bracketed character list
748 == static void p_b_term(register struct parse *p, register cset *cs);
749 */
750static void
751p_b_term(p, cs)
752register struct parse *p;
753register cset *cs;
754{
755	register char c;
756	register char start, finish;
757	register int i;
758
759	/* classify what we've got */
760	switch ((MORE()) ? PEEK() : '\0') {
761	case '[':
762		c = (MORE2()) ? PEEK2() : '\0';
763		break;
764	case '-':
765		SETERROR(REG_ERANGE);
766		return;			/* NOTE RETURN */
767		break;
768	default:
769		c = '\0';
770		break;
771	}
772
773	switch (c) {
774	case ':':		/* character class */
775		NEXT2();
776		(void)REQUIRE(MORE(), REG_EBRACK);
777		c = PEEK();
778		(void)REQUIRE(c != '-' && c != ']', REG_ECTYPE);
779		p_b_cclass(p, cs);
780		(void)REQUIRE(MORE(), REG_EBRACK);
781		(void)REQUIRE(EATTWO(':', ']'), REG_ECTYPE);
782		break;
783	case '=':		/* equivalence class */
784		NEXT2();
785		(void)REQUIRE(MORE(), REG_EBRACK);
786		c = PEEK();
787		(void)REQUIRE(c != '-' && c != ']', REG_ECOLLATE);
788		p_b_eclass(p, cs);
789		(void)REQUIRE(MORE(), REG_EBRACK);
790		(void)REQUIRE(EATTWO('=', ']'), REG_ECOLLATE);
791		break;
792	default:		/* symbol, ordinary character, or range */
793/* xxx revision needed for multichar stuff */
794		start = p_b_symbol(p);
795		if (SEE('-') && MORE2() && PEEK2() != ']') {
796			/* range */
797			NEXT();
798			if (EAT('-'))
799				finish = '-';
800			else
801				finish = p_b_symbol(p);
802		} else
803			finish = start;
804		if (start == finish)
805			CHadd(cs, start);
806		else {
807			if (__collate_load_error) {
808				(void)REQUIRE((uch)start <= (uch)finish, REG_ERANGE);
809				for (i = (uch)start; i <= (uch)finish; i++)
810					CHadd(cs, i);
811			} else {
812				(void)REQUIRE(__collate_range_cmp(start, finish) <= 0, REG_ERANGE);
813				for (i = CHAR_MIN; i <= CHAR_MAX; i++) {
814					if (   __collate_range_cmp(start, i) <= 0
815					    && __collate_range_cmp(i, finish) <= 0
816					   )
817						CHadd(cs, i);
818				}
819			}
820		}
821		break;
822	}
823}
824
825/*
826 - p_b_cclass - parse a character-class name and deal with it
827 == static void p_b_cclass(register struct parse *p, register cset *cs);
828 */
829static void
830p_b_cclass(p, cs)
831register struct parse *p;
832register cset *cs;
833{
834	register int c;
835	register char *sp = p->next;
836	register struct cclass *cp;
837	register size_t len;
838
839	while (MORE() && isalpha((uch)PEEK()))
840		NEXT();
841	len = p->next - sp;
842	for (cp = cclasses; cp->name != NULL; cp++)
843		if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0')
844			break;
845	if (cp->name == NULL) {
846		/* oops, didn't find it */
847		SETERROR(REG_ECTYPE);
848		return;
849	}
850
851	switch (cp->fidx) {
852	case CALNUM:
853		for (c = CHAR_MIN; c <= CHAR_MAX; c++)
854			if (isalnum((uch)c))
855				CHadd(cs, c);
856		break;
857	case CALPHA:
858		for (c = CHAR_MIN; c <= CHAR_MAX; c++)
859			if (isalpha((uch)c))
860				CHadd(cs, c);
861		break;
862	case CBLANK:
863		for (c = CHAR_MIN; c <= CHAR_MAX; c++)
864			if (isblank((uch)c))
865				CHadd(cs, c);
866		break;
867	case CCNTRL:
868		for (c = CHAR_MIN; c <= CHAR_MAX; c++)
869			if (iscntrl((uch)c))
870				CHadd(cs, c);
871		break;
872	case CDIGIT:
873		for (c = CHAR_MIN; c <= CHAR_MAX; c++)
874			if (isdigit((uch)c))
875				CHadd(cs, c);
876		break;
877	case CGRAPH:
878		for (c = CHAR_MIN; c <= CHAR_MAX; c++)
879			if (isgraph((uch)c))
880				CHadd(cs, c);
881		break;
882	case CLOWER:
883		for (c = CHAR_MIN; c <= CHAR_MAX; c++)
884			if (islower((uch)c))
885				CHadd(cs, c);
886		break;
887	case CPRINT:
888		for (c = CHAR_MIN; c <= CHAR_MAX; c++)
889			if (isprint((uch)c))
890				CHadd(cs, c);
891		break;
892	case CPUNCT:
893		for (c = CHAR_MIN; c <= CHAR_MAX; c++)
894			if (ispunct((uch)c))
895				CHadd(cs, c);
896		break;
897	case CSPACE:
898		for (c = CHAR_MIN; c <= CHAR_MAX; c++)
899			if (isspace((uch)c))
900				CHadd(cs, c);
901		break;
902	case CUPPER:
903		for (c = CHAR_MIN; c <= CHAR_MAX; c++)
904			if (isupper((uch)c))
905				CHadd(cs, c);
906		break;
907	case CXDIGIT:
908		for (c = CHAR_MIN; c <= CHAR_MAX; c++)
909			if (isxdigit((uch)c))
910				CHadd(cs, c);
911		break;
912	}
913#if 0
914	for (u = cp->multis; *u != '\0'; u += strlen(u) + 1)
915		MCadd(p, cs, u);
916#endif
917}
918
919/*
920 - p_b_eclass - parse an equivalence-class name and deal with it
921 == static void p_b_eclass(register struct parse *p, register cset *cs);
922 *
923 * This implementation is incomplete. xxx
924 */
925static void
926p_b_eclass(p, cs)
927register struct parse *p;
928register cset *cs;
929{
930	register char c;
931
932	c = p_b_coll_elem(p, '=');
933	CHadd(cs, c);
934}
935
936/*
937 - p_b_symbol - parse a character or [..]ed multicharacter collating symbol
938 == static char p_b_symbol(register struct parse *p);
939 */
940static char			/* value of symbol */
941p_b_symbol(p)
942register struct parse *p;
943{
944	register char value;
945
946	(void)REQUIRE(MORE(), REG_EBRACK);
947	if (!EATTWO('[', '.'))
948		return(GETNEXT());
949
950	/* collating symbol */
951	value = p_b_coll_elem(p, '.');
952	(void)REQUIRE(EATTWO('.', ']'), REG_ECOLLATE);
953	return(value);
954}
955
956/*
957 - p_b_coll_elem - parse a collating-element name and look it up
958 == static char p_b_coll_elem(register struct parse *p, int endc);
959 */
960static char			/* value of collating element */
961p_b_coll_elem(p, endc)
962register struct parse *p;
963int endc;			/* name ended by endc,']' */
964{
965	register char *sp = p->next;
966	register struct cname *cp;
967	register int len;
968
969	while (MORE() && !SEETWO(endc, ']'))
970		NEXT();
971	if (!MORE()) {
972		SETERROR(REG_EBRACK);
973		return(0);
974	}
975	len = p->next - sp;
976	for (cp = cnames; cp->name != NULL; cp++)
977		if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0')
978			return(cp->code);	/* known name */
979	if (len == 1)
980		return(*sp);	/* single character */
981	SETERROR(REG_ECOLLATE);			/* neither */
982	return(0);
983}
984
985/*
986 - othercase - return the case counterpart of an alphabetic
987 == static char othercase(int ch);
988 */
989static char			/* if no counterpart, return ch */
990othercase(ch)
991int ch;
992{
993	ch = (uch)ch;
994	assert(isalpha(ch));
995	if (isupper(ch))
996		return(tolower(ch));
997	else if (islower(ch))
998		return(toupper(ch));
999	else			/* peculiar, but could happen */
1000		return(ch);
1001}
1002
1003/*
1004 - bothcases - emit a dualcase version of a two-case character
1005 == static void bothcases(register struct parse *p, int ch);
1006 *
1007 * Boy, is this implementation ever a kludge...
1008 */
1009static void
1010bothcases(p, ch)
1011register struct parse *p;
1012int ch;
1013{
1014	register char *oldnext = p->next;
1015	register char *oldend = p->end;
1016	char bracket[3];
1017
1018	ch = (uch)ch;
1019	assert(othercase(ch) != ch);	/* p_bracket() would recurse */
1020	p->next = bracket;
1021	p->end = bracket+2;
1022	bracket[0] = ch;
1023	bracket[1] = ']';
1024	bracket[2] = '\0';
1025	p_bracket(p);
1026	assert(p->next == bracket+2);
1027	p->next = oldnext;
1028	p->end = oldend;
1029}
1030
1031/*
1032 - ordinary - emit an ordinary character
1033 == static void ordinary(register struct parse *p, register int ch);
1034 */
1035static void
1036ordinary(p, ch)
1037register struct parse *p;
1038register int ch;
1039{
1040	register cat_t *cap = p->g->categories;
1041
1042	if ((p->g->cflags&REG_ICASE) && isalpha((uch)ch) && othercase(ch) != ch)
1043		bothcases(p, ch);
1044	else {
1045		EMIT(OCHAR, (uch)ch);
1046		if (cap[ch] == 0)
1047			cap[ch] = p->g->ncategories++;
1048	}
1049}
1050
1051/*
1052 - nonnewline - emit REG_NEWLINE version of OANY
1053 == static void nonnewline(register struct parse *p);
1054 *
1055 * Boy, is this implementation ever a kludge...
1056 */
1057static void
1058nonnewline(p)
1059register struct parse *p;
1060{
1061	register char *oldnext = p->next;
1062	register char *oldend = p->end;
1063	char bracket[4];
1064
1065	p->next = bracket;
1066	p->end = bracket+3;
1067	bracket[0] = '^';
1068	bracket[1] = '\n';
1069	bracket[2] = ']';
1070	bracket[3] = '\0';
1071	p_bracket(p);
1072	assert(p->next == bracket+3);
1073	p->next = oldnext;
1074	p->end = oldend;
1075}
1076
1077/*
1078 - repeat - generate code for a bounded repetition, recursively if needed
1079 == static void repeat(register struct parse *p, sopno start, int from, int to);
1080 */
1081static void
1082repeat(p, start, from, to)
1083register struct parse *p;
1084sopno start;			/* operand from here to end of strip */
1085int from;			/* repeated from this number */
1086int to;				/* to this number of times (maybe INFINITY) */
1087{
1088	register sopno finish = HERE();
1089#	define	N	2
1090#	define	INF	3
1091#	define	REP(f, t)	((f)*8 + (t))
1092#	define	MAP(n)	(((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N)
1093	register sopno copy;
1094
1095	if (p->error != 0)	/* head off possible runaway recursion */
1096		return;
1097
1098	assert(from <= to);
1099
1100	switch (REP(MAP(from), MAP(to))) {
1101	case REP(0, 0):			/* must be user doing this */
1102		DROP(finish-start);	/* drop the operand */
1103		break;
1104	case REP(0, 1):			/* as x{1,1}? */
1105	case REP(0, N):			/* as x{1,n}? */
1106	case REP(0, INF):		/* as x{1,}? */
1107		/* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
1108		INSERT(OCH_, start);		/* offset is wrong... */
1109		repeat(p, start+1, 1, to);
1110		ASTERN(OOR1, start);
1111		AHEAD(start);			/* ... fix it */
1112		EMIT(OOR2, 0);
1113		AHEAD(THERE());
1114		ASTERN(O_CH, THERETHERE());
1115		break;
1116	case REP(1, 1):			/* trivial case */
1117		/* done */
1118		break;
1119	case REP(1, N):			/* as x?x{1,n-1} */
1120		/* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
1121		INSERT(OCH_, start);
1122		ASTERN(OOR1, start);
1123		AHEAD(start);
1124		EMIT(OOR2, 0);			/* offset very wrong... */
1125		AHEAD(THERE());			/* ...so fix it */
1126		ASTERN(O_CH, THERETHERE());
1127		copy = dupl(p, start+1, finish+1);
1128		assert(copy == finish+4);
1129		repeat(p, copy, 1, to-1);
1130		break;
1131	case REP(1, INF):		/* as x+ */
1132		INSERT(OPLUS_, start);
1133		ASTERN(O_PLUS, start);
1134		break;
1135	case REP(N, N):			/* as xx{m-1,n-1} */
1136		copy = dupl(p, start, finish);
1137		repeat(p, copy, from-1, to-1);
1138		break;
1139	case REP(N, INF):		/* as xx{n-1,INF} */
1140		copy = dupl(p, start, finish);
1141		repeat(p, copy, from-1, to);
1142		break;
1143	default:			/* "can't happen" */
1144		SETERROR(REG_ASSERT);	/* just in case */
1145		break;
1146	}
1147}
1148
1149/*
1150 - seterr - set an error condition
1151 == static int seterr(register struct parse *p, int e);
1152 */
1153static int			/* useless but makes type checking happy */
1154seterr(p, e)
1155register struct parse *p;
1156int e;
1157{
1158	if (p->error == 0)	/* keep earliest error condition */
1159		p->error = e;
1160	p->next = nuls;		/* try to bring things to a halt */
1161	p->end = nuls;
1162	return(0);		/* make the return value well-defined */
1163}
1164
1165/*
1166 - allocset - allocate a set of characters for []
1167 == static cset *allocset(register struct parse *p);
1168 */
1169static cset *
1170allocset(p)
1171register struct parse *p;
1172{
1173	register int no = p->g->ncsets++;
1174	register size_t nc;
1175	register size_t nbytes;
1176	register cset *cs;
1177	register size_t css = (size_t)p->g->csetsize;
1178	register int i;
1179
1180	if (no >= p->ncsalloc) {	/* need another column of space */
1181		p->ncsalloc += CHAR_BIT;
1182		nc = p->ncsalloc;
1183		assert(nc % CHAR_BIT == 0);
1184		nbytes = nc / CHAR_BIT * css;
1185		if (p->g->sets == NULL)
1186			p->g->sets = (cset *)malloc(nc * sizeof(cset));
1187		else
1188			p->g->sets = (cset *)reallocf((char *)p->g->sets,
1189							nc * sizeof(cset));
1190		if (p->g->setbits == NULL)
1191			p->g->setbits = (uch *)malloc(nbytes);
1192		else {
1193			p->g->setbits = (uch *)reallocf((char *)p->g->setbits,
1194								nbytes);
1195			/* xxx this isn't right if setbits is now NULL */
1196			for (i = 0; i < no; i++)
1197				p->g->sets[i].ptr = p->g->setbits + css*(i/CHAR_BIT);
1198		}
1199		if (p->g->sets != NULL && p->g->setbits != NULL)
1200			(void) memset((char *)p->g->setbits + (nbytes - css),
1201								0, css);
1202		else {
1203			no = 0;
1204			SETERROR(REG_ESPACE);
1205			/* caller's responsibility not to do set ops */
1206		}
1207	}
1208
1209	assert(p->g->sets != NULL);	/* xxx */
1210	cs = &p->g->sets[no];
1211	cs->ptr = p->g->setbits + css*((no)/CHAR_BIT);
1212	cs->mask = 1 << ((no) % CHAR_BIT);
1213	cs->hash = 0;
1214	cs->smultis = 0;
1215	cs->multis = NULL;
1216
1217	return(cs);
1218}
1219
1220/*
1221 - freeset - free a now-unused set
1222 == static void freeset(register struct parse *p, register cset *cs);
1223 */
1224static void
1225freeset(p, cs)
1226register struct parse *p;
1227register cset *cs;
1228{
1229	register int i;
1230	register cset *top = &p->g->sets[p->g->ncsets];
1231	register size_t css = (size_t)p->g->csetsize;
1232
1233	for (i = 0; i < css; i++)
1234		CHsub(cs, i);
1235	if (cs == top-1)	/* recover only the easy case */
1236		p->g->ncsets--;
1237}
1238
1239/*
1240 - freezeset - final processing on a set of characters
1241 == static int freezeset(register struct parse *p, register cset *cs);
1242 *
1243 * The main task here is merging identical sets.  This is usually a waste
1244 * of time (although the hash code minimizes the overhead), but can win
1245 * big if REG_ICASE is being used.  REG_ICASE, by the way, is why the hash
1246 * is done using addition rather than xor -- all ASCII [aA] sets xor to
1247 * the same value!
1248 */
1249static int			/* set number */
1250freezeset(p, cs)
1251register struct parse *p;
1252register cset *cs;
1253{
1254	register short h = cs->hash;
1255	register int i;
1256	register cset *top = &p->g->sets[p->g->ncsets];
1257	register cset *cs2;
1258	register size_t css = (size_t)p->g->csetsize;
1259
1260	/* look for an earlier one which is the same */
1261	for (cs2 = &p->g->sets[0]; cs2 < top; cs2++)
1262		if (cs2->hash == h && cs2 != cs) {
1263			/* maybe */
1264			for (i = 0; i < css; i++)
1265				if (!!CHIN(cs2, i) != !!CHIN(cs, i))
1266					break;		/* no */
1267			if (i == css)
1268				break;			/* yes */
1269		}
1270
1271	if (cs2 < top) {	/* found one */
1272		freeset(p, cs);
1273		cs = cs2;
1274	}
1275
1276	return((int)(cs - p->g->sets));
1277}
1278
1279/*
1280 - firstch - return first character in a set (which must have at least one)
1281 == static int firstch(register struct parse *p, register cset *cs);
1282 */
1283static int			/* character; there is no "none" value */
1284firstch(p, cs)
1285register struct parse *p;
1286register cset *cs;
1287{
1288	register int i;
1289	register size_t css = (size_t)p->g->csetsize;
1290
1291	for (i = 0; i < css; i++)
1292		if (CHIN(cs, i))
1293			return((char)i);
1294	assert(never);
1295	return(0);		/* arbitrary */
1296}
1297
1298/*
1299 - nch - number of characters in a set
1300 == static int nch(register struct parse *p, register cset *cs);
1301 */
1302static int
1303nch(p, cs)
1304register struct parse *p;
1305register cset *cs;
1306{
1307	register int i;
1308	register size_t css = (size_t)p->g->csetsize;
1309	register int n = 0;
1310
1311	for (i = 0; i < css; i++)
1312		if (CHIN(cs, i))
1313			n++;
1314	return(n);
1315}
1316
1317/*
1318 - mcadd - add a collating element to a cset
1319 == static void mcadd(register struct parse *p, register cset *cs, \
1320 ==	register char *cp);
1321 */
1322static void
1323mcadd(p, cs, cp)
1324register struct parse *p;
1325register cset *cs;
1326register char *cp;
1327{
1328	register size_t oldend = cs->smultis;
1329
1330	cs->smultis += strlen(cp) + 1;
1331	if (cs->multis == NULL)
1332		cs->multis = malloc(cs->smultis);
1333	else
1334		cs->multis = reallocf(cs->multis, cs->smultis);
1335	if (cs->multis == NULL) {
1336		SETERROR(REG_ESPACE);
1337		return;
1338	}
1339
1340	(void) strcpy(cs->multis + oldend - 1, cp);
1341	cs->multis[cs->smultis - 1] = '\0';
1342}
1343
1344#if used
1345/*
1346 - mcsub - subtract a collating element from a cset
1347 == static void mcsub(register cset *cs, register char *cp);
1348 */
1349static void
1350mcsub(cs, cp)
1351register cset *cs;
1352register char *cp;
1353{
1354	register char *fp = mcfind(cs, cp);
1355	register size_t len = strlen(fp);
1356
1357	assert(fp != NULL);
1358	(void) memmove(fp, fp + len + 1,
1359				cs->smultis - (fp + len + 1 - cs->multis));
1360	cs->smultis -= len;
1361
1362	if (cs->smultis == 0) {
1363		free(cs->multis);
1364		cs->multis = NULL;
1365		return;
1366	}
1367
1368	cs->multis = reallocf(cs->multis, cs->smultis);
1369	assert(cs->multis != NULL);
1370}
1371
1372/*
1373 - mcin - is a collating element in a cset?
1374 == static int mcin(register cset *cs, register char *cp);
1375 */
1376static int
1377mcin(cs, cp)
1378register cset *cs;
1379register char *cp;
1380{
1381	return(mcfind(cs, cp) != NULL);
1382}
1383
1384/*
1385 - mcfind - find a collating element in a cset
1386 == static char *mcfind(register cset *cs, register char *cp);
1387 */
1388static char *
1389mcfind(cs, cp)
1390register cset *cs;
1391register char *cp;
1392{
1393	register char *p;
1394
1395	if (cs->multis == NULL)
1396		return(NULL);
1397	for (p = cs->multis; *p != '\0'; p += strlen(p) + 1)
1398		if (strcmp(cp, p) == 0)
1399			return(p);
1400	return(NULL);
1401}
1402#endif
1403
1404/*
1405 - mcinvert - invert the list of collating elements in a cset
1406 == static void mcinvert(register struct parse *p, register cset *cs);
1407 *
1408 * This would have to know the set of possibilities.  Implementation
1409 * is deferred.
1410 */
1411static void
1412mcinvert(p, cs)
1413register struct parse *p;
1414register cset *cs;
1415{
1416	assert(cs->multis == NULL);	/* xxx */
1417}
1418
1419/*
1420 - mccase - add case counterparts of the list of collating elements in a cset
1421 == static void mccase(register struct parse *p, register cset *cs);
1422 *
1423 * This would have to know the set of possibilities.  Implementation
1424 * is deferred.
1425 */
1426static void
1427mccase(p, cs)
1428register struct parse *p;
1429register cset *cs;
1430{
1431	assert(cs->multis == NULL);	/* xxx */
1432}
1433
1434/*
1435 - isinsets - is this character in any sets?
1436 == static int isinsets(register struct re_guts *g, int c);
1437 */
1438static int			/* predicate */
1439isinsets(g, c)
1440register struct re_guts *g;
1441int c;
1442{
1443	register uch *col;
1444	register int i;
1445	register int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT;
1446	register unsigned uc = (uch)c;
1447
1448	for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize)
1449		if (col[uc] != 0)
1450			return(1);
1451	return(0);
1452}
1453
1454/*
1455 - samesets - are these two characters in exactly the same sets?
1456 == static int samesets(register struct re_guts *g, int c1, int c2);
1457 */
1458static int			/* predicate */
1459samesets(g, c1, c2)
1460register struct re_guts *g;
1461int c1;
1462int c2;
1463{
1464	register uch *col;
1465	register int i;
1466	register int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT;
1467	register unsigned uc1 = (uch)c1;
1468	register unsigned uc2 = (uch)c2;
1469
1470	for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize)
1471		if (col[uc1] != col[uc2])
1472			return(0);
1473	return(1);
1474}
1475
1476/*
1477 - categorize - sort out character categories
1478 == static void categorize(struct parse *p, register struct re_guts *g);
1479 */
1480static void
1481categorize(p, g)
1482struct parse *p;
1483register struct re_guts *g;
1484{
1485	register cat_t *cats = g->categories;
1486	register int c;
1487	register int c2;
1488	register cat_t cat;
1489
1490	/* avoid making error situations worse */
1491	if (p->error != 0)
1492		return;
1493
1494	for (c = CHAR_MIN; c <= CHAR_MAX; c++)
1495		if (cats[c] == 0 && isinsets(g, c)) {
1496			cat = g->ncategories++;
1497			cats[c] = cat;
1498			for (c2 = c+1; c2 <= CHAR_MAX; c2++)
1499				if (cats[c2] == 0 && samesets(g, c, c2))
1500					cats[c2] = cat;
1501		}
1502}
1503
1504/*
1505 - dupl - emit a duplicate of a bunch of sops
1506 == static sopno dupl(register struct parse *p, sopno start, sopno finish);
1507 */
1508static sopno			/* start of duplicate */
1509dupl(p, start, finish)
1510register struct parse *p;
1511sopno start;			/* from here */
1512sopno finish;			/* to this less one */
1513{
1514	register sopno ret = HERE();
1515	register sopno len = finish - start;
1516
1517	assert(finish >= start);
1518	if (len == 0)
1519		return(ret);
1520	enlarge(p, p->ssize + len);	/* this many unexpected additions */
1521	assert(p->ssize >= p->slen + len);
1522	(void) memcpy((char *)(p->strip + p->slen),
1523		(char *)(p->strip + start), (size_t)len*sizeof(sop));
1524	p->slen += len;
1525	return(ret);
1526}
1527
1528/*
1529 - doemit - emit a strip operator
1530 == static void doemit(register struct parse *p, sop op, size_t opnd);
1531 *
1532 * It might seem better to implement this as a macro with a function as
1533 * hard-case backup, but it's just too big and messy unless there are
1534 * some changes to the data structures.  Maybe later.
1535 */
1536static void
1537doemit(p, op, opnd)
1538register struct parse *p;
1539sop op;
1540size_t opnd;
1541{
1542	/* avoid making error situations worse */
1543	if (p->error != 0)
1544		return;
1545
1546	/* deal with oversize operands ("can't happen", more or less) */
1547	assert(opnd < 1<<OPSHIFT);
1548
1549	/* deal with undersized strip */
1550	if (p->slen >= p->ssize)
1551		enlarge(p, (p->ssize+1) / 2 * 3);	/* +50% */
1552	assert(p->slen < p->ssize);
1553
1554	/* finally, it's all reduced to the easy case */
1555	p->strip[p->slen++] = SOP(op, opnd);
1556}
1557
1558/*
1559 - doinsert - insert a sop into the strip
1560 == static void doinsert(register struct parse *p, sop op, size_t opnd, sopno pos);
1561 */
1562static void
1563doinsert(p, op, opnd, pos)
1564register struct parse *p;
1565sop op;
1566size_t opnd;
1567sopno pos;
1568{
1569	register sopno sn;
1570	register sop s;
1571	register int i;
1572
1573	/* avoid making error situations worse */
1574	if (p->error != 0)
1575		return;
1576
1577	sn = HERE();
1578	EMIT(op, opnd);		/* do checks, ensure space */
1579	assert(HERE() == sn+1);
1580	s = p->strip[sn];
1581
1582	/* adjust paren pointers */
1583	assert(pos > 0);
1584	for (i = 1; i < NPAREN; i++) {
1585		if (p->pbegin[i] >= pos) {
1586			p->pbegin[i]++;
1587		}
1588		if (p->pend[i] >= pos) {
1589			p->pend[i]++;
1590		}
1591	}
1592
1593	memmove((char *)&p->strip[pos+1], (char *)&p->strip[pos],
1594						(HERE()-pos-1)*sizeof(sop));
1595	p->strip[pos] = s;
1596}
1597
1598/*
1599 - dofwd - complete a forward reference
1600 == static void dofwd(register struct parse *p, sopno pos, sop value);
1601 */
1602static void
1603dofwd(p, pos, value)
1604register struct parse *p;
1605register sopno pos;
1606sop value;
1607{
1608	/* avoid making error situations worse */
1609	if (p->error != 0)
1610		return;
1611
1612	assert(value < 1<<OPSHIFT);
1613	p->strip[pos] = OP(p->strip[pos]) | value;
1614}
1615
1616/*
1617 - enlarge - enlarge the strip
1618 == static void enlarge(register struct parse *p, sopno size);
1619 */
1620static void
1621enlarge(p, size)
1622register struct parse *p;
1623register sopno size;
1624{
1625	register sop *sp;
1626
1627	if (p->ssize >= size)
1628		return;
1629
1630	sp = (sop *)realloc(p->strip, size*sizeof(sop));
1631	if (sp == NULL) {
1632		SETERROR(REG_ESPACE);
1633		return;
1634	}
1635	p->strip = sp;
1636	p->ssize = size;
1637}
1638
1639/*
1640 - stripsnug - compact the strip
1641 == static void stripsnug(register struct parse *p, register struct re_guts *g);
1642 */
1643static void
1644stripsnug(p, g)
1645register struct parse *p;
1646register struct re_guts *g;
1647{
1648	g->nstates = p->slen;
1649	g->strip = (sop *)realloc((char *)p->strip, p->slen * sizeof(sop));
1650	if (g->strip == NULL) {
1651		SETERROR(REG_ESPACE);
1652		g->strip = p->strip;
1653	}
1654}
1655
1656/*
1657 - findmust - fill in must and mlen with longest mandatory literal string
1658 == static void findmust(register struct parse *p, register struct re_guts *g);
1659 *
1660 * This algorithm could do fancy things like analyzing the operands of |
1661 * for common subsequences.  Someday.  This code is simple and finds most
1662 * of the interesting cases.
1663 *
1664 * Note that must and mlen got initialized during setup.
1665 */
1666static void
1667findmust(p, g)
1668struct parse *p;
1669register struct re_guts *g;
1670{
1671	register sop *scan;
1672	sop *start;
1673	register sop *newstart;
1674	register sopno newlen;
1675	register sop s;
1676	register char *cp;
1677	register sopno i;
1678
1679	/* avoid making error situations worse */
1680	if (p->error != 0)
1681		return;
1682
1683	/* find the longest OCHAR sequence in strip */
1684	newlen = 0;
1685	scan = g->strip + 1;
1686	do {
1687		s = *scan++;
1688		switch (OP(s)) {
1689		case OCHAR:		/* sequence member */
1690			if (newlen == 0)		/* new sequence */
1691				newstart = scan - 1;
1692			newlen++;
1693			break;
1694		case OPLUS_:		/* things that don't break one */
1695		case OLPAREN:
1696		case ORPAREN:
1697			break;
1698		case OQUEST_:		/* things that must be skipped */
1699		case OCH_:
1700			scan--;
1701			do {
1702				scan += OPND(s);
1703				s = *scan;
1704				/* assert() interferes w debug printouts */
1705				if (OP(s) != O_QUEST && OP(s) != O_CH &&
1706							OP(s) != OOR2) {
1707					g->iflags |= BAD;
1708					return;
1709				}
1710			} while (OP(s) != O_QUEST && OP(s) != O_CH);
1711			/* fallthrough */
1712		default:		/* things that break a sequence */
1713			if (newlen > g->mlen) {		/* ends one */
1714				start = newstart;
1715				g->mlen = newlen;
1716			}
1717			newlen = 0;
1718			break;
1719		}
1720	} while (OP(s) != OEND);
1721
1722	if (g->mlen == 0)		/* there isn't one */
1723		return;
1724
1725	/* turn it into a character string */
1726	g->must = malloc((size_t)g->mlen + 1);
1727	if (g->must == NULL) {		/* argh; just forget it */
1728		g->mlen = 0;
1729		return;
1730	}
1731	cp = g->must;
1732	scan = start;
1733	for (i = g->mlen; i > 0; i--) {
1734		while (OP(s = *scan++) != OCHAR)
1735			continue;
1736		assert(cp < g->must + g->mlen);
1737		*cp++ = (char)OPND(s);
1738	}
1739	assert(cp == g->must + g->mlen);
1740	*cp++ = '\0';		/* just on general principles */
1741}
1742
1743/*
1744 - pluscount - count + nesting
1745 == static sopno pluscount(register struct parse *p, register struct re_guts *g);
1746 */
1747static sopno			/* nesting depth */
1748pluscount(p, g)
1749struct parse *p;
1750register struct re_guts *g;
1751{
1752	register sop *scan;
1753	register sop s;
1754	register sopno plusnest = 0;
1755	register sopno maxnest = 0;
1756
1757	if (p->error != 0)
1758		return(0);	/* there may not be an OEND */
1759
1760	scan = g->strip + 1;
1761	do {
1762		s = *scan++;
1763		switch (OP(s)) {
1764		case OPLUS_:
1765			plusnest++;
1766			break;
1767		case O_PLUS:
1768			if (plusnest > maxnest)
1769				maxnest = plusnest;
1770			plusnest--;
1771			break;
1772		}
1773	} while (OP(s) != OEND);
1774	if (plusnest != 0)
1775		g->iflags |= BAD;
1776	return(maxnest);
1777}
1778