regcomp.c revision 86208
1/*-
2 * Copyright (c) 1992, 1993, 1994 Henry Spencer.
3 * Copyright (c) 1992, 1993, 1994
4 *	The Regents of the University of California.  All rights reserved.
5 *
6 * This code is derived from software contributed to Berkeley by
7 * Henry Spencer.
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 * 1. Redistributions of source code must retain the above copyright
13 *    notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 *    notice, this list of conditions and the following disclaimer in the
16 *    documentation and/or other materials provided with the distribution.
17 * 3. All advertising materials mentioning features or use of this software
18 *    must display the following acknowledgement:
19 *	This product includes software developed by the University of
20 *	California, Berkeley and its contributors.
21 * 4. Neither the name of the University nor the names of its contributors
22 *    may be used to endorse or promote products derived from this software
23 *    without specific prior written permission.
24 *
25 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
26 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
28 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
29 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
30 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
31 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
32 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
33 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
34 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
35 * SUCH DAMAGE.
36 *
37 *	@(#)regcomp.c	8.5 (Berkeley) 3/20/94
38 *
39 * $FreeBSD: head/lib/libc/regex/regcomp.c 86208 2001-11-09 10:17:44Z dcs $
40 */
41
42#if defined(LIBC_SCCS) && !defined(lint)
43static char sccsid[] = "@(#)regcomp.c	8.5 (Berkeley) 3/20/94";
44#endif /* LIBC_SCCS and not lint */
45
46#include <sys/types.h>
47#include <stdio.h>
48#include <string.h>
49#include <ctype.h>
50#include <limits.h>
51#include <stdlib.h>
52#include <regex.h>
53
54#include "collate.h"
55
56#include "utils.h"
57#include "regex2.h"
58
59#include "cclass.h"
60#include "cname.h"
61
62/*
63 * parse structure, passed up and down to avoid global variables and
64 * other clumsinesses
65 */
66struct parse {
67	char *next;		/* next character in RE */
68	char *end;		/* end of string (-> NUL normally) */
69	int error;		/* has an error been seen? */
70	sop *strip;		/* malloced strip */
71	sopno ssize;		/* malloced strip size (allocated) */
72	sopno slen;		/* malloced strip length (used) */
73	int ncsalloc;		/* number of csets allocated */
74	struct re_guts *g;
75#	define	NPAREN	10	/* we need to remember () 1-9 for back refs */
76	sopno pbegin[NPAREN];	/* -> ( ([0] unused) */
77	sopno pend[NPAREN];	/* -> ) ([0] unused) */
78};
79
80/* ========= begin header generated by ./mkh ========= */
81#ifdef __cplusplus
82extern "C" {
83#endif
84
85/* === regcomp.c === */
86static void p_ere __P((struct parse *p, int stop));
87static void p_ere_exp __P((struct parse *p));
88static void p_str __P((struct parse *p));
89static void p_bre __P((struct parse *p, int end1, int end2));
90static int p_simp_re __P((struct parse *p, int starordinary));
91static int p_count __P((struct parse *p));
92static void p_bracket __P((struct parse *p));
93static void p_b_term __P((struct parse *p, cset *cs));
94static void p_b_cclass __P((struct parse *p, cset *cs));
95static void p_b_eclass __P((struct parse *p, cset *cs));
96static char p_b_symbol __P((struct parse *p));
97static char p_b_coll_elem __P((struct parse *p, int endc));
98static char othercase __P((int ch));
99static void bothcases __P((struct parse *p, int ch));
100static void ordinary __P((struct parse *p, int ch));
101static void nonnewline __P((struct parse *p));
102static void repeat __P((struct parse *p, sopno start, int from, int to));
103static int seterr __P((struct parse *p, int e));
104static cset *allocset __P((struct parse *p));
105static void freeset __P((struct parse *p, cset *cs));
106static int freezeset __P((struct parse *p, cset *cs));
107static int firstch __P((struct parse *p, cset *cs));
108static int nch __P((struct parse *p, cset *cs));
109static void mcadd __P((struct parse *p, cset *cs, char *cp));
110#if used
111static void mcsub __P((cset *cs, char *cp));
112static int mcin __P((cset *cs, char *cp));
113static char *mcfind __P((cset *cs, char *cp));
114#endif
115static void mcinvert __P((struct parse *p, cset *cs));
116static void mccase __P((struct parse *p, cset *cs));
117static int isinsets __P((struct re_guts *g, int c));
118static int samesets __P((struct re_guts *g, int c1, int c2));
119static void categorize __P((struct parse *p, struct re_guts *g));
120static sopno dupl __P((struct parse *p, sopno start, sopno finish));
121static void doemit __P((struct parse *p, sop op, size_t opnd));
122static void doinsert __P((struct parse *p, sop op, size_t opnd, sopno pos));
123static void dofwd __P((struct parse *p, sopno pos, sop value));
124static void enlarge __P((struct parse *p, sopno size));
125static void stripsnug __P((struct parse *p, struct re_guts *g));
126static void findmust __P((struct parse *p, struct re_guts *g));
127static int altoffset __P((sop *scan, int offset, int mccs));
128static void computejumps __P((struct parse *p, struct re_guts *g));
129static void computematchjumps __P((struct parse *p, struct re_guts *g));
130static sopno pluscount __P((struct parse *p, struct re_guts *g));
131
132#ifdef __cplusplus
133}
134#endif
135/* ========= end header generated by ./mkh ========= */
136
137static char nuls[10];		/* place to point scanner in event of error */
138
139/*
140 * macros for use with parse structure
141 * BEWARE:  these know that the parse structure is named `p' !!!
142 */
143#define	PEEK()	(*p->next)
144#define	PEEK2()	(*(p->next+1))
145#define	MORE()	(p->next < p->end)
146#define	MORE2()	(p->next+1 < p->end)
147#define	SEE(c)	(MORE() && PEEK() == (c))
148#define	SEETWO(a, b)	(MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b))
149#define	EAT(c)	((SEE(c)) ? (NEXT(), 1) : 0)
150#define	EATTWO(a, b)	((SEETWO(a, b)) ? (NEXT2(), 1) : 0)
151#define	NEXT()	(p->next++)
152#define	NEXT2()	(p->next += 2)
153#define	NEXTn(n)	(p->next += (n))
154#define	GETNEXT()	(*p->next++)
155#define	SETERROR(e)	seterr(p, (e))
156#define	REQUIRE(co, e)	((co) || SETERROR(e))
157#define	MUSTSEE(c, e)	(REQUIRE(MORE() && PEEK() == (c), e))
158#define	MUSTEAT(c, e)	(REQUIRE(MORE() && GETNEXT() == (c), e))
159#define	MUSTNOTSEE(c, e)	(REQUIRE(!MORE() || PEEK() != (c), e))
160#define	EMIT(op, sopnd)	doemit(p, (sop)(op), (size_t)(sopnd))
161#define	INSERT(op, pos)	doinsert(p, (sop)(op), HERE()-(pos)+1, pos)
162#define	AHEAD(pos)		dofwd(p, pos, HERE()-(pos))
163#define	ASTERN(sop, pos)	EMIT(sop, HERE()-pos)
164#define	HERE()		(p->slen)
165#define	THERE()		(p->slen - 1)
166#define	THERETHERE()	(p->slen - 2)
167#define	DROP(n)	(p->slen -= (n))
168
169#ifndef NDEBUG
170static int never = 0;		/* for use in asserts; shuts lint up */
171#else
172#define	never	0		/* some <assert.h>s have bugs too */
173#endif
174
175/* Macro used by computejump()/computematchjump() */
176#define MIN(a,b)	((a)<(b)?(a):(b))
177
178/*
179 - regcomp - interface for parser and compilation
180 = extern int regcomp(regex_t *, const char *, int);
181 = #define	REG_BASIC	0000
182 = #define	REG_EXTENDED	0001
183 = #define	REG_ICASE	0002
184 = #define	REG_NOSUB	0004
185 = #define	REG_NEWLINE	0010
186 = #define	REG_NOSPEC	0020
187 = #define	REG_PEND	0040
188 = #define	REG_DUMP	0200
189 */
190int				/* 0 success, otherwise REG_something */
191regcomp(preg, pattern, cflags)
192regex_t *preg;
193const char *pattern;
194int cflags;
195{
196	struct parse pa;
197	register struct re_guts *g;
198	register struct parse *p = &pa;
199	register int i;
200	register size_t len;
201#ifdef REDEBUG
202#	define	GOODFLAGS(f)	(f)
203#else
204#	define	GOODFLAGS(f)	((f)&~REG_DUMP)
205#endif
206
207	cflags = GOODFLAGS(cflags);
208	if ((cflags&REG_EXTENDED) && (cflags&REG_NOSPEC))
209		return(REG_INVARG);
210
211	if (cflags&REG_PEND) {
212		if (preg->re_endp < pattern)
213			return(REG_INVARG);
214		len = preg->re_endp - pattern;
215	} else
216		len = strlen((char *)pattern);
217
218	/* do the mallocs early so failure handling is easy */
219	g = (struct re_guts *)malloc(sizeof(struct re_guts) +
220							(NC-1)*sizeof(cat_t));
221	if (g == NULL)
222		return(REG_ESPACE);
223	p->ssize = len/(size_t)2*(size_t)3 + (size_t)1;	/* ugh */
224	p->strip = (sop *)malloc(p->ssize * sizeof(sop));
225	p->slen = 0;
226	if (p->strip == NULL) {
227		free((char *)g);
228		return(REG_ESPACE);
229	}
230
231	/* set things up */
232	p->g = g;
233	p->next = (char *)pattern;	/* convenience; we do not modify it */
234	p->end = p->next + len;
235	p->error = 0;
236	p->ncsalloc = 0;
237	for (i = 0; i < NPAREN; i++) {
238		p->pbegin[i] = 0;
239		p->pend[i] = 0;
240	}
241	g->csetsize = NC;
242	g->sets = NULL;
243	g->setbits = NULL;
244	g->ncsets = 0;
245	g->cflags = cflags;
246	g->iflags = 0;
247	g->nbol = 0;
248	g->neol = 0;
249	g->must = NULL;
250	g->moffset = -1;
251	g->charjump = NULL;
252	g->matchjump = NULL;
253	g->mlen = 0;
254	g->nsub = 0;
255	g->ncategories = 1;	/* category 0 is "everything else" */
256	g->categories = &g->catspace[-(CHAR_MIN)];
257	(void) memset((char *)g->catspace, 0, NC*sizeof(cat_t));
258	g->backrefs = 0;
259
260	/* do it */
261	EMIT(OEND, 0);
262	g->firststate = THERE();
263	if (cflags&REG_EXTENDED)
264		p_ere(p, OUT);
265	else if (cflags&REG_NOSPEC)
266		p_str(p);
267	else
268		p_bre(p, OUT, OUT);
269	EMIT(OEND, 0);
270	g->laststate = THERE();
271
272	/* tidy up loose ends and fill things in */
273	categorize(p, g);
274	stripsnug(p, g);
275	findmust(p, g);
276	/* only use Boyer-Moore algorithm if the pattern is bigger
277	 * than three characters
278	 */
279	if(g->mlen > 3) {
280		computejumps(p, g);
281		computematchjumps(p, g);
282		if(g->matchjump == NULL && g->charjump != NULL) {
283			free(g->charjump);
284			g->charjump = NULL;
285		}
286	}
287	g->nplus = pluscount(p, g);
288	g->magic = MAGIC2;
289	preg->re_nsub = g->nsub;
290	preg->re_g = g;
291	preg->re_magic = MAGIC1;
292#ifndef REDEBUG
293	/* not debugging, so can't rely on the assert() in regexec() */
294	if (g->iflags&BAD)
295		SETERROR(REG_ASSERT);
296#endif
297
298	/* win or lose, we're done */
299	if (p->error != 0)	/* lose */
300		regfree(preg);
301	return(p->error);
302}
303
304/*
305 - p_ere - ERE parser top level, concatenation and alternation
306 == static void p_ere(register struct parse *p, int stop);
307 */
308static void
309p_ere(p, stop)
310register struct parse *p;
311int stop;			/* character this ERE should end at */
312{
313	register char c;
314	register sopno prevback;
315	register sopno prevfwd;
316	register sopno conc;
317	register int first = 1;		/* is this the first alternative? */
318
319	for (;;) {
320		/* do a bunch of concatenated expressions */
321		conc = HERE();
322		while (MORE() && (c = PEEK()) != '|' && c != stop)
323			p_ere_exp(p);
324		(void)REQUIRE(HERE() != conc, REG_EMPTY);	/* require nonempty */
325
326		if (!EAT('|'))
327			break;		/* NOTE BREAK OUT */
328
329		if (first) {
330			INSERT(OCH_, conc);	/* offset is wrong */
331			prevfwd = conc;
332			prevback = conc;
333			first = 0;
334		}
335		ASTERN(OOR1, prevback);
336		prevback = THERE();
337		AHEAD(prevfwd);			/* fix previous offset */
338		prevfwd = HERE();
339		EMIT(OOR2, 0);			/* offset is very wrong */
340	}
341
342	if (!first) {		/* tail-end fixups */
343		AHEAD(prevfwd);
344		ASTERN(O_CH, prevback);
345	}
346
347	assert(!MORE() || SEE(stop));
348}
349
350/*
351 - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op
352 == static void p_ere_exp(register struct parse *p);
353 */
354static void
355p_ere_exp(p)
356register struct parse *p;
357{
358	register char c;
359	register sopno pos;
360	register int count;
361	register int count2;
362	register sopno subno;
363	int wascaret = 0;
364
365	assert(MORE());		/* caller should have ensured this */
366	c = GETNEXT();
367
368	pos = HERE();
369	switch (c) {
370	case '(':
371		(void)REQUIRE(MORE(), REG_EPAREN);
372		p->g->nsub++;
373		subno = p->g->nsub;
374		if (subno < NPAREN)
375			p->pbegin[subno] = HERE();
376		EMIT(OLPAREN, subno);
377		if (!SEE(')'))
378			p_ere(p, ')');
379		if (subno < NPAREN) {
380			p->pend[subno] = HERE();
381			assert(p->pend[subno] != 0);
382		}
383		EMIT(ORPAREN, subno);
384		(void)MUSTEAT(')', REG_EPAREN);
385		break;
386#ifndef POSIX_MISTAKE
387	case ')':		/* happens only if no current unmatched ( */
388		/*
389		 * You may ask, why the ifndef?  Because I didn't notice
390		 * this until slightly too late for 1003.2, and none of the
391		 * other 1003.2 regular-expression reviewers noticed it at
392		 * all.  So an unmatched ) is legal POSIX, at least until
393		 * we can get it fixed.
394		 */
395		SETERROR(REG_EPAREN);
396		break;
397#endif
398	case '^':
399		EMIT(OBOL, 0);
400		p->g->iflags |= USEBOL;
401		p->g->nbol++;
402		wascaret = 1;
403		break;
404	case '$':
405		EMIT(OEOL, 0);
406		p->g->iflags |= USEEOL;
407		p->g->neol++;
408		break;
409	case '|':
410		SETERROR(REG_EMPTY);
411		break;
412	case '*':
413	case '+':
414	case '?':
415		SETERROR(REG_BADRPT);
416		break;
417	case '.':
418		if (p->g->cflags&REG_NEWLINE)
419			nonnewline(p);
420		else
421			EMIT(OANY, 0);
422		break;
423	case '[':
424		p_bracket(p);
425		break;
426	case '\\':
427		(void)REQUIRE(MORE(), REG_EESCAPE);
428		c = GETNEXT();
429		ordinary(p, c);
430		break;
431	case '{':		/* okay as ordinary except if digit follows */
432		(void)REQUIRE(!MORE() || !isdigit((uch)PEEK()), REG_BADRPT);
433		/* FALLTHROUGH */
434	default:
435		ordinary(p, c);
436		break;
437	}
438
439	if (!MORE())
440		return;
441	c = PEEK();
442	/* we call { a repetition if followed by a digit */
443	if (!( c == '*' || c == '+' || c == '?' ||
444				(c == '{' && MORE2() && isdigit((uch)PEEK2())) ))
445		return;		/* no repetition, we're done */
446	NEXT();
447
448	(void)REQUIRE(!wascaret, REG_BADRPT);
449	switch (c) {
450	case '*':	/* implemented as +? */
451		/* this case does not require the (y|) trick, noKLUDGE */
452		INSERT(OPLUS_, pos);
453		ASTERN(O_PLUS, pos);
454		INSERT(OQUEST_, pos);
455		ASTERN(O_QUEST, pos);
456		break;
457	case '+':
458		INSERT(OPLUS_, pos);
459		ASTERN(O_PLUS, pos);
460		break;
461	case '?':
462		/* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
463		INSERT(OCH_, pos);		/* offset slightly wrong */
464		ASTERN(OOR1, pos);		/* this one's right */
465		AHEAD(pos);			/* fix the OCH_ */
466		EMIT(OOR2, 0);			/* offset very wrong... */
467		AHEAD(THERE());			/* ...so fix it */
468		ASTERN(O_CH, THERETHERE());
469		break;
470	case '{':
471		count = p_count(p);
472		if (EAT(',')) {
473			if (isdigit((uch)PEEK())) {
474				count2 = p_count(p);
475				(void)REQUIRE(count <= count2, REG_BADBR);
476			} else		/* single number with comma */
477				count2 = INFINITY;
478		} else		/* just a single number */
479			count2 = count;
480		repeat(p, pos, count, count2);
481		if (!EAT('}')) {	/* error heuristics */
482			while (MORE() && PEEK() != '}')
483				NEXT();
484			(void)REQUIRE(MORE(), REG_EBRACE);
485			SETERROR(REG_BADBR);
486		}
487		break;
488	}
489
490	if (!MORE())
491		return;
492	c = PEEK();
493	if (!( c == '*' || c == '+' || c == '?' ||
494				(c == '{' && MORE2() && isdigit((uch)PEEK2())) ) )
495		return;
496	SETERROR(REG_BADRPT);
497}
498
499/*
500 - p_str - string (no metacharacters) "parser"
501 == static void p_str(register struct parse *p);
502 */
503static void
504p_str(p)
505register struct parse *p;
506{
507	(void)REQUIRE(MORE(), REG_EMPTY);
508	while (MORE())
509		ordinary(p, GETNEXT());
510}
511
512/*
513 - p_bre - BRE parser top level, anchoring and concatenation
514 == static void p_bre(register struct parse *p, register int end1, \
515 ==	register int end2);
516 * Giving end1 as OUT essentially eliminates the end1/end2 check.
517 *
518 * This implementation is a bit of a kludge, in that a trailing $ is first
519 * taken as an ordinary character and then revised to be an anchor.  The
520 * only undesirable side effect is that '$' gets included as a character
521 * category in such cases.  This is fairly harmless; not worth fixing.
522 * The amount of lookahead needed to avoid this kludge is excessive.
523 */
524static void
525p_bre(p, end1, end2)
526register struct parse *p;
527register int end1;		/* first terminating character */
528register int end2;		/* second terminating character */
529{
530	register sopno start = HERE();
531	register int first = 1;			/* first subexpression? */
532	register int wasdollar = 0;
533
534	if (EAT('^')) {
535		EMIT(OBOL, 0);
536		p->g->iflags |= USEBOL;
537		p->g->nbol++;
538	}
539	while (MORE() && !SEETWO(end1, end2)) {
540		wasdollar = p_simp_re(p, first);
541		first = 0;
542	}
543	if (wasdollar) {	/* oops, that was a trailing anchor */
544		DROP(1);
545		EMIT(OEOL, 0);
546		p->g->iflags |= USEEOL;
547		p->g->neol++;
548	}
549
550	(void)REQUIRE(HERE() != start, REG_EMPTY);	/* require nonempty */
551}
552
553/*
554 - p_simp_re - parse a simple RE, an atom possibly followed by a repetition
555 == static int p_simp_re(register struct parse *p, int starordinary);
556 */
557static int			/* was the simple RE an unbackslashed $? */
558p_simp_re(p, starordinary)
559register struct parse *p;
560int starordinary;		/* is a leading * an ordinary character? */
561{
562	register int c;
563	register int count;
564	register int count2;
565	register sopno pos;
566	register int i;
567	register sopno subno;
568#	define	BACKSL	(1<<CHAR_BIT)
569
570	pos = HERE();		/* repetion op, if any, covers from here */
571
572	assert(MORE());		/* caller should have ensured this */
573	c = GETNEXT();
574	if (c == '\\') {
575		(void)REQUIRE(MORE(), REG_EESCAPE);
576		c = BACKSL | GETNEXT();
577	}
578	switch (c) {
579	case '.':
580		if (p->g->cflags&REG_NEWLINE)
581			nonnewline(p);
582		else
583			EMIT(OANY, 0);
584		break;
585	case '[':
586		p_bracket(p);
587		break;
588	case BACKSL|'{':
589		SETERROR(REG_BADRPT);
590		break;
591	case BACKSL|'(':
592		p->g->nsub++;
593		subno = p->g->nsub;
594		if (subno < NPAREN)
595			p->pbegin[subno] = HERE();
596		EMIT(OLPAREN, subno);
597		/* the MORE here is an error heuristic */
598		if (MORE() && !SEETWO('\\', ')'))
599			p_bre(p, '\\', ')');
600		if (subno < NPAREN) {
601			p->pend[subno] = HERE();
602			assert(p->pend[subno] != 0);
603		}
604		EMIT(ORPAREN, subno);
605		(void)REQUIRE(EATTWO('\\', ')'), REG_EPAREN);
606		break;
607	case BACKSL|')':	/* should not get here -- must be user */
608	case BACKSL|'}':
609		SETERROR(REG_EPAREN);
610		break;
611	case BACKSL|'1':
612	case BACKSL|'2':
613	case BACKSL|'3':
614	case BACKSL|'4':
615	case BACKSL|'5':
616	case BACKSL|'6':
617	case BACKSL|'7':
618	case BACKSL|'8':
619	case BACKSL|'9':
620		i = (c&~BACKSL) - '0';
621		assert(i < NPAREN);
622		if (p->pend[i] != 0) {
623			assert(i <= p->g->nsub);
624			EMIT(OBACK_, i);
625			assert(p->pbegin[i] != 0);
626			assert(OP(p->strip[p->pbegin[i]]) == OLPAREN);
627			assert(OP(p->strip[p->pend[i]]) == ORPAREN);
628			(void) dupl(p, p->pbegin[i]+1, p->pend[i]);
629			EMIT(O_BACK, i);
630		} else
631			SETERROR(REG_ESUBREG);
632		p->g->backrefs = 1;
633		break;
634	case '*':
635		(void)REQUIRE(starordinary, REG_BADRPT);
636		/* FALLTHROUGH */
637	default:
638		ordinary(p, (char)c);
639		break;
640	}
641
642	if (EAT('*')) {		/* implemented as +? */
643		/* this case does not require the (y|) trick, noKLUDGE */
644		INSERT(OPLUS_, pos);
645		ASTERN(O_PLUS, pos);
646		INSERT(OQUEST_, pos);
647		ASTERN(O_QUEST, pos);
648	} else if (EATTWO('\\', '{')) {
649		count = p_count(p);
650		if (EAT(',')) {
651			if (MORE() && isdigit((uch)PEEK())) {
652				count2 = p_count(p);
653				(void)REQUIRE(count <= count2, REG_BADBR);
654			} else		/* single number with comma */
655				count2 = INFINITY;
656		} else		/* just a single number */
657			count2 = count;
658		repeat(p, pos, count, count2);
659		if (!EATTWO('\\', '}')) {	/* error heuristics */
660			while (MORE() && !SEETWO('\\', '}'))
661				NEXT();
662			(void)REQUIRE(MORE(), REG_EBRACE);
663			SETERROR(REG_BADBR);
664		}
665	} else if (c == '$')     /* $ (but not \$) ends it */
666		return(1);
667
668	return(0);
669}
670
671/*
672 - p_count - parse a repetition count
673 == static int p_count(register struct parse *p);
674 */
675static int			/* the value */
676p_count(p)
677register struct parse *p;
678{
679	register int count = 0;
680	register int ndigits = 0;
681
682	while (MORE() && isdigit((uch)PEEK()) && count <= DUPMAX) {
683		count = count*10 + (GETNEXT() - '0');
684		ndigits++;
685	}
686
687	(void)REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR);
688	return(count);
689}
690
691/*
692 - p_bracket - parse a bracketed character list
693 == static void p_bracket(register struct parse *p);
694 *
695 * Note a significant property of this code:  if the allocset() did SETERROR,
696 * no set operations are done.
697 */
698static void
699p_bracket(p)
700register struct parse *p;
701{
702	register cset *cs = allocset(p);
703	register int invert = 0;
704
705	/* Dept of Truly Sickening Special-Case Kludges */
706	if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]", 6) == 0) {
707		EMIT(OBOW, 0);
708		NEXTn(6);
709		return;
710	}
711	if (p->next + 5 < p->end && strncmp(p->next, "[:>:]]", 6) == 0) {
712		EMIT(OEOW, 0);
713		NEXTn(6);
714		return;
715	}
716
717	if (EAT('^'))
718		invert++;	/* make note to invert set at end */
719	if (EAT(']'))
720		CHadd(cs, ']');
721	else if (EAT('-'))
722		CHadd(cs, '-');
723	while (MORE() && PEEK() != ']' && !SEETWO('-', ']'))
724		p_b_term(p, cs);
725	if (EAT('-'))
726		CHadd(cs, '-');
727	(void)MUSTEAT(']', REG_EBRACK);
728
729	if (p->error != 0)	/* don't mess things up further */
730		return;
731
732	if (p->g->cflags&REG_ICASE) {
733		register int i;
734		register int ci;
735
736		for (i = p->g->csetsize - 1; i >= 0; i--)
737			if (CHIN(cs, i) && isalpha(i)) {
738				ci = othercase(i);
739				if (ci != i)
740					CHadd(cs, ci);
741			}
742		if (cs->multis != NULL)
743			mccase(p, cs);
744	}
745	if (invert) {
746		register int i;
747
748		for (i = p->g->csetsize - 1; i >= 0; i--)
749			if (CHIN(cs, i))
750				CHsub(cs, i);
751			else
752				CHadd(cs, i);
753		if (p->g->cflags&REG_NEWLINE)
754			CHsub(cs, '\n');
755		if (cs->multis != NULL)
756			mcinvert(p, cs);
757	}
758
759	assert(cs->multis == NULL);		/* xxx */
760
761	if (nch(p, cs) == 1) {		/* optimize singleton sets */
762		ordinary(p, firstch(p, cs));
763		freeset(p, cs);
764	} else
765		EMIT(OANYOF, freezeset(p, cs));
766}
767
768/*
769 - p_b_term - parse one term of a bracketed character list
770 == static void p_b_term(register struct parse *p, register cset *cs);
771 */
772static void
773p_b_term(p, cs)
774register struct parse *p;
775register cset *cs;
776{
777	register char c;
778	register char start, finish;
779	register int i;
780
781	/* classify what we've got */
782	switch ((MORE()) ? PEEK() : '\0') {
783	case '[':
784		c = (MORE2()) ? PEEK2() : '\0';
785		break;
786	case '-':
787		SETERROR(REG_ERANGE);
788		return;			/* NOTE RETURN */
789		break;
790	default:
791		c = '\0';
792		break;
793	}
794
795	switch (c) {
796	case ':':		/* character class */
797		NEXT2();
798		(void)REQUIRE(MORE(), REG_EBRACK);
799		c = PEEK();
800		(void)REQUIRE(c != '-' && c != ']', REG_ECTYPE);
801		p_b_cclass(p, cs);
802		(void)REQUIRE(MORE(), REG_EBRACK);
803		(void)REQUIRE(EATTWO(':', ']'), REG_ECTYPE);
804		break;
805	case '=':		/* equivalence class */
806		NEXT2();
807		(void)REQUIRE(MORE(), REG_EBRACK);
808		c = PEEK();
809		(void)REQUIRE(c != '-' && c != ']', REG_ECOLLATE);
810		p_b_eclass(p, cs);
811		(void)REQUIRE(MORE(), REG_EBRACK);
812		(void)REQUIRE(EATTWO('=', ']'), REG_ECOLLATE);
813		break;
814	default:		/* symbol, ordinary character, or range */
815/* xxx revision needed for multichar stuff */
816		start = p_b_symbol(p);
817		if (SEE('-') && MORE2() && PEEK2() != ']') {
818			/* range */
819			NEXT();
820			if (EAT('-'))
821				finish = '-';
822			else
823				finish = p_b_symbol(p);
824		} else
825			finish = start;
826		if (start == finish)
827			CHadd(cs, start);
828		else {
829			if (__collate_load_error) {
830				(void)REQUIRE((uch)start <= (uch)finish, REG_ERANGE);
831				for (i = (uch)start; i <= (uch)finish; i++)
832					CHadd(cs, i);
833			} else {
834				(void)REQUIRE(__collate_range_cmp(start, finish) <= 0, REG_ERANGE);
835				for (i = CHAR_MIN; i <= CHAR_MAX; i++) {
836					if (   __collate_range_cmp(start, i) <= 0
837					    && __collate_range_cmp(i, finish) <= 0
838					   )
839						CHadd(cs, i);
840				}
841			}
842		}
843		break;
844	}
845}
846
847/*
848 - p_b_cclass - parse a character-class name and deal with it
849 == static void p_b_cclass(register struct parse *p, register cset *cs);
850 */
851static void
852p_b_cclass(p, cs)
853register struct parse *p;
854register cset *cs;
855{
856	register int c;
857	register char *sp = p->next;
858	register struct cclass *cp;
859	register size_t len;
860
861	while (MORE() && isalpha((uch)PEEK()))
862		NEXT();
863	len = p->next - sp;
864	for (cp = cclasses; cp->name != NULL; cp++)
865		if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0')
866			break;
867	if (cp->name == NULL) {
868		/* oops, didn't find it */
869		SETERROR(REG_ECTYPE);
870		return;
871	}
872
873	switch (cp->fidx) {
874	case CALNUM:
875		for (c = CHAR_MIN; c <= CHAR_MAX; c++)
876			if (isalnum((uch)c))
877				CHadd(cs, c);
878		break;
879	case CALPHA:
880		for (c = CHAR_MIN; c <= CHAR_MAX; c++)
881			if (isalpha((uch)c))
882				CHadd(cs, c);
883		break;
884	case CBLANK:
885		for (c = CHAR_MIN; c <= CHAR_MAX; c++)
886			if (isblank((uch)c))
887				CHadd(cs, c);
888		break;
889	case CCNTRL:
890		for (c = CHAR_MIN; c <= CHAR_MAX; c++)
891			if (iscntrl((uch)c))
892				CHadd(cs, c);
893		break;
894	case CDIGIT:
895		for (c = CHAR_MIN; c <= CHAR_MAX; c++)
896			if (isdigit((uch)c))
897				CHadd(cs, c);
898		break;
899	case CGRAPH:
900		for (c = CHAR_MIN; c <= CHAR_MAX; c++)
901			if (isgraph((uch)c))
902				CHadd(cs, c);
903		break;
904	case CLOWER:
905		for (c = CHAR_MIN; c <= CHAR_MAX; c++)
906			if (islower((uch)c))
907				CHadd(cs, c);
908		break;
909	case CPRINT:
910		for (c = CHAR_MIN; c <= CHAR_MAX; c++)
911			if (isprint((uch)c))
912				CHadd(cs, c);
913		break;
914	case CPUNCT:
915		for (c = CHAR_MIN; c <= CHAR_MAX; c++)
916			if (ispunct((uch)c))
917				CHadd(cs, c);
918		break;
919	case CSPACE:
920		for (c = CHAR_MIN; c <= CHAR_MAX; c++)
921			if (isspace((uch)c))
922				CHadd(cs, c);
923		break;
924	case CUPPER:
925		for (c = CHAR_MIN; c <= CHAR_MAX; c++)
926			if (isupper((uch)c))
927				CHadd(cs, c);
928		break;
929	case CXDIGIT:
930		for (c = CHAR_MIN; c <= CHAR_MAX; c++)
931			if (isxdigit((uch)c))
932				CHadd(cs, c);
933		break;
934	}
935#if 0
936	for (u = cp->multis; *u != '\0'; u += strlen(u) + 1)
937		MCadd(p, cs, u);
938#endif
939}
940
941/*
942 - p_b_eclass - parse an equivalence-class name and deal with it
943 == static void p_b_eclass(register struct parse *p, register cset *cs);
944 *
945 * This implementation is incomplete. xxx
946 */
947static void
948p_b_eclass(p, cs)
949register struct parse *p;
950register cset *cs;
951{
952	register char c;
953
954	c = p_b_coll_elem(p, '=');
955	CHadd(cs, c);
956}
957
958/*
959 - p_b_symbol - parse a character or [..]ed multicharacter collating symbol
960 == static char p_b_symbol(register struct parse *p);
961 */
962static char			/* value of symbol */
963p_b_symbol(p)
964register struct parse *p;
965{
966	register char value;
967
968	(void)REQUIRE(MORE(), REG_EBRACK);
969	if (!EATTWO('[', '.'))
970		return(GETNEXT());
971
972	/* collating symbol */
973	value = p_b_coll_elem(p, '.');
974	(void)REQUIRE(EATTWO('.', ']'), REG_ECOLLATE);
975	return(value);
976}
977
978/*
979 - p_b_coll_elem - parse a collating-element name and look it up
980 == static char p_b_coll_elem(register struct parse *p, int endc);
981 */
982static char			/* value of collating element */
983p_b_coll_elem(p, endc)
984register struct parse *p;
985int endc;			/* name ended by endc,']' */
986{
987	register char *sp = p->next;
988	register struct cname *cp;
989	register int len;
990
991	while (MORE() && !SEETWO(endc, ']'))
992		NEXT();
993	if (!MORE()) {
994		SETERROR(REG_EBRACK);
995		return(0);
996	}
997	len = p->next - sp;
998	for (cp = cnames; cp->name != NULL; cp++)
999		if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0')
1000			return(cp->code);	/* known name */
1001	if (len == 1)
1002		return(*sp);	/* single character */
1003	SETERROR(REG_ECOLLATE);			/* neither */
1004	return(0);
1005}
1006
1007/*
1008 - othercase - return the case counterpart of an alphabetic
1009 == static char othercase(int ch);
1010 */
1011static char			/* if no counterpart, return ch */
1012othercase(ch)
1013int ch;
1014{
1015	ch = (uch)ch;
1016	assert(isalpha(ch));
1017	if (isupper(ch))
1018		return(tolower(ch));
1019	else if (islower(ch))
1020		return(toupper(ch));
1021	else			/* peculiar, but could happen */
1022		return(ch);
1023}
1024
1025/*
1026 - bothcases - emit a dualcase version of a two-case character
1027 == static void bothcases(register struct parse *p, int ch);
1028 *
1029 * Boy, is this implementation ever a kludge...
1030 */
1031static void
1032bothcases(p, ch)
1033register struct parse *p;
1034int ch;
1035{
1036	register char *oldnext = p->next;
1037	register char *oldend = p->end;
1038	char bracket[3];
1039
1040	ch = (uch)ch;
1041	assert(othercase(ch) != ch);	/* p_bracket() would recurse */
1042	p->next = bracket;
1043	p->end = bracket+2;
1044	bracket[0] = ch;
1045	bracket[1] = ']';
1046	bracket[2] = '\0';
1047	p_bracket(p);
1048	assert(p->next == bracket+2);
1049	p->next = oldnext;
1050	p->end = oldend;
1051}
1052
1053/*
1054 - ordinary - emit an ordinary character
1055 == static void ordinary(register struct parse *p, register int ch);
1056 */
1057static void
1058ordinary(p, ch)
1059register struct parse *p;
1060register int ch;
1061{
1062	register cat_t *cap = p->g->categories;
1063
1064	if ((p->g->cflags&REG_ICASE) && isalpha((uch)ch) && othercase(ch) != ch)
1065		bothcases(p, ch);
1066	else {
1067		EMIT(OCHAR, (uch)ch);
1068		if (cap[ch] == 0)
1069			cap[ch] = p->g->ncategories++;
1070	}
1071}
1072
1073/*
1074 - nonnewline - emit REG_NEWLINE version of OANY
1075 == static void nonnewline(register struct parse *p);
1076 *
1077 * Boy, is this implementation ever a kludge...
1078 */
1079static void
1080nonnewline(p)
1081register struct parse *p;
1082{
1083	register char *oldnext = p->next;
1084	register char *oldend = p->end;
1085	char bracket[4];
1086
1087	p->next = bracket;
1088	p->end = bracket+3;
1089	bracket[0] = '^';
1090	bracket[1] = '\n';
1091	bracket[2] = ']';
1092	bracket[3] = '\0';
1093	p_bracket(p);
1094	assert(p->next == bracket+3);
1095	p->next = oldnext;
1096	p->end = oldend;
1097}
1098
1099/*
1100 - repeat - generate code for a bounded repetition, recursively if needed
1101 == static void repeat(register struct parse *p, sopno start, int from, int to);
1102 */
1103static void
1104repeat(p, start, from, to)
1105register struct parse *p;
1106sopno start;			/* operand from here to end of strip */
1107int from;			/* repeated from this number */
1108int to;				/* to this number of times (maybe INFINITY) */
1109{
1110	register sopno finish = HERE();
1111#	define	N	2
1112#	define	INF	3
1113#	define	REP(f, t)	((f)*8 + (t))
1114#	define	MAP(n)	(((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N)
1115	register sopno copy;
1116
1117	if (p->error != 0)	/* head off possible runaway recursion */
1118		return;
1119
1120	assert(from <= to);
1121
1122	switch (REP(MAP(from), MAP(to))) {
1123	case REP(0, 0):			/* must be user doing this */
1124		DROP(finish-start);	/* drop the operand */
1125		break;
1126	case REP(0, 1):			/* as x{1,1}? */
1127	case REP(0, N):			/* as x{1,n}? */
1128	case REP(0, INF):		/* as x{1,}? */
1129		/* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
1130		INSERT(OCH_, start);		/* offset is wrong... */
1131		repeat(p, start+1, 1, to);
1132		ASTERN(OOR1, start);
1133		AHEAD(start);			/* ... fix it */
1134		EMIT(OOR2, 0);
1135		AHEAD(THERE());
1136		ASTERN(O_CH, THERETHERE());
1137		break;
1138	case REP(1, 1):			/* trivial case */
1139		/* done */
1140		break;
1141	case REP(1, N):			/* as x?x{1,n-1} */
1142		/* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
1143		INSERT(OCH_, start);
1144		ASTERN(OOR1, start);
1145		AHEAD(start);
1146		EMIT(OOR2, 0);			/* offset very wrong... */
1147		AHEAD(THERE());			/* ...so fix it */
1148		ASTERN(O_CH, THERETHERE());
1149		copy = dupl(p, start+1, finish+1);
1150		assert(copy == finish+4);
1151		repeat(p, copy, 1, to-1);
1152		break;
1153	case REP(1, INF):		/* as x+ */
1154		INSERT(OPLUS_, start);
1155		ASTERN(O_PLUS, start);
1156		break;
1157	case REP(N, N):			/* as xx{m-1,n-1} */
1158		copy = dupl(p, start, finish);
1159		repeat(p, copy, from-1, to-1);
1160		break;
1161	case REP(N, INF):		/* as xx{n-1,INF} */
1162		copy = dupl(p, start, finish);
1163		repeat(p, copy, from-1, to);
1164		break;
1165	default:			/* "can't happen" */
1166		SETERROR(REG_ASSERT);	/* just in case */
1167		break;
1168	}
1169}
1170
1171/*
1172 - seterr - set an error condition
1173 == static int seterr(register struct parse *p, int e);
1174 */
1175static int			/* useless but makes type checking happy */
1176seterr(p, e)
1177register struct parse *p;
1178int e;
1179{
1180	if (p->error == 0)	/* keep earliest error condition */
1181		p->error = e;
1182	p->next = nuls;		/* try to bring things to a halt */
1183	p->end = nuls;
1184	return(0);		/* make the return value well-defined */
1185}
1186
1187/*
1188 - allocset - allocate a set of characters for []
1189 == static cset *allocset(register struct parse *p);
1190 */
1191static cset *
1192allocset(p)
1193register struct parse *p;
1194{
1195	register int no = p->g->ncsets++;
1196	register size_t nc;
1197	register size_t nbytes;
1198	register cset *cs;
1199	register size_t css = (size_t)p->g->csetsize;
1200	register int i;
1201
1202	if (no >= p->ncsalloc) {	/* need another column of space */
1203		p->ncsalloc += CHAR_BIT;
1204		nc = p->ncsalloc;
1205		assert(nc % CHAR_BIT == 0);
1206		nbytes = nc / CHAR_BIT * css;
1207		if (p->g->sets == NULL)
1208			p->g->sets = (cset *)malloc(nc * sizeof(cset));
1209		else
1210			p->g->sets = (cset *)reallocf((char *)p->g->sets,
1211							nc * sizeof(cset));
1212		if (p->g->setbits == NULL)
1213			p->g->setbits = (uch *)malloc(nbytes);
1214		else {
1215			p->g->setbits = (uch *)reallocf((char *)p->g->setbits,
1216								nbytes);
1217			/* xxx this isn't right if setbits is now NULL */
1218			for (i = 0; i < no; i++)
1219				p->g->sets[i].ptr = p->g->setbits + css*(i/CHAR_BIT);
1220		}
1221		if (p->g->sets != NULL && p->g->setbits != NULL)
1222			(void) memset((char *)p->g->setbits + (nbytes - css),
1223								0, css);
1224		else {
1225			no = 0;
1226			SETERROR(REG_ESPACE);
1227			/* caller's responsibility not to do set ops */
1228		}
1229	}
1230
1231	assert(p->g->sets != NULL);	/* xxx */
1232	cs = &p->g->sets[no];
1233	cs->ptr = p->g->setbits + css*((no)/CHAR_BIT);
1234	cs->mask = 1 << ((no) % CHAR_BIT);
1235	cs->hash = 0;
1236	cs->smultis = 0;
1237	cs->multis = NULL;
1238
1239	return(cs);
1240}
1241
1242/*
1243 - freeset - free a now-unused set
1244 == static void freeset(register struct parse *p, register cset *cs);
1245 */
1246static void
1247freeset(p, cs)
1248register struct parse *p;
1249register cset *cs;
1250{
1251	register int i;
1252	register cset *top = &p->g->sets[p->g->ncsets];
1253	register size_t css = (size_t)p->g->csetsize;
1254
1255	for (i = 0; i < css; i++)
1256		CHsub(cs, i);
1257	if (cs == top-1)	/* recover only the easy case */
1258		p->g->ncsets--;
1259}
1260
1261/*
1262 - freezeset - final processing on a set of characters
1263 == static int freezeset(register struct parse *p, register cset *cs);
1264 *
1265 * The main task here is merging identical sets.  This is usually a waste
1266 * of time (although the hash code minimizes the overhead), but can win
1267 * big if REG_ICASE is being used.  REG_ICASE, by the way, is why the hash
1268 * is done using addition rather than xor -- all ASCII [aA] sets xor to
1269 * the same value!
1270 */
1271static int			/* set number */
1272freezeset(p, cs)
1273register struct parse *p;
1274register cset *cs;
1275{
1276	register short h = cs->hash;
1277	register int i;
1278	register cset *top = &p->g->sets[p->g->ncsets];
1279	register cset *cs2;
1280	register size_t css = (size_t)p->g->csetsize;
1281
1282	/* look for an earlier one which is the same */
1283	for (cs2 = &p->g->sets[0]; cs2 < top; cs2++)
1284		if (cs2->hash == h && cs2 != cs) {
1285			/* maybe */
1286			for (i = 0; i < css; i++)
1287				if (!!CHIN(cs2, i) != !!CHIN(cs, i))
1288					break;		/* no */
1289			if (i == css)
1290				break;			/* yes */
1291		}
1292
1293	if (cs2 < top) {	/* found one */
1294		freeset(p, cs);
1295		cs = cs2;
1296	}
1297
1298	return((int)(cs - p->g->sets));
1299}
1300
1301/*
1302 - firstch - return first character in a set (which must have at least one)
1303 == static int firstch(register struct parse *p, register cset *cs);
1304 */
1305static int			/* character; there is no "none" value */
1306firstch(p, cs)
1307register struct parse *p;
1308register cset *cs;
1309{
1310	register int i;
1311	register size_t css = (size_t)p->g->csetsize;
1312
1313	for (i = 0; i < css; i++)
1314		if (CHIN(cs, i))
1315			return((char)i);
1316	assert(never);
1317	return(0);		/* arbitrary */
1318}
1319
1320/*
1321 - nch - number of characters in a set
1322 == static int nch(register struct parse *p, register cset *cs);
1323 */
1324static int
1325nch(p, cs)
1326register struct parse *p;
1327register cset *cs;
1328{
1329	register int i;
1330	register size_t css = (size_t)p->g->csetsize;
1331	register int n = 0;
1332
1333	for (i = 0; i < css; i++)
1334		if (CHIN(cs, i))
1335			n++;
1336	return(n);
1337}
1338
1339/*
1340 - mcadd - add a collating element to a cset
1341 == static void mcadd(register struct parse *p, register cset *cs, \
1342 ==	register char *cp);
1343 */
1344static void
1345mcadd(p, cs, cp)
1346register struct parse *p;
1347register cset *cs;
1348register char *cp;
1349{
1350	register size_t oldend = cs->smultis;
1351
1352	cs->smultis += strlen(cp) + 1;
1353	if (cs->multis == NULL)
1354		cs->multis = malloc(cs->smultis);
1355	else
1356		cs->multis = reallocf(cs->multis, cs->smultis);
1357	if (cs->multis == NULL) {
1358		SETERROR(REG_ESPACE);
1359		return;
1360	}
1361
1362	(void) strcpy(cs->multis + oldend - 1, cp);
1363	cs->multis[cs->smultis - 1] = '\0';
1364}
1365
1366#if used
1367/*
1368 - mcsub - subtract a collating element from a cset
1369 == static void mcsub(register cset *cs, register char *cp);
1370 */
1371static void
1372mcsub(cs, cp)
1373register cset *cs;
1374register char *cp;
1375{
1376	register char *fp = mcfind(cs, cp);
1377	register size_t len = strlen(fp);
1378
1379	assert(fp != NULL);
1380	(void) memmove(fp, fp + len + 1,
1381				cs->smultis - (fp + len + 1 - cs->multis));
1382	cs->smultis -= len;
1383
1384	if (cs->smultis == 0) {
1385		free(cs->multis);
1386		cs->multis = NULL;
1387		return;
1388	}
1389
1390	cs->multis = reallocf(cs->multis, cs->smultis);
1391	assert(cs->multis != NULL);
1392}
1393
1394/*
1395 - mcin - is a collating element in a cset?
1396 == static int mcin(register cset *cs, register char *cp);
1397 */
1398static int
1399mcin(cs, cp)
1400register cset *cs;
1401register char *cp;
1402{
1403	return(mcfind(cs, cp) != NULL);
1404}
1405
1406/*
1407 - mcfind - find a collating element in a cset
1408 == static char *mcfind(register cset *cs, register char *cp);
1409 */
1410static char *
1411mcfind(cs, cp)
1412register cset *cs;
1413register char *cp;
1414{
1415	register char *p;
1416
1417	if (cs->multis == NULL)
1418		return(NULL);
1419	for (p = cs->multis; *p != '\0'; p += strlen(p) + 1)
1420		if (strcmp(cp, p) == 0)
1421			return(p);
1422	return(NULL);
1423}
1424#endif
1425
1426/*
1427 - mcinvert - invert the list of collating elements in a cset
1428 == static void mcinvert(register struct parse *p, register cset *cs);
1429 *
1430 * This would have to know the set of possibilities.  Implementation
1431 * is deferred.
1432 */
1433static void
1434mcinvert(p, cs)
1435register struct parse *p;
1436register cset *cs;
1437{
1438	assert(cs->multis == NULL);	/* xxx */
1439}
1440
1441/*
1442 - mccase - add case counterparts of the list of collating elements in a cset
1443 == static void mccase(register struct parse *p, register cset *cs);
1444 *
1445 * This would have to know the set of possibilities.  Implementation
1446 * is deferred.
1447 */
1448static void
1449mccase(p, cs)
1450register struct parse *p;
1451register cset *cs;
1452{
1453	assert(cs->multis == NULL);	/* xxx */
1454}
1455
1456/*
1457 - isinsets - is this character in any sets?
1458 == static int isinsets(register struct re_guts *g, int c);
1459 */
1460static int			/* predicate */
1461isinsets(g, c)
1462register struct re_guts *g;
1463int c;
1464{
1465	register uch *col;
1466	register int i;
1467	register int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT;
1468	register unsigned uc = (uch)c;
1469
1470	for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize)
1471		if (col[uc] != 0)
1472			return(1);
1473	return(0);
1474}
1475
1476/*
1477 - samesets - are these two characters in exactly the same sets?
1478 == static int samesets(register struct re_guts *g, int c1, int c2);
1479 */
1480static int			/* predicate */
1481samesets(g, c1, c2)
1482register struct re_guts *g;
1483int c1;
1484int c2;
1485{
1486	register uch *col;
1487	register int i;
1488	register int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT;
1489	register unsigned uc1 = (uch)c1;
1490	register unsigned uc2 = (uch)c2;
1491
1492	for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize)
1493		if (col[uc1] != col[uc2])
1494			return(0);
1495	return(1);
1496}
1497
1498/*
1499 - categorize - sort out character categories
1500 == static void categorize(struct parse *p, register struct re_guts *g);
1501 */
1502static void
1503categorize(p, g)
1504struct parse *p;
1505register struct re_guts *g;
1506{
1507	register cat_t *cats = g->categories;
1508	register int c;
1509	register int c2;
1510	register cat_t cat;
1511
1512	/* avoid making error situations worse */
1513	if (p->error != 0)
1514		return;
1515
1516	for (c = CHAR_MIN; c <= CHAR_MAX; c++)
1517		if (cats[c] == 0 && isinsets(g, c)) {
1518			cat = g->ncategories++;
1519			cats[c] = cat;
1520			for (c2 = c+1; c2 <= CHAR_MAX; c2++)
1521				if (cats[c2] == 0 && samesets(g, c, c2))
1522					cats[c2] = cat;
1523		}
1524}
1525
1526/*
1527 - dupl - emit a duplicate of a bunch of sops
1528 == static sopno dupl(register struct parse *p, sopno start, sopno finish);
1529 */
1530static sopno			/* start of duplicate */
1531dupl(p, start, finish)
1532register struct parse *p;
1533sopno start;			/* from here */
1534sopno finish;			/* to this less one */
1535{
1536	register sopno ret = HERE();
1537	register sopno len = finish - start;
1538
1539	assert(finish >= start);
1540	if (len == 0)
1541		return(ret);
1542	enlarge(p, p->ssize + len);	/* this many unexpected additions */
1543	assert(p->ssize >= p->slen + len);
1544	(void) memcpy((char *)(p->strip + p->slen),
1545		(char *)(p->strip + start), (size_t)len*sizeof(sop));
1546	p->slen += len;
1547	return(ret);
1548}
1549
1550/*
1551 - doemit - emit a strip operator
1552 == static void doemit(register struct parse *p, sop op, size_t opnd);
1553 *
1554 * It might seem better to implement this as a macro with a function as
1555 * hard-case backup, but it's just too big and messy unless there are
1556 * some changes to the data structures.  Maybe later.
1557 */
1558static void
1559doemit(p, op, opnd)
1560register struct parse *p;
1561sop op;
1562size_t opnd;
1563{
1564	/* avoid making error situations worse */
1565	if (p->error != 0)
1566		return;
1567
1568	/* deal with oversize operands ("can't happen", more or less) */
1569	assert(opnd < 1<<OPSHIFT);
1570
1571	/* deal with undersized strip */
1572	if (p->slen >= p->ssize)
1573		enlarge(p, (p->ssize+1) / 2 * 3);	/* +50% */
1574	assert(p->slen < p->ssize);
1575
1576	/* finally, it's all reduced to the easy case */
1577	p->strip[p->slen++] = SOP(op, opnd);
1578}
1579
1580/*
1581 - doinsert - insert a sop into the strip
1582 == static void doinsert(register struct parse *p, sop op, size_t opnd, sopno pos);
1583 */
1584static void
1585doinsert(p, op, opnd, pos)
1586register struct parse *p;
1587sop op;
1588size_t opnd;
1589sopno pos;
1590{
1591	register sopno sn;
1592	register sop s;
1593	register int i;
1594
1595	/* avoid making error situations worse */
1596	if (p->error != 0)
1597		return;
1598
1599	sn = HERE();
1600	EMIT(op, opnd);		/* do checks, ensure space */
1601	assert(HERE() == sn+1);
1602	s = p->strip[sn];
1603
1604	/* adjust paren pointers */
1605	assert(pos > 0);
1606	for (i = 1; i < NPAREN; i++) {
1607		if (p->pbegin[i] >= pos) {
1608			p->pbegin[i]++;
1609		}
1610		if (p->pend[i] >= pos) {
1611			p->pend[i]++;
1612		}
1613	}
1614
1615	memmove((char *)&p->strip[pos+1], (char *)&p->strip[pos],
1616						(HERE()-pos-1)*sizeof(sop));
1617	p->strip[pos] = s;
1618}
1619
1620/*
1621 - dofwd - complete a forward reference
1622 == static void dofwd(register struct parse *p, sopno pos, sop value);
1623 */
1624static void
1625dofwd(p, pos, value)
1626register struct parse *p;
1627register sopno pos;
1628sop value;
1629{
1630	/* avoid making error situations worse */
1631	if (p->error != 0)
1632		return;
1633
1634	assert(value < 1<<OPSHIFT);
1635	p->strip[pos] = OP(p->strip[pos]) | value;
1636}
1637
1638/*
1639 - enlarge - enlarge the strip
1640 == static void enlarge(register struct parse *p, sopno size);
1641 */
1642static void
1643enlarge(p, size)
1644register struct parse *p;
1645register sopno size;
1646{
1647	register sop *sp;
1648
1649	if (p->ssize >= size)
1650		return;
1651
1652	sp = (sop *)realloc(p->strip, size*sizeof(sop));
1653	if (sp == NULL) {
1654		SETERROR(REG_ESPACE);
1655		return;
1656	}
1657	p->strip = sp;
1658	p->ssize = size;
1659}
1660
1661/*
1662 - stripsnug - compact the strip
1663 == static void stripsnug(register struct parse *p, register struct re_guts *g);
1664 */
1665static void
1666stripsnug(p, g)
1667register struct parse *p;
1668register struct re_guts *g;
1669{
1670	g->nstates = p->slen;
1671	g->strip = (sop *)realloc((char *)p->strip, p->slen * sizeof(sop));
1672	if (g->strip == NULL) {
1673		SETERROR(REG_ESPACE);
1674		g->strip = p->strip;
1675	}
1676}
1677
1678/*
1679 - findmust - fill in must and mlen with longest mandatory literal string
1680 == static void findmust(register struct parse *p, register struct re_guts *g);
1681 *
1682 * This algorithm could do fancy things like analyzing the operands of |
1683 * for common subsequences.  Someday.  This code is simple and finds most
1684 * of the interesting cases.
1685 *
1686 * Note that must and mlen got initialized during setup.
1687 */
1688static void
1689findmust(p, g)
1690struct parse *p;
1691register struct re_guts *g;
1692{
1693	register sop *scan;
1694	sop *start;
1695	register sop *newstart;
1696	register sopno newlen;
1697	register sop s;
1698	register char *cp;
1699	register sopno i;
1700	int offset;
1701	int cs, mccs;
1702
1703	/* avoid making error situations worse */
1704	if (p->error != 0)
1705		return;
1706
1707	/* Find out if we can handle OANYOF or not */
1708	mccs = 0;
1709	for (cs = 0; cs < g->ncsets; cs++)
1710		if (g->sets[cs].multis != NULL)
1711			mccs = 1;
1712
1713	/* find the longest OCHAR sequence in strip */
1714	newlen = 0;
1715	offset = 0;
1716	g->moffset = 0;
1717	scan = g->strip + 1;
1718	do {
1719		s = *scan++;
1720		switch (OP(s)) {
1721		case OCHAR:		/* sequence member */
1722			if (newlen == 0)		/* new sequence */
1723				newstart = scan - 1;
1724			newlen++;
1725			break;
1726		case OPLUS_:		/* things that don't break one */
1727		case OLPAREN:
1728		case ORPAREN:
1729			break;
1730		case OQUEST_:		/* things that must be skipped */
1731		case OCH_:
1732			offset = altoffset(scan, offset, mccs);
1733			scan--;
1734			do {
1735				scan += OPND(s);
1736				s = *scan;
1737				/* assert() interferes w debug printouts */
1738				if (OP(s) != O_QUEST && OP(s) != O_CH &&
1739							OP(s) != OOR2) {
1740					g->iflags |= BAD;
1741					return;
1742				}
1743			} while (OP(s) != O_QUEST && OP(s) != O_CH);
1744			/* fallthrough */
1745		case OBOW:		/* things that break a sequence */
1746		case OEOW:
1747		case OBOL:
1748		case OEOL:
1749		case O_QUEST:
1750		case O_CH:
1751		case OEND:
1752			if (newlen > g->mlen) {		/* ends one */
1753				start = newstart;
1754				g->mlen = newlen;
1755				if (offset > -1) {
1756					g->moffset += offset;
1757					offset = newlen;
1758				} else
1759					g->moffset = offset;
1760			} else {
1761				if (offset > -1)
1762					offset += newlen;
1763			}
1764			newlen = 0;
1765			break;
1766		case OANY:
1767			if (newlen > g->mlen) {		/* ends one */
1768				start = newstart;
1769				g->mlen = newlen;
1770				if (offset > -1) {
1771					g->moffset += offset;
1772					offset = newlen;
1773				} else
1774					g->moffset = offset;
1775			} else {
1776				if (offset > -1)
1777					offset += newlen;
1778			}
1779			if (offset > -1)
1780				offset++;
1781			newlen = 0;
1782			break;
1783		case OANYOF:		/* may or may not invalidate offset */
1784			/* First, everything as OANY */
1785			if (newlen > g->mlen) {		/* ends one */
1786				start = newstart;
1787				g->mlen = newlen;
1788				if (offset > -1) {
1789					g->moffset += offset;
1790					offset = newlen;
1791				} else
1792					g->moffset = offset;
1793			} else {
1794				if (offset > -1)
1795					offset += newlen;
1796			}
1797			if (offset > -1)
1798				offset++;
1799			newlen = 0;
1800			/* And, now, if we found out we can't deal with
1801			 * it, make offset = -1.
1802			 */
1803			if (mccs)
1804				offset = -1;
1805			break;
1806		default:
1807			/* Anything here makes it impossible or too hard
1808			 * to calculate the offset -- so we give up;
1809			 * save the last known good offset, in case the
1810			 * must sequence doesn't occur later.
1811			 */
1812			if (newlen > g->mlen) {		/* ends one */
1813				start = newstart;
1814				g->mlen = newlen;
1815				if (offset > -1)
1816					g->moffset += offset;
1817				else
1818					g->moffset = offset;
1819			}
1820			offset = -1;
1821			newlen = 0;
1822			break;
1823		}
1824	} while (OP(s) != OEND);
1825
1826	if (g->mlen == 0) {		/* there isn't one */
1827		g->moffset = -1;
1828		return;
1829	}
1830
1831	/* turn it into a character string */
1832	g->must = malloc((size_t)g->mlen + 1);
1833	if (g->must == NULL) {		/* argh; just forget it */
1834		g->mlen = 0;
1835		g->moffset = -1;
1836		return;
1837	}
1838	cp = g->must;
1839	scan = start;
1840	for (i = g->mlen; i > 0; i--) {
1841		while (OP(s = *scan++) != OCHAR)
1842			continue;
1843		assert(cp < g->must + g->mlen);
1844		*cp++ = (char)OPND(s);
1845	}
1846	assert(cp == g->must + g->mlen);
1847	*cp++ = '\0';		/* just on general principles */
1848}
1849
1850/*
1851 - altoffset - choose biggest offset among multiple choices
1852 == static int altoffset(sop *scan, int offset, int mccs);
1853 *
1854 * Compute, recursively if necessary, the largest offset among multiple
1855 * re paths.
1856 */
1857static int
1858altoffset(scan, offset, mccs)
1859sop *scan;
1860int offset;
1861int mccs;
1862{
1863	int largest;
1864	int try;
1865	sop s;
1866
1867	/* If we gave up already on offsets, return */
1868	if (offset == -1)
1869		return -1;
1870
1871	largest = 0;
1872	try = 0;
1873	s = *scan++;
1874	while (OP(s) != O_QUEST && OP(s) != O_CH) {
1875		switch (OP(s)) {
1876		case OOR1:
1877			if (try > largest)
1878				largest = try;
1879			try = 0;
1880			break;
1881		case OQUEST_:
1882		case OCH_:
1883			try = altoffset(scan, try, mccs);
1884			if (try == -1)
1885				return -1;
1886			scan--;
1887			do {
1888				scan += OPND(s);
1889				s = *scan;
1890				if (OP(s) != O_QUEST && OP(s) != O_CH &&
1891							OP(s) != OOR2)
1892					return -1;
1893			} while (OP(s) != O_QUEST && OP(s) != O_CH);
1894			/* We must skip to the next position, or we'll
1895			 * leave altoffset() too early.
1896			 */
1897			scan++;
1898			break;
1899		case OANYOF:
1900			if (mccs)
1901				return -1;
1902		case OCHAR:
1903		case OANY:
1904			try++;
1905		case OBOW:
1906		case OEOW:
1907		case OLPAREN:
1908		case ORPAREN:
1909		case OOR2:
1910			break;
1911		default:
1912			try = -1;
1913			break;
1914		}
1915		if (try == -1)
1916			return -1;
1917		s = *scan++;
1918	}
1919
1920	if (try > largest)
1921		largest = try;
1922
1923	return largest+offset;
1924}
1925
1926/*
1927 - computejumps - compute char jumps for BM scan
1928 == static void computejumps(register struct parse *p, register struct re_guts *g);
1929 *
1930 * This algorithm assumes g->must exists and is has size greater than
1931 * zero. It's based on the algorithm found on Computer Algorithms by
1932 * Sara Baase.
1933 *
1934 * A char jump is the number of characters one needs to jump based on
1935 * the value of the character from the text that was mismatched.
1936 */
1937static void
1938computejumps(p, g)
1939struct parse *p;
1940struct re_guts *g;
1941{
1942	int ch;
1943	int mindex;
1944
1945	/* Avoid making errors worse */
1946	if (p->error != 0)
1947		return;
1948
1949	g->charjump = (int*) malloc((NC + 1) * sizeof(int));
1950	if (g->charjump == NULL)	/* Not a fatal error */
1951		return;
1952	/* Adjust for signed chars, if necessary */
1953	g->charjump = &g->charjump[-(CHAR_MIN)];
1954
1955	/* If the character does not exist in the pattern, the jump
1956	 * is equal to the number of characters in the pattern.
1957	 */
1958	for (ch = CHAR_MIN; ch < (CHAR_MAX + 1); ch++)
1959		g->charjump[ch] = g->mlen;
1960
1961	/* If the character does exist, compute the jump that would
1962	 * take us to the last character in the pattern equal to it
1963	 * (notice that we match right to left, so that last character
1964	 * is the first one that would be matched).
1965	 */
1966	for (mindex = 0; mindex < g->mlen; mindex++)
1967		g->charjump[g->must[mindex]] = g->mlen - mindex - 1;
1968}
1969
1970/*
1971 - computematchjumps - compute match jumps for BM scan
1972 == static void computematchjumps(register struct parse *p, register struct re_guts *g);
1973 *
1974 * This algorithm assumes g->must exists and is has size greater than
1975 * zero. It's based on the algorithm found on Computer Algorithms by
1976 * Sara Baase.
1977 *
1978 * A match jump is the number of characters one needs to advance based
1979 * on the already-matched suffix.
1980 * Notice that all values here are minus (g->mlen-1), because of the way
1981 * the search algorithm works.
1982 */
1983static void
1984computematchjumps(p, g)
1985struct parse *p;
1986struct re_guts *g;
1987{
1988	int mindex;		/* General "must" iterator */
1989	int suffix;		/* Keeps track of matching suffix */
1990	int ssuffix;		/* Keeps track of suffixes' suffix */
1991	int* pmatches;		/* pmatches[k] points to the next i
1992				 * such that i+1...mlen is a substring
1993				 * of k+1...k+mlen-i-1
1994				 */
1995
1996	/* Avoid making errors worse */
1997	if (p->error != 0)
1998		return;
1999
2000	pmatches = (int*) malloc(g->mlen * sizeof(unsigned int));
2001	if (pmatches == NULL) {
2002		g->matchjump = NULL;
2003		return;
2004	}
2005
2006	g->matchjump = (int*) malloc(g->mlen * sizeof(unsigned int));
2007	if (g->matchjump == NULL)	/* Not a fatal error */
2008		return;
2009
2010	/* Set maximum possible jump for each character in the pattern */
2011	for (mindex = 0; mindex < g->mlen; mindex++)
2012		g->matchjump[mindex] = 2*g->mlen - mindex - 1;
2013
2014	/* Compute pmatches[] */
2015	for (mindex = g->mlen - 1, suffix = g->mlen; mindex >= 0;
2016	    mindex--, suffix--) {
2017		pmatches[mindex] = suffix;
2018
2019		/* If a mismatch is found, interrupting the substring,
2020		 * compute the matchjump for that position. If no
2021		 * mismatch is found, then a text substring mismatched
2022		 * against the suffix will also mismatch against the
2023		 * substring.
2024		 */
2025		while (suffix < g->mlen
2026		    && g->must[mindex] != g->must[suffix]) {
2027			g->matchjump[suffix] = MIN(g->matchjump[suffix],
2028			    g->mlen - mindex - 1);
2029			suffix = pmatches[suffix];
2030		}
2031	}
2032
2033	/* Compute the matchjump up to the last substring found to jump
2034	 * to the beginning of the largest must pattern prefix matching
2035	 * it's own suffix.
2036	 */
2037	for (mindex = 0; mindex <= suffix; mindex++)
2038		g->matchjump[mindex] = MIN(g->matchjump[mindex],
2039		    g->mlen + suffix - mindex);
2040
2041        ssuffix = pmatches[suffix];
2042        while (suffix < g->mlen) {
2043                while (suffix <= ssuffix && suffix < g->mlen) {
2044                        g->matchjump[suffix] = MIN(g->matchjump[suffix],
2045			    g->mlen + ssuffix - suffix);
2046                        suffix++;
2047                }
2048		if (suffix < g->mlen)
2049                	ssuffix = pmatches[ssuffix];
2050        }
2051
2052	free(pmatches);
2053}
2054
2055/*
2056 - pluscount - count + nesting
2057 == static sopno pluscount(register struct parse *p, register struct re_guts *g);
2058 */
2059static sopno			/* nesting depth */
2060pluscount(p, g)
2061struct parse *p;
2062register struct re_guts *g;
2063{
2064	register sop *scan;
2065	register sop s;
2066	register sopno plusnest = 0;
2067	register sopno maxnest = 0;
2068
2069	if (p->error != 0)
2070		return(0);	/* there may not be an OEND */
2071
2072	scan = g->strip + 1;
2073	do {
2074		s = *scan++;
2075		switch (OP(s)) {
2076		case OPLUS_:
2077			plusnest++;
2078			break;
2079		case O_PLUS:
2080			if (plusnest > maxnest)
2081				maxnest = plusnest;
2082			plusnest--;
2083			break;
2084		}
2085	} while (OP(s) != OEND);
2086	if (plusnest != 0)
2087		g->iflags |= BAD;
2088	return(maxnest);
2089}
2090