1/*
2 * lexical analyzer
3 * This file is #included by regcomp.c.
4 *
5 * Copyright (c) 1998, 1999 Henry Spencer.  All rights reserved.
6 *
7 * Development of this software was funded, in part, by Cray Research Inc.,
8 * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics
9 * Corporation, none of whom are responsible for the results.  The author
10 * thanks all of them.
11 *
12 * Redistribution and use in source and binary forms -- with or without
13 * modification -- are permitted for any purpose, provided that
14 * redistributions in source form retain this entire copyright notice and
15 * indicate the origin and nature of any modifications.
16 *
17 * I'd appreciate being given credit for this package in the documentation
18 * of software which uses it, but that is not a requirement.
19 *
20 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
21 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
22 * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL
23 * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
24 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
25 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
26 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
27 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
28 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
29 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 *
31 */
32
33/* scanning macros (know about v) */
34#define	ATEOS()		(v->now >= v->stop)
35#define	HAVE(n)		(v->stop - v->now >= (n))
36#define	NEXT1(c)	(!ATEOS() && *v->now == CHR(c))
37#define	NEXT2(a,b)	(HAVE(2) && *v->now == CHR(a) && *(v->now+1) == CHR(b))
38#define	NEXT3(a,b,c)	(HAVE(3) && *v->now == CHR(a) && \
39						*(v->now+1) == CHR(b) && \
40						*(v->now+2) == CHR(c))
41#define	SET(c)		(v->nexttype = (c))
42#define	SETV(c, n)	(v->nexttype = (c), v->nextvalue = (n))
43#define	RET(c)		return (SET(c), 1)
44#define	RETV(c, n)	return (SETV(c, n), 1)
45#define	FAILW(e)	return (ERR(e), 0)	/* ERR does SET(EOS) */
46#define	LASTTYPE(t)	(v->lasttype == (t))
47
48/* lexical contexts */
49#define	L_ERE	1	/* mainline ERE/ARE */
50#define	L_BRE	2	/* mainline BRE */
51#define	L_Q	3	/* REG_QUOTE */
52#define	L_EBND	4	/* ERE/ARE bound */
53#define	L_BBND	5	/* BRE bound */
54#define	L_BRACK	6	/* brackets */
55#define	L_CEL	7	/* collating element */
56#define	L_ECL	8	/* equivalence class */
57#define	L_CCL	9	/* character class */
58#define	INTOCON(c)	(v->lexcon = (c))
59#define	INCON(con)	(v->lexcon == (con))
60
61/* construct pointer past end of chr array */
62#define	ENDOF(array)	((array) + sizeof(array)/sizeof(chr))
63
64/*
65 - lexstart - set up lexical stuff, scan leading options
66 ^ static VOID lexstart(struct vars *);
67 */
68static VOID
69lexstart(v)
70struct vars *v;
71{
72	prefixes(v);			/* may turn on new type bits etc. */
73	NOERR();
74
75	if (v->cflags&REG_QUOTE) {
76		assert(!(v->cflags&(REG_ADVANCED|REG_EXPANDED|REG_NEWLINE)));
77		INTOCON(L_Q);
78	} else if (v->cflags&REG_EXTENDED) {
79		assert(!(v->cflags&REG_QUOTE));
80		INTOCON(L_ERE);
81	} else {
82		assert(!(v->cflags&(REG_QUOTE|REG_ADVF)));
83		INTOCON(L_BRE);
84	}
85
86	v->nexttype = EMPTY;		/* remember we were at the start */
87	next(v);			/* set up the first token */
88}
89
90/*
91 - prefixes - implement various special prefixes
92 ^ static VOID prefixes(struct vars *);
93 */
94static VOID
95prefixes(v)
96struct vars *v;
97{
98	/* literal string doesn't get any of this stuff */
99	if (v->cflags&REG_QUOTE)
100		return;
101
102	/* initial "***" gets special things */
103	if (HAVE(4) && NEXT3('*', '*', '*'))
104		switch (*(v->now + 3)) {
105		case CHR('?'):		/* "***?" error, msg shows version */
106			ERR(REG_BADPAT);
107			return;		/* proceed no further */
108			break;
109		case CHR('='):		/* "***=" shifts to literal string */
110			NOTE(REG_UNONPOSIX);
111			v->cflags |= REG_QUOTE;
112			v->cflags &= ~(REG_ADVANCED|REG_EXPANDED|REG_NEWLINE);
113			v->now += 4;
114			return;		/* and there can be no more prefixes */
115			break;
116		case CHR(':'):		/* "***:" shifts to AREs */
117			NOTE(REG_UNONPOSIX);
118			v->cflags |= REG_ADVANCED;
119			v->now += 4;
120			break;
121		default:		/* otherwise *** is just an error */
122			ERR(REG_BADRPT);
123			return;
124			break;
125		}
126
127	/* BREs and EREs don't get embedded options */
128	if ((v->cflags&REG_ADVANCED) != REG_ADVANCED)
129		return;
130
131	/* embedded options (AREs only) */
132	if (HAVE(3) && NEXT2('(', '?') && iscalpha(*(v->now + 2))) {
133		NOTE(REG_UNONPOSIX);
134		v->now += 2;
135		for (; !ATEOS() && iscalpha(*v->now); v->now++)
136			switch (*v->now) {
137			case CHR('b'):		/* BREs (but why???) */
138				v->cflags &= ~(REG_ADVANCED|REG_QUOTE);
139				break;
140			case CHR('c'):		/* case sensitive */
141				v->cflags &= ~REG_ICASE;
142				break;
143			case CHR('e'):		/* plain EREs */
144				v->cflags |= REG_EXTENDED;
145				v->cflags &= ~(REG_ADVF|REG_QUOTE);
146				break;
147			case CHR('i'):		/* case insensitive */
148				v->cflags |= REG_ICASE;
149				break;
150			case CHR('m'):		/* Perloid synonym for n */
151			case CHR('n'):		/* \n affects ^ $ . [^ */
152				v->cflags |= REG_NEWLINE;
153				break;
154			case CHR('p'):		/* ~Perl, \n affects . [^ */
155				v->cflags |= REG_NLSTOP;
156				v->cflags &= ~REG_NLANCH;
157				break;
158			case CHR('q'):		/* literal string */
159				v->cflags |= REG_QUOTE;
160				v->cflags &= ~REG_ADVANCED;
161				break;
162			case CHR('s'):		/* single line, \n ordinary */
163				v->cflags &= ~REG_NEWLINE;
164				break;
165			case CHR('t'):		/* tight syntax */
166				v->cflags &= ~REG_EXPANDED;
167				break;
168			case CHR('w'):		/* weird, \n affects ^ $ only */
169				v->cflags &= ~REG_NLSTOP;
170				v->cflags |= REG_NLANCH;
171				break;
172			case CHR('x'):		/* expanded syntax */
173				v->cflags |= REG_EXPANDED;
174				break;
175			default:
176				ERR(REG_BADOPT);
177				return;
178			}
179		if (!NEXT1(')')) {
180			ERR(REG_BADOPT);
181			return;
182		}
183		v->now++;
184		if (v->cflags&REG_QUOTE)
185			v->cflags &= ~(REG_EXPANDED|REG_NEWLINE);
186	}
187}
188
189/*
190 - lexnest - "call a subroutine", interpolating string at the lexical level
191 * Note, this is not a very general facility.  There are a number of
192 * implicit assumptions about what sorts of strings can be subroutines.
193 ^ static VOID lexnest(struct vars *, chr *, chr *);
194 */
195static VOID
196lexnest(v, beginp, endp)
197struct vars *v;
198chr *beginp;				/* start of interpolation */
199chr *endp;				/* one past end of interpolation */
200{
201	assert(v->savenow == NULL);	/* only one level of nesting */
202	v->savenow = v->now;
203	v->savestop = v->stop;
204	v->now = beginp;
205	v->stop = endp;
206}
207
208/*
209 * string constants to interpolate as expansions of things like \d
210 */
211static chr backd[] = {		/* \d */
212	CHR('['), CHR('['), CHR(':'),
213	CHR('d'), CHR('i'), CHR('g'), CHR('i'), CHR('t'),
214	CHR(':'), CHR(']'), CHR(']')
215};
216static chr backD[] = {		/* \D */
217	CHR('['), CHR('^'), CHR('['), CHR(':'),
218	CHR('d'), CHR('i'), CHR('g'), CHR('i'), CHR('t'),
219	CHR(':'), CHR(']'), CHR(']')
220};
221static chr brbackd[] = {	/* \d within brackets */
222	CHR('['), CHR(':'),
223	CHR('d'), CHR('i'), CHR('g'), CHR('i'), CHR('t'),
224	CHR(':'), CHR(']')
225};
226static chr backs[] = {		/* \s */
227	CHR('['), CHR('['), CHR(':'),
228	CHR('s'), CHR('p'), CHR('a'), CHR('c'), CHR('e'),
229	CHR(':'), CHR(']'), CHR(']')
230};
231static chr backS[] = {		/* \S */
232	CHR('['), CHR('^'), CHR('['), CHR(':'),
233	CHR('s'), CHR('p'), CHR('a'), CHR('c'), CHR('e'),
234	CHR(':'), CHR(']'), CHR(']')
235};
236static chr brbacks[] = {	/* \s within brackets */
237	CHR('['), CHR(':'),
238	CHR('s'), CHR('p'), CHR('a'), CHR('c'), CHR('e'),
239	CHR(':'), CHR(']')
240};
241static chr backw[] = {		/* \w */
242	CHR('['), CHR('['), CHR(':'),
243	CHR('a'), CHR('l'), CHR('n'), CHR('u'), CHR('m'),
244	CHR(':'), CHR(']'), CHR('_'), CHR(']')
245};
246static chr backW[] = {		/* \W */
247	CHR('['), CHR('^'), CHR('['), CHR(':'),
248	CHR('a'), CHR('l'), CHR('n'), CHR('u'), CHR('m'),
249	CHR(':'), CHR(']'), CHR('_'), CHR(']')
250};
251static chr brbackw[] = {	/* \w within brackets */
252	CHR('['), CHR(':'),
253	CHR('a'), CHR('l'), CHR('n'), CHR('u'), CHR('m'),
254	CHR(':'), CHR(']'), CHR('_')
255};
256
257/*
258 - lexword - interpolate a bracket expression for word characters
259 * Possibly ought to inquire whether there is a "word" character class.
260 ^ static VOID lexword(struct vars *);
261 */
262static VOID
263lexword(v)
264struct vars *v;
265{
266	lexnest(v, backw, ENDOF(backw));
267}
268
269/*
270 - next - get next token
271 ^ static int next(struct vars *);
272 */
273static int			/* 1 normal, 0 failure */
274next(v)
275struct vars *v;
276{
277	chr c;
278
279	/* errors yield an infinite sequence of failures */
280	if (ISERR())
281		return 0;	/* the error has set nexttype to EOS */
282
283	/* remember flavor of last token */
284	v->lasttype = v->nexttype;
285
286	/* REG_BOSONLY */
287	if (v->nexttype == EMPTY && (v->cflags&REG_BOSONLY)) {
288		/* at start of a REG_BOSONLY RE */
289		RETV(SBEGIN, 0);		/* same as \A */
290	}
291
292	/* if we're nested and we've hit end, return to outer level */
293	if (v->savenow != NULL && ATEOS()) {
294		v->now = v->savenow;
295		v->stop = v->savestop;
296		v->savenow = v->savestop = NULL;
297	}
298
299	/* skip white space etc. if appropriate (not in literal or []) */
300	if (v->cflags&REG_EXPANDED)
301		switch (v->lexcon) {
302		case L_ERE:
303		case L_BRE:
304		case L_EBND:
305		case L_BBND:
306			skip(v);
307			break;
308		}
309
310	/* handle EOS, depending on context */
311	if (ATEOS()) {
312		switch (v->lexcon) {
313		case L_ERE:
314		case L_BRE:
315		case L_Q:
316			RET(EOS);
317			break;
318		case L_EBND:
319		case L_BBND:
320			FAILW(REG_EBRACE);
321			break;
322		case L_BRACK:
323		case L_CEL:
324		case L_ECL:
325		case L_CCL:
326			FAILW(REG_EBRACK);
327			break;
328		}
329		assert(NOTREACHED);
330	}
331
332	/* okay, time to actually get a character */
333	c = *v->now++;
334
335	/* deal with the easy contexts, punt EREs to code below */
336	switch (v->lexcon) {
337	case L_BRE:			/* punt BREs to separate function */
338		return brenext(v, c);
339		break;
340	case L_ERE:			/* see below */
341		break;
342	case L_Q:			/* literal strings are easy */
343		RETV(PLAIN, c);
344		break;
345	case L_BBND:			/* bounds are fairly simple */
346	case L_EBND:
347		switch (c) {
348		case CHR('0'): case CHR('1'): case CHR('2'): case CHR('3'):
349		case CHR('4'): case CHR('5'): case CHR('6'): case CHR('7'):
350		case CHR('8'): case CHR('9'):
351			RETV(DIGIT, (chr)DIGITVAL(c));
352			break;
353		case CHR(','):
354			RET(',');
355			break;
356		case CHR('}'):		/* ERE bound ends with } */
357			if (INCON(L_EBND)) {
358				INTOCON(L_ERE);
359				if ((v->cflags&REG_ADVF) && NEXT1('?')) {
360					v->now++;
361					NOTE(REG_UNONPOSIX);
362					RETV('}', 0);
363				}
364				RETV('}', 1);
365			} else
366				FAILW(REG_BADBR);
367			break;
368		case CHR('\\'):		/* BRE bound ends with \} */
369			if (INCON(L_BBND) && NEXT1('}')) {
370				v->now++;
371				INTOCON(L_BRE);
372				RET('}');
373			} else
374				FAILW(REG_BADBR);
375			break;
376		default:
377			FAILW(REG_BADBR);
378			break;
379		}
380		assert(NOTREACHED);
381		break;
382	case L_BRACK:			/* brackets are not too hard */
383		switch (c) {
384		case CHR(']'):
385			if (LASTTYPE('['))
386				RETV(PLAIN, c);
387			else {
388				INTOCON((v->cflags&REG_EXTENDED) ?
389							L_ERE : L_BRE);
390				RET(']');
391			}
392			break;
393		case CHR('\\'):
394			NOTE(REG_UBBS);
395			if (!(v->cflags&REG_ADVF))
396				RETV(PLAIN, c);
397			NOTE(REG_UNONPOSIX);
398			if (ATEOS())
399				FAILW(REG_EESCAPE);
400			(DISCARD)lexescape(v);
401			switch (v->nexttype) {	/* not all escapes okay here */
402			case PLAIN:
403				return 1;
404				break;
405			case CCLASS:
406				switch (v->nextvalue) {
407				case 'd':
408					lexnest(v, brbackd, ENDOF(brbackd));
409					break;
410				case 's':
411					lexnest(v, brbacks, ENDOF(brbacks));
412					break;
413				case 'w':
414					lexnest(v, brbackw, ENDOF(brbackw));
415					break;
416				default:
417					FAILW(REG_EESCAPE);
418					break;
419				}
420				/* lexnest done, back up and try again */
421				v->nexttype = v->lasttype;
422				return next(v);
423				break;
424			}
425			/* not one of the acceptable escapes */
426			FAILW(REG_EESCAPE);
427			break;
428		case CHR('-'):
429			if (LASTTYPE('[') || NEXT1(']'))
430				RETV(PLAIN, c);
431			else
432				RETV(RANGE, c);
433			break;
434		case CHR('['):
435			if (ATEOS())
436				FAILW(REG_EBRACK);
437			switch (*v->now++) {
438			case CHR('.'):
439				INTOCON(L_CEL);
440				/* might or might not be locale-specific */
441				RET(COLLEL);
442				break;
443			case CHR('='):
444				INTOCON(L_ECL);
445				NOTE(REG_ULOCALE);
446				RET(ECLASS);
447				break;
448			case CHR(':'):
449				INTOCON(L_CCL);
450				NOTE(REG_ULOCALE);
451				RET(CCLASS);
452				break;
453			default:			/* oops */
454				v->now--;
455				RETV(PLAIN, c);
456				break;
457			}
458			assert(NOTREACHED);
459			break;
460		default:
461			RETV(PLAIN, c);
462			break;
463		}
464		assert(NOTREACHED);
465		break;
466	case L_CEL:			/* collating elements are easy */
467		if (c == CHR('.') && NEXT1(']')) {
468			v->now++;
469			INTOCON(L_BRACK);
470			RETV(END, '.');
471		} else
472			RETV(PLAIN, c);
473		break;
474	case L_ECL:			/* ditto equivalence classes */
475		if (c == CHR('=') && NEXT1(']')) {
476			v->now++;
477			INTOCON(L_BRACK);
478			RETV(END, '=');
479		} else
480			RETV(PLAIN, c);
481		break;
482	case L_CCL:			/* ditto character classes */
483		if (c == CHR(':') && NEXT1(']')) {
484			v->now++;
485			INTOCON(L_BRACK);
486			RETV(END, ':');
487		} else
488			RETV(PLAIN, c);
489		break;
490	default:
491		assert(NOTREACHED);
492		break;
493	}
494
495	/* that got rid of everything except EREs and AREs */
496	assert(INCON(L_ERE));
497
498	/* deal with EREs and AREs, except for backslashes */
499	switch (c) {
500	case CHR('|'):
501		RET('|');
502		break;
503	case CHR('*'):
504		if ((v->cflags&REG_ADVF) && NEXT1('?')) {
505			v->now++;
506			NOTE(REG_UNONPOSIX);
507			RETV('*', 0);
508		}
509		RETV('*', 1);
510		break;
511	case CHR('+'):
512		if ((v->cflags&REG_ADVF) && NEXT1('?')) {
513			v->now++;
514			NOTE(REG_UNONPOSIX);
515			RETV('+', 0);
516		}
517		RETV('+', 1);
518		break;
519	case CHR('?'):
520		if ((v->cflags&REG_ADVF) && NEXT1('?')) {
521			v->now++;
522			NOTE(REG_UNONPOSIX);
523			RETV('?', 0);
524		}
525		RETV('?', 1);
526		break;
527	case CHR('{'):		/* bounds start or plain character */
528		if (v->cflags&REG_EXPANDED)
529			skip(v);
530		if (ATEOS() || !iscdigit(*v->now)) {
531			NOTE(REG_UBRACES);
532			NOTE(REG_UUNSPEC);
533			RETV(PLAIN, c);
534		} else {
535			NOTE(REG_UBOUNDS);
536			INTOCON(L_EBND);
537			RET('{');
538		}
539		assert(NOTREACHED);
540		break;
541	case CHR('('):		/* parenthesis, or advanced extension */
542		if ((v->cflags&REG_ADVF) && NEXT1('?')) {
543			NOTE(REG_UNONPOSIX);
544			v->now++;
545			switch (*v->now++) {
546			case CHR(':'):		/* non-capturing paren */
547				RETV('(', 0);
548				break;
549			case CHR('#'):		/* comment */
550				while (!ATEOS() && *v->now != CHR(')'))
551					v->now++;
552				if (!ATEOS())
553					v->now++;
554				assert(v->nexttype == v->lasttype);
555				return next(v);
556				break;
557			case CHR('='):		/* positive lookahead */
558				NOTE(REG_ULOOKAHEAD);
559				RETV(LACON, 1);
560				break;
561			case CHR('!'):		/* negative lookahead */
562				NOTE(REG_ULOOKAHEAD);
563				RETV(LACON, 0);
564				break;
565			default:
566				FAILW(REG_BADRPT);
567				break;
568			}
569			assert(NOTREACHED);
570		}
571		if (v->cflags&REG_NOSUB)
572			RETV('(', 0);		/* all parens non-capturing */
573		else
574			RETV('(', 1);
575		break;
576	case CHR(')'):
577		if (LASTTYPE('(')) {
578			NOTE(REG_UUNSPEC);
579		}
580		RETV(')', c);
581		break;
582	case CHR('['):		/* easy except for [[:<:]] and [[:>:]] */
583		if (HAVE(6) &&	*(v->now+0) == CHR('[') &&
584				*(v->now+1) == CHR(':') &&
585				(*(v->now+2) == CHR('<') ||
586						*(v->now+2) == CHR('>')) &&
587				*(v->now+3) == CHR(':') &&
588				*(v->now+4) == CHR(']') &&
589				*(v->now+5) == CHR(']')) {
590			c = *(v->now+2);
591			v->now += 6;
592			NOTE(REG_UNONPOSIX);
593			RET((c == CHR('<')) ? '<' : '>');
594		}
595		INTOCON(L_BRACK);
596		if (NEXT1('^')) {
597			v->now++;
598			RETV('[', 0);
599		}
600		RETV('[', 1);
601		break;
602	case CHR('.'):
603		RET('.');
604		break;
605	case CHR('^'):
606		RET('^');
607		break;
608	case CHR('$'):
609		RET('$');
610		break;
611	case CHR('\\'):		/* mostly punt backslashes to code below */
612		if (ATEOS())
613			FAILW(REG_EESCAPE);
614		break;
615	default:		/* ordinary character */
616		RETV(PLAIN, c);
617		break;
618	}
619
620	/* ERE/ARE backslash handling; backslash already eaten */
621	assert(!ATEOS());
622	if (!(v->cflags&REG_ADVF)) {	/* only AREs have non-trivial escapes */
623		if (iscalnum(*v->now)) {
624			NOTE(REG_UBSALNUM);
625			NOTE(REG_UUNSPEC);
626		}
627		RETV(PLAIN, *v->now++);
628	}
629	(DISCARD)lexescape(v);
630	if (ISERR())
631		FAILW(REG_EESCAPE);
632	if (v->nexttype == CCLASS) {	/* fudge at lexical level */
633		switch (v->nextvalue) {
634		case 'd':	lexnest(v, backd, ENDOF(backd)); break;
635		case 'D':	lexnest(v, backD, ENDOF(backD)); break;
636		case 's':	lexnest(v, backs, ENDOF(backs)); break;
637		case 'S':	lexnest(v, backS, ENDOF(backS)); break;
638		case 'w':	lexnest(v, backw, ENDOF(backw)); break;
639		case 'W':	lexnest(v, backW, ENDOF(backW)); break;
640		default:
641			assert(NOTREACHED);
642			FAILW(REG_ASSERT);
643			break;
644		}
645		/* lexnest done, back up and try again */
646		v->nexttype = v->lasttype;
647		return next(v);
648	}
649	/* otherwise, lexescape has already done the work */
650	return !ISERR();
651}
652
653/*
654 - lexescape - parse an ARE backslash escape (backslash already eaten)
655 * Note slightly nonstandard use of the CCLASS type code.
656 ^ static int lexescape(struct vars *);
657 */
658static int			/* not actually used, but convenient for RETV */
659lexescape(v)
660struct vars *v;
661{
662	chr c;
663	static chr alert[] = {
664		CHR('a'), CHR('l'), CHR('e'), CHR('r'), CHR('t')
665	};
666	static chr esc[] = {
667		CHR('E'), CHR('S'), CHR('C')
668	};
669	chr *save;
670
671	assert(v->cflags&REG_ADVF);
672
673	assert(!ATEOS());
674	c = *v->now++;
675	if (!iscalnum(c))
676		RETV(PLAIN, c);
677
678	NOTE(REG_UNONPOSIX);
679	switch (c) {
680	case CHR('a'):
681		RETV(PLAIN, chrnamed(v, alert, ENDOF(alert), CHR('\007')));
682		break;
683	case CHR('A'):
684		RETV(SBEGIN, 0);
685		break;
686	case CHR('b'):
687		RETV(PLAIN, CHR('\b'));
688		break;
689	case CHR('B'):
690		RETV(PLAIN, CHR('\\'));
691		break;
692	case CHR('c'):
693		NOTE(REG_UUNPORT);
694		if (ATEOS())
695			FAILW(REG_EESCAPE);
696		RETV(PLAIN, (chr)(*v->now++ & 037));
697		break;
698	case CHR('d'):
699		NOTE(REG_ULOCALE);
700		RETV(CCLASS, 'd');
701		break;
702	case CHR('D'):
703		NOTE(REG_ULOCALE);
704		RETV(CCLASS, 'D');
705		break;
706	case CHR('e'):
707		NOTE(REG_UUNPORT);
708		RETV(PLAIN, chrnamed(v, esc, ENDOF(esc), CHR('\033')));
709		break;
710	case CHR('f'):
711		RETV(PLAIN, CHR('\f'));
712		break;
713	case CHR('m'):
714		RET('<');
715		break;
716	case CHR('M'):
717		RET('>');
718		break;
719	case CHR('n'):
720		RETV(PLAIN, CHR('\n'));
721		break;
722	case CHR('r'):
723		RETV(PLAIN, CHR('\r'));
724		break;
725	case CHR('s'):
726		NOTE(REG_ULOCALE);
727		RETV(CCLASS, 's');
728		break;
729	case CHR('S'):
730		NOTE(REG_ULOCALE);
731		RETV(CCLASS, 'S');
732		break;
733	case CHR('t'):
734		RETV(PLAIN, CHR('\t'));
735		break;
736	case CHR('u'):
737		c = lexdigits(v, 16, 4, 4);
738		if (ISERR())
739			FAILW(REG_EESCAPE);
740		RETV(PLAIN, c);
741		break;
742	case CHR('U'):
743		c = lexdigits(v, 16, 8, 8);
744		if (ISERR())
745			FAILW(REG_EESCAPE);
746		RETV(PLAIN, c);
747		break;
748	case CHR('v'):
749		RETV(PLAIN, CHR('\v'));
750		break;
751	case CHR('w'):
752		NOTE(REG_ULOCALE);
753		RETV(CCLASS, 'w');
754		break;
755	case CHR('W'):
756		NOTE(REG_ULOCALE);
757		RETV(CCLASS, 'W');
758		break;
759	case CHR('x'):
760		NOTE(REG_UUNPORT);
761		c = lexdigits(v, 16, 1, 255);	/* REs >255 long outside spec */
762		if (ISERR())
763			FAILW(REG_EESCAPE);
764		RETV(PLAIN, c);
765		break;
766	case CHR('y'):
767		NOTE(REG_ULOCALE);
768		RETV(WBDRY, 0);
769		break;
770	case CHR('Y'):
771		NOTE(REG_ULOCALE);
772		RETV(NWBDRY, 0);
773		break;
774	case CHR('Z'):
775		RETV(SEND, 0);
776		break;
777	case CHR('1'): case CHR('2'): case CHR('3'): case CHR('4'):
778	case CHR('5'): case CHR('6'): case CHR('7'): case CHR('8'):
779	case CHR('9'):
780		save = v->now;
781		v->now--;	/* put first digit back */
782		c = lexdigits(v, 10, 1, 255);	/* REs >255 long outside spec */
783		if (ISERR())
784			FAILW(REG_EESCAPE);
785		/* ugly heuristic (first test is "exactly 1 digit?") */
786		if (v->now-save == 0 || ((int)c > 0 && (int)c <= v->nsubexp)) {
787			NOTE(REG_UBACKREF);
788			RETV(BACKREF, (chr)c);
789		}
790		/* oops, doesn't look like it's a backref after all... */
791		v->now = save;
792		/* and fall through into octal number */
793	case CHR('0'):
794		NOTE(REG_UUNPORT);
795		v->now--;	/* put first digit back */
796		c = lexdigits(v, 8, 1, 3);
797		if (ISERR())
798			FAILW(REG_EESCAPE);
799		RETV(PLAIN, c);
800		break;
801	default:
802		assert(iscalpha(c));
803		FAILW(REG_EESCAPE);	/* unknown alphabetic escape */
804		break;
805	}
806	assert(NOTREACHED);
807}
808
809/*
810 - lexdigits - slurp up digits and return chr value
811 ^ static chr lexdigits(struct vars *, int, int, int);
812 */
813static chr			/* chr value; errors signalled via ERR */
814lexdigits(v, base, minlen, maxlen)
815struct vars *v;
816int base;
817int minlen;
818int maxlen;
819{
820	uchr n;			/* unsigned to avoid overflow misbehavior */
821	int len;
822	chr c;
823	int d;
824	CONST uchr ub = (uchr) base;
825
826	n = 0;
827	for (len = 0; len < maxlen && !ATEOS(); len++) {
828		c = *v->now++;
829		switch (c) {
830		case CHR('0'): case CHR('1'): case CHR('2'): case CHR('3'):
831		case CHR('4'): case CHR('5'): case CHR('6'): case CHR('7'):
832		case CHR('8'): case CHR('9'):
833			d = DIGITVAL(c);
834			break;
835		case CHR('a'): case CHR('A'): d = 10; break;
836		case CHR('b'): case CHR('B'): d = 11; break;
837		case CHR('c'): case CHR('C'): d = 12; break;
838		case CHR('d'): case CHR('D'): d = 13; break;
839		case CHR('e'): case CHR('E'): d = 14; break;
840		case CHR('f'): case CHR('F'): d = 15; break;
841		default:
842			v->now--;	/* oops, not a digit at all */
843			d = -1;
844			break;
845		}
846
847		if (d >= base) {	/* not a plausible digit */
848			v->now--;
849			d = -1;
850		}
851		if (d < 0)
852			break;		/* NOTE BREAK OUT */
853		n = n*ub + (uchr)d;
854	}
855	if (len < minlen)
856		ERR(REG_EESCAPE);
857
858	return (chr)n;
859}
860
861/*
862 - brenext - get next BRE token
863 * This is much like EREs except for all the stupid backslashes and the
864 * context-dependency of some things.
865 ^ static int brenext(struct vars *, pchr);
866 */
867static int			/* 1 normal, 0 failure */
868brenext(v, pc)
869struct vars *v;
870pchr pc;
871{
872	chr c = (chr)pc;
873
874	switch (c) {
875	case CHR('*'):
876		if (LASTTYPE(EMPTY) || LASTTYPE('(') || LASTTYPE('^'))
877			RETV(PLAIN, c);
878		RET('*');
879		break;
880	case CHR('['):
881		if (HAVE(6) &&	*(v->now+0) == CHR('[') &&
882				*(v->now+1) == CHR(':') &&
883				(*(v->now+2) == CHR('<') ||
884						*(v->now+2) == CHR('>')) &&
885				*(v->now+3) == CHR(':') &&
886				*(v->now+4) == CHR(']') &&
887				*(v->now+5) == CHR(']')) {
888			c = *(v->now+2);
889			v->now += 6;
890			NOTE(REG_UNONPOSIX);
891			RET((c == CHR('<')) ? '<' : '>');
892		}
893		INTOCON(L_BRACK);
894		if (NEXT1('^')) {
895			v->now++;
896			RETV('[', 0);
897		}
898		RETV('[', 1);
899		break;
900	case CHR('.'):
901		RET('.');
902		break;
903	case CHR('^'):
904		if (LASTTYPE(EMPTY))
905			RET('^');
906		if (LASTTYPE('(')) {
907			NOTE(REG_UUNSPEC);
908			RET('^');
909		}
910		RETV(PLAIN, c);
911		break;
912	case CHR('$'):
913		if (v->cflags&REG_EXPANDED)
914			skip(v);
915		if (ATEOS())
916			RET('$');
917		if (NEXT2('\\', ')')) {
918			NOTE(REG_UUNSPEC);
919			RET('$');
920		}
921		RETV(PLAIN, c);
922		break;
923	case CHR('\\'):
924		break;		/* see below */
925	default:
926		RETV(PLAIN, c);
927		break;
928	}
929
930	assert(c == CHR('\\'));
931
932	if (ATEOS())
933		FAILW(REG_EESCAPE);
934
935	c = *v->now++;
936	switch (c) {
937	case CHR('{'):
938		INTOCON(L_BBND);
939		NOTE(REG_UBOUNDS);
940		RET('{');
941		break;
942	case CHR('('):
943		RETV('(', 1);
944		break;
945	case CHR(')'):
946		RETV(')', c);
947		break;
948	case CHR('<'):
949		NOTE(REG_UNONPOSIX);
950		RET('<');
951		break;
952	case CHR('>'):
953		NOTE(REG_UNONPOSIX);
954		RET('>');
955		break;
956	case CHR('1'): case CHR('2'): case CHR('3'): case CHR('4'):
957	case CHR('5'): case CHR('6'): case CHR('7'): case CHR('8'):
958	case CHR('9'):
959		NOTE(REG_UBACKREF);
960		RETV(BACKREF, (chr)DIGITVAL(c));
961		break;
962	default:
963		if (iscalnum(c)) {
964			NOTE(REG_UBSALNUM);
965			NOTE(REG_UUNSPEC);
966		}
967		RETV(PLAIN, c);
968		break;
969	}
970
971	assert(NOTREACHED);
972}
973
974/*
975 - skip - skip white space and comments in expanded form
976 ^ static VOID skip(struct vars *);
977 */
978static VOID
979skip(v)
980struct vars *v;
981{
982	chr *start = v->now;
983
984	assert(v->cflags&REG_EXPANDED);
985
986	for (;;) {
987		while (!ATEOS() && iscspace(*v->now))
988			v->now++;
989		if (ATEOS() || *v->now != CHR('#'))
990			break;				/* NOTE BREAK OUT */
991		assert(NEXT1('#'));
992		while (!ATEOS() && *v->now != CHR('\n'))
993			v->now++;
994		/* leave the newline to be picked up by the iscspace loop */
995	}
996
997	if (v->now != start)
998		NOTE(REG_UNONPOSIX);
999}
1000
1001/*
1002 - newline - return the chr for a newline
1003 * This helps confine use of CHR to this source file.
1004 ^ static chr newline(NOPARMS);
1005 */
1006static chr
1007newline()
1008{
1009	return CHR('\n');
1010}
1011
1012/*
1013 - ch - return the chr sequence for regc_locale.c's fake collating element ch
1014 * This helps confine use of CHR to this source file.  Beware that the caller
1015 * knows how long the sequence is.
1016 ^ #ifdef REG_DEBUG
1017 ^ static chr *ch(NOPARMS);
1018 ^ #endif
1019 */
1020#ifdef REG_DEBUG
1021static chr *
1022ch()
1023{
1024	static chr chstr[] = { CHR('c'), CHR('h'), CHR('\0') };
1025
1026	return chstr;
1027}
1028#endif
1029
1030/*
1031 - chrnamed - return the chr known by a given (chr string) name
1032 * The code is a bit clumsy, but this routine gets only such specialized
1033 * use that it hardly matters.
1034 ^ static chr chrnamed(struct vars *, chr *, chr *, pchr);
1035 */
1036static chr
1037chrnamed(v, startp, endp, lastresort)
1038struct vars *v;
1039chr *startp;			/* start of name */
1040chr *endp;			/* just past end of name */
1041pchr lastresort;		/* what to return if name lookup fails */
1042{
1043	celt c;
1044	int errsave;
1045	int e;
1046	struct cvec *cv;
1047
1048	errsave = v->err;
1049	v->err = 0;
1050	c = element(v, startp, endp);
1051	e = v->err;
1052	v->err = errsave;
1053
1054	if (e != 0)
1055		return (chr)lastresort;
1056
1057	cv = range(v, c, c, 0);
1058	if (cv->nchrs == 0)
1059		return (chr)lastresort;
1060	return cv->chrs[0];
1061}
1062