1/*
2 * lexical analyzer
3 * This file is #included by regcomp.c.
4 *
5 * Copyright (c) 1998, 1999 Henry Spencer.  All rights reserved.
6 *
7 * Development of this software was funded, in part, by Cray Research Inc.,
8 * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics
9 * Corporation, none of whom are responsible for the results.  The author
10 * thanks all of them.
11 *
12 * Redistribution and use in source and binary forms -- with or without
13 * modification -- are permitted for any purpose, provided that
14 * redistributions in source form retain this entire copyright notice and
15 * indicate the origin and nature of any modifications.
16 *
17 * I'd appreciate being given credit for this package in the documentation of
18 * software which uses it, but that is not a requirement.
19 *
20 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
21 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
22 * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL
23 * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
24 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
25 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
26 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
27 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
28 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
29 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 */
31
32/* scanning macros (know about v) */
33#define	ATEOS()		(v->now >= v->stop)
34#define	HAVE(n)		(v->stop - v->now >= (n))
35#define	NEXT1(c)	(!ATEOS() && *v->now == CHR(c))
36#define	NEXT2(a,b)	(HAVE(2) && *v->now == CHR(a) && *(v->now+1) == CHR(b))
37#define	NEXT3(a,b,c) \
38	(HAVE(3) && *v->now == CHR(a) && \
39		*(v->now+1) == CHR(b) && \
40		*(v->now+2) == CHR(c))
41#define	SET(c)		(v->nexttype = (c))
42#define	SETV(c, n)	(v->nexttype = (c), v->nextvalue = (n))
43#define	RET(c)		return (SET(c), 1)
44#define	RETV(c, n)	return (SETV(c, n), 1)
45#define	FAILW(e)	return (ERR(e), 0)	/* ERR does SET(EOS) */
46#define	LASTTYPE(t)	(v->lasttype == (t))
47
48/* lexical contexts */
49#define	L_ERE	1	/* mainline ERE/ARE */
50#define	L_BRE	2	/* mainline BRE */
51#define	L_Q	3	/* REG_QUOTE */
52#define	L_EBND	4	/* ERE/ARE bound */
53#define	L_BBND	5	/* BRE bound */
54#define	L_BRACK	6	/* brackets */
55#define	L_CEL	7	/* collating element */
56#define	L_ECL	8	/* equivalence class */
57#define	L_CCL	9	/* character class */
58#define	INTOCON(c)	(v->lexcon = (c))
59#define	INCON(con)	(v->lexcon == (con))
60
61/* construct pointer past end of chr array */
62#define	ENDOF(array)	((array) + sizeof(array)/sizeof(chr))
63
64/*
65 - lexstart - set up lexical stuff, scan leading options
66 ^ static VOID lexstart(struct vars *);
67 */
68static void
69lexstart(
70    struct vars *v)
71{
72    prefixes(v);		/* may turn on new type bits etc. */
73    NOERR();
74
75    if (v->cflags&REG_QUOTE) {
76	assert(!(v->cflags&(REG_ADVANCED|REG_EXPANDED|REG_NEWLINE)));
77	INTOCON(L_Q);
78    } else if (v->cflags&REG_EXTENDED) {
79	assert(!(v->cflags&REG_QUOTE));
80	INTOCON(L_ERE);
81    } else {
82	assert(!(v->cflags&(REG_QUOTE|REG_ADVF)));
83	INTOCON(L_BRE);
84    }
85
86    v->nexttype = EMPTY;	/* remember we were at the start */
87    next(v);			/* set up the first token */
88}
89
90/*
91 - prefixes - implement various special prefixes
92 ^ static VOID prefixes(struct vars *);
93 */
94static void
95prefixes(
96    struct vars *v)
97{
98    /*
99     * Literal string doesn't get any of this stuff.
100     */
101
102    if (v->cflags&REG_QUOTE) {
103	return;
104    }
105
106    /*
107     * Initial "***" gets special things.
108     */
109
110    if (HAVE(4) && NEXT3('*', '*', '*')) {
111	switch (*(v->now + 3)) {
112	case CHR('?'):		/* "***?" error, msg shows version */
113	    ERR(REG_BADPAT);
114	    return;		/* proceed no further */
115	    break;
116	case CHR('='):		/* "***=" shifts to literal string */
117	    NOTE(REG_UNONPOSIX);
118	    v->cflags |= REG_QUOTE;
119	    v->cflags &= ~(REG_ADVANCED|REG_EXPANDED|REG_NEWLINE);
120	    v->now += 4;
121	    return;		/* and there can be no more prefixes */
122	    break;
123	case CHR(':'):		/* "***:" shifts to AREs */
124	    NOTE(REG_UNONPOSIX);
125	    v->cflags |= REG_ADVANCED;
126	    v->now += 4;
127	    break;
128	default:		/* otherwise *** is just an error */
129	    ERR(REG_BADRPT);
130	    return;
131	    break;
132	}
133    }
134
135    /*
136     * BREs and EREs don't get embedded options.
137     */
138
139    if ((v->cflags&REG_ADVANCED) != REG_ADVANCED) {
140	return;
141    }
142
143    /*
144     * Embedded options (AREs only).
145     */
146
147    if (HAVE(3) && NEXT2('(', '?') && iscalpha(*(v->now + 2))) {
148	NOTE(REG_UNONPOSIX);
149	v->now += 2;
150	for (; !ATEOS() && iscalpha(*v->now); v->now++) {
151	    switch (*v->now) {
152	    case CHR('b'):	/* BREs (but why???) */
153		v->cflags &= ~(REG_ADVANCED|REG_QUOTE);
154		break;
155	    case CHR('c'):	/* case sensitive */
156		v->cflags &= ~REG_ICASE;
157		break;
158	    case CHR('e'):	/* plain EREs */
159		v->cflags |= REG_EXTENDED;
160		v->cflags &= ~(REG_ADVF|REG_QUOTE);
161		break;
162	    case CHR('i'):	/* case insensitive */
163		v->cflags |= REG_ICASE;
164		break;
165	    case CHR('m'):	/* Perloid synonym for n */
166	    case CHR('n'):	/* \n affects ^ $ . [^ */
167		v->cflags |= REG_NEWLINE;
168		break;
169	    case CHR('p'):	/* ~Perl, \n affects . [^ */
170		v->cflags |= REG_NLSTOP;
171		v->cflags &= ~REG_NLANCH;
172		break;
173	    case CHR('q'):	/* literal string */
174		v->cflags |= REG_QUOTE;
175		v->cflags &= ~REG_ADVANCED;
176		break;
177	    case CHR('s'):	/* single line, \n ordinary */
178		v->cflags &= ~REG_NEWLINE;
179		break;
180	    case CHR('t'):	/* tight syntax */
181		v->cflags &= ~REG_EXPANDED;
182		break;
183	    case CHR('w'):	/* weird, \n affects ^ $ only */
184		v->cflags &= ~REG_NLSTOP;
185		v->cflags |= REG_NLANCH;
186		break;
187	    case CHR('x'):	/* expanded syntax */
188		v->cflags |= REG_EXPANDED;
189		break;
190	    default:
191		ERR(REG_BADOPT);
192		return;
193	    }
194	}
195	if (!NEXT1(')')) {
196	    ERR(REG_BADOPT);
197	    return;
198	}
199	v->now++;
200	if (v->cflags&REG_QUOTE) {
201	    v->cflags &= ~(REG_EXPANDED|REG_NEWLINE);
202	}
203    }
204}
205
206/*
207 - lexnest - "call a subroutine", interpolating string at the lexical level
208 * Note, this is not a very general facility.  There are a number of
209 * implicit assumptions about what sorts of strings can be subroutines.
210 ^ static VOID lexnest(struct vars *, const chr *, const chr *);
211 */
212static void
213lexnest(
214    struct vars *v,
215    const chr *beginp,		/* start of interpolation */
216    const chr *endp)		/* one past end of interpolation */
217{
218    assert(v->savenow == NULL);	/* only one level of nesting */
219    v->savenow = v->now;
220    v->savestop = v->stop;
221    v->now = beginp;
222    v->stop = endp;
223}
224
225/*
226 * string constants to interpolate as expansions of things like \d
227 */
228
229static const chr backd[] = {	/* \d */
230    CHR('['), CHR('['), CHR(':'),
231    CHR('d'), CHR('i'), CHR('g'), CHR('i'), CHR('t'),
232    CHR(':'), CHR(']'), CHR(']')
233};
234static const chr backD[] = {	/* \D */
235    CHR('['), CHR('^'), CHR('['), CHR(':'),
236    CHR('d'), CHR('i'), CHR('g'), CHR('i'), CHR('t'),
237    CHR(':'), CHR(']'), CHR(']')
238};
239static const chr brbackd[] = {	/* \d within brackets */
240    CHR('['), CHR(':'),
241    CHR('d'), CHR('i'), CHR('g'), CHR('i'), CHR('t'),
242    CHR(':'), CHR(']')
243};
244static const chr backs[] = {	/* \s */
245    CHR('['), CHR('['), CHR(':'),
246    CHR('s'), CHR('p'), CHR('a'), CHR('c'), CHR('e'),
247    CHR(':'), CHR(']'), CHR(']')
248};
249static const chr backS[] = {	/* \S */
250    CHR('['), CHR('^'), CHR('['), CHR(':'),
251    CHR('s'), CHR('p'), CHR('a'), CHR('c'), CHR('e'),
252    CHR(':'), CHR(']'), CHR(']')
253};
254static const chr brbacks[] = {	/* \s within brackets */
255    CHR('['), CHR(':'),
256    CHR('s'), CHR('p'), CHR('a'), CHR('c'), CHR('e'),
257    CHR(':'), CHR(']')
258};
259static const chr backw[] = {	/* \w */
260    CHR('['), CHR('['), CHR(':'),
261    CHR('a'), CHR('l'), CHR('n'), CHR('u'), CHR('m'),
262    CHR(':'), CHR(']'), CHR('_'), CHR(']')
263};
264static const chr backW[] = {	/* \W */
265    CHR('['), CHR('^'), CHR('['), CHR(':'),
266    CHR('a'), CHR('l'), CHR('n'), CHR('u'), CHR('m'),
267    CHR(':'), CHR(']'), CHR('_'), CHR(']')
268};
269static const chr brbackw[] = {	/* \w within brackets */
270    CHR('['), CHR(':'),
271    CHR('a'), CHR('l'), CHR('n'), CHR('u'), CHR('m'),
272    CHR(':'), CHR(']'), CHR('_')
273};
274
275/*
276 - lexword - interpolate a bracket expression for word characters
277 * Possibly ought to inquire whether there is a "word" character class.
278 ^ static VOID lexword(struct vars *);
279 */
280static void
281lexword(
282    struct vars *v)
283{
284    lexnest(v, backw, ENDOF(backw));
285}
286
287/*
288 - next - get next token
289 ^ static int next(struct vars *);
290 */
291static int			/* 1 normal, 0 failure */
292next(
293    struct vars *v)
294{
295    chr c;
296
297    /*
298     * Errors yield an infinite sequence of failures.
299     */
300
301    if (ISERR()) {
302	return 0;		/* the error has set nexttype to EOS */
303    }
304
305    /*
306     * Remember flavor of last token.
307     */
308
309    v->lasttype = v->nexttype;
310
311    /*
312     * REG_BOSONLY
313     */
314
315    if (v->nexttype == EMPTY && (v->cflags&REG_BOSONLY)) {
316	/* at start of a REG_BOSONLY RE */
317	RETV(SBEGIN, 0);	/* same as \A */
318    }
319
320    /*
321     * If we're nested and we've hit end, return to outer level.
322     */
323
324    if (v->savenow != NULL && ATEOS()) {
325	v->now = v->savenow;
326	v->stop = v->savestop;
327	v->savenow = v->savestop = NULL;
328    }
329
330    /*
331     * Skip white space etc. if appropriate (not in literal or [])
332     */
333
334    if (v->cflags&REG_EXPANDED) {
335	switch (v->lexcon) {
336	case L_ERE:
337	case L_BRE:
338	case L_EBND:
339	case L_BBND:
340	    skip(v);
341	    break;
342	}
343    }
344
345    /*
346     * Handle EOS, depending on context.
347     */
348
349    if (ATEOS()) {
350	switch (v->lexcon) {
351	case L_ERE:
352	case L_BRE:
353	case L_Q:
354	    RET(EOS);
355	    break;
356	case L_EBND:
357	case L_BBND:
358	    FAILW(REG_EBRACE);
359	    break;
360	case L_BRACK:
361	case L_CEL:
362	case L_ECL:
363	case L_CCL:
364	    FAILW(REG_EBRACK);
365	    break;
366	}
367	assert(NOTREACHED);
368    }
369
370    /*
371     * Okay, time to actually get a character.
372     */
373
374    c = *v->now++;
375
376    /*
377     * Deal with the easy contexts, punt EREs to code below.
378     */
379
380    switch (v->lexcon) {
381    case L_BRE:			/* punt BREs to separate function */
382	return brenext(v, c);
383	break;
384    case L_ERE:			/* see below */
385	break;
386    case L_Q:			/* literal strings are easy */
387	RETV(PLAIN, c);
388	break;
389    case L_BBND:		/* bounds are fairly simple */
390    case L_EBND:
391	switch (c) {
392	case CHR('0'): case CHR('1'): case CHR('2'): case CHR('3'):
393	case CHR('4'): case CHR('5'): case CHR('6'): case CHR('7'):
394	case CHR('8'): case CHR('9'):
395	    RETV(DIGIT, (chr)DIGITVAL(c));
396	    break;
397	case CHR(','):
398	    RET(',');
399	    break;
400	case CHR('}'):		/* ERE bound ends with } */
401	    if (INCON(L_EBND)) {
402		INTOCON(L_ERE);
403		if ((v->cflags&REG_ADVF) && NEXT1('?')) {
404		    v->now++;
405		    NOTE(REG_UNONPOSIX);
406		    RETV('}', 0);
407		}
408		RETV('}', 1);
409	    } else {
410		FAILW(REG_BADBR);
411	    }
412	    break;
413	case CHR('\\'):		/* BRE bound ends with \} */
414	    if (INCON(L_BBND) && NEXT1('}')) {
415		v->now++;
416		INTOCON(L_BRE);
417		RET('}');
418	    } else {
419		FAILW(REG_BADBR);
420	    }
421	    break;
422	default:
423	    FAILW(REG_BADBR);
424	    break;
425	}
426	assert(NOTREACHED);
427	break;
428    case L_BRACK:		/* brackets are not too hard */
429	switch (c) {
430	case CHR(']'):
431	    if (LASTTYPE('[')) {
432		RETV(PLAIN, c);
433	    } else {
434		INTOCON((v->cflags&REG_EXTENDED) ? L_ERE : L_BRE);
435		RET(']');
436	    }
437	    break;
438	case CHR('\\'):
439	    NOTE(REG_UBBS);
440	    if (!(v->cflags&REG_ADVF)) {
441		RETV(PLAIN, c);
442	    }
443	    NOTE(REG_UNONPOSIX);
444	    if (ATEOS()) {
445		FAILW(REG_EESCAPE);
446	    }
447	    (DISCARD)lexescape(v);
448	    switch (v->nexttype) {	/* not all escapes okay here */
449	    case PLAIN:
450		return 1;
451		break;
452	    case CCLASS:
453		switch (v->nextvalue) {
454		case 'd':
455		    lexnest(v, brbackd, ENDOF(brbackd));
456		    break;
457		case 's':
458		    lexnest(v, brbacks, ENDOF(brbacks));
459		    break;
460		case 'w':
461		    lexnest(v, brbackw, ENDOF(brbackw));
462		    break;
463		default:
464		    FAILW(REG_EESCAPE);
465		    break;
466		}
467
468		/*
469		 * lexnest() done, back up and try again.
470		 */
471
472		v->nexttype = v->lasttype;
473		return next(v);
474		break;
475	    }
476
477	    /*
478	     * Not one of the acceptable escapes.
479	     */
480
481	    FAILW(REG_EESCAPE);
482	    break;
483	case CHR('-'):
484	    if (LASTTYPE('[') || NEXT1(']')) {
485		RETV(PLAIN, c);
486	    } else {
487		RETV(RANGE, c);
488	    }
489	    break;
490	case CHR('['):
491	    if (ATEOS()) {
492		FAILW(REG_EBRACK);
493	    }
494	    switch (*v->now++) {
495	    case CHR('.'):
496		INTOCON(L_CEL);
497
498		/*
499		 * Might or might not be locale-specific.
500		 */
501
502		RET(COLLEL);
503		break;
504	    case CHR('='):
505		INTOCON(L_ECL);
506		NOTE(REG_ULOCALE);
507		RET(ECLASS);
508		break;
509	    case CHR(':'):
510		INTOCON(L_CCL);
511		NOTE(REG_ULOCALE);
512		RET(CCLASS);
513		break;
514	    default:		/* oops */
515		v->now--;
516		RETV(PLAIN, c);
517		break;
518	    }
519	    assert(NOTREACHED);
520	    break;
521	default:
522	    RETV(PLAIN, c);
523	    break;
524	}
525	assert(NOTREACHED);
526	break;
527    case L_CEL:			/* collating elements are easy */
528	if (c == CHR('.') && NEXT1(']')) {
529	    v->now++;
530	    INTOCON(L_BRACK);
531	    RETV(END, '.');
532	} else {
533	    RETV(PLAIN, c);
534	}
535	break;
536    case L_ECL:			/* ditto equivalence classes */
537	if (c == CHR('=') && NEXT1(']')) {
538	    v->now++;
539	    INTOCON(L_BRACK);
540	    RETV(END, '=');
541	} else {
542	    RETV(PLAIN, c);
543	}
544	break;
545    case L_CCL:			/* ditto character classes */
546	if (c == CHR(':') && NEXT1(']')) {
547	    v->now++;
548	    INTOCON(L_BRACK);
549	    RETV(END, ':');
550	} else {
551	    RETV(PLAIN, c);
552	}
553	break;
554    default:
555	assert(NOTREACHED);
556	break;
557    }
558
559    /*
560     * That got rid of everything except EREs and AREs.
561     */
562
563    assert(INCON(L_ERE));
564
565    /*
566     * Deal with EREs and AREs, except for backslashes.
567     */
568
569    switch (c) {
570    case CHR('|'):
571	RET('|');
572	break;
573    case CHR('*'):
574	if ((v->cflags&REG_ADVF) && NEXT1('?')) {
575	    v->now++;
576	    NOTE(REG_UNONPOSIX);
577	    RETV('*', 0);
578	}
579	RETV('*', 1);
580	break;
581    case CHR('+'):
582	if ((v->cflags&REG_ADVF) && NEXT1('?')) {
583	    v->now++;
584	    NOTE(REG_UNONPOSIX);
585	    RETV('+', 0);
586	}
587	RETV('+', 1);
588	break;
589    case CHR('?'):
590	if ((v->cflags&REG_ADVF) && NEXT1('?')) {
591	    v->now++;
592	    NOTE(REG_UNONPOSIX);
593	    RETV('?', 0);
594	}
595	RETV('?', 1);
596	break;
597    case CHR('{'):		/* bounds start or plain character */
598	if (v->cflags&REG_EXPANDED) {
599	    skip(v);
600	}
601	if (ATEOS() || !iscdigit(*v->now)) {
602	    NOTE(REG_UBRACES);
603	    NOTE(REG_UUNSPEC);
604	    RETV(PLAIN, c);
605	} else {
606	    NOTE(REG_UBOUNDS);
607	    INTOCON(L_EBND);
608	    RET('{');
609	}
610	assert(NOTREACHED);
611	break;
612    case CHR('('):		/* parenthesis, or advanced extension */
613	if ((v->cflags&REG_ADVF) && NEXT1('?')) {
614	    NOTE(REG_UNONPOSIX);
615	    v->now++;
616	    switch (*v->now++) {
617	    case CHR(':'):	/* non-capturing paren */
618		RETV('(', 0);
619		break;
620	    case CHR('#'):	/* comment */
621		while (!ATEOS() && *v->now != CHR(')')) {
622		    v->now++;
623		}
624		if (!ATEOS()) {
625		    v->now++;
626		}
627		assert(v->nexttype == v->lasttype);
628		return next(v);
629		break;
630	    case CHR('='):	/* positive lookahead */
631		NOTE(REG_ULOOKAHEAD);
632		RETV(LACON, 1);
633		break;
634	    case CHR('!'):	/* negative lookahead */
635		NOTE(REG_ULOOKAHEAD);
636		RETV(LACON, 0);
637		break;
638	    default:
639		FAILW(REG_BADRPT);
640		break;
641	    }
642	    assert(NOTREACHED);
643	}
644	if (v->cflags&REG_NOSUB) {
645	    RETV('(', 0);	/* all parens non-capturing */
646	} else {
647	    RETV('(', 1);
648	}
649	break;
650    case CHR(')'):
651	if (LASTTYPE('(')) {
652	    NOTE(REG_UUNSPEC);
653	}
654	RETV(')', c);
655	break;
656    case CHR('['):		/* easy except for [[:<:]] and [[:>:]] */
657	if (HAVE(6) &&	*(v->now+0) == CHR('[') &&
658		*(v->now+1) == CHR(':') &&
659		(*(v->now+2) == CHR('<') || *(v->now+2) == CHR('>')) &&
660		*(v->now+3) == CHR(':') &&
661		*(v->now+4) == CHR(']') &&
662		*(v->now+5) == CHR(']')) {
663	    c = *(v->now+2);
664	    v->now += 6;
665	    NOTE(REG_UNONPOSIX);
666	    RET((c == CHR('<')) ? '<' : '>');
667	}
668	INTOCON(L_BRACK);
669	if (NEXT1('^')) {
670	    v->now++;
671	    RETV('[', 0);
672	}
673	RETV('[', 1);
674	break;
675    case CHR('.'):
676	RET('.');
677	break;
678    case CHR('^'):
679	RET('^');
680	break;
681    case CHR('$'):
682	RET('$');
683	break;
684    case CHR('\\'):		/* mostly punt backslashes to code below */
685	if (ATEOS()) {
686	    FAILW(REG_EESCAPE);
687	}
688	break;
689    default:		/* ordinary character */
690	RETV(PLAIN, c);
691	break;
692    }
693
694    /*
695     * ERE/ARE backslash handling; backslash already eaten.
696     */
697
698    assert(!ATEOS());
699    if (!(v->cflags&REG_ADVF)) {/* only AREs have non-trivial escapes */
700	if (iscalnum(*v->now)) {
701	    NOTE(REG_UBSALNUM);
702	    NOTE(REG_UUNSPEC);
703	}
704	RETV(PLAIN, *v->now++);
705    }
706    (DISCARD)lexescape(v);
707    if (ISERR()) {
708	FAILW(REG_EESCAPE);
709    }
710    if (v->nexttype == CCLASS) {/* fudge at lexical level */
711	switch (v->nextvalue) {
712	case 'd':	lexnest(v, backd, ENDOF(backd)); break;
713	case 'D':	lexnest(v, backD, ENDOF(backD)); break;
714	case 's':	lexnest(v, backs, ENDOF(backs)); break;
715	case 'S':	lexnest(v, backS, ENDOF(backS)); break;
716	case 'w':	lexnest(v, backw, ENDOF(backw)); break;
717	case 'W':	lexnest(v, backW, ENDOF(backW)); break;
718	default:
719	    assert(NOTREACHED);
720	    FAILW(REG_ASSERT);
721	    break;
722	}
723	/* lexnest done, back up and try again */
724	v->nexttype = v->lasttype;
725	return next(v);
726    }
727
728    /*
729     * Otherwise, lexescape has already done the work.
730     */
731
732    return !ISERR();
733}
734
735/*
736 - lexescape - parse an ARE backslash escape (backslash already eaten)
737 * Note slightly nonstandard use of the CCLASS type code.
738 ^ static int lexescape(struct vars *);
739 */
740static int			/* not actually used, but convenient for RETV */
741lexescape(
742    struct vars *v)
743{
744    chr c;
745    static chr alert[] = {
746	CHR('a'), CHR('l'), CHR('e'), CHR('r'), CHR('t')
747    };
748    static chr esc[] = {
749	CHR('E'), CHR('S'), CHR('C')
750    };
751    const chr *save;
752
753    assert(v->cflags&REG_ADVF);
754
755    assert(!ATEOS());
756    c = *v->now++;
757    if (!iscalnum(c)) {
758	RETV(PLAIN, c);
759    }
760
761    NOTE(REG_UNONPOSIX);
762    switch (c) {
763    case CHR('a'):
764	RETV(PLAIN, chrnamed(v, alert, ENDOF(alert), CHR('\007')));
765	break;
766    case CHR('A'):
767	RETV(SBEGIN, 0);
768	break;
769    case CHR('b'):
770	RETV(PLAIN, CHR('\b'));
771	break;
772    case CHR('B'):
773	RETV(PLAIN, CHR('\\'));
774	break;
775    case CHR('c'):
776	NOTE(REG_UUNPORT);
777	if (ATEOS()) {
778	    FAILW(REG_EESCAPE);
779	}
780	RETV(PLAIN, (chr)(*v->now++ & 037));
781	break;
782    case CHR('d'):
783	NOTE(REG_ULOCALE);
784	RETV(CCLASS, 'd');
785	break;
786    case CHR('D'):
787	NOTE(REG_ULOCALE);
788	RETV(CCLASS, 'D');
789	break;
790    case CHR('e'):
791	NOTE(REG_UUNPORT);
792	RETV(PLAIN, chrnamed(v, esc, ENDOF(esc), CHR('\033')));
793	break;
794    case CHR('f'):
795	RETV(PLAIN, CHR('\f'));
796	break;
797    case CHR('m'):
798	RET('<');
799	break;
800    case CHR('M'):
801	RET('>');
802	break;
803    case CHR('n'):
804	RETV(PLAIN, CHR('\n'));
805	break;
806    case CHR('r'):
807	RETV(PLAIN, CHR('\r'));
808	break;
809    case CHR('s'):
810	NOTE(REG_ULOCALE);
811	RETV(CCLASS, 's');
812	break;
813    case CHR('S'):
814	NOTE(REG_ULOCALE);
815	RETV(CCLASS, 'S');
816	break;
817    case CHR('t'):
818	RETV(PLAIN, CHR('\t'));
819	break;
820    case CHR('u'):
821	c = lexdigits(v, 16, 4, 4);
822	if (ISERR()) {
823	    FAILW(REG_EESCAPE);
824	}
825	RETV(PLAIN, c);
826	break;
827    case CHR('U'):
828	c = lexdigits(v, 16, 8, 8);
829	if (ISERR()) {
830	    FAILW(REG_EESCAPE);
831	}
832	RETV(PLAIN, c);
833	break;
834    case CHR('v'):
835	RETV(PLAIN, CHR('\v'));
836	break;
837    case CHR('w'):
838	NOTE(REG_ULOCALE);
839	RETV(CCLASS, 'w');
840	break;
841    case CHR('W'):
842	NOTE(REG_ULOCALE);
843	RETV(CCLASS, 'W');
844	break;
845    case CHR('x'):
846	NOTE(REG_UUNPORT);
847	c = lexdigits(v, 16, 1, 255);	/* REs >255 long outside spec */
848	if (ISERR()) {
849	    FAILW(REG_EESCAPE);
850	}
851	RETV(PLAIN, c);
852	break;
853    case CHR('y'):
854	NOTE(REG_ULOCALE);
855	RETV(WBDRY, 0);
856	break;
857    case CHR('Y'):
858	NOTE(REG_ULOCALE);
859	RETV(NWBDRY, 0);
860	break;
861    case CHR('Z'):
862	RETV(SEND, 0);
863	break;
864    case CHR('1'): case CHR('2'): case CHR('3'): case CHR('4'):
865    case CHR('5'): case CHR('6'): case CHR('7'): case CHR('8'):
866    case CHR('9'):
867	save = v->now;
868	v->now--;		/* put first digit back */
869	c = lexdigits(v, 10, 1, 255);	/* REs >255 long outside spec */
870	if (ISERR()) {
871	    FAILW(REG_EESCAPE);
872	}
873
874	/*
875	 * Ugly heuristic (first test is "exactly 1 digit?")
876	 */
877
878	if (v->now - save == 0 || ((int) c > 0 && (int)c <= v->nsubexp)) {
879	    NOTE(REG_UBACKREF);
880	    RETV(BACKREF, (chr)c);
881	}
882
883	/*
884	 * Oops, doesn't look like it's a backref after all...
885	 */
886
887	v->now = save;
888
889	/*
890	 * And fall through into octal number.
891	 */
892
893    case CHR('0'):
894	NOTE(REG_UUNPORT);
895	v->now--;		/* put first digit back */
896	c = lexdigits(v, 8, 1, 3);
897	if (ISERR()) {
898	    FAILW(REG_EESCAPE);
899	}
900	RETV(PLAIN, c);
901	break;
902    default:
903	assert(iscalpha(c));
904	FAILW(REG_EESCAPE);	/* unknown alphabetic escape */
905	break;
906    }
907    assert(NOTREACHED);
908}
909
910/*
911 - lexdigits - slurp up digits and return chr value
912 ^ static chr lexdigits(struct vars *, int, int, int);
913 */
914static chr			/* chr value; errors signalled via ERR */
915lexdigits(
916    struct vars *v,
917    int base,
918    int minlen,
919    int maxlen)
920{
921    uchr n;			/* unsigned to avoid overflow misbehavior */
922    int len;
923    chr c;
924    int d;
925    CONST uchr ub = (uchr) base;
926
927    n = 0;
928    for (len = 0; len < maxlen && !ATEOS(); len++) {
929	c = *v->now++;
930	switch (c) {
931	case CHR('0'): case CHR('1'): case CHR('2'): case CHR('3'):
932	case CHR('4'): case CHR('5'): case CHR('6'): case CHR('7'):
933	case CHR('8'): case CHR('9'):
934	    d = DIGITVAL(c);
935	    break;
936	case CHR('a'): case CHR('A'): d = 10; break;
937	case CHR('b'): case CHR('B'): d = 11; break;
938	case CHR('c'): case CHR('C'): d = 12; break;
939	case CHR('d'): case CHR('D'): d = 13; break;
940	case CHR('e'): case CHR('E'): d = 14; break;
941	case CHR('f'): case CHR('F'): d = 15; break;
942	default:
943	    v->now--;		/* oops, not a digit at all */
944	    d = -1;
945	    break;
946	}
947
948	if (d >= base) {	/* not a plausible digit */
949	    v->now--;
950	    d = -1;
951	}
952	if (d < 0) {
953	    break;		/* NOTE BREAK OUT */
954	}
955	n = n*ub + (uchr)d;
956    }
957    if (len < minlen) {
958	ERR(REG_EESCAPE);
959    }
960
961    return (chr)n;
962}
963
964/*
965 - brenext - get next BRE token
966 * This is much like EREs except for all the stupid backslashes and the
967 * context-dependency of some things.
968 ^ static int brenext(struct vars *, pchr);
969 */
970static int			/* 1 normal, 0 failure */
971brenext(
972    struct vars *v,
973    pchr pc)
974{
975    chr c = (chr)pc;
976
977    switch (c) {
978    case CHR('*'):
979	if (LASTTYPE(EMPTY) || LASTTYPE('(') || LASTTYPE('^')) {
980	    RETV(PLAIN, c);
981	}
982	RET('*');
983	break;
984    case CHR('['):
985	if (HAVE(6) &&	*(v->now+0) == CHR('[') &&
986		*(v->now+1) == CHR(':') &&
987		(*(v->now+2) == CHR('<') || *(v->now+2) == CHR('>')) &&
988		*(v->now+3) == CHR(':') &&
989		*(v->now+4) == CHR(']') &&
990		*(v->now+5) == CHR(']')) {
991	    c = *(v->now+2);
992	    v->now += 6;
993	    NOTE(REG_UNONPOSIX);
994	    RET((c == CHR('<')) ? '<' : '>');
995	}
996	INTOCON(L_BRACK);
997	if (NEXT1('^')) {
998	    v->now++;
999	    RETV('[', 0);
1000	}
1001	RETV('[', 1);
1002	break;
1003    case CHR('.'):
1004	RET('.');
1005	break;
1006    case CHR('^'):
1007	if (LASTTYPE(EMPTY)) {
1008	    RET('^');
1009	}
1010	if (LASTTYPE('(')) {
1011	    NOTE(REG_UUNSPEC);
1012	    RET('^');
1013	}
1014	RETV(PLAIN, c);
1015	break;
1016    case CHR('$'):
1017	if (v->cflags&REG_EXPANDED) {
1018	    skip(v);
1019	}
1020	if (ATEOS()) {
1021	    RET('$');
1022	}
1023	if (NEXT2('\\', ')')) {
1024	    NOTE(REG_UUNSPEC);
1025	    RET('$');
1026	}
1027	RETV(PLAIN, c);
1028	break;
1029    case CHR('\\'):
1030	break;			/* see below */
1031    default:
1032	RETV(PLAIN, c);
1033	break;
1034    }
1035
1036    assert(c == CHR('\\'));
1037
1038    if (ATEOS()) {
1039	FAILW(REG_EESCAPE);
1040    }
1041
1042    c = *v->now++;
1043    switch (c) {
1044    case CHR('{'):
1045	INTOCON(L_BBND);
1046	NOTE(REG_UBOUNDS);
1047	RET('{');
1048	break;
1049    case CHR('('):
1050	RETV('(', 1);
1051	break;
1052    case CHR(')'):
1053	RETV(')', c);
1054	break;
1055    case CHR('<'):
1056	NOTE(REG_UNONPOSIX);
1057	RET('<');
1058	break;
1059    case CHR('>'):
1060	NOTE(REG_UNONPOSIX);
1061	RET('>');
1062	break;
1063    case CHR('1'): case CHR('2'): case CHR('3'): case CHR('4'):
1064    case CHR('5'): case CHR('6'): case CHR('7'): case CHR('8'):
1065    case CHR('9'):
1066	NOTE(REG_UBACKREF);
1067	RETV(BACKREF, (chr)DIGITVAL(c));
1068	break;
1069    default:
1070	if (iscalnum(c)) {
1071	    NOTE(REG_UBSALNUM);
1072	    NOTE(REG_UUNSPEC);
1073	}
1074	RETV(PLAIN, c);
1075	break;
1076    }
1077
1078    assert(NOTREACHED);
1079}
1080
1081/*
1082 - skip - skip white space and comments in expanded form
1083 ^ static VOID skip(struct vars *);
1084 */
1085static void
1086skip(
1087    struct vars *v)
1088{
1089    const chr *start = v->now;
1090
1091    assert(v->cflags&REG_EXPANDED);
1092
1093    for (;;) {
1094	while (!ATEOS() && iscspace(*v->now)) {
1095	    v->now++;
1096	}
1097	if (ATEOS() || *v->now != CHR('#')) {
1098	    break;		/* NOTE BREAK OUT */
1099	}
1100	assert(NEXT1('#'));
1101	while (!ATEOS() && *v->now != CHR('\n')) {
1102	    v->now++;
1103	}
1104
1105	/*
1106	 * Leave the newline to be picked up by the iscspace loop.
1107	 */
1108    }
1109
1110    if (v->now != start) {
1111	NOTE(REG_UNONPOSIX);
1112    }
1113}
1114
1115/*
1116 - newline - return the chr for a newline
1117 * This helps confine use of CHR to this source file.
1118 ^ static chr newline(NOPARMS);
1119 */
1120static chr
1121newline(void)
1122{
1123    return CHR('\n');
1124}
1125
1126/*
1127 - ch - return the chr sequence for regc_locale.c's fake collating element ch
1128 * This helps confine use of CHR to this source file.  Beware that the caller
1129 * knows how long the sequence is.
1130 ^ #ifdef REG_DEBUG
1131 ^ static const chr *ch(NOPARMS);
1132 ^ #endif
1133 */
1134#ifdef REG_DEBUG
1135static const chr *
1136ch(void)
1137{
1138    static chr chstr[] = { CHR('c'), CHR('h'), CHR('\0') };
1139
1140    return chstr;
1141}
1142#endif
1143
1144/*
1145 - chrnamed - return the chr known by a given (chr string) name
1146 * The code is a bit clumsy, but this routine gets only such specialized
1147 * use that it hardly matters.
1148 ^ static chr chrnamed(struct vars *, const chr *, const chr *, pchr);
1149 */
1150static chr
1151chrnamed(
1152    struct vars *v,
1153    const chr *startp,		/* start of name */
1154    const chr *endp,		/* just past end of name */
1155    pchr lastresort)		/* what to return if name lookup fails */
1156{
1157    celt c;
1158    int errsave;
1159    int e;
1160    struct cvec *cv;
1161
1162    errsave = v->err;
1163    v->err = 0;
1164    c = element(v, startp, endp);
1165    e = v->err;
1166    v->err = errsave;
1167
1168    if (e != 0) {
1169	return (chr)lastresort;
1170    }
1171
1172    cv = range(v, c, c, 0);
1173    if (cv->nchrs == 0) {
1174	return (chr)lastresort;
1175    }
1176    return cv->chrs[0];
1177}
1178
1179/*
1180 * Local Variables:
1181 * mode: c
1182 * c-basic-offset: 4
1183 * fill-column: 78
1184 * End:
1185 */
1186