1/***********************************************************************
2*                                                                      *
3*               This software is part of the ast package               *
4*          Copyright (c) 1985-2012 AT&T Intellectual Property          *
5*                      and is licensed under the                       *
6*                 Eclipse Public License, Version 1.0                  *
7*                    by AT&T Intellectual Property                     *
8*                                                                      *
9*                A copy of the License is available at                 *
10*          http://www.eclipse.org/org/documents/epl-v10.html           *
11*         (with md5 checksum b35adb5213ca9657e911e9befb180842)         *
12*                                                                      *
13*              Information and Software Systems Research               *
14*                            AT&T Research                             *
15*                           Florham Park NJ                            *
16*                                                                      *
17*                 Glenn Fowler <gsf@research.att.com>                  *
18*                  David Korn <dgk@research.att.com>                   *
19*                   Phong Vo <kpv@research.att.com>                    *
20*                                                                      *
21***********************************************************************/
22#pragma prototyped
23
24/*
25 * posix regex compiler
26 */
27
28#include "reglib.h"
29
30#if _PACKAGE_ast
31#include "lclib.h"
32#endif
33
34#define serialize		re_serialize	/* hp.ia64 <unistd.h>! */
35
36#define C_ESC			(-1)
37#define C_MB			(-2)
38
39#if _AST_REGEX_DEBUG
40
41#define DEBUG_TEST(f,y,n)	((debug&(debug_flag=f))?(y):(n))
42#define DEBUG_CODE(f,y,n)	do if(debug&(f)){y}else{n} while(0)
43#define DEBUG_INIT()		do { char* t; if (!debug) { debug = 0x80000000; if (t = getenv("_AST_regex_comp_debug")) debug |= strtoul(t, NiL, 0); } } while (0)
44
45static unsigned long	debug;
46static unsigned long	debug_flag;
47
48#else
49
50#define DEBUG_INIT()
51#define DEBUG_TEST(f,y,n)	(n)
52#define DEBUG_CODE(f,y,n)	do {n} while(0)
53
54#endif
55
56#if _PACKAGE_ast
57
58typedef struct Cchr_s
59{
60	Dtlink_t	lnk;
61	unsigned char	nam[2];
62	Ckey_t		key;
63} Cchr_t;
64
65#endif
66
67#define eat(p)		do{if ((p)->token.push)(p)->token.push=0;else (p)->cursor+=(p)->token.len;}while (0)
68
69/*
70 * determine whether greedy matching will work, i.e. produce
71 * the best match first.  such expressions are "easy", and
72 * need no backtracking once a complete match is found.
73 * if an expression has backreferences or alts it's hard
74 * else if it has only one closure it's easy
75 * else if all closures are simple (i.e. one-character) it's easy
76 * else it's hard.
77 */
78
79typedef struct Stats_s
80{
81	unsigned long	l;	/* min length to left of x		*/
82	unsigned long	k;	/* min length to left of y		*/
83	unsigned long	m;	/* min length				*/
84	unsigned long	n;	/* max length				*/
85	unsigned short	a;	/* number of alternations		*/
86	unsigned short	b;	/* number of backrefs			*/
87	unsigned short	c;	/* number of closures			*/
88	unsigned short	e;	/* $					*/
89	unsigned short	i;	/* number of negations			*/
90	unsigned short	p;	/* number of named subexpressions	*/
91	unsigned short	s;	/* number of simple closures		*/
92	unsigned short	t;	/* number of tries			*/
93	unsigned short	u;	/* number of unnamed subexpressions	*/
94	Rex_t*		x;	/* max length REX_STRING		*/
95	Rex_t*		y;	/* max length REX_TRIE			*/
96} Stats_t;
97
98typedef struct Token_s
99{
100	unsigned long	min;
101	unsigned long	max;
102	short		lex;
103	short		len;
104	short		esc;
105	short		att;
106	short		push;
107} Token_t;
108
109typedef struct Cenv_s
110{
111	int		delimiter;	/* pattern delimiter		*/
112	int		error;		/* last error			*/
113	int		explicit;	/* explicit match on this char	*/
114	int		mappeddot;	/* inverse mapped '.'		*/
115	int		mappednewline;	/* inverse mapped '\n'		*/
116	int		mappedslash;	/* inverse mapped '/'		*/
117	regflags_t	flags;		/* flags arg to regcomp		*/
118	int		type;		/* BRE,ERE,ARE,SRE,KRE		*/
119	unsigned char*	cursor;		/* curent point in re		*/
120	unsigned char*	pattern;	/* the original pattern		*/
121	unsigned char*	literal;	/* literal restart pattern	*/
122	int		parno;		/* number of last open paren	*/
123	int		parnest;	/* paren nest count		*/
124	int		posixkludge; 	/* to make * nonspecial		*/
125	Token_t		token;		/* token lookahead		*/
126	Stats_t		stats;		/* RE statistics		*/
127	int		terminator;	/* pattern terminator		*/
128	Rex_t*		paren[2*(BACK_REF_MAX+2)];
129					/* paren[i]!=0 if \i defined	*/
130	regex_t*	regex;		/* user handle			*/
131	regdisc_t*	disc;		/* user discipline		*/
132	unsigned char*	map;		/* external to native ccode map	*/
133	unsigned char*	MAP;		/* fold and/or map		*/
134} Cenv_t;
135
136/*
137 * allocate a new Rex_t node
138 */
139
140#if _BLD_DEBUG
141#define node(a,b,c,d,e)	node(a,b,c,d,e, const char* file, unsigned int line)
142#endif
143
144static Rex_t*
145node(Cenv_t* env, int type, int lo, int hi, size_t extra)
146{
147	register Rex_t*	e;
148
149	DEBUG_TEST(0x0800,(sfprintf(sfstdout, "%s:%d node(%d,%d,%d,%u)\n", file, line, type, lo, hi, sizeof(Rex_t) + extra)),(0));
150	if (e = (Rex_t*)alloc(env->disc, 0, sizeof(Rex_t) + extra))
151	{
152		memset(e, 0, sizeof(Rex_t) + extra);
153		e->type = type;
154		e->marked = 0;
155		e->lo = lo;
156		e->hi = hi;
157		e->flags = env->flags;
158		e->map = (e->flags & REG_ICASE) ? env->MAP : env->map;
159		e->explicit = env->explicit;
160		if (extra)
161			e->re.data = (char*)e + sizeof(Rex_t);
162	}
163	return e;
164}
165
166#if _BLD_DEBUG
167#undef	node
168#define node(a,b,c,d,e)	node(a,b,c,d,e,__FILE__,__LINE__)
169#endif
170
171/*
172 * free a Trie_node_t node
173 */
174
175static void
176triedrop(regdisc_t* disc, Trie_node_t* e)
177{
178	if (e)
179	{
180		triedrop(disc, e->son);
181		triedrop(disc, e->sib);
182		alloc(disc, e, 0);
183	}
184}
185
186/*
187 * free a Rex_t node
188 */
189
190void
191drop(regdisc_t* disc, Rex_t* e)
192{
193	int	i;
194	Rex_t*	x;
195
196	if (e && !(disc->re_flags & REG_NOFREE))
197		do
198		{
199			switch (e->type)
200			{
201			case REX_ALT:
202			case REX_CONJ:
203				drop(disc, e->re.group.expr.binary.left);
204				drop(disc, e->re.group.expr.binary.right);
205				break;
206			case REX_GROUP:
207			case REX_GROUP_AHEAD:
208			case REX_GROUP_AHEAD_NOT:
209			case REX_GROUP_BEHIND:
210			case REX_GROUP_BEHIND_NOT:
211			case REX_GROUP_CUT:
212			case REX_NEG:
213			case REX_REP:
214				drop(disc, e->re.group.expr.rex);
215				break;
216			case REX_TRIE:
217				for (i = 0; i <= UCHAR_MAX; i++)
218					triedrop(disc, e->re.trie.root[i]);
219				break;
220			}
221			x = e->next;
222			alloc(disc, e, 0);
223		} while (e = x);
224}
225
226/*
227 * mark e and descendants minimal
228 */
229
230static void
231mark(register Rex_t* e, int set)
232{
233	if (e && !e->marked)
234		do
235		{
236			e->marked = 1;
237			if (set)
238				e->flags |= REG_MINIMAL;
239			else
240				e->flags &= ~REG_MINIMAL;
241			switch (e->type)
242			{
243			case REX_ALT:
244			case REX_CONJ:
245			case REX_GROUP_COND:
246				if (e->re.group.expr.binary.left)
247					mark(e->re.group.expr.binary.left, set);
248				if (e->re.group.expr.binary.right)
249					mark(e->re.group.expr.binary.right, set);
250				break;
251			case REX_GROUP:
252			case REX_GROUP_AHEAD:
253			case REX_GROUP_AHEAD_NOT:
254			case REX_GROUP_BEHIND:
255			case REX_GROUP_BEHIND_NOT:
256			case REX_GROUP_CUT:
257			case REX_NEG:
258			case REX_REP:
259			case REX_TRIE:
260				mark(e->re.group.expr.rex, set);
261				break;
262			}
263		} while (e = e->next);
264}
265
266/*
267 * assign subexpression numbers by a preorder tree walk
268 */
269
270static int
271serialize(Cenv_t* env, Rex_t* e, int n)
272{
273	do
274	{
275		e->serial = n++;
276		switch (e->type)
277		{
278		case REX_ALT:
279		case REX_GROUP_COND:
280			if (e->re.group.expr.binary.left)
281				n = serialize(env, e->re.group.expr.binary.left, n);
282			e->re.group.expr.binary.serial = n++;
283			if (e->re.group.expr.binary.right)
284				n = serialize(env, e->re.group.expr.binary.right, n);
285			break;
286		case REX_CONJ:
287			n = serialize(env, e->re.group.expr.binary.left, n);
288			n = serialize(env, e->re.group.expr.binary.right, n);
289			break;
290		case REX_GROUP:
291		case REX_GROUP_AHEAD:
292		case REX_GROUP_AHEAD_NOT:
293		case REX_GROUP_BEHIND:
294		case REX_GROUP_BEHIND_NOT:
295		case REX_GROUP_CUT:
296		case REX_NEG:
297		case REX_REP:
298			n = serialize(env, e->re.group.expr.rex, n);
299			break;
300		}
301	} while (e = e->next);
302	return n;
303}
304
305/*
306 * catenate e and f into a sequence, collapsing them if possible
307 */
308
309static Rex_t*
310cat(Cenv_t* env, Rex_t* e, Rex_t* f)
311{
312	Rex_t*	g;
313
314	if (!f)
315	{
316		drop(env->disc, e);
317		return 0;
318	}
319	if (e->type == REX_NULL)
320	{
321		drop(env->disc, e);
322		return f;
323	}
324	if (f->type == REX_NULL)
325	{
326		g = f->next;
327		f->next = 0;
328		drop(env->disc, f);
329		f = g;
330	}
331	else if (e->type == REX_DOT && f->type == REX_DOT)
332	{
333		unsigned int	m = e->lo + f->lo;
334		unsigned int	n = e->hi + f->hi;
335
336		if (m <= RE_DUP_MAX)
337		{
338			if (e->hi > RE_DUP_MAX || f->hi > RE_DUP_MAX)
339			{
340				n = RE_DUP_INF;
341				goto combine;
342			}
343			else if (n <= RE_DUP_MAX)
344			{
345			combine:
346				e->lo = m;
347				e->hi = n;
348				g = f->next;
349				f->next = 0;
350				drop(env->disc, f);
351				f = g;
352			}
353		}
354	}
355	e->next = f;
356	return e;
357}
358
359/*
360 * collect re statistics
361 */
362
363static int
364stats(register Cenv_t* env, register Rex_t* e)
365{
366	register unsigned long	n;
367	register unsigned long	m;
368	unsigned long		cm;
369	unsigned long		nm;
370	unsigned long		cn;
371	unsigned long		nn;
372	unsigned long		l;
373	unsigned long		k;
374	unsigned long		t;
375	Rex_t*			q;
376	Rex_t*			x;
377	Rex_t*			y;
378	unsigned char		c;
379	unsigned char		b;
380
381	do
382	{
383		switch (e->type)
384		{
385		case REX_ALT:
386			x = env->stats.x;
387			l = env->stats.l;
388			y = env->stats.y;
389			k = env->stats.k;
390			t = env->stats.t;
391			if (++env->stats.a <= 0)
392				return 1;
393			cm = env->stats.m;
394			env->stats.m = 0;
395			cn = env->stats.n;
396			env->stats.n = 0;
397			if (stats(env, e->re.group.expr.binary.left))
398				return 1;
399			m = env->stats.m;
400			env->stats.m = 0;
401			n = env->stats.n;
402			env->stats.n = 0;
403			if (e->re.group.expr.binary.right && stats(env, e->re.group.expr.binary.right))
404				return 1;
405			if (env->stats.m > m)
406				env->stats.m = m;
407			else
408				m = env->stats.m;
409			if ((env->stats.m += cm) < m)
410				return 1;
411			if (env->stats.n < n)
412				env->stats.n = n;
413			else
414				n = env->stats.n;
415			if ((env->stats.n += cn) < n)
416				return 1;
417			env->stats.x = x;
418			env->stats.l = l;
419			env->stats.y = y;
420			env->stats.k = k;
421			env->stats.t = t;
422			break;
423		case REX_BACK:
424			if (++env->stats.b <= 0)
425				return 1;
426			break;
427		case REX_CLASS:
428		case REX_COLL_CLASS:
429		case REX_DOT:
430		case REX_ONECHAR:
431			n = env->stats.m;
432			if ((env->stats.m += e->lo) < n)
433				return 1;
434			if (e->hi != RE_DUP_INF)
435			{
436				n = env->stats.n;
437				if ((env->stats.n += e->hi) < n)
438					return 1;
439			}
440			if (e->lo != e->hi)
441			{
442				if (++env->stats.c <= 0)
443					return 1;
444				if (++env->stats.s <= 0)
445					return 1;
446			}
447			break;
448		case REX_CONJ:
449			cm = env->stats.m;
450			env->stats.m = 0;
451			cn = env->stats.n;
452			env->stats.n = 0;
453			if (stats(env, e->re.group.expr.binary.left))
454				return 1;
455			nm = env->stats.m;
456			env->stats.m = 0;
457			nn = env->stats.n;
458			env->stats.n = 0;
459			if (stats(env, e->re.group.expr.binary.right))
460				return 1;
461			if (env->stats.m < nm)
462				env->stats.m = nm;
463			else
464				nm = env->stats.m;
465			if ((env->stats.m += cm) < nm)
466				return 1;
467			if (env->stats.n < nn)
468				env->stats.n = nn;
469			else
470				nn = env->stats.n;
471			if ((env->stats.n += cn) < nn)
472				return 1;
473			break;
474		case REX_END:
475			env->stats.e = 1;
476			break;
477		case REX_GROUP:
478			if (e->re.group.number && ++env->stats.p <= 0 || !e->re.group.number && ++env->stats.u <= 0)
479				return 1;
480			if (stats(env, e->re.group.expr.rex))
481				return 1;
482			break;
483		case REX_GROUP_AHEAD:
484		case REX_GROUP_AHEAD_NOT:
485		case REX_GROUP_BEHIND:
486		case REX_GROUP_BEHIND_NOT:
487			m = env->stats.m;
488			n = env->stats.n;
489			x = env->stats.x;
490			y = env->stats.y;
491			if (stats(env, e->re.group.expr.rex))
492				return 1;
493			env->stats.m = m;
494			env->stats.n = n;
495			env->stats.x = x;
496			env->stats.y = y;
497			switch (e->type)
498			{
499			case REX_GROUP_AHEAD:
500			case REX_GROUP_BEHIND:
501				if (++env->stats.u <= 0)
502					return 1;
503				break;
504			}
505			break;
506		case REX_GROUP_COND:
507			if (++env->stats.u <= 0)
508				return 1;
509			m = env->stats.m;
510			n = env->stats.n;
511			x = env->stats.x;
512			y = env->stats.y;
513			if (e->re.group.size > 0 && ++env->stats.b <= 0)
514				return 1;
515			if (e->re.group.expr.binary.left && stats(env, e->re.group.expr.binary.left))
516				return 1;
517			if (q = e->re.group.expr.binary.right)
518			{
519				if (q->re.group.expr.binary.left && stats(env, q->re.group.expr.binary.left))
520					return 1;
521				if (q->re.group.expr.binary.right && stats(env, q->re.group.expr.binary.right))
522					return 1;
523			}
524			env->stats.m = m;
525			env->stats.n = n;
526			env->stats.x = x;
527			env->stats.y = y;
528			break;
529		case REX_GROUP_CUT:
530			if (++env->stats.u <= 0)
531				return 1;
532			m = env->stats.m;
533			n = env->stats.n;
534			x = env->stats.x;
535			y = env->stats.y;
536			if (stats(env, e->re.group.expr.rex))
537				return 1;
538			env->stats.m = m;
539			env->stats.n = n;
540			env->stats.x = x;
541			env->stats.y = y;
542			break;
543		case REX_NEG:
544			env->stats.i++;
545			x = env->stats.x;
546			l = env->stats.l;
547			y = env->stats.y;
548			k = env->stats.k;
549			t = env->stats.t;
550			cm = env->stats.m;
551			env->stats.m = 0;
552			if (stats(env, e->re.group.expr.rex))
553				return 1;
554			env->stats.m = !env->stats.m;
555			if ((env->stats.m += cm) < cm)
556				return 1;
557			env->stats.x = x;
558			env->stats.l = l;
559			env->stats.y = y;
560			env->stats.k = k;
561			env->stats.t = t;
562			break;
563		case REX_REP:
564			x = env->stats.x;
565			l = env->stats.l;
566			y = env->stats.y;
567			k = env->stats.k;
568			t = env->stats.t;
569			if (++env->stats.c <= 0)
570				return 1;
571			b = env->stats.b;
572			c = env->stats.c;
573			cm = env->stats.m;
574			env->stats.m = 0;
575			if (stats(env, e->re.group.expr.rex))
576				return 1;
577			if (env->stats.m == 1 && b == env->stats.b && c == env->stats.c && ++env->stats.s <= 0)
578				return 1;
579			if (e->lo < 1)
580			{
581				env->stats.x = x;
582				env->stats.l = l;
583				env->stats.y = y;
584				env->stats.k = k;
585				env->stats.t = t;
586				env->stats.m = cm;
587			}
588			else
589			{
590				m = env->stats.m;
591				if ((env->stats.m *= e->lo) > 0 && env->stats.m < m)
592					return 1;
593				m = env->stats.m;
594				if ((env->stats.m += cm) < m)
595					return 1;
596				if (env->stats.x != x)
597					env->stats.l = cm;
598				if (env->stats.y != y)
599					env->stats.k = cm;
600			}
601			break;
602		case REX_STRING:
603			if (!e->map)
604			{
605				cm = env->stats.m;
606				if ((env->stats.m += e->re.string.size) < cm)
607					return 1;
608				cn = env->stats.n;
609				if ((env->stats.n += e->re.string.size) < cn)
610					return 1;
611				if (!env->stats.x || env->stats.x->re.string.size < e->re.string.size)
612				{
613					env->stats.x = e;
614					env->stats.l = cm;
615				}
616			}
617			break;
618		case REX_TRIE:
619			if (++env->stats.s <= 0)
620				return 1;
621			cm = env->stats.m;
622			if ((env->stats.m += e->re.trie.min) < cm)
623				return 1;
624			cn = env->stats.n;
625			if ((env->stats.n += e->re.trie.max) < cn)
626				return 1;
627			env->stats.t++;
628			if (!env->stats.y || env->stats.y->re.trie.min < e->re.trie.min)
629			{
630				env->stats.y = e;
631				env->stats.k = cm;
632			}
633			break;
634		}
635	} while (e = e->next);
636	return 0;
637}
638
639static int	token(Cenv_t*);
640
641static int
642magic(register Cenv_t* env, register int c, int escaped)
643{
644	register char*	sp;
645	register int	n;
646	int		o = c;
647	int		e = env->error;
648	int		l = env->token.len;
649	short*		mp;
650	char*		ep;
651
652	if (mp = state.magic[c])
653	{
654		c = mp[env->type+escaped];
655		if (c >= T_META)
656		{
657			sp = (char*)env->cursor + env->token.len;
658			switch (c)
659			{
660			case T_LEFT:
661				n = 0;
662				ep = sp;
663				while (*sp >= '0' && *sp <= '9')
664				{
665					if (n > (INT_MAX / 10))
666					{
667						env->error = REG_BADBR;
668						goto bad;
669					}
670					n = n * 10 + *sp++ - '0';
671				}
672				if (sp == ep)
673				{
674					if (env->type < SRE || *sp != ',')
675					{
676						env->error = *sp ? REG_BADBR : REG_EBRACE;
677						goto bad;
678					}
679				}
680				else if (n > RE_DUP_MAX)
681				{
682					env->error = REG_BADBR;
683					goto bad;
684				}
685				env->token.min = n;
686				if (*sp == ',')
687				{
688					n = 0;
689					ep = ++sp;
690					while (*sp >= '0' && *sp <= '9')
691					{
692						if (n > (INT_MAX / 10))
693						{
694							env->error = REG_BADBR;
695							goto bad;
696						}
697						n = n * 10 + *sp++ - '0';
698					}
699					if (sp == ep)
700						n = RE_DUP_INF;
701					else if (n < env->token.min)
702					{
703						env->error = REG_BADBR;
704						goto bad;
705					}
706				}
707				env->token.max = n;
708				switch (*sp)
709				{
710				case 0:
711					env->error = REG_EBRACE;
712					goto bad;
713				case '\\':
714					if (!escaped)
715					{
716						env->error = REG_BADBR;
717						goto bad;
718					}
719					sp++;
720					break;
721				default:
722					if (escaped)
723					{
724						env->error = REG_BADBR;
725						goto bad;
726					}
727					break;
728				}
729				switch (*sp++)
730				{
731				case 0:
732					env->error = REG_EBRACE;
733					goto bad;
734				case '}':
735					break;
736				default:
737					env->error = REG_BADBR;
738					goto bad;
739				}
740				env->token.len = sp - (char*)env->cursor;
741				break;
742			case T_RIGHT:
743				env->error = REG_EBRACE;
744				goto bad;
745			case T_OPEN:
746				if (env->type < SRE && *sp == '?')
747				{
748					env->token.len++;
749					env->token.lex = 0;
750					goto group;
751				}
752				break;
753			case T_ESCAPE:
754				c = chresc(sp - 2, &ep);
755				if (ep < sp)
756					goto bad;
757				env->token.len += ep - sp;
758				if (c >= T_META)
759				{
760					env->token.lex = c;
761					c = C_ESC;
762				}
763				return c;
764			case T_BACK+0:
765			case T_BACK+1:
766			case T_BACK+2:
767			case T_BACK+3:
768			case T_BACK+4:
769			case T_BACK+5:
770			case T_BACK+6:
771			case T_BACK+7:
772				n = chresc(sp - 2, &ep);
773				if (ep > sp + 1)
774				{
775					env->token.len += ep - sp;
776					return n;
777				}
778				/*FALLTHROUGH*/
779			case T_BACK+8:
780			case T_BACK+9:
781				if (env->type == SRE || c == T_BACK && !(env->flags & (REG_LENIENT|REG_REGEXP)))
782				{
783					env->error = REG_BADESC;
784					goto bad;
785				}
786				if ((env->flags & REG_MULTIREF) && isdigit(*sp))
787				{
788					c = (c - T_BACK) * 10 + (*sp - '0');
789					if (c > 0 && c <= env->parno && env->paren[c])
790						c += T_BACK;
791					else
792						c = chresc(sp - 2, &ep);
793					env->token.len++;
794				}
795				if (c == T_BACK)
796					c = 0;
797				break;
798			case T_BAD:
799				if (escaped == 1 && (env->flags & (REG_LENIENT|REG_REGEXP)) && (c = mp[env->type+escaped+2]) >= T_META)
800					return c;
801				goto bad;
802			}
803			if (env->type >= SRE)
804			{
805				if (c == T_DOT)
806					c = '.';
807				else if (c < T_OPEN)
808				{
809					if (env->type == KRE && *(env->cursor + env->token.len) == '-' && *(env->cursor + env->token.len + 1) == '(')
810					{
811						env->token.len++;
812						env->token.att = 1;
813					}
814					if (env->type == KRE && *(env->cursor + env->token.len) == '(')
815					{
816						env->token.len++;
817						switch (c)
818						{
819						case T_AT:
820							break;
821						case T_PERCENT:
822							env->token.lex = c;
823							goto group;
824						case T_TILDE:
825							env->token.lex = 0;
826							goto group;
827						default:
828							env->token.lex = c;
829							break;
830						}
831						c = T_OPEN;
832					}
833					else if (c == T_STAR)
834						c = T_DOTSTAR;
835					else if (c == T_QUES)
836						c = T_DOT;
837					else
838					{
839						c = o;
840						env->token.len = l;
841					}
842				}
843				else if (c > T_BACK)
844				{
845					c = (c - T_BACK) * 2 - 1;
846					c = (c > env->parno || !env->paren[c]) ? o : T_BACK + c;
847				}
848				else if (env->type == KRE && !env->parnest && (env->flags & REG_SHELL_GROUP))
849				{
850					if (c == T_AND)
851						c = '&';
852					else if (c == T_BAR)
853						c = '|';
854					else if (c == T_OPEN)
855						c = '(';
856				}
857			}
858		}
859	}
860	else if (escaped == 2)
861	{
862		if (env->type >= SRE && !(env->flags & REG_SHELL_ESCAPED) || (env->flags & REG_ESCAPE) && (c == '[' || c == '-' || c == ']' || env->delimiter && c == env->delimiter))
863			/*ok*/;
864		else
865		{
866			env->error = REG_BADESC;
867			goto bad;
868		}
869	}
870	else if (escaped && !(env->flags & (REG_LENIENT|REG_REGEXP)) && c != ']')
871	{
872		env->error = REG_BADESC;
873		goto bad;
874	}
875	return c;
876 group:
877	sp = (char*)env->cursor + env->token.len;
878	switch (*sp++)
879	{
880	case ')':
881		break;
882	case '#':
883		for (;;)
884		{
885			switch (*sp++)
886			{
887			case 0:
888				env->error = REG_EPAREN;
889				return T_BAD;
890			case ')':
891				break;
892			default:
893				continue;
894			}
895			break;
896		}
897		break;
898	default:
899		return T_GROUP;
900	}
901	env->cursor = (unsigned char*)sp;
902	return token(env);
903 bad:
904	if (escaped == 2)
905		env->error = e;
906	else if (env->flags & (REG_LENIENT|REG_REGEXP))
907		return o;
908	else if (escaped == 1 && !env->error)
909	{
910		if (mp || o == ']')
911			return o;
912		env->error = REG_BADESC;
913	}
914	return T_BAD;
915}
916
917static int
918token(register Cenv_t* env)
919{
920	int	c;
921	int	posixkludge;
922
923	if (env->token.push)
924		return env->token.lex;
925	env->token.att = env->token.esc = 0;
926	if ((env->token.len = MBSIZE(env->cursor)) > 1)
927		return env->token.lex = C_MB;
928	env->token.lex = 0;
929	for (;;)
930	{
931		c = *env->cursor;
932		if (c == 0 || c == env->delimiter || c == env->terminator)
933			return T_END;
934		if (!(env->flags & REG_COMMENT))
935			break;
936		if (c == '#')
937		{
938			do
939			{
940				c = *++env->cursor;
941				if (c == 0 || c == env->delimiter)
942					return T_END;
943			} while (c != '\n');
944		}
945		else if (!isspace(c))
946			break;
947		env->cursor++;
948	}
949	if (c == '\n' && (env->flags & REG_MULTIPLE) && !env->delimiter)
950	{
951		if (env->parnest)
952		{
953			env->error = REG_EPAREN;
954			return T_BAD;
955		}
956		env->parno = 0;
957		env->pattern = env->cursor + 1;
958		return T_BAR;
959	}
960	if (env->flags & REG_LITERAL)
961		return c;
962	if (posixkludge = env->posixkludge)
963	{
964		env->posixkludge = 0;
965		if (c == '*')
966			return c;
967	}
968	if (c == '\\')
969	{
970		if (env->flags & REG_SHELL_ESCAPED)
971			return c;
972		if (!(c = *(env->cursor + 1)) || c == env->terminator)
973		{
974			if (env->flags & (REG_LENIENT|REG_REGEXP))
975			{
976				if (c)
977				{
978					env->token.esc = env->token.len;
979					env->token.len += MBSIZE(env->cursor + 1);
980					return c;
981				}
982				return '\\';
983			}
984			env->error = REG_EESCAPE;
985			return T_BAD;
986		}
987		env->token.esc = env->token.len;
988		env->token.len += MBSIZE(env->cursor + 1);
989		if (env->delimiter && c == 'n')
990			return '\n';
991		else if (c == env->delimiter)
992			return magic(env, c, 0);
993		else if (c == '(' && env->type == BRE)
994			env->posixkludge = 1;
995		else if (c == ')' && env->type == BRE && env->parnest <= 0)
996		{
997			env->error = REG_EPAREN;
998			return T_BAD;
999		}
1000		else if (isspace(c) && (env->flags & REG_COMMENT))
1001			return c;
1002		return magic(env, c, 1);
1003	}
1004	else if (c == '$')
1005	{
1006		if (env->type == BRE && (*(env->cursor + 1) == 0 || *(env->cursor + 1) == env->delimiter || *(env->cursor + 1) == env->terminator || *(env->cursor + 1) == '\\' && *(env->cursor + 2) == ')') || (env->flags & REG_MULTIPLE) && *(env->cursor + 1) == '\n')
1007			return T_DOLL;
1008	}
1009	else if (c == '^')
1010	{
1011		if (env->type == BRE && (env->cursor == env->pattern || posixkludge == 1))
1012		{
1013			env->posixkludge = 2;
1014			return T_CFLX;
1015		}
1016	}
1017	else if (c == ')')
1018	{
1019		if (env->type != BRE && env->parnest <= 0)
1020			return c;
1021	}
1022	else if (c == '/' && env->explicit == env->mappedslash)
1023	{
1024		while (*(env->cursor + env->token.len) == c)
1025			env->token.len++;
1026		return T_SLASHPLUS;
1027	}
1028	return magic(env, c, 0);
1029}
1030
1031static Celt_t*
1032col(Celt_t* ce, int ic, unsigned char* bp, int bw, int bc, unsigned char* ep, int ew, int ec)
1033{
1034	register char*		s;
1035	register unsigned char*	k;
1036	register unsigned char*	e;
1037	register int		c;
1038	register int		cc;
1039	int			bt;
1040	int			et;
1041	Ckey_t			key;
1042
1043	cc = 0;
1044	for (;;)
1045	{
1046		k = key;
1047		if (bw == 1)
1048		{
1049			c = bc;
1050			if (ic)
1051			{
1052				if (isupper(c))
1053				{
1054					c = tolower(c);
1055					cc = -1;
1056				}
1057				else if (islower(c))
1058				{
1059					c = toupper(c);
1060					cc = -1;
1061				}
1062			}
1063			*k++ = c;
1064		}
1065		else if (bw < COLL_KEY_MAX)
1066		{
1067			s = (char*)bp;
1068			if (ic)
1069			{
1070				c = mbchar(s);
1071				if (iswupper(c))
1072				{
1073					c = towlower(c);
1074					cc = 1;
1075				}
1076				else if (iswlower(c))
1077				{
1078					c = towupper(c);
1079					cc = 1;
1080				}
1081			}
1082			if (cc > 0)
1083			{
1084				cc = -1;
1085				k += mbconv((char*)k, c);
1086			}
1087			else
1088				for (e = k + bw; k < e; *k++ = *s++);
1089		}
1090		*k = 0;
1091		mbxfrm(ce->beg, key, COLL_KEY_MAX);
1092		if (ep)
1093		{
1094			k = key;
1095			c = mbchar(k);
1096			if (iswupper(c))
1097				bt = COLL_range_uc;
1098			else if (iswlower(c))
1099				bt = COLL_range_lc;
1100			else
1101				bt = COLL_range;
1102			k = key;
1103			if (ew == 1)
1104			{
1105				c = ec;
1106				if (ic)
1107				{
1108					if (isupper(c))
1109					{
1110						c = tolower(c);
1111						cc = -1;
1112					}
1113					else if (islower(c))
1114					{
1115						c = toupper(c);
1116						cc = -1;
1117					}
1118				}
1119				*k++ = c;
1120			}
1121			else if (ew < COLL_KEY_MAX)
1122			{
1123				s = (char*)ep;
1124				if (ic)
1125				{
1126					c = mbchar(s);
1127					if (iswupper(c))
1128					{
1129						c = towlower(c);
1130						cc = 1;
1131					}
1132					else if (iswlower(c))
1133					{
1134						c = towupper(c);
1135						cc = 1;
1136					}
1137				}
1138				if (cc > 0)
1139				{
1140					cc = -1;
1141					k += mbconv((char*)k, c);
1142				}
1143				else
1144					for (e = k + ew; k < e; *k++ = *s++);
1145			}
1146			*k = 0;
1147			mbxfrm(ce->end, key, COLL_KEY_MAX);
1148			k = key;
1149			c = mbchar(k);
1150			if (iswupper(c))
1151				et = COLL_range_uc;
1152			else if (iswlower(c))
1153				et = COLL_range_lc;
1154			else
1155				et = COLL_range;
1156			ce->typ = bt == et ? bt : COLL_range;
1157		}
1158		else
1159			ce->typ = COLL_char;
1160		ce++;
1161		if (!ic || !cc)
1162			break;
1163		ic = 0;
1164	}
1165	return ce;
1166}
1167
1168static Rex_t*
1169bra(Cenv_t* env)
1170{
1171	Rex_t*		e;
1172	int		c;
1173	int		i;
1174	int		w;
1175	int		neg;
1176	int		last;
1177	int		inrange;
1178	int		complicated;
1179	int		collate;
1180	int		elements;
1181	unsigned char*	first;
1182	unsigned char*	start;
1183	unsigned char*	begin;
1184	unsigned char*	s;
1185	regclass_t	f;
1186	unsigned char	buf[4 * (COLL_KEY_MAX + 1)];
1187#if _PACKAGE_ast
1188	int		ic;
1189	char		mbc[COLL_KEY_MAX + 1];
1190#endif
1191
1192	if (!(e = node(env, REX_CLASS, 1, 1, sizeof(Set_t))))
1193		return 0;
1194	collate = complicated = elements = 0;
1195	if (*env->cursor == '^' || env->type >= SRE && *env->cursor == '!')
1196	{
1197		env->cursor++;
1198		complicated = neg = 1;
1199	}
1200	else
1201		neg = 0;
1202	first = env->cursor;
1203	start = first + MBSIZE(first);
1204	if (*env->cursor == 0 || *(env->cursor + 1) == 0 || *env->cursor == env->terminator || *(env->cursor + 1) == env->terminator || (env->flags & REG_ESCAPE) && (*env->cursor == env->delimiter || *env->cursor != '\\' && *(env->cursor + 1) == env->delimiter))
1205		goto error;
1206	begin = env->cursor + MBSIZE(env->cursor);
1207
1208	/*
1209	 * inrange: 0=no, 1=possibly, 2=definitely
1210	 */
1211
1212	inrange = 0;
1213	for (;;)
1214	{
1215		if (!(c = *env->cursor) || c == env->terminator || c == env->delimiter && (env->flags & REG_ESCAPE))
1216			goto error;
1217		env->cursor += (w = MBSIZE(env->cursor));
1218		if (c == '\\' && ((env->flags & REG_CLASS_ESCAPE) || *env->cursor == env->delimiter && (env->flags & REG_ESCAPE)))
1219		{
1220			if (*env->cursor)
1221			{
1222				if (*env->cursor == 'n')
1223				{
1224					env->cursor++;
1225					c = '\n';
1226				}
1227				else if (env->type < SRE || !(env->flags & REG_SHELL_ESCAPED))
1228				{
1229					env->token.len = 1;
1230					w = magic(env, *env->cursor, 2);
1231					if (env->token.len > 1 || w != T_BAD)
1232					{
1233						if (env->token.len == 1 && (f = classfun(w)))
1234						{
1235							if (inrange > 1)
1236							{
1237								if (env->type < SRE && !(env->flags & (REG_LENIENT|REG_REGEXP)))
1238									goto erange;
1239								inrange = 0;
1240							}
1241							env->cursor++;
1242							for (c = 0; c <= UCHAR_MAX; c++)
1243								if ((*f)(c))
1244									setadd(e->re.charclass, c);
1245							complicated++;
1246							elements++;
1247							continue;
1248						}
1249						if (env->token.len > 1 || w >= 0 && w < T_META)
1250						{
1251							c = w;
1252							if (c > UCHAR_MAX)
1253							{
1254								if (env->type < SRE && !(env->flags & (REG_LENIENT|REG_REGEXP)) && !mbwide())
1255									goto erange;
1256								c = UCHAR_MAX;
1257							}
1258							env->cursor += env->token.len;
1259						}
1260					}
1261				}
1262			}
1263		}
1264		else if (c == ']')
1265		{
1266			if (env->cursor == begin)
1267			{
1268				last = c;
1269				inrange = 1;
1270				continue;
1271			}
1272			if (inrange != 0)
1273			{
1274				setadd(e->re.charclass, last);
1275				elements++;
1276				if (inrange == 2)
1277				{
1278					setadd(e->re.charclass, '-');
1279					elements++;
1280				}
1281			}
1282			break;
1283		}
1284		else if (c == '-')
1285		{
1286			if (!inrange && env->cursor != begin && *env->cursor != ']')
1287			{
1288				if (env->type < SRE && !(env->flags & (REG_LENIENT|REG_REGEXP)))
1289					goto erange;
1290				continue;
1291			}
1292			else if (inrange == 1)
1293			{
1294				inrange = 2;
1295				complicated++;
1296				continue;
1297			}
1298		}
1299		else if (c == '[')
1300		{
1301			switch (*env->cursor)
1302			{
1303			case 0:
1304				goto error;
1305			case ':':
1306				if (env->flags & REG_REGEXP)
1307					goto normal;
1308				if (inrange == 1)
1309				{
1310					setadd(e->re.charclass, last);
1311					elements++;
1312				}
1313				if (!(f = regclass((char*)env->cursor, (char**)&env->cursor)))
1314				{
1315					if (env->cursor == start && (c = *(env->cursor + 1)))
1316					{
1317						s = start = env->cursor + 1;
1318						while (*++s && *s != ':');
1319						if (*s == ':' && *(s + 1) == ']' && *(s + 2) == ']')
1320						{
1321							if ((i = (s - start)) == 1)
1322							{
1323								switch (c)
1324								{
1325								case '<':
1326									i = REX_WBEG;
1327									break;
1328								case '>':
1329									i = REX_WEND;
1330									break;
1331								default:
1332									i = 0;
1333									break;
1334								}
1335								if (i)
1336								{
1337									env->cursor = s + 3;
1338									drop(env->disc, e);
1339									return node(env, i, 0, 0, 0);
1340								}
1341							}
1342						}
1343					}
1344					env->error = REG_ECTYPE;
1345					goto error;
1346				}
1347				for (c = 0; c <= UCHAR_MAX; c++)
1348					if ((*f)(c))
1349						setadd(e->re.charclass, c);
1350				inrange = 0;
1351				complicated++;
1352				elements++;
1353				continue;
1354			case '=':
1355				if (env->flags & REG_REGEXP)
1356					goto normal;
1357				if (inrange == 2)
1358					goto erange;
1359				if (inrange == 1)
1360				{
1361					setadd(e->re.charclass, last);
1362					elements++;
1363				}
1364				if ((c = regcollate((char*)env->cursor, (char**)&env->cursor, (char*)buf, sizeof(buf), NiL)) < 0)
1365					goto ecollate;
1366				if (c > 1)
1367					collate++;
1368				else
1369					setadd(e->re.charclass, buf[0]);
1370				c = buf[0];
1371				inrange = 0;
1372				complicated++;
1373				elements++;
1374				continue;
1375			case '.':
1376				if (env->flags & REG_REGEXP)
1377					goto normal;
1378				if ((c = regcollate((char*)env->cursor, (char**)&env->cursor, (char*)buf, sizeof(buf), NiL)) < 0)
1379					goto ecollate;
1380				if (c > 1)
1381					collate++;
1382				c = buf[0];
1383				complicated++;
1384				break;
1385			default:
1386			normal:
1387				if (*env->cursor == env->terminator || *env->cursor == env->delimiter && (env->flags & REG_ESCAPE))
1388					goto error;
1389				break;
1390			}
1391		}
1392		else if (w > 1)
1393			complicated++;
1394		if (inrange == 2)
1395		{
1396			if (last <= c)
1397			{
1398				for (i = last; i <= c; i++)
1399					setadd(e->re.charclass, i);
1400				inrange = env->type >= SRE || (env->flags & (REG_LENIENT|REG_REGEXP));
1401				elements += 2;
1402			}
1403			else if (env->type >= SRE)
1404			{
1405				setadd(e->re.charclass, last);
1406				setadd(e->re.charclass, c);
1407				elements += 2;
1408				inrange = 1;
1409			}
1410			else if (!(env->flags & (REG_LENIENT|REG_REGEXP)))
1411				goto erange;
1412			else
1413				inrange = 0;
1414		}
1415		else if (inrange == 1)
1416		{
1417			setadd(e->re.charclass, last);
1418			elements++;
1419		}
1420		else
1421			inrange = 1;
1422		last = c;
1423	}
1424#if _PACKAGE_ast
1425	if (complicated && mbcoll())
1426	{
1427		Dt_t*			dt;
1428		Cchr_t*			cc;
1429		Cchr_t*			tc;
1430		Cchr_t*			xc;
1431		Celt_t*			ce;
1432		Cchr_t			key;
1433		int			rw;
1434		int			rc;
1435		wchar_t			wc;
1436		unsigned char*		rp;
1437		unsigned char*		pp;
1438		char			cb[2][COLL_KEY_MAX+1];
1439
1440		static Dtdisc_t		disc;
1441
1442		static const char	primary[] = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789";
1443
1444		if (!(dt = (Dt_t*)LCINFO(AST_LC_COLLATE)->data))
1445		{
1446			disc.key = offsetof(Cchr_t, key);
1447			if ((cc = newof(0, Cchr_t, elementsof(primary), 0)) && (dt = dtopen(&disc, Dtoset)))
1448			{
1449				for (i = 0; i < elementsof(primary) - 1; i++, cc++)
1450				{
1451					cc->nam[0] = primary[i];
1452					mbxfrm(cc->key, cc->nam, COLL_KEY_MAX);
1453					dtinsert(dt, cc);
1454				}
1455				for (i = 0; i < elementsof(cc->key); i++)
1456					cc->key[i] = ~0;
1457				dtinsert(dt, cc);
1458				LCINFO(AST_LC_COLLATE)->data = (void*)dt;
1459			}
1460			else
1461			{
1462				if (cc)
1463					free(cc);
1464				drop(env->disc, e);
1465				return 0;
1466			}
1467		}
1468		if (dt)
1469		{
1470			drop(env->disc, e);
1471			if (ic = env->flags & REG_ICASE)
1472				elements *= 2;
1473			if (!(e = node(env, REX_COLL_CLASS, 1, 1, (elements + 3) * sizeof(Celt_t))))
1474				return 0;
1475			ce = (Celt_t*)e->re.data;
1476			e->re.collate.invert = neg;
1477			e->re.collate.elements = ce;
1478			env->cursor = first;
1479			inrange = 0;
1480			for (;;)
1481			{
1482				if ((c = *env->cursor) == 0 || c == env->terminator || (env->flags & REG_ESCAPE) && c == env->delimiter)
1483					goto error;
1484				pp = env->cursor;
1485				env->cursor += (w = MBSIZE(env->cursor));
1486				if (c == '\\' && ((env->flags & REG_CLASS_ESCAPE) || *env->cursor == env->delimiter && (env->flags & REG_ESCAPE)))
1487				{
1488					if (*env->cursor)
1489					{
1490						if (*env->cursor == 'n')
1491						{
1492							pp = env->cursor++;
1493							c = '\n';
1494						}
1495						else if (env->type < SRE || !(env->flags & REG_SHELL_ESCAPED))
1496						{
1497							env->token.len = 1;
1498							w = magic(env, *env->cursor, 2);
1499							if (env->token.len > 1 || w != T_BAD)
1500							{
1501								if (env->token.len == 1 && (f = classfun(w)))
1502								{
1503									if (inrange > 1)
1504									{
1505										if (env->type < SRE && !(env->flags & (REG_LENIENT|REG_REGEXP)))
1506											goto erange;
1507										inrange = 0;
1508									}
1509									env->cursor++;
1510									ce->fun = f;
1511									ce->typ = COLL_call;
1512									ce++;
1513									continue;
1514								}
1515								if (env->token.len > 1 || w >= 0 && w < T_META)
1516								{
1517									c = w;
1518									w = mbconv(mbc, c);
1519									pp = (unsigned char*)mbc;
1520									env->cursor += env->token.len;
1521								}
1522							}
1523						}
1524					}
1525				}
1526				else if (c == ']')
1527				{
1528					if (env->cursor == begin)
1529					{
1530						rp = pp;
1531						rw = w;
1532						inrange = 1;
1533						continue;
1534					}
1535					if (inrange != 0)
1536					{
1537						ce = col(ce, ic, rp, rw, rc, NiL, 0, 0);
1538						if (inrange == 2)
1539							ce = col(ce, ic, NiL, 1, '-', NiL, 0, 0);
1540					}
1541					break;
1542				}
1543				else if (c == '-')
1544				{
1545					if (!inrange && env->cursor != begin && *env->cursor != ']')
1546					{
1547						if (env->type < SRE && !(env->flags & (REG_LENIENT|REG_REGEXP)))
1548							goto erange;
1549						continue;
1550					}
1551					else if (inrange == 1)
1552					{
1553						inrange = 2;
1554						continue;
1555					}
1556				}
1557				else if (c == '[')
1558				{
1559					switch (*env->cursor)
1560					{
1561					case 0:
1562						goto error;
1563					case ':':
1564						if (env->flags & REG_REGEXP)
1565							goto complicated_normal;
1566						if (inrange == 1)
1567							ce = col(ce, ic, rp, rw, rc, NiL, 0, 0);
1568						if (!(f = regclass((char*)env->cursor, (char**)&env->cursor)))
1569						{
1570							if (env->cursor == start && (c = *(env->cursor + 1)) && *(env->cursor + 2) == ':' && *(env->cursor + 3) == ']' && *(env->cursor + 4) == ']')
1571							{
1572								switch (c)
1573								{
1574								case '<':
1575									i = REX_WBEG;
1576									break;
1577								case '>':
1578									i = REX_WEND;
1579									break;
1580								default:
1581									i = 0;
1582									break;
1583								}
1584								if (i)
1585								{
1586									env->cursor += 5;
1587									drop(env->disc, e);
1588									return node(env, i, 0, 0, 0);
1589								}
1590							}
1591							env->error = REG_ECTYPE;
1592							goto error;
1593						}
1594						ce->fun = f;
1595						ce->typ = COLL_call;
1596						ce++;
1597						inrange = 0;
1598						continue;
1599					case '=':
1600						if (env->flags & REG_REGEXP)
1601							goto complicated_normal;
1602						if (inrange == 2)
1603							goto erange;
1604						if (inrange == 1)
1605							ce = col(ce, ic, rp, rw, rc, NiL, 0, 0);
1606						pp = (unsigned char*)cb[inrange];
1607						rp = env->cursor + 1;
1608						if ((rw = regcollate((char*)env->cursor, (char**)&env->cursor, (char*)pp, COLL_KEY_MAX, &wc)) < 0)
1609							goto ecollate;
1610						c = 0;
1611						if (ic)
1612						{
1613							if (iswupper(wc))
1614							{
1615								wc = towlower(wc);
1616								rw = mbconv((char*)pp, wc);
1617								c = 'u';
1618							}
1619							else if (iswlower(wc))
1620								c = 'l';
1621						}
1622						i = 1;
1623						for (;;)
1624						{
1625							mbxfrm(key.key, (char*)pp, COLL_KEY_MAX);
1626							if (!(cc = (Cchr_t*)dtsearch(dt, &key)) && !(cc = (Cchr_t*)dtprev(dt, &key)))
1627							{
1628								if (i)
1629								{
1630									c = *pp;
1631									goto singleton;
1632								}
1633								goto ecollate;
1634							}
1635							xc = (tc = (Cchr_t*)dtprev(dt, cc)) && !strcasecmp((char*)tc->nam, (char*)cc->nam) ? tc : cc;
1636							if (c == 'l' || c == 'L' && !(c = 0))
1637								ce->typ = COLL_range_lc;
1638							else if (c == 'u' || c == 'U' && !(c = 0))
1639								ce->typ = COLL_range_uc;
1640							else
1641								ce->typ = COLL_range;
1642							strcpy((char*)ce->beg, (char*)xc->key);
1643							if (!(cc = (Cchr_t*)dtnext(dt, cc)))
1644							{
1645								if (i)
1646								{
1647									c = *pp;
1648									goto singleton;
1649								}
1650								goto ecollate;
1651							}
1652							if (!strcasecmp((char*)xc->nam, (char*)cc->nam) && (tc = (Cchr_t*)dtnext(dt, cc)))
1653								cc = tc;
1654							strcpy((char*)ce->end, (char*)cc->key);
1655							ce->max = -1;
1656							ce++;
1657							if (!c)
1658								break;
1659							if (c == 'u')
1660							{
1661								wc = towlower(wc);
1662								c = 'L';
1663							}
1664							else
1665							{
1666								wc = towupper(wc);
1667								c = 'U';
1668							}
1669							rw = mbconv((char*)pp, wc);
1670							i = 0;
1671						}
1672						inrange = 0;
1673						c = *pp;
1674						continue;
1675					case '.':
1676						if (env->flags & REG_REGEXP)
1677							goto complicated_normal;
1678						pp = (unsigned char*)cb[inrange];
1679						if ((w = regcollate((char*)env->cursor, (char**)&env->cursor, (char*)pp, COLL_KEY_MAX, NiL)) < 0)
1680							goto ecollate;
1681						c = *pp;
1682						break;
1683					default:
1684					complicated_normal:
1685						if (*env->cursor == env->terminator || *env->cursor == env->delimiter && (env->flags & REG_ESCAPE))
1686							goto error;
1687						break;
1688					}
1689				}
1690			singleton:
1691				if (inrange == 2)
1692				{
1693					ce = col(ce, ic, rp, rw, rc, pp, w, c);
1694					if (strcmp((char*)ce->beg, (char*)ce->end) > 0)
1695					{
1696						if (env->type < SRE && !(env->flags & (REG_LENIENT|REG_REGEXP)))
1697							goto erange;
1698						(ce-1)->typ = COLL_char;
1699						strcpy((char*)ce->beg, (char*)(ce-1)->end);
1700						ce->typ = COLL_char;
1701						ce++;
1702					}
1703					inrange = env->type >= SRE || (env->flags & (REG_LENIENT|REG_REGEXP));
1704				}
1705				else if (inrange == 1)
1706					ce = col(ce, ic, rp, rw, rc, NiL, 0, 0);
1707				else
1708					inrange = 1;
1709				rp = pp;
1710				rw = w;
1711				rc = c;
1712			}
1713			ce->typ = COLL_end;
1714			return e;
1715		}
1716	}
1717#endif
1718	if (collate)
1719		goto ecollate;
1720	if (env->flags & REG_ICASE)
1721		for (i = 0; i <= UCHAR_MAX; i++)
1722			if (settst(e->re.charclass, i))
1723			{
1724				if (isupper(i))
1725					c = tolower(i);
1726				else if (islower(i))
1727					c = toupper(i);
1728				else
1729					continue;
1730				setadd(e->re.charclass, c);
1731			}
1732	if (neg)
1733	{
1734		for (i = 0; i < elementsof(e->re.charclass->bits); i++)
1735			e->re.charclass->bits[i] ^= ~0;
1736		if (env->explicit >= 0)
1737			setclr(e->re.charclass, env->explicit);
1738	}
1739	return e;
1740 ecollate:
1741	env->error = REG_ECOLLATE;
1742	goto error;
1743 erange:
1744	env->error = REG_ERANGE;
1745 error:
1746	drop(env->disc, e);
1747	if (!env->error)
1748		env->error = REG_EBRACK;
1749	return 0;
1750}
1751
1752static Rex_t*
1753ccl(Cenv_t* env, int type)
1754{
1755	int		i;
1756	Rex_t*		e;
1757	Celt_t*		ce;
1758	regclass_t	f;
1759
1760	if (!(f = classfun(type)))
1761	{
1762		env->error = REG_BADESC;
1763		return 0;
1764	}
1765	if (!mbcoll())
1766	{
1767		if (!(e = node(env, REX_CLASS, 1, 1, sizeof(Set_t))))
1768			return 0;
1769		for (i = 0; i <= UCHAR_MAX; i++)
1770			if ((*f)(i))
1771				setadd(e->re.charclass, i);
1772		if (env->explicit >= 0)
1773			setclr(e->re.charclass, env->explicit);
1774	}
1775	else
1776	{
1777		if (!(e = node(env, REX_COLL_CLASS, 1, 1, 2 * sizeof(Celt_t))))
1778			return 0;
1779		ce = (Celt_t*)e->re.data;
1780		e->re.collate.invert = 0;
1781		e->re.collate.elements = ce;
1782		ce->fun = f;
1783		ce->typ = COLL_call;
1784		ce++;
1785		ce->typ = COLL_end;
1786	}
1787	return e;
1788}
1789
1790static Rex_t*
1791rep(Cenv_t* env, Rex_t* e, int number, int last)
1792{
1793	Rex_t*		f;
1794	unsigned long	m = 0;
1795	unsigned long	n = RE_DUP_INF;
1796	int		minimal = -1;
1797
1798	if (!e)
1799		return 0;
1800	switch (token(env))
1801	{
1802	case T_BANG:
1803		eat(env);
1804		if (!(f = node(env, REX_NEG, m, n, 0)))
1805		{
1806			drop(env->disc, e);
1807			return 0;
1808		}
1809		f->re.group.expr.rex = e;
1810		return f;
1811	case T_QUES:
1812		eat(env);
1813		n = 1;
1814		break;
1815	case T_STAR:
1816		eat(env);
1817		break;
1818	case T_PLUS:
1819		eat(env);
1820		m = 1;
1821		break;
1822	case T_LEFT:
1823		eat(env);
1824		m = env->token.min;
1825		n = env->token.max;
1826		break;
1827	default:
1828		return e;
1829	}
1830	if (env->token.att)
1831		minimal = 1;
1832	else if (env->type < SRE)
1833		switch (token(env))
1834		{
1835		case T_QUES:
1836			eat(env);
1837			minimal = !(env->flags & REG_MINIMAL);
1838			break;
1839		case T_STAR: /*AST*/
1840			eat(env);
1841			minimal = !!(env->flags & REG_MINIMAL);
1842			break;
1843		}
1844	switch (e->type)
1845	{
1846	case REX_DOT:
1847	case REX_CLASS:
1848	case REX_COLL_CLASS:
1849	case REX_ONECHAR:
1850		e->lo = m;
1851		e->hi = n;
1852		if (minimal >= 0)
1853			mark(e, minimal);
1854		return e;
1855#if HUH_2002_08_07
1856	case REX_BEG:
1857#endif
1858	case REX_BEG_STR:
1859	case REX_END_STR:
1860	case REX_FIN_STR:
1861	case REX_WBEG:
1862	case REX_WEND:
1863	case REX_WORD:
1864	case REX_WORD_NOT:
1865		env->error = REG_BADRPT;
1866		drop(env->disc, e);
1867		return 0;
1868	}
1869	if (m == 1 && n == 1)
1870	{
1871		if (minimal >= 0)
1872			mark(e, minimal);
1873		return e;
1874	}
1875	if (!(f = node(env, REX_REP, m, n, 0)))
1876	{
1877		drop(env->disc, e);
1878		return 0;
1879	}
1880	f->re.group.expr.rex = e;
1881	f->re.group.number = number;
1882	f->re.group.last = last;
1883	if (minimal >= 0)
1884		mark(f, minimal);
1885	if (m <= n && n)
1886	{
1887		for (; e && e->type >= REX_GROUP && e->type <= REX_GROUP_CUT; e = e->re.group.expr.rex);
1888		if (e && e->type == REX_NEG)
1889			f->type = REX_GROUP;
1890	}
1891	return f;
1892}
1893
1894static int
1895isstring(Rex_t* e)
1896{
1897	switch (e->type)
1898	{
1899	case REX_ONECHAR:
1900		return e->lo == 1 && e->hi == 1;
1901	case REX_STRING:
1902		return 1;
1903	}
1904	return 0;
1905}
1906
1907static Trie_node_t*
1908trienode(Cenv_t* env, int c)
1909{
1910	Trie_node_t*	t;
1911
1912	if (t = (Trie_node_t*)alloc(env->disc, 0, sizeof(Trie_node_t)))
1913	{
1914		memset(t, 0, sizeof(Trie_node_t));
1915		t->c = c;
1916	}
1917	return t;
1918}
1919
1920static int
1921insert(Cenv_t* env, Rex_t* f, Rex_t* g)
1922{
1923	unsigned char*	s;
1924	unsigned char*	e;
1925	Trie_node_t*	t;
1926	int		len;
1927	unsigned char	tmp[2];
1928
1929	switch (f->type)
1930	{
1931	case REX_ONECHAR:
1932		*(s = tmp) = f->re.onechar;
1933		e = s + 1;
1934		break;
1935	case REX_STRING:
1936		s = f->re.string.base;
1937		e = s + f->re.string.size;
1938		break;
1939	default:
1940		return 1;
1941	}
1942	if (!(t = g->re.trie.root[*s]) && !(t = g->re.trie.root[*s] = trienode(env, *s)))
1943		return 1;
1944	for (len = 1;;)
1945	{
1946		if (t->c == *s)
1947		{
1948			if (++s >= e)
1949				break;
1950			if (!t->son && !(t->son = trienode(env, *s)))
1951				return 1;
1952			t = t->son;
1953			len++;
1954		}
1955		else
1956		{
1957			if (!t->sib && !(t->sib = trienode(env, *s)))
1958				return 1;
1959			t = t->sib;
1960		}
1961	}
1962	if (g->re.trie.min > len)
1963		g->re.trie.min = len;
1964	if (g->re.trie.max < len)
1965		g->re.trie.max = len;
1966	t->end = 1;
1967	return 0;
1968}
1969
1970/*
1971 * trie() tries to combine nontrivial e and f into a REX_TRIE
1972 * unless 0 is returned, e and f are deleted as far as possible
1973 * also called by regcomb
1974 */
1975
1976static Rex_t*
1977trie(Cenv_t* env, Rex_t* e, Rex_t* f)
1978{
1979	Rex_t*	g;
1980
1981	if (e->next || f->next || !isstring(e) || e->flags != f->flags)
1982		return 0;
1983	if (isstring(f))
1984	{
1985		if (!(g = node(env, REX_TRIE, 0, 0, (UCHAR_MAX + 1) * sizeof(Trie_node_t*))))
1986			return 0;
1987		g->re.trie.min = INT_MAX;
1988		if (insert(env, f, g))
1989			goto nospace;
1990		drop(env->disc, f);
1991	}
1992	else if (f->type != REX_TRIE)
1993		return 0;
1994	else
1995		g = f;
1996	if (insert(env, e, g))
1997		goto nospace;
1998	drop(env->disc, e);
1999	return g;
2000 nospace:
2001	if (g != f)
2002		drop(env->disc, g);
2003	return 0;
2004}
2005
2006static Rex_t*		alt(Cenv_t*, int, int);
2007
2008static int
2009chr(register Cenv_t* env, int* escaped)
2010{
2011	unsigned char*	p;
2012	int		c;
2013
2014	*escaped = 0;
2015	if (!(c = *env->cursor))
2016		return -1;
2017	env->cursor++;
2018	if (c == '\\')
2019	{
2020		if (env->flags & REG_SHELL_ESCAPED)
2021			return c;
2022		if (!(c = *(env->cursor + 1)) || c == env->terminator)
2023		{
2024			if (env->flags & (REG_LENIENT|REG_REGEXP))
2025				return c ? c : '\\';
2026			env->error = REG_EESCAPE;
2027			return -1;
2028		}
2029		p = env->cursor;
2030		c = chresc((char*)env->cursor - 1, (char**)&env->cursor);
2031		*escaped = env->cursor - p;
2032	}
2033	return c;
2034}
2035
2036/*
2037 * open the perly gates
2038 */
2039
2040static Rex_t*
2041grp(Cenv_t* env, int parno)
2042{
2043	Rex_t*		e;
2044	Rex_t*		f;
2045	int		c;
2046	int		g;
2047	int		i;
2048	int		n;
2049	int		x;
2050	int		esc;
2051	int		typ;
2052	int		beg;
2053	unsigned char*	p;
2054
2055	g = env->flags;
2056	beg = env->pattern == env->cursor - env->token.len;
2057	if (!(c = env->token.lex) && (c = *env->cursor))
2058		env->cursor++;
2059	env->token.len = 0;
2060	env->parnest++;
2061	typ = -1;
2062	switch (c)
2063	{
2064	case '-':
2065	case '+':
2066	case 'a':
2067	case 'g':
2068	case 'i':
2069	case 'l':
2070	case 'm':
2071	case 'p':
2072	case 'r':
2073	case 's':
2074	case 'x':
2075	case 'A':
2076	case 'B':
2077	case 'E':
2078	case 'F':
2079	case 'G':
2080	case 'I':
2081	case 'K':
2082	case 'L':
2083	case 'M':	/* glob(3) */
2084	case 'N':	/* glob(3) */
2085	case 'O':	/* glob(3) */
2086	case 'P':
2087	case 'R':	/* pcre */
2088	case 'S':
2089	case 'U':	/* pcre */
2090	case 'V':
2091	case 'X':	/* pcre */
2092		x = REX_GROUP;
2093		i = 1;
2094		env->token.push = 1;
2095		for (;;)
2096		{
2097			switch (c)
2098			{
2099			case ')':
2100				if (!(env->flags & REG_LITERAL))
2101				{
2102					env->error = REG_BADRPT;
2103					return 0;
2104				}
2105				/*FALLTHROUGH*/
2106			case 0:
2107			case T_CLOSE:
2108				x = 0;
2109				goto done;
2110			case ':':
2111				eat(env);
2112				if (token(env) == T_CLOSE)
2113					x = 0;
2114				goto done;
2115			case '-':
2116				i = 0;
2117				break;
2118			case '+':
2119				i = 1;
2120				break;
2121			case 'a':
2122				if (i)
2123					env->flags |= (REG_LEFT|REG_RIGHT);
2124				else
2125					env->flags &= ~(REG_LEFT|REG_RIGHT);
2126				break;
2127			case 'g':
2128				if (i)
2129					env->flags &= ~REG_MINIMAL;
2130				else
2131					env->flags |= REG_MINIMAL;
2132				break;
2133			case 'i':
2134				if (i)
2135					env->flags |= REG_ICASE;
2136				else
2137					env->flags &= ~REG_ICASE;
2138				break;
2139			case 'l':
2140				if (i)
2141					env->flags |= REG_LEFT;
2142				else
2143					env->flags &= ~REG_LEFT;
2144				break;
2145			case 'm':
2146				if (i)
2147					env->flags |= REG_NEWLINE;
2148				else
2149					env->flags &= ~REG_NEWLINE;
2150				env->explicit = (env->flags & (REG_NEWLINE|REG_SPAN)) == REG_NEWLINE ? env->mappednewline : -1;
2151				break;
2152			case 'p':
2153				if (i)
2154					env->flags &= ~REG_LENIENT;
2155				else
2156					env->flags |= REG_LENIENT;
2157				break;
2158			case 'r':
2159				if (i)
2160					env->flags |= REG_RIGHT;
2161				else
2162					env->flags &= ~REG_RIGHT;
2163				break;
2164			case 's':
2165				if (i)
2166					env->flags |= REG_SPAN;
2167				else
2168					env->flags &= ~REG_SPAN;
2169				env->explicit = (env->flags & (REG_NEWLINE|REG_SPAN)) == REG_NEWLINE ? env->mappednewline : -1;
2170				break;
2171			case 'x':
2172				if (i)
2173					env->flags |= REG_COMMENT;
2174				else
2175					env->flags &= ~REG_COMMENT;
2176				break;
2177			case 'X':
2178				if (typ >= 0 || env->type == ERE && (env->flags & REG_CLASS_ESCAPE))
2179					break; /* PCRE_EXTRA */
2180				/*FALLTHROUGH*/
2181			case 'A':
2182				env->flags &= ~(REG_AUGMENTED|REG_EXTENDED|REG_LITERAL|REG_REGEXP|REG_SHELL|REG_LEFT|REG_RIGHT);
2183				env->flags |= REG_AUGMENTED|REG_EXTENDED;
2184				typ = ARE;
2185				break;
2186			case 'B':
2187			case 'G':
2188				env->flags &= ~(REG_AUGMENTED|REG_EXTENDED|REG_LITERAL|REG_REGEXP|REG_SHELL|REG_LEFT|REG_RIGHT);
2189				typ = BRE;
2190				break;
2191			case 'E':
2192				env->flags &= ~(REG_AUGMENTED|REG_EXTENDED|REG_LITERAL|REG_REGEXP|REG_SHELL|REG_LEFT|REG_RIGHT);
2193				env->flags |= REG_EXTENDED;
2194				typ = ERE;
2195				break;
2196			case 'F':
2197			case 'L':
2198				env->flags &= ~(REG_AUGMENTED|REG_EXTENDED|REG_LITERAL|REG_REGEXP|REG_SHELL|REG_LEFT|REG_RIGHT);
2199				env->flags |= REG_LITERAL;
2200				typ = ERE;
2201				break;
2202			case 'K':
2203				env->flags &= ~(REG_AUGMENTED|REG_EXTENDED|REG_LITERAL|REG_REGEXP|REG_SHELL|REG_LEFT|REG_RIGHT);
2204				env->flags |= REG_AUGMENTED|REG_SHELL|REG_LEFT|REG_RIGHT;
2205				typ = KRE;
2206				break;
2207			case 'M':
2208				/* used by caller to disable glob(3) GLOB_BRACE */
2209				break;
2210			case 'N':
2211				/* used by caller to disable glob(3) GLOB_NOCHECK */
2212				break;
2213			case 'O':
2214				/* used by caller to disable glob(3) GLOB_STARSTAR */
2215				break;
2216			case 'P':
2217				env->flags &= ~(REG_AUGMENTED|REG_EXTENDED|REG_LITERAL|REG_REGEXP|REG_SHELL|REG_LEFT|REG_RIGHT);
2218				env->flags |= REG_EXTENDED|REG_CLASS_ESCAPE;
2219				typ = ERE;
2220				break;
2221			case 'S':
2222				env->flags &= ~(REG_AUGMENTED|REG_EXTENDED|REG_LITERAL|REG_REGEXP|REG_SHELL|REG_LEFT|REG_RIGHT);
2223				env->flags |= REG_SHELL|REG_LEFT|REG_RIGHT;
2224				typ = SRE;
2225				break;
2226			case 'U': /* PCRE_UNGREEDY */
2227				if (typ >= 0 || env->type == ERE && (env->flags & REG_CLASS_ESCAPE))
2228				{
2229					if (i)
2230						env->flags |= REG_MINIMAL;
2231					else
2232						env->flags &= ~REG_MINIMAL;
2233				}
2234				break;
2235			case 'V':
2236				env->flags &= ~(REG_AUGMENTED|REG_EXTENDED|REG_LITERAL|REG_REGEXP|REG_SHELL|REG_LEFT|REG_RIGHT);
2237				env->flags |= REG_REGEXP;
2238				typ = BRE;
2239				break;
2240			default:
2241				env->error = REG_BADRPT;
2242				return 0;
2243			}
2244			eat(env);
2245			c = token(env);
2246		}
2247	done:
2248		break;
2249	case ':':
2250		x = REX_GROUP;
2251		break;
2252	case '=':
2253		x = REX_GROUP_AHEAD;
2254		break;
2255	case '!':
2256		x = REX_GROUP_AHEAD_NOT;
2257		break;
2258	case '<':
2259		switch (token(env))
2260		{
2261		case '=':
2262			x = REX_GROUP_BEHIND;
2263			break;
2264		case '!':
2265		case T_BANG:
2266			x = REX_GROUP_BEHIND_NOT;
2267			break;
2268		default:
2269			env->error = REG_BADRPT;
2270			return 0;
2271		}
2272		eat(env);
2273		break;
2274	case '>':
2275		x = REX_GROUP_CUT;
2276		break;
2277	case '%':
2278	case T_PERCENT:
2279		e = node(env, REX_NEST, 0, 0, (UCHAR_MAX + 1) * sizeof(unsigned short));
2280		e->re.nest.primary = isalnum(*env->cursor) ? -1 : *env->cursor;
2281		n = 1;
2282		for (;;)
2283		{
2284			switch (i = chr(env, &esc))
2285			{
2286			case -1:
2287			case 0:
2288			invalid:
2289				env->cursor -= esc + 1;
2290				env->error = REG_EPAREN;
2291				return 0;
2292			case 'D':
2293				x = REX_NEST_delimiter;
2294				/*FALLTHROUGH*/
2295			delimiter:
2296				if ((i = chr(env, &esc)) < 0)
2297					goto invalid;
2298				if (e->re.nest.type[i] & ~x)
2299					goto invalid;
2300				e->re.nest.type[i] = x;
2301				continue;
2302			case 'E':
2303				x = REX_NEST_escape;
2304				goto delimiter;
2305			case 'L':
2306				x = REX_NEST_literal;
2307				goto quote;
2308			case 'O':
2309				switch (i = chr(env, &esc))
2310				{
2311				case 'T':
2312					e->re.nest.type[UCHAR_MAX+1] |= REX_NEST_terminator;
2313					break;
2314				default:
2315					goto invalid;
2316				}
2317				continue;
2318			case 'Q':
2319				x = REX_NEST_quote;
2320				/*FALLTHROUGH*/
2321			quote:
2322				if ((i = chr(env, &esc)) < 0)
2323					goto invalid;
2324				if (e->re.nest.type[i] & ~x)
2325					goto invalid;
2326				e->re.nest.type[i] = x|REX_NEST_open|REX_NEST_close|(i<<REX_NEST_SHIFT);
2327				continue;
2328			case 'S':
2329				x = REX_NEST_separator;
2330				goto delimiter;
2331			case 'T':
2332				x = REX_NEST_terminator;
2333				goto delimiter;
2334			case '|':
2335			case '&':
2336				if (!esc)
2337					goto invalid;
2338				goto nesting;
2339			case '(':
2340				if (!esc)
2341					n++;
2342				goto nesting;
2343			case ')':
2344				if (!esc && !--n)
2345					break;
2346				goto nesting;
2347			default:
2348			nesting:
2349				if (isalnum(i) || (e->re.nest.type[i] & (REX_NEST_close|REX_NEST_escape|REX_NEST_literal|REX_NEST_quote|REX_NEST_delimiter|REX_NEST_separator|REX_NEST_terminator)))
2350					goto invalid;
2351				e->re.nest.type[i] = REX_NEST_open;
2352				if ((x = chr(env, &esc)) < 0 || (e->re.nest.type[x] & (REX_NEST_close|REX_NEST_escape|REX_NEST_delimiter|REX_NEST_separator|REX_NEST_terminator)))
2353					goto invalid;
2354				if (!esc)
2355				{
2356					if (x == ')' && !--n)
2357						goto invalid;
2358					else if (x == '(')
2359						n++;
2360				}
2361				e->re.nest.type[x] |= REX_NEST_close;
2362				e->re.nest.type[i] |= x << REX_NEST_SHIFT;
2363				continue;
2364			}
2365			break;
2366		}
2367		env->parnest--;
2368		if (c == T_PERCENT)
2369			for (n = 0; n < 2; n++)
2370			{
2371				parno = ++env->parno;
2372				if (!(f = node(env, REX_GROUP, 0, 0, 0)))
2373				{
2374					drop(env->disc, e);
2375					return 0;
2376				}
2377				if (parno < elementsof(env->paren))
2378					env->paren[parno] = f;
2379				f->re.group.back = 0;
2380				f->re.group.number = parno;
2381				f->re.group.expr.rex = e;
2382				e = f;
2383			}
2384		return e;
2385	case '(':
2386		c = 0;
2387		if (isdigit(*env->cursor))
2388		{
2389			f = 0;
2390			do
2391			{
2392				if (c > (INT_MAX / 10))
2393				{
2394					env->error = REG_BADRPT;
2395					return 0;
2396				}
2397				c = c * 10 + (*env->cursor++ - '0');
2398			} while (isdigit(*env->cursor));
2399			if (*env->cursor++ != ')')
2400			{
2401				env->error = REG_BADRPT;
2402				return 0;
2403			}
2404			if (c && env->type >= SRE)
2405				c = c * 2 - 1;
2406			if (!c || c > env->parno || !env->paren[c])
2407			{
2408				if (!(env->flags & (REG_LENIENT|REG_REGEXP)))
2409				{
2410					env->error = REG_ESUBREG;
2411					return 0;
2412				}
2413				if (c)
2414					c = -1;
2415			}
2416		}
2417		else
2418		{
2419			if (env->type < SRE && *env->cursor++ != '?')
2420			{
2421				env->error = REG_BADRPT;
2422				return 0;
2423			}
2424			if (!(f = grp(env, parno + 1)) && env->error)
2425				return 0;
2426		}
2427		if (!(e = node(env, REX_GROUP_COND, 0, 0, 0)))
2428		{
2429			drop(env->disc, f);
2430			return 0;
2431		}
2432		e->re.group.size = c;
2433		e->re.group.expr.binary.left = f;
2434		if (!(e->re.group.expr.binary.right = alt(env, parno, 1)))
2435		{
2436			drop(env->disc, e);
2437			return 0;
2438		}
2439		if (token(env) != T_CLOSE)
2440		{
2441			env->error = REG_EPAREN;
2442			return 0;
2443		}
2444		eat(env);
2445		env->parnest--;
2446		return rep(env, e, parno, parno);
2447	case '{':
2448		p = env->cursor;
2449		n = 1;
2450		while (c = *env->cursor)
2451		{
2452			if (c == '\\' && *(env->cursor + 1))
2453				env->cursor++;
2454			else if (c == '{')
2455				n++;
2456			else if (c == '}' && !--n)
2457				break;
2458			else if (c == env->delimiter || c == env->terminator)
2459				break;
2460			env->cursor++;
2461		}
2462		if (c != '}')
2463		{
2464			env->error = REG_EBRACE;
2465			return 0;
2466		}
2467		if (*++env->cursor != ')')
2468		{
2469			env->error = REG_EPAREN;
2470			return 0;
2471		}
2472		env->cursor++;
2473		env->parnest--;
2474		if (env->disc->re_version < REG_VERSION_EXEC)
2475		{
2476			env->error = REG_BADRPT;
2477			return 0;
2478		}
2479		if (!env->disc->re_execf)
2480			return 0;
2481		if (!(e = node(env, REX_EXEC, 0, 0, 0)))
2482			return 0;
2483		e->re.exec.text = (const char*)p;
2484		e->re.exec.size = env->cursor - p - 2;
2485		if (!env->disc->re_compf)
2486			e->re.exec.data = 0;
2487		else
2488			e->re.exec.data = (*env->disc->re_compf)(env->regex, e->re.exec.text, e->re.exec.size, env->disc);
2489		return e;
2490	case '0': case '1': case '2': case '3': case '4':
2491	case '5': case '6': case '7': case '8': case '9':
2492		c -= '0';
2493		while (isdigit(*env->cursor))
2494		{
2495			if (c > (INT_MAX / 10))
2496			{
2497				env->error = REG_ESUBREG;
2498				return 0;
2499			}
2500			c = c * 10 + *env->cursor++ - '0';
2501		}
2502		if (*env->cursor == ')')
2503		{
2504			env->cursor++;
2505			env->parnest--;
2506			env->token.len = 1;
2507			if (c > env->parno || !env->paren[c])
2508			{
2509				env->error = REG_ESUBREG;
2510				return 0;
2511			}
2512			env->paren[c]->re.group.back = 1;
2513			return rep(env, node(env, REX_BACK, c, 0, 0), 0, 0);
2514		}
2515		/*FALLTHROUGH*/
2516	default:
2517		env->error = REG_BADRPT;
2518		return 0;
2519	}
2520	p = env->pattern;
2521	i = env->type;
2522	if (x)
2523	{
2524		if (typ >= 0)
2525			env->type = typ;
2526		if (!(e = alt(env, parno, 0)))
2527			goto nope;
2528		env->flags = g;
2529		env->type = i;
2530	}
2531	c = token(env);
2532	env->parnest--;
2533	if (c != T_CLOSE && (!(env->flags & REG_LITERAL) || c != ')'))
2534	{
2535		env->error = REG_EPAREN;
2536		goto nope;
2537	}
2538	eat(env);
2539	if (typ >= 0 && beg)
2540		env->pattern = env->cursor;
2541	if (!x)
2542	{
2543		if (typ >= 0)
2544			env->type = typ;
2545		return 0;
2546	}
2547	if (!(f = node(env, x, 0, 0, 0)))
2548	{
2549		drop(env->disc, e);
2550		goto nope;
2551	}
2552	f->re.group.expr.rex = e;
2553	if (x == REX_GROUP_BEHIND || x == REX_GROUP_BEHIND_NOT)
2554	{
2555		if (stats(env, e))
2556		{
2557			drop(env->disc, f);
2558			if (!env->error)
2559				env->error = REG_ECOUNT;
2560			goto nope;
2561		}
2562		f->re.group.size = env->stats.m;
2563		memset(&env->stats, 0, sizeof(env->stats));
2564	}
2565	switch (x)
2566	{
2567	case REX_GROUP:
2568	case REX_GROUP_CUT:
2569		f = rep(env, f, parno, env->parno);
2570		break;
2571	}
2572	if (f)
2573		return f;
2574 nope:
2575	env->flags = g;
2576	env->pattern = p;
2577	env->type = i;
2578	return 0;
2579}
2580
2581static Rex_t*
2582seq(Cenv_t* env)
2583{
2584	Rex_t*		e;
2585	Rex_t*		f;
2586	Token_t		tok;
2587	int		c;
2588	int		i;
2589	int		n;
2590	int		x;
2591	int		parno;
2592	int		type;
2593	regflags_t	flags;
2594	unsigned char*	s;
2595	unsigned char*	p;
2596	unsigned char*	t;
2597	unsigned char*	u;
2598	unsigned char	buf[256];
2599
2600	for (;;)
2601	{
2602		s = buf;
2603		while ((c = token(env)) < T_META && s < &buf[sizeof(buf) - env->token.len])
2604		{
2605			x = c;
2606			p = env->cursor;
2607			if (c >= 0)
2608			{
2609				n = 1;
2610				*s++ = (env->flags & REG_ICASE) ? toupper(c) : c;
2611			}
2612			else if (c == C_ESC || (env->flags & REG_ICASE))
2613			{
2614				c = (c == C_ESC) ? env->token.lex : mbchar(p);
2615				if (env->flags & REG_ICASE)
2616					c = towupper(c);
2617				if ((&buf[sizeof(buf)] - s) < MB_CUR_MAX)
2618					break;
2619				if ((n = mbconv((char*)s, c)) < 0)
2620					*s++ = c;
2621				else if (n)
2622					s += n;
2623				else
2624				{
2625					n = 1;
2626					*s++ = 0;
2627				}
2628			}
2629			else
2630			{
2631				n = env->token.len - env->token.esc;
2632				for (t = p, u = s + n; s < u; *s++ = *t++);
2633			}
2634			eat(env);
2635		}
2636		if (c == T_BAD)
2637			return 0;
2638		if (s > buf)
2639			switch (c)
2640			{
2641			case T_STAR:
2642			case T_PLUS:
2643			case T_LEFT:
2644			case T_QUES:
2645			case T_BANG:
2646				if ((s -= n) == buf)
2647					e = 0;
2648				else
2649				{
2650					i = s - buf;
2651					if (!(e = node(env, REX_STRING, 0, 0, i)))
2652						return 0;
2653					memcpy((char*)(e->re.string.base = (unsigned char*)e->re.data), (char*)buf, i);
2654					e->re.string.size = i;
2655				}
2656				if (x >= 0)
2657				{
2658					if (!(f = node(env, REX_ONECHAR, 1, 1, 0)))
2659					{
2660						drop(env->disc, e);
2661						return 0;
2662					}
2663					f->re.onechar = (env->flags & REG_ICASE) ? toupper(x) : x;
2664				}
2665				else
2666				{
2667					if (!(f = node(env, REX_STRING, 0, 0, n)))
2668						return 0;
2669					memcpy((char*)(f->re.string.base = (unsigned char*)f->re.data), (char*)p, n);
2670					f->re.string.size = n;
2671				}
2672				if (!(f = rep(env, f, 0, 0)) || !(f = cat(env, f, seq(env))))
2673				{
2674					drop(env->disc, e);
2675					return 0;
2676				}
2677				if (e)
2678					f = cat(env, e, f);
2679				return f;
2680			default:
2681				c = s - buf;
2682				if (!(e = node(env, REX_STRING, 0, 0, c)))
2683					return 0;
2684				memcpy((char*)(e->re.string.base = (unsigned char*)e->re.data), (char*)buf, c);
2685				e->re.string.size = c;
2686				return cat(env, e, seq(env));
2687			}
2688		else if (c > T_BACK)
2689		{
2690			eat(env);
2691			c -= T_BACK;
2692			if (c > env->parno || !env->paren[c])
2693			{
2694				env->error = REG_ESUBREG;
2695				return 0;
2696			}
2697			env->paren[c]->re.group.back = 1;
2698			e = rep(env, node(env, REX_BACK, c, 0, 0), 0, 0);
2699		}
2700		else
2701			switch (c)
2702			{
2703			case T_AND:
2704			case T_CLOSE:
2705			case T_BAR:
2706			case T_END:
2707				return node(env, REX_NULL, 0, 0, 0);
2708			case T_DOLL:
2709				eat(env);
2710				e = rep(env, node(env, REX_END, 0, 0, 0), 0, 0);
2711				break;
2712			case T_CFLX:
2713				eat(env);
2714				if ((e = node(env, REX_BEG, 0, 0, 0)) && (env->flags & REG_EXTENDED))
2715					e = rep(env, e, 0, 0);
2716				break;
2717			case T_OPEN:
2718				tok = env->token;
2719				eat(env);
2720				flags = env->flags;
2721				type = env->type;
2722				if (env->token.att)
2723					env->flags |= REG_MINIMAL;
2724				env->parnest++;
2725				if (env->type == KRE)
2726					++env->parno;
2727				parno = ++env->parno;
2728				if (!(e = alt(env, parno + 1, 0)))
2729					break;
2730				if (e->type == REX_NULL && env->type == ERE && !(env->flags & (REG_NULL|REG_REGEXP)))
2731				{
2732					drop(env->disc, e);
2733					env->error = (*env->cursor == 0 || *env->cursor == env->delimiter || *env->cursor == env->terminator) ? REG_EPAREN : REG_ENULL;
2734					return 0;
2735				}
2736				if (token(env) != T_CLOSE)
2737				{
2738					drop(env->disc, e);
2739					env->error = REG_EPAREN;
2740					return 0;
2741				}
2742				env->parnest--;
2743				eat(env);
2744				if (!(f = node(env, REX_GROUP, 0, 0, 0)))
2745				{
2746					drop(env->disc, e);
2747					return 0;
2748				}
2749				if (parno < elementsof(env->paren))
2750					env->paren[parno] = f;
2751				f->re.group.back = 0;
2752				f->re.group.number = parno;
2753				f->re.group.expr.rex = e;
2754				if (tok.lex)
2755				{
2756					tok.push = 1;
2757					env->token = tok;
2758				}
2759				if (!(e = rep(env, f, parno, env->parno)))
2760					return 0;
2761				if (env->type == KRE)
2762				{
2763					if (!(f = node(env, REX_GROUP, 0, 0, 0)))
2764					{
2765						drop(env->disc, e);
2766						return 0;
2767					}
2768					if (--parno < elementsof(env->paren))
2769						env->paren[parno] = f;
2770					f->re.group.back = 0;
2771					f->re.group.number = parno;
2772					f->re.group.expr.rex = e;
2773					e = f;
2774				}
2775				env->flags = flags;
2776				env->type = type;
2777				break;
2778			case T_GROUP:
2779				p = env->cursor;
2780				eat(env);
2781				flags = env->flags;
2782				type = env->type;
2783				if (!(e = grp(env, env->parno + 1)))
2784				{
2785					if (env->error)
2786						return 0;
2787					if (env->literal == env->pattern && env->literal == p)
2788						env->literal = env->cursor;
2789					continue;
2790				}
2791				env->flags = flags;
2792				env->type = type;
2793				break;
2794			case T_BRA:
2795				eat(env);
2796				if (e = bra(env))
2797					e = rep(env, e, 0, 0);
2798				break;
2799			case T_ALNUM:
2800			case T_ALNUM_NOT:
2801			case T_DIGIT:
2802			case T_DIGIT_NOT:
2803			case T_SPACE:
2804			case T_SPACE_NOT:
2805				eat(env);
2806				if (e = ccl(env, c))
2807					e = rep(env, e, 0, 0);
2808				break;
2809			case T_LT:
2810				eat(env);
2811				e = rep(env, node(env, REX_WBEG, 0, 0, 0), 0, 0);
2812				break;
2813			case T_GT:
2814				eat(env);
2815				e = rep(env, node(env, REX_WEND, 0, 0, 0), 0, 0);
2816				break;
2817			case T_DOT:
2818				eat(env);
2819				e = rep(env, node(env, REX_DOT, 1, 1, 0), 0, 0);
2820				break;
2821			case T_DOTSTAR:
2822				eat(env);
2823				env->token.lex = T_STAR;
2824				env->token.push = 1;
2825				e = rep(env, node(env, REX_DOT, 1, 1, 0), 0, 0);
2826				break;
2827			case T_SLASHPLUS:
2828				eat(env);
2829				env->token.lex = T_PLUS;
2830				env->token.push = 1;
2831				if (e = node(env, REX_ONECHAR, 1, 1, 0))
2832				{
2833					e->re.onechar = '/';
2834					e = rep(env, e, 0, 0);
2835				}
2836				break;
2837			case T_WORD:
2838				eat(env);
2839				e = rep(env, node(env, REX_WORD, 0, 0, 0), 0, 0);
2840				break;
2841			case T_WORD_NOT:
2842				eat(env);
2843				e = rep(env, node(env, REX_WORD_NOT, 0, 0, 0), 0, 0);
2844				break;
2845			case T_BEG_STR:
2846				eat(env);
2847				e = rep(env, node(env, REX_BEG_STR, 0, 0, 0), 0, 0);
2848				break;
2849			case T_END_STR:
2850				eat(env);
2851				e = rep(env, node(env, REX_END_STR, 0, 0, 0), 0, 0);
2852				break;
2853			case T_FIN_STR:
2854				eat(env);
2855				e = rep(env, node(env, REX_FIN_STR, 0, 0, 0), 0, 0);
2856				break;
2857			default:
2858				env->error = REG_BADRPT;
2859				return 0;
2860			}
2861		if (e && *env->cursor != 0 && *env->cursor != env->delimiter && *env->cursor != env->terminator)
2862			e = cat(env, e, seq(env));
2863		return e;
2864	}
2865}
2866
2867static Rex_t*
2868con(Cenv_t* env)
2869{
2870	Rex_t*	e;
2871	Rex_t*	f;
2872	Rex_t*	g;
2873
2874	if (!(e = seq(env)) || !(env->flags & REG_AUGMENTED) || token(env) != T_AND)
2875		return e;
2876	eat(env);
2877	if (!(f = con(env)))
2878	{
2879		drop(env->disc, e);
2880		return 0;
2881	}
2882	if (!(g = node(env, REX_CONJ, 0, 0, 0)))
2883	{
2884		drop(env->disc, e);
2885		drop(env->disc, f);
2886		return 0;
2887	}
2888	g->re.group.expr.binary.left = e;
2889	g->re.group.expr.binary.right = f;
2890	return g;
2891}
2892
2893static Rex_t*
2894alt(Cenv_t* env, int number, int cond)
2895{
2896	Rex_t*	e;
2897	Rex_t*	f;
2898	Rex_t*	g;
2899
2900	if (!(e = con(env)))
2901		return 0;
2902	else if (token(env) != T_BAR)
2903	{
2904		if (!cond)
2905			return e;
2906		f = 0;
2907		if (e->type == REX_NULL)
2908			goto bad;
2909	}
2910	else
2911	{
2912		eat(env);
2913		if (!(f = alt(env, number, 0)))
2914		{
2915			drop(env->disc, e);
2916			return 0;
2917		}
2918		if ((e->type == REX_NULL || f->type == REX_NULL) && !(env->flags & (REG_NULL|REG_REGEXP)))
2919			goto bad;
2920		if (!cond && (g = trie(env, e, f)))
2921			return g;
2922	}
2923	if (!(g = node(env, REX_ALT, 0, 0, 0)))
2924	{
2925		env->error = REG_ESPACE;
2926		goto bad;
2927	}
2928	g->re.group.number = number;
2929	g->re.group.last = env->parno;
2930	g->re.group.expr.binary.left = e;
2931	g->re.group.expr.binary.right = f;
2932	return g;
2933 bad:
2934	drop(env->disc, e);
2935	drop(env->disc, f);
2936	if (!env->error)
2937		env->error = REG_ENULL;
2938	return 0;
2939}
2940
2941/*
2942 * add v to REX_BM tables
2943 */
2944
2945static void
2946bmstr(Cenv_t* env, register Rex_t* a, unsigned char* v, int n, Bm_mask_t b)
2947{
2948	int	c;
2949	int	m;
2950	size_t	z;
2951
2952	for (m = 0; m < n; m++)
2953	{
2954		if (!(z = n - m - 1))
2955			z = HIT;
2956		c = v[m];
2957		a->re.bm.mask[m][c] |= b;
2958		if (z == HIT || !a->re.bm.skip[c] || a->re.bm.skip[c] > z && a->re.bm.skip[c] < HIT)
2959			a->re.bm.skip[c] = z;
2960		if (a->flags & REG_ICASE)
2961		{
2962			if (isupper(c))
2963				c = tolower(c);
2964			else if (islower(c))
2965				c = toupper(c);
2966			else
2967				continue;
2968			a->re.bm.mask[m][c] |= b;
2969			if (z == HIT || !a->re.bm.skip[c] || a->re.bm.skip[c] > z && a->re.bm.skip[c] < HIT)
2970				a->re.bm.skip[c] = z;
2971		}
2972	}
2973}
2974
2975/*
2976 * set up BM table from trie
2977 */
2978
2979static int
2980bmtrie(Cenv_t* env, Rex_t* a, unsigned char* v, Trie_node_t* x, int n, int m, Bm_mask_t b)
2981{
2982	do
2983	{
2984		v[m] = x->c;
2985		if (m >= (n - 1))
2986		{
2987			bmstr(env, a, v, n, b);
2988			if (!(b <<= 1))
2989			{
2990				b = 1;
2991				a->re.bm.complete = 0;
2992			}
2993			else if (x->son)
2994				a->re.bm.complete = 0;
2995		}
2996		else if (x->son)
2997			b = bmtrie(env, a, v, x->son, n, m + 1, b);
2998	} while (x = x->sib);
2999	return b;
3000}
3001
3002/*
3003 * rewrite the expression tree for some special cases
3004 * 1. it is a null expression - illegal
3005 * 2. max length fixed string found -- use BM algorithm
3006 * 3. it begins with an unanchored string - use KMP algorithm
3007 * 0 returned on success
3008 */
3009
3010static int
3011special(Cenv_t* env, regex_t* p)
3012{
3013	Rex_t*		a;
3014	Rex_t*		e;
3015	Rex_t*		t;
3016	Rex_t*		x;
3017	Rex_t*		y;
3018	unsigned char*	s;
3019	int*		f;
3020	int		n;
3021	int		m;
3022	int		k;
3023
3024	DEBUG_INIT();
3025	if (e = p->env->rex)
3026	{
3027		if ((x = env->stats.x) && x->re.string.size < 3)
3028			x = 0;
3029		if ((t = env->stats.y) && t->re.trie.min < 3)
3030			t = 0;
3031		if (x && t)
3032		{
3033			if (x->re.string.size >= t->re.trie.min)
3034				t = 0;
3035			else
3036				x = 0;
3037		}
3038		if (x || t)
3039		{
3040			Bm_mask_t**	mask;
3041			Bm_mask_t*	h;
3042			unsigned char*	v;
3043			size_t*		q;
3044			size_t		l;
3045			int		i;
3046			int		j;
3047
3048			if (x)
3049			{
3050				y = x;
3051				n = m = x->re.string.size;
3052				l = env->stats.l;
3053			}
3054			else
3055			{
3056				y = t;
3057				n = t->re.trie.min;
3058				m = t->re.trie.max;
3059				l = env->stats.k;
3060			}
3061			if (!(q = (size_t*)alloc(env->disc, 0, (n + 1) * sizeof(size_t))))
3062				return 1;
3063			if (!(a = node(env, REX_BM, 0, 0, n * (sizeof(Bm_mask_t*) + (UCHAR_MAX + 1) * sizeof(Bm_mask_t)) + (UCHAR_MAX + n + 2) * sizeof(size_t))))
3064			{
3065				alloc(env->disc, q, 0);
3066				return 1;
3067			}
3068			a->flags = y->flags;
3069			a->map = y->map;
3070			a->re.bm.size = n;
3071			a->re.bm.back = (y == e || y == e->re.group.expr.rex) ? (m - n) : -1;
3072			a->re.bm.left = l - 1;
3073			a->re.bm.right = env->stats.m - l - n;
3074			a->re.bm.complete = (env->stats.e || y != e && (e->type != REX_GROUP || y != e->re.group.expr.rex) || e->next || ((a->re.bm.left + a->re.bm.right) >= 0)) ? 0 : n;
3075			h = (Bm_mask_t*)&a->re.bm.mask[n];
3076			a->re.bm.skip = (size_t*)(h + n * (UCHAR_MAX + 1));
3077			a->re.bm.fail = &a->re.bm.skip[UCHAR_MAX + 1];
3078			for (m = 0; m <= UCHAR_MAX; m++)
3079				a->re.bm.skip[m] = n;
3080			a->re.bm.skip[0] = a->re.bm.skip[env->mappednewline] = (y->next && y->next->type == REX_END) ? HIT : (n + a->re.bm.left);
3081			for (i = 1; i <= n; i++)
3082				a->re.bm.fail[i] = 2 * n - i;
3083			mask = a->re.bm.mask;
3084			for (m = 0; m < n; m++)
3085			{
3086				mask[m] = h;
3087				h += UCHAR_MAX + 1;
3088			}
3089			if (x)
3090				bmstr(env, a, x->re.string.base, n, 1);
3091			else
3092			{
3093				v = (unsigned char*)q;
3094				memset(v, 0, n);
3095				m = 1;
3096				for (i = 0; i <= UCHAR_MAX; i++)
3097					if (t->re.trie.root[i])
3098						m = bmtrie(env, a, v, t->re.trie.root[i], n, 0, m);
3099			}
3100			mask--;
3101			memset(q, 0, n * sizeof(*q));
3102			for (k = (j = n) + 1; j > 0; j--, k--)
3103			{
3104				DEBUG_TEST(0x0010,(sfprintf(sfstderr, "BM#0: k=%d j=%d\n", k, j)),(0));
3105				for (q[j] = k; k <= n; k = q[k])
3106				{
3107					for (m = 0; m <= UCHAR_MAX; m++)
3108						if (mask[k][m] == mask[j][m])
3109						{
3110							DEBUG_TEST(0x0010,sfprintf(sfstderr, "CUT1: mask[%d][%c]=mask[%d][%c]\n", k, m, j, m), (0));
3111							goto cut;
3112						}
3113					DEBUG_TEST(0x0010,sfprintf(sfstderr, "BM#2: fail[%d]=%d => %d\n", k, a->re.bm.fail[k], (a->re.bm.fail[k] > n - j) ? (n - j) : a->re.bm.fail[k]), (0));
3114					if (a->re.bm.fail[k] > n - j)
3115						a->re.bm.fail[k] = n - j;
3116				}
3117			cut:	;
3118			}
3119			for (i = 1; i <= n; i++)
3120				if (a->re.bm.fail[i] > n + k - i)
3121				{
3122					DEBUG_TEST(0x0010,sfprintf(sfstderr, "BM#4: fail[%d]=%d => %d\n", i, a->re.bm.fail[i], n + k - i), (0));
3123					a->re.bm.fail[i] = n + k - i;
3124				}
3125#if _AST_REGEX_DEBUG
3126			if (DEBUG_TEST(0x0020,(1),(0)))
3127			{
3128				sfprintf(sfstderr, "STAT: complete=%d n=%d k=%d l=%d r=%d y=%d:%d e=%d:%d\n", a->re.bm.complete, n, k, a->re.bm.left, a->re.bm.right, y->type, y->next ? y->next->type : 0, e->type, e->next ? e->next->type : 0);
3129				for (m = 0; m < n; m++)
3130					for (i = 1; i <= UCHAR_MAX; i++)
3131						if (a->re.bm.mask[m][i])
3132							sfprintf(sfstderr, "MASK: [%d]['%c'] = %032..2u\n", m, i, a->re.bm.mask[m][i]);
3133				for (i = ' '; i <= UCHAR_MAX; i++)
3134					if (a->re.bm.skip[i] >= HIT)
3135						sfprintf(sfstderr, "SKIP: ['%c'] =   *\n", i);
3136					else if (a->re.bm.skip[i] > 0 && a->re.bm.skip[i] < n)
3137						sfprintf(sfstderr, "SKIP: ['%c'] = %3d\n", i, a->re.bm.skip[i]);
3138				for (j = 31; j >= 0; j--)
3139				{
3140					sfprintf(sfstderr, "      ");
3141				next:
3142					for (m = 0; m < n; m++)
3143					{
3144						for (i = 0040; i < 0177; i++)
3145							if (a->re.bm.mask[m][i] & (1 << j))
3146							{
3147								sfprintf(sfstderr, "  %c", i);
3148								break;
3149							}
3150						if (i >= 0177)
3151						{
3152							if (j > 0)
3153							{
3154								j--;
3155								goto next;
3156							}
3157							sfprintf(sfstderr, "  ?");
3158						}
3159					}
3160					sfprintf(sfstderr, "\n");
3161				}
3162				sfprintf(sfstderr, "FAIL: ");
3163				for (m = 1; m <= n; m++)
3164					sfprintf(sfstderr, "%3d", a->re.bm.fail[m]);
3165				sfprintf(sfstderr, "\n");
3166			}
3167#endif
3168			alloc(env->disc, q, 0);
3169			a->next = e;
3170			p->env->rex = a;
3171			return 0;
3172		}
3173		switch (e->type)
3174		{
3175		case REX_BEG:
3176			if (env->flags & REG_NEWLINE)
3177				return 0;
3178			break;
3179		case REX_GROUP:
3180			if (env->stats.b)
3181				return 0;
3182			e = e->re.group.expr.rex;
3183			if (e->type != REX_DOT)
3184				return 0;
3185			/*FALLTHROUGH*/
3186		case REX_DOT:
3187			if (e->lo == 0 && e->hi == RE_DUP_INF)
3188				break;
3189			return 0;
3190		case REX_NULL:
3191			if (env->flags & (REG_NULL|REG_REGEXP))
3192				break;
3193			env->error = REG_ENULL;
3194			return 1;
3195		case REX_STRING:
3196			if ((env->flags & (REG_LEFT|REG_LITERAL|REG_RIGHT)) || e->map)
3197				return 0;
3198			s = e->re.string.base;
3199			n = e->re.string.size;
3200			if (!(a = node(env, REX_KMP, 0, 0, n * (sizeof(int*) + 1))))
3201				return 1;
3202			a->flags = e->flags;
3203			a->map = e->map;
3204			f = a->re.string.fail;
3205			memcpy((char*)(a->re.string.base = (unsigned char*)&f[n]), (char*)s, n);
3206			s = a->re.string.base;
3207			a->re.string.size = n;
3208			f[0] = m = -1;
3209			for (k = 1; k < n; k++)
3210			{
3211				while (m >= 0 && s[m+1] != s[k])
3212					m = f[m];
3213				if (s[m+1] == s[k])
3214					m++;
3215				f[k] = m;
3216			}
3217			a->next = e->next;
3218			p->env->rex = a;
3219			e->next = 0;
3220			drop(env->disc, e);
3221			break;
3222		default:
3223			return 0;
3224		}
3225	}
3226	p->env->once = 1;
3227	return 0;
3228}
3229
3230int
3231regcomp(regex_t* p, const char* pattern, regflags_t flags)
3232{
3233	Rex_t*			e;
3234	Rex_t*			f;
3235	regdisc_t*		disc;
3236	unsigned char*		fold;
3237	int			i;
3238	Cenv_t			env;
3239
3240	if (!p)
3241		return REG_BADPAT;
3242	if (flags & REG_DISCIPLINE)
3243	{
3244		flags &= ~REG_DISCIPLINE;
3245		disc = p->re_disc;
3246	}
3247	else
3248		disc = &state.disc;
3249	if (!disc->re_errorlevel)
3250		disc->re_errorlevel = 2;
3251	p->env = 0;
3252	if (!pattern)
3253		return fatal(disc, REG_BADPAT, pattern);
3254	if (!state.initialized)
3255	{
3256		state.initialized = 1;
3257		for (i = 0; i < elementsof(state.escape); i++)
3258			state.magic[state.escape[i].key] = state.escape[i].val;
3259	}
3260	if (!(fold = (unsigned char*)LCINFO(AST_LC_CTYPE)->data))
3261	{
3262		if (!(fold = newof(0, unsigned char, UCHAR_MAX, 1)))
3263			return fatal(disc, REG_ESPACE, pattern);
3264		for (i = 0; i <= UCHAR_MAX; i++)
3265			fold[i] = toupper(i);
3266		LCINFO(AST_LC_CTYPE)->data = (void*)fold;
3267	}
3268 again:
3269	if (!(p->env = (Env_t*)alloc(disc, 0, sizeof(Env_t))))
3270		return fatal(disc, REG_ESPACE, pattern);
3271	memset(p->env, 0, sizeof(*p->env));
3272	memset(&env, 0, sizeof(env));
3273	env.regex = p;
3274	env.flags = flags;
3275	env.disc = p->env->disc = disc;
3276	if (env.flags & REG_AUGMENTED)
3277		env.flags |= REG_EXTENDED;
3278	env.mappeddot = '.';
3279	env.mappednewline = '\n';
3280	env.mappedslash = '/';
3281	if (disc->re_version >= REG_VERSION_MAP && disc->re_map)
3282	{
3283		env.map = disc->re_map;
3284		env.MAP = p->env->fold;
3285		for (i = 0; i <= UCHAR_MAX; i++)
3286		{
3287			env.MAP[i] = fold[env.map[i]];
3288			if (env.map[i] == '.')
3289				env.mappeddot = i;
3290			if (env.map[i] == '\n')
3291				env.mappednewline = i;
3292			if (env.map[i] == '/')
3293				env.mappedslash = i;
3294		}
3295	}
3296	else
3297		env.MAP = fold;
3298	env.type = (env.flags & REG_AUGMENTED) ? ARE : (env.flags & REG_EXTENDED) ? ERE : BRE;
3299	env.explicit = -1;
3300	if (env.flags & REG_SHELL)
3301	{
3302		if (env.flags & REG_SHELL_PATH)
3303			env.explicit = env.mappedslash;
3304		if (!(env.flags & REG_SHELL_ESCAPED))
3305			env.flags |= REG_CLASS_ESCAPE;
3306		env.flags |= REG_LENIENT|REG_NULL;
3307		env.type = env.type == BRE ? SRE : KRE;
3308	}
3309	else
3310		env.flags &= ~(REG_SHELL_DOT|REG_SHELL_ESCAPED|REG_SHELL_GROUP|REG_SHELL_PATH);
3311	if ((env.flags & (REG_NEWLINE|REG_SPAN)) == REG_NEWLINE)
3312		env.explicit = env.mappednewline;
3313	p->env->leading = (env.flags & REG_SHELL_DOT) ? env.mappeddot : -1;
3314	env.posixkludge = !(env.flags & (REG_EXTENDED|REG_SHELL));
3315	env.token.lex = 0;
3316	env.token.push = 0;
3317	if (env.flags & REG_DELIMITED)
3318	{
3319		switch (env.delimiter = *pattern++)
3320		{
3321		case 0:
3322		case '\\':
3323		case '\n':
3324		case '\r':
3325			env.error = REG_EDELIM;
3326			goto bad;
3327		}
3328		env.terminator = '\n';
3329	}
3330	env.literal = env.pattern = env.cursor = (unsigned char*)pattern;
3331	if (!(p->env->rex = alt(&env, 1, 0)))
3332		goto bad;
3333	if (env.parnest)
3334	{
3335		env.error = REG_EPAREN;
3336		goto bad;
3337	}
3338	if ((env.flags & REG_LEFT) && p->env->rex->type != REX_BEG)
3339	{
3340		if (p->env->rex->type == REX_ALT)
3341			env.flags &= ~REG_FIRST;
3342		if (!(e = node(&env, REX_BEG, 0, 0, 0)))
3343		{
3344			regfree(p);
3345			return fatal(disc, REG_ESPACE, pattern);
3346		}
3347		e->next = p->env->rex;
3348		p->env->rex = e;
3349		p->env->once = 1;
3350	}
3351	for (e = p->env->rex; e->next; e = e->next);
3352	p->env->done.type = REX_DONE;
3353	p->env->done.flags = e->flags;
3354	if ((env.flags & REG_RIGHT) && e->type != REX_END)
3355	{
3356		if (p->env->rex->type == REX_ALT)
3357			env.flags &= ~REG_FIRST;
3358		if (!(f = node(&env, REX_END, 0, 0, 0)))
3359		{
3360			regfree(p);
3361			return fatal(disc, REG_ESPACE, pattern);
3362		}
3363		f->flags = e->flags;
3364		f->map = e->map;
3365		e->next = f;
3366	}
3367	if (stats(&env, p->env->rex))
3368	{
3369		if (!env.error)
3370			env.error = REG_ECOUNT;
3371		goto bad;
3372	}
3373	if (env.stats.b)
3374		p->env->hard = p->env->separate = 1;
3375	else if (!(env.flags & REG_FIRST) && (env.stats.a || env.stats.c > 1 && env.stats.c != env.stats.s || env.stats.t && (env.stats.t > 1 || env.stats.a || env.stats.c)))
3376		p->env->hard = 1;
3377	if (p->env->hard || env.stats.c || env.stats.i)
3378		p->env->stats.re_min = p->env->stats.re_max = -1;
3379	else
3380	{
3381		if (!(p->env->stats.re_min = env.stats.m))
3382			p->env->stats.re_min = -1;
3383		if (!(p->env->stats.re_max = env.stats.n))
3384			p->env->stats.re_max = -1;
3385	}
3386	if (special(&env, p))
3387		goto bad;
3388	serialize(&env, p->env->rex, 1);
3389	p->re_nsub = env.stats.p;
3390	if (env.type == KRE)
3391		p->re_nsub /= 2;
3392	if (env.flags & REG_DELIMITED)
3393	{
3394		p->re_npat = env.cursor - env.pattern + 1;
3395		if (*env.cursor == env.delimiter)
3396			p->re_npat++;
3397		else if (env.flags & REG_MUSTDELIM)
3398		{
3399			env.error = REG_EDELIM;
3400			goto bad;
3401		}
3402		else
3403			env.flags &= ~REG_DELIMITED;
3404	}
3405	p->env->explicit = env.explicit;
3406	p->env->flags = env.flags & REG_COMP;
3407	p->env->min = env.stats.m;
3408	p->env->nsub = env.stats.p + env.stats.u;
3409	p->env->refs = 1;
3410	return 0;
3411 bad:
3412	regfree(p);
3413	if (!env.error)
3414		env.error = REG_ESPACE;
3415	if (env.type >= SRE && env.error != REG_ESPACE && !(flags & REG_LITERAL))
3416	{
3417		flags |= REG_LITERAL;
3418		pattern = (const char*)env.literal;
3419		goto again;
3420	}
3421	return fatal(disc, env.error, pattern);
3422}
3423
3424/*
3425 * regcomp() on sized pattern
3426 * the lazy way around adding and checking env.end
3427 */
3428
3429int
3430regncomp(regex_t* p, const char* pattern, size_t size, regflags_t flags)
3431{
3432	char*	s;
3433	int	r;
3434
3435	if (!(s = malloc(size + 1)))
3436		return fatal((flags & REG_DISCIPLINE) ? p->re_disc : &state.disc, REG_ESPACE, pattern);
3437	memcpy(s, pattern, size);
3438	s[size] = 0;
3439	r = regcomp(p, s, flags);
3440	free(s);
3441	return r;
3442}
3443
3444/*
3445 * combine two compiled regular expressions if possible,
3446 * replacing first with the combination and freeing second.
3447 * return 0 on success.
3448 * the only combinations handled are building a Trie
3449 * from String|Kmp|Trie and String|Kmp
3450 */
3451
3452int
3453regcomb(regex_t* p, regex_t* q)
3454{
3455	Rex_t*	e = p->env->rex;
3456	Rex_t*	f = q->env->rex;
3457	Rex_t*	g;
3458	Rex_t*	h;
3459	Cenv_t	env;
3460
3461	if (!e || !f)
3462		return fatal(p->env->disc, REG_BADPAT, NiL);
3463	if (p->env->separate || q->env->separate)
3464		return REG_ESUBREG;
3465	memset(&env, 0, sizeof(env));
3466	env.disc = p->env->disc;
3467	if (e->type == REX_BM)
3468	{
3469		p->env->rex = e->next;
3470		e->next = 0;
3471		drop(env.disc, e);
3472		e = p->env->rex;
3473	}
3474	if (f->type == REX_BM)
3475	{
3476		q->env->rex = f->next;
3477		f->next = 0;
3478		drop(env.disc, f);
3479		f = q->env->rex;
3480	}
3481	if (e->type == REX_BEG && f->type == REX_BEG)
3482	{
3483		p->env->flags |= REG_LEFT;
3484		p->env->rex = e->next;
3485		e->next = 0;
3486		drop(env.disc, e);
3487		e = p->env->rex;
3488		q->env->rex = f->next;
3489		f->next = 0;
3490		drop(env.disc, f);
3491		f = q->env->rex;
3492	}
3493	for (g = e; g->next; g = g->next);
3494	for (h = f; h->next; h = h->next);
3495	if (g->next && g->next->type == REX_END && h->next && h->next->type == REX_END)
3496	{
3497		p->env->flags |= REG_RIGHT;
3498		drop(env.disc, g->next);
3499		g->next = 0;
3500		drop(env.disc, h->next);
3501		h->next = 0;
3502	}
3503	if (!(g = trie(&env, f, e)))
3504		return fatal(p->env->disc, REG_BADPAT, NiL);
3505	p->env->rex = g;
3506	if (!q->env->once)
3507		p->env->once = 0;
3508	q->env->rex = 0;
3509	if (p->env->flags & REG_LEFT)
3510	{
3511		if (!(e = node(&env, REX_BEG, 0, 0, 0)))
3512		{
3513			regfree(p);
3514			return fatal(p->env->disc, REG_ESPACE, NiL);
3515		}
3516		e->next = p->env->rex;
3517		p->env->rex = e;
3518		p->env->once = 1;
3519	}
3520	if (p->env->flags & REG_RIGHT)
3521	{
3522		for (f = p->env->rex; f->next; f = f->next);
3523		if (f->type != REX_END)
3524		{
3525			if (!(e = node(&env, REX_END, 0, 0, 0)))
3526			{
3527				regfree(p);
3528				return fatal(p->env->disc, REG_ESPACE, NiL);
3529			}
3530			f->next = e;
3531		}
3532	}
3533	env.explicit = p->env->explicit;
3534	env.flags = p->env->flags;
3535	env.disc = p->env->disc;
3536	if (stats(&env, p->env->rex))
3537	{
3538		regfree(p);
3539		return fatal(p->env->disc, env.error ? env.error : REG_ECOUNT, NiL);
3540	}
3541	if (special(&env, p))
3542	{
3543		regfree(p);
3544		return fatal(p->env->disc, env.error ? env.error : REG_ESPACE, NiL);
3545	}
3546	p->env->min = g->re.trie.min;
3547	return 0;
3548}
3549
3550/*
3551 * copy a reference of p into q
3552 * p and q may then have separate regsubcomp() instantiations
3553 */
3554
3555int
3556regdup(regex_t* p, regex_t* q)
3557{
3558	if (!p || !q)
3559		return REG_BADPAT;
3560	*q = *p;
3561	p->env->refs++;
3562	q->re_sub = 0;
3563	return 0;
3564}
3565