1/*
2  regcomp.c - TRE POSIX compatible regex compilation functions.
3
4  Copyright (c) 2001-2009 Ville Laurikari <vl@iki.fi>
5  All rights reserved.
6
7  Redistribution and use in source and binary forms, with or without
8  modification, are permitted provided that the following conditions
9  are met:
10
11    1. Redistributions of source code must retain the above copyright
12       notice, this list of conditions and the following disclaimer.
13
14    2. Redistributions in binary form must reproduce the above copyright
15       notice, this list of conditions and the following disclaimer in the
16       documentation and/or other materials provided with the distribution.
17
18  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS
19  ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21  A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT
22  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
30*/
31
32#include <string.h>
33#include <stdlib.h>
34#include <regex.h>
35#include <limits.h>
36#include <stdint.h>
37#include <ctype.h>
38
39#include "tre.h"
40
41#include <assert.h>
42
43/***********************************************************************
44 from tre-compile.h
45***********************************************************************/
46
47typedef struct {
48  int position;
49  int code_min;
50  int code_max;
51  int *tags;
52  int assertions;
53  tre_ctype_t class;
54  tre_ctype_t *neg_classes;
55  int backref;
56} tre_pos_and_tags_t;
57
58
59/***********************************************************************
60 from tre-ast.c and tre-ast.h
61***********************************************************************/
62
63/* The different AST node types. */
64typedef enum {
65  LITERAL,
66  CATENATION,
67  ITERATION,
68  UNION
69} tre_ast_type_t;
70
71/* Special subtypes of TRE_LITERAL. */
72#define EMPTY	  -1   /* Empty leaf (denotes empty string). */
73#define ASSERTION -2   /* Assertion leaf. */
74#define TAG	  -3   /* Tag leaf. */
75#define BACKREF	  -4   /* Back reference leaf. */
76
77#define IS_SPECIAL(x)	((x)->code_min < 0)
78#define IS_EMPTY(x)	((x)->code_min == EMPTY)
79#define IS_ASSERTION(x) ((x)->code_min == ASSERTION)
80#define IS_TAG(x)	((x)->code_min == TAG)
81#define IS_BACKREF(x)	((x)->code_min == BACKREF)
82
83
84/* A generic AST node.  All AST nodes consist of this node on the top
85   level with `obj' pointing to the actual content. */
86typedef struct {
87  tre_ast_type_t type;   /* Type of the node. */
88  void *obj;             /* Pointer to actual node. */
89  int nullable;
90  int submatch_id;
91  int num_submatches;
92  int num_tags;
93  tre_pos_and_tags_t *firstpos;
94  tre_pos_and_tags_t *lastpos;
95} tre_ast_node_t;
96
97
98/* A "literal" node.  These are created for assertions, back references,
99   tags, matching parameter settings, and all expressions that match one
100   character. */
101typedef struct {
102  long code_min;
103  long code_max;
104  int position;
105  tre_ctype_t class;
106  tre_ctype_t *neg_classes;
107} tre_literal_t;
108
109/* A "catenation" node.	 These are created when two regexps are concatenated.
110   If there are more than one subexpressions in sequence, the `left' part
111   holds all but the last, and `right' part holds the last subexpression
112   (catenation is left associative). */
113typedef struct {
114  tre_ast_node_t *left;
115  tre_ast_node_t *right;
116} tre_catenation_t;
117
118/* An "iteration" node.	 These are created for the "*", "+", "?", and "{m,n}"
119   operators. */
120typedef struct {
121  /* Subexpression to match. */
122  tre_ast_node_t *arg;
123  /* Minimum number of consecutive matches. */
124  int min;
125  /* Maximum number of consecutive matches. */
126  int max;
127  /* If 0, match as many characters as possible, if 1 match as few as
128     possible.	Note that this does not always mean the same thing as
129     matching as many/few repetitions as possible. */
130  unsigned int minimal:1;
131} tre_iteration_t;
132
133/* An "union" node.  These are created for the "|" operator. */
134typedef struct {
135  tre_ast_node_t *left;
136  tre_ast_node_t *right;
137} tre_union_t;
138
139
140static tre_ast_node_t *
141tre_ast_new_node(tre_mem_t mem, int type, void *obj)
142{
143	tre_ast_node_t *node = tre_mem_calloc(mem, sizeof *node);
144	if (!node || !obj)
145		return 0;
146	node->obj = obj;
147	node->type = type;
148	node->nullable = -1;
149	node->submatch_id = -1;
150	return node;
151}
152
153static tre_ast_node_t *
154tre_ast_new_literal(tre_mem_t mem, int code_min, int code_max, int position)
155{
156	tre_ast_node_t *node;
157	tre_literal_t *lit;
158
159	lit = tre_mem_calloc(mem, sizeof *lit);
160	node = tre_ast_new_node(mem, LITERAL, lit);
161	if (!node)
162		return 0;
163	lit->code_min = code_min;
164	lit->code_max = code_max;
165	lit->position = position;
166	return node;
167}
168
169static tre_ast_node_t *
170tre_ast_new_iter(tre_mem_t mem, tre_ast_node_t *arg, int min, int max, int minimal)
171{
172	tre_ast_node_t *node;
173	tre_iteration_t *iter;
174
175	iter = tre_mem_calloc(mem, sizeof *iter);
176	node = tre_ast_new_node(mem, ITERATION, iter);
177	if (!node)
178		return 0;
179	iter->arg = arg;
180	iter->min = min;
181	iter->max = max;
182	iter->minimal = minimal;
183	node->num_submatches = arg->num_submatches;
184	return node;
185}
186
187static tre_ast_node_t *
188tre_ast_new_union(tre_mem_t mem, tre_ast_node_t *left, tre_ast_node_t *right)
189{
190	tre_ast_node_t *node;
191	tre_union_t *un;
192
193	if (!left)
194		return right;
195	un = tre_mem_calloc(mem, sizeof *un);
196	node = tre_ast_new_node(mem, UNION, un);
197	if (!node || !right)
198		return 0;
199	un->left = left;
200	un->right = right;
201	node->num_submatches = left->num_submatches + right->num_submatches;
202	return node;
203}
204
205static tre_ast_node_t *
206tre_ast_new_catenation(tre_mem_t mem, tre_ast_node_t *left, tre_ast_node_t *right)
207{
208	tre_ast_node_t *node;
209	tre_catenation_t *cat;
210
211	if (!left)
212		return right;
213	cat = tre_mem_calloc(mem, sizeof *cat);
214	node = tre_ast_new_node(mem, CATENATION, cat);
215	if (!node)
216		return 0;
217	cat->left = left;
218	cat->right = right;
219	node->num_submatches = left->num_submatches + right->num_submatches;
220	return node;
221}
222
223
224/***********************************************************************
225 from tre-stack.c and tre-stack.h
226***********************************************************************/
227
228typedef struct tre_stack_rec tre_stack_t;
229
230/* Creates a new stack object.	`size' is initial size in bytes, `max_size'
231   is maximum size, and `increment' specifies how much more space will be
232   allocated with realloc() if all space gets used up.	Returns the stack
233   object or NULL if out of memory. */
234static tre_stack_t *
235tre_stack_new(int size, int max_size, int increment);
236
237/* Frees the stack object. */
238static void
239tre_stack_destroy(tre_stack_t *s);
240
241/* Returns the current number of objects in the stack. */
242static int
243tre_stack_num_objects(tre_stack_t *s);
244
245/* Each tre_stack_push_*(tre_stack_t *s, <type> value) function pushes
246   `value' on top of stack `s'.  Returns REG_ESPACE if out of memory.
247   This tries to realloc() more space before failing if maximum size
248   has not yet been reached.  Returns REG_OK if successful. */
249#define declare_pushf(typetag, type)					      \
250  static reg_errcode_t tre_stack_push_ ## typetag(tre_stack_t *s, type value)
251
252declare_pushf(voidptr, void *);
253declare_pushf(int, int);
254
255/* Each tre_stack_pop_*(tre_stack_t *s) function pops the topmost
256   element off of stack `s' and returns it.  The stack must not be
257   empty. */
258#define declare_popf(typetag, type)		  \
259  static type tre_stack_pop_ ## typetag(tre_stack_t *s)
260
261declare_popf(voidptr, void *);
262declare_popf(int, int);
263
264/* Just to save some typing. */
265#define STACK_PUSH(s, typetag, value)					      \
266  do									      \
267    {									      \
268      status = tre_stack_push_ ## typetag(s, value);			      \
269    }									      \
270  while (/*CONSTCOND*/0)
271
272#define STACK_PUSHX(s, typetag, value)					      \
273  {									      \
274    status = tre_stack_push_ ## typetag(s, value);			      \
275    if (status != REG_OK)						      \
276      break;								      \
277  }
278
279#define STACK_PUSHR(s, typetag, value)					      \
280  {									      \
281    reg_errcode_t _status;						      \
282    _status = tre_stack_push_ ## typetag(s, value);			      \
283    if (_status != REG_OK)						      \
284      return _status;							      \
285  }
286
287union tre_stack_item {
288  void *voidptr_value;
289  int int_value;
290};
291
292struct tre_stack_rec {
293  int size;
294  int max_size;
295  int increment;
296  int ptr;
297  union tre_stack_item *stack;
298};
299
300
301static tre_stack_t *
302tre_stack_new(int size, int max_size, int increment)
303{
304  tre_stack_t *s;
305
306  s = xmalloc(sizeof(*s));
307  if (s != NULL)
308    {
309      s->stack = xmalloc(sizeof(*s->stack) * size);
310      if (s->stack == NULL)
311	{
312	  xfree(s);
313	  return NULL;
314	}
315      s->size = size;
316      s->max_size = max_size;
317      s->increment = increment;
318      s->ptr = 0;
319    }
320  return s;
321}
322
323static void
324tre_stack_destroy(tre_stack_t *s)
325{
326  xfree(s->stack);
327  xfree(s);
328}
329
330static int
331tre_stack_num_objects(tre_stack_t *s)
332{
333  return s->ptr;
334}
335
336static reg_errcode_t
337tre_stack_push(tre_stack_t *s, union tre_stack_item value)
338{
339  if (s->ptr < s->size)
340    {
341      s->stack[s->ptr] = value;
342      s->ptr++;
343    }
344  else
345    {
346      if (s->size >= s->max_size)
347	{
348	  return REG_ESPACE;
349	}
350      else
351	{
352	  union tre_stack_item *new_buffer;
353	  int new_size;
354	  new_size = s->size + s->increment;
355	  if (new_size > s->max_size)
356	    new_size = s->max_size;
357	  new_buffer = xrealloc(s->stack, sizeof(*new_buffer) * new_size);
358	  if (new_buffer == NULL)
359	    {
360	      return REG_ESPACE;
361	    }
362	  assert(new_size > s->size);
363	  s->size = new_size;
364	  s->stack = new_buffer;
365	  tre_stack_push(s, value);
366	}
367    }
368  return REG_OK;
369}
370
371#define define_pushf(typetag, type)  \
372  declare_pushf(typetag, type) {     \
373    union tre_stack_item item;	     \
374    item.typetag ## _value = value;  \
375    return tre_stack_push(s, item);  \
376}
377
378define_pushf(int, int)
379define_pushf(voidptr, void *)
380
381#define define_popf(typetag, type)		    \
382  declare_popf(typetag, type) {			    \
383    return s->stack[--s->ptr].typetag ## _value;    \
384  }
385
386define_popf(int, int)
387define_popf(voidptr, void *)
388
389
390/***********************************************************************
391 from tre-parse.c and tre-parse.h
392***********************************************************************/
393
394/* Parse context. */
395typedef struct {
396	/* Memory allocator. The AST is allocated using this. */
397	tre_mem_t mem;
398	/* Stack used for keeping track of regexp syntax. */
399	tre_stack_t *stack;
400	/* The parsed node after a parse function returns. */
401	tre_ast_node_t *n;
402	/* Position in the regexp pattern after a parse function returns. */
403	const char *s;
404	/* The first character of the last subexpression parsed. */
405	const char *start;
406	/* Current submatch ID. */
407	int submatch_id;
408	/* Current position (number of literal). */
409	int position;
410	/* The highest back reference or -1 if none seen so far. */
411	int max_backref;
412	/* Compilation flags. */
413	int cflags;
414} tre_parse_ctx_t;
415
416/* Some macros for expanding \w, \s, etc. */
417static const struct {
418	char c;
419	const char *expansion;
420} tre_macros[] = {
421	{'t', "\t"}, {'n', "\n"}, {'r', "\r"},
422	{'f', "\f"}, {'a', "\a"}, {'e', "\033"},
423	{'w', "[[:alnum:]_]"}, {'W', "[^[:alnum:]_]"}, {'s', "[[:space:]]"},
424	{'S', "[^[:space:]]"}, {'d', "[[:digit:]]"}, {'D', "[^[:digit:]]"},
425	{ 0, 0 }
426};
427
428/* Expands a macro delimited by `regex' and `regex_end' to `buf', which
429   must have at least `len' items.  Sets buf[0] to zero if the there
430   is no match in `tre_macros'. */
431static const char *tre_expand_macro(const char *s)
432{
433	int i;
434	for (i = 0; tre_macros[i].c && tre_macros[i].c != *s; i++);
435	return tre_macros[i].expansion;
436}
437
438static int
439tre_compare_lit(const void *a, const void *b)
440{
441	const tre_literal_t *const *la = a;
442	const tre_literal_t *const *lb = b;
443	/* assumes the range of valid code_min is < INT_MAX */
444	return la[0]->code_min - lb[0]->code_min;
445}
446
447struct literals {
448	tre_mem_t mem;
449	tre_literal_t **a;
450	int len;
451	int cap;
452};
453
454static tre_literal_t *tre_new_lit(struct literals *p)
455{
456	tre_literal_t **a;
457	if (p->len >= p->cap) {
458		if (p->cap >= 1<<15)
459			return 0;
460		p->cap *= 2;
461		a = xrealloc(p->a, p->cap * sizeof *p->a);
462		if (!a)
463			return 0;
464		p->a = a;
465	}
466	a = p->a + p->len++;
467	*a = tre_mem_calloc(p->mem, sizeof **a);
468	return *a;
469}
470
471static int add_icase_literals(struct literals *ls, int min, int max)
472{
473	tre_literal_t *lit;
474	int b, e, c;
475	for (c=min; c<=max; ) {
476		/* assumes islower(c) and isupper(c) are exclusive
477		   and toupper(c)!=c if islower(c).
478		   multiple opposite case characters are not supported */
479		if (tre_islower(c)) {
480			b = e = tre_toupper(c);
481			for (c++, e++; c<=max; c++, e++)
482				if (tre_toupper(c) != e) break;
483		} else if (tre_isupper(c)) {
484			b = e = tre_tolower(c);
485			for (c++, e++; c<=max; c++, e++)
486				if (tre_tolower(c) != e) break;
487		} else {
488			c++;
489			continue;
490		}
491		lit = tre_new_lit(ls);
492		if (!lit)
493			return -1;
494		lit->code_min = b;
495		lit->code_max = e-1;
496		lit->position = -1;
497	}
498	return 0;
499}
500
501
502/* Maximum number of character classes in a negated bracket expression. */
503#define MAX_NEG_CLASSES 64
504
505struct neg {
506	int negate;
507	int len;
508	tre_ctype_t a[MAX_NEG_CLASSES];
509};
510
511// TODO: parse bracket into a set of non-overlapping [lo,hi] ranges
512
513/*
514bracket grammar:
515Bracket  =  '[' List ']'  |  '[^' List ']'
516List     =  Term  |  List Term
517Term     =  Char  |  Range  |  Chclass  |  Eqclass
518Range    =  Char '-' Char  |  Char '-' '-'
519Char     =  Coll  |  coll_single
520Meta     =  ']'  |  '-'
521Coll     =  '[.' coll_single '.]'  |  '[.' coll_multi '.]'  |  '[.' Meta '.]'
522Eqclass  =  '[=' coll_single '=]'  |  '[=' coll_multi '=]'
523Chclass  =  '[:' class ':]'
524
525coll_single is a single char collating element but it can be
526 '-' only at the beginning or end of a List and
527 ']' only at the beginning of a List and
528 '^' anywhere except after the openning '['
529*/
530
531static reg_errcode_t parse_bracket_terms(tre_parse_ctx_t *ctx, const char *s, struct literals *ls, struct neg *neg)
532{
533	const char *start = s;
534	tre_ctype_t class;
535	int min, max;
536	wchar_t wc;
537	int len;
538
539	for (;;) {
540		class = 0;
541		len = mbtowc(&wc, s, -1);
542		if (len <= 0)
543			return *s ? REG_BADPAT : REG_EBRACK;
544		if (*s == ']' && s != start) {
545			ctx->s = s+1;
546			return REG_OK;
547		}
548		if (*s == '-' && s != start && s[1] != ']' &&
549		    /* extension: [a-z--@] is accepted as [a-z]|[--@] */
550		    (s[1] != '-' || s[2] == ']'))
551			return REG_ERANGE;
552		if (*s == '[' && (s[1] == '.' || s[1] == '='))
553			/* collating symbols and equivalence classes are not supported */
554			return REG_ECOLLATE;
555		if (*s == '[' && s[1] == ':') {
556			char tmp[CHARCLASS_NAME_MAX+1];
557			s += 2;
558			for (len=0; len < CHARCLASS_NAME_MAX && s[len]; len++) {
559				if (s[len] == ':') {
560					memcpy(tmp, s, len);
561					tmp[len] = 0;
562					class = tre_ctype(tmp);
563					break;
564				}
565			}
566			if (!class || s[len+1] != ']')
567				return REG_ECTYPE;
568			min = 0;
569			max = TRE_CHAR_MAX;
570			s += len+2;
571		} else {
572			min = max = wc;
573			s += len;
574			if (*s == '-' && s[1] != ']') {
575				s++;
576				len = mbtowc(&wc, s, -1);
577				max = wc;
578				/* XXX - Should use collation order instead of
579				   encoding values in character ranges. */
580				if (len <= 0 || min > max)
581					return REG_ERANGE;
582				s += len;
583			}
584		}
585
586		if (class && neg->negate) {
587			if (neg->len >= MAX_NEG_CLASSES)
588				return REG_ESPACE;
589			neg->a[neg->len++] = class;
590		} else  {
591			tre_literal_t *lit = tre_new_lit(ls);
592			if (!lit)
593				return REG_ESPACE;
594			lit->code_min = min;
595			lit->code_max = max;
596			lit->class = class;
597			lit->position = -1;
598
599			/* Add opposite-case codepoints if REG_ICASE is present.
600			   It seems that POSIX requires that bracket negation
601			   should happen before case-folding, but most practical
602			   implementations do it the other way around. Changing
603			   the order would need efficient representation of
604			   case-fold ranges and bracket range sets even with
605			   simple patterns so this is ok for now. */
606			if (ctx->cflags & REG_ICASE && !class)
607				if (add_icase_literals(ls, min, max))
608					return REG_ESPACE;
609		}
610	}
611}
612
613static reg_errcode_t parse_bracket(tre_parse_ctx_t *ctx, const char *s)
614{
615	int i, max, min, negmax, negmin;
616	tre_ast_node_t *node = 0, *n;
617	tre_ctype_t *nc = 0;
618	tre_literal_t *lit;
619	struct literals ls;
620	struct neg neg;
621	reg_errcode_t err;
622
623	ls.mem = ctx->mem;
624	ls.len = 0;
625	ls.cap = 32;
626	ls.a = xmalloc(ls.cap * sizeof *ls.a);
627	if (!ls.a)
628		return REG_ESPACE;
629	neg.len = 0;
630	neg.negate = *s == '^';
631	if (neg.negate)
632		s++;
633
634	err = parse_bracket_terms(ctx, s, &ls, &neg);
635	if (err != REG_OK)
636		goto parse_bracket_done;
637
638	if (neg.negate) {
639		/* Sort the array if we need to negate it. */
640		qsort(ls.a, ls.len, sizeof *ls.a, tre_compare_lit);
641		/* extra lit for the last negated range */
642		lit = tre_new_lit(&ls);
643		if (!lit) {
644			err = REG_ESPACE;
645			goto parse_bracket_done;
646		}
647		lit->code_min = TRE_CHAR_MAX+1;
648		lit->code_max = TRE_CHAR_MAX+1;
649		lit->position = -1;
650		/* negated classes */
651		if (neg.len) {
652			nc = tre_mem_alloc(ctx->mem, (neg.len+1)*sizeof *neg.a);
653			if (!nc) {
654				err = REG_ESPACE;
655				goto parse_bracket_done;
656			}
657			memcpy(nc, neg.a, neg.len*sizeof *neg.a);
658			nc[neg.len] = 0;
659		}
660	}
661
662	/* Build a union of the items in the array, negated if necessary. */
663	negmax = negmin = 0;
664	for (i = 0; i < ls.len; i++) {
665		lit = ls.a[i];
666		min = lit->code_min;
667		max = lit->code_max;
668		if (neg.negate) {
669			if (min <= negmin) {
670				/* Overlap. */
671				negmin = MAX(max + 1, negmin);
672				continue;
673			}
674			negmax = min - 1;
675			lit->code_min = negmin;
676			lit->code_max = negmax;
677			negmin = max + 1;
678		}
679		lit->position = ctx->position;
680		lit->neg_classes = nc;
681		n = tre_ast_new_node(ctx->mem, LITERAL, lit);
682		node = tre_ast_new_union(ctx->mem, node, n);
683		if (!node) {
684			err = REG_ESPACE;
685			break;
686		}
687	}
688
689parse_bracket_done:
690	xfree(ls.a);
691	ctx->position++;
692	ctx->n = node;
693	return err;
694}
695
696static const char *parse_dup_count(const char *s, int *n)
697{
698	*n = -1;
699	if (!isdigit(*s))
700		return s;
701	*n = 0;
702	for (;;) {
703		*n = 10 * *n + (*s - '0');
704		s++;
705		if (!isdigit(*s) || *n > RE_DUP_MAX)
706			break;
707	}
708	return s;
709}
710
711static const char *parse_dup(const char *s, int ere, int *pmin, int *pmax)
712{
713	int min, max;
714
715	s = parse_dup_count(s, &min);
716	if (*s == ',')
717		s = parse_dup_count(s+1, &max);
718	else
719		max = min;
720
721	if (
722		(max < min && max >= 0) ||
723		max > RE_DUP_MAX ||
724		min > RE_DUP_MAX ||
725		min < 0 ||
726		(!ere && *s++ != '\\') ||
727		*s++ != '}'
728	)
729		return 0;
730	*pmin = min;
731	*pmax = max;
732	return s;
733}
734
735static int hexval(unsigned c)
736{
737	if (c-'0'<10) return c-'0';
738	c |= 32;
739	if (c-'a'<6) return c-'a'+10;
740	return -1;
741}
742
743static reg_errcode_t marksub(tre_parse_ctx_t *ctx, tre_ast_node_t *node, int subid)
744{
745	if (node->submatch_id >= 0) {
746		tre_ast_node_t *n = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
747		if (!n)
748			return REG_ESPACE;
749		n = tre_ast_new_catenation(ctx->mem, n, node);
750		if (!n)
751			return REG_ESPACE;
752		n->num_submatches = node->num_submatches;
753		node = n;
754	}
755	node->submatch_id = subid;
756	node->num_submatches++;
757	ctx->n = node;
758	return REG_OK;
759}
760
761/*
762BRE grammar:
763Regex  =  Branch  |  '^'  |  '$'  |  '^$'  |  '^' Branch  |  Branch '$'  |  '^' Branch '$'
764Branch =  Atom  |  Branch Atom
765Atom   =  char  |  quoted_char  |  '.'  |  Bracket  |  Atom Dup  |  '\(' Branch '\)'  |  back_ref
766Dup    =  '*'  |  '\{' Count '\}'  |  '\{' Count ',\}'  |  '\{' Count ',' Count '\}'
767
768(leading ^ and trailing $ in a sub expr may be an anchor or literal as well)
769
770ERE grammar:
771Regex  =  Branch  |  Regex '|' Branch
772Branch =  Atom  |  Branch Atom
773Atom   =  char  |  quoted_char  |  '.'  |  Bracket  |  Atom Dup  |  '(' Regex ')'  |  '^'  |  '$'
774Dup    =  '*'  |  '+'  |  '?'  |  '{' Count '}'  |  '{' Count ',}'  |  '{' Count ',' Count '}'
775
776(a*+?, ^*, $+, \X, {, (|a) are unspecified)
777*/
778
779static reg_errcode_t parse_atom(tre_parse_ctx_t *ctx, const char *s)
780{
781	int len, ere = ctx->cflags & REG_EXTENDED;
782	const char *p;
783	tre_ast_node_t *node;
784	wchar_t wc;
785	switch (*s) {
786	case '[':
787		return parse_bracket(ctx, s+1);
788	case '\\':
789		p = tre_expand_macro(s+1);
790		if (p) {
791			/* assume \X expansion is a single atom */
792			reg_errcode_t err = parse_atom(ctx, p);
793			ctx->s = s+2;
794			return err;
795		}
796		/* extensions: \b, \B, \<, \>, \xHH \x{HHHH} */
797		switch (*++s) {
798		case 0:
799			return REG_EESCAPE;
800		case 'b':
801			node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_WB, -1);
802			break;
803		case 'B':
804			node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_WB_NEG, -1);
805			break;
806		case '<':
807			node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_BOW, -1);
808			break;
809		case '>':
810			node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_EOW, -1);
811			break;
812		case 'x':
813			s++;
814			int i, v = 0, c;
815			len = 2;
816			if (*s == '{') {
817				len = 8;
818				s++;
819			}
820			for (i=0; i<len && v<0x110000; i++) {
821				c = hexval(s[i]);
822				if (c < 0) break;
823				v = 16*v + c;
824			}
825			s += i;
826			if (len == 8) {
827				if (*s != '}')
828					return REG_EBRACE;
829				s++;
830			}
831			node = tre_ast_new_literal(ctx->mem, v, v, ctx->position++);
832			s--;
833			break;
834		case '{':
835		case '+':
836		case '?':
837			/* extension: treat \+, \? as repetitions in BRE */
838			/* reject repetitions after empty expression in BRE */
839			if (!ere)
840				return REG_BADRPT;
841		case '|':
842			/* extension: treat \| as alternation in BRE */
843			if (!ere) {
844				node = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
845				s--;
846				goto end;
847			}
848			/* fallthrough */
849		default:
850			if (!ere && (unsigned)*s-'1' < 9) {
851				/* back reference */
852				int val = *s - '0';
853				node = tre_ast_new_literal(ctx->mem, BACKREF, val, ctx->position++);
854				ctx->max_backref = MAX(val, ctx->max_backref);
855			} else {
856				/* extension: accept unknown escaped char
857				   as a literal */
858				goto parse_literal;
859			}
860		}
861		s++;
862		break;
863	case '.':
864		if (ctx->cflags & REG_NEWLINE) {
865			tre_ast_node_t *tmp1, *tmp2;
866			tmp1 = tre_ast_new_literal(ctx->mem, 0, '\n'-1, ctx->position++);
867			tmp2 = tre_ast_new_literal(ctx->mem, '\n'+1, TRE_CHAR_MAX, ctx->position++);
868			if (tmp1 && tmp2)
869				node = tre_ast_new_union(ctx->mem, tmp1, tmp2);
870			else
871				node = 0;
872		} else {
873			node = tre_ast_new_literal(ctx->mem, 0, TRE_CHAR_MAX, ctx->position++);
874		}
875		s++;
876		break;
877	case '^':
878		/* '^' has a special meaning everywhere in EREs, and at beginning of BRE. */
879		if (!ere && s != ctx->start)
880			goto parse_literal;
881		node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_BOL, -1);
882		s++;
883		break;
884	case '$':
885		/* '$' is special everywhere in EREs, and at the end of a BRE subexpression. */
886		if (!ere && s[1] && (s[1]!='\\'|| (s[2]!=')' && s[2]!='|')))
887			goto parse_literal;
888		node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_EOL, -1);
889		s++;
890		break;
891	case '*':
892	case '{':
893	case '+':
894	case '?':
895		/* reject repetitions after empty expression in ERE */
896		if (ere)
897			return REG_BADRPT;
898	case '|':
899		if (!ere)
900			goto parse_literal;
901	case 0:
902		node = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
903		break;
904	default:
905parse_literal:
906		len = mbtowc(&wc, s, -1);
907		if (len < 0)
908			return REG_BADPAT;
909		if (ctx->cflags & REG_ICASE && (tre_isupper(wc) || tre_islower(wc))) {
910			tre_ast_node_t *tmp1, *tmp2;
911			/* multiple opposite case characters are not supported */
912			tmp1 = tre_ast_new_literal(ctx->mem, tre_toupper(wc), tre_toupper(wc), ctx->position);
913			tmp2 = tre_ast_new_literal(ctx->mem, tre_tolower(wc), tre_tolower(wc), ctx->position);
914			if (tmp1 && tmp2)
915				node = tre_ast_new_union(ctx->mem, tmp1, tmp2);
916			else
917				node = 0;
918		} else {
919			node = tre_ast_new_literal(ctx->mem, wc, wc, ctx->position);
920		}
921		ctx->position++;
922		s += len;
923		break;
924	}
925end:
926	if (!node)
927		return REG_ESPACE;
928	ctx->n = node;
929	ctx->s = s;
930	return REG_OK;
931}
932
933#define PUSHPTR(err, s, v) do { \
934	if ((err = tre_stack_push_voidptr(s, v)) != REG_OK) \
935		return err; \
936} while(0)
937
938#define PUSHINT(err, s, v) do { \
939	if ((err = tre_stack_push_int(s, v)) != REG_OK) \
940		return err; \
941} while(0)
942
943static reg_errcode_t tre_parse(tre_parse_ctx_t *ctx)
944{
945	tre_ast_node_t *nbranch=0, *nunion=0;
946	int ere = ctx->cflags & REG_EXTENDED;
947	const char *s = ctx->start;
948	int subid = 0;
949	int depth = 0;
950	reg_errcode_t err;
951	tre_stack_t *stack = ctx->stack;
952
953	PUSHINT(err, stack, subid++);
954	for (;;) {
955		if ((!ere && *s == '\\' && s[1] == '(') ||
956		    (ere && *s == '(')) {
957			PUSHPTR(err, stack, nunion);
958			PUSHPTR(err, stack, nbranch);
959			PUSHINT(err, stack, subid++);
960			s++;
961			if (!ere)
962				s++;
963			depth++;
964			nbranch = nunion = 0;
965			ctx->start = s;
966			continue;
967		}
968		if ((!ere && *s == '\\' && s[1] == ')') ||
969		    (ere && *s == ')' && depth)) {
970			ctx->n = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
971			if (!ctx->n)
972				return REG_ESPACE;
973		} else {
974			err = parse_atom(ctx, s);
975			if (err != REG_OK)
976				return err;
977			s = ctx->s;
978		}
979
980	parse_iter:
981		for (;;) {
982			int min, max;
983
984			if (*s!='\\' && *s!='*') {
985				if (!ere)
986					break;
987				if (*s!='+' && *s!='?' && *s!='{')
988					break;
989			}
990			if (*s=='\\' && ere)
991				break;
992			/* extension: treat \+, \? as repetitions in BRE */
993			if (*s=='\\' && s[1]!='+' && s[1]!='?' && s[1]!='{')
994				break;
995			if (*s=='\\')
996				s++;
997
998			/* handle ^* at the start of a BRE. */
999			if (!ere && s==ctx->start+1 && s[-1]=='^')
1000				break;
1001
1002			/* extension: multiple consecutive *+?{,} is unspecified,
1003			   but (a+)+ has to be supported so accepting a++ makes
1004			   sense, note however that the RE_DUP_MAX limit can be
1005			   circumvented: (a{255}){255} uses a lot of memory.. */
1006			if (*s=='{') {
1007				s = parse_dup(s+1, ere, &min, &max);
1008				if (!s)
1009					return REG_BADBR;
1010			} else {
1011				min=0;
1012				max=-1;
1013				if (*s == '+')
1014					min = 1;
1015				if (*s == '?')
1016					max = 1;
1017				s++;
1018			}
1019			if (max == 0)
1020				ctx->n = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
1021			else
1022				ctx->n = tre_ast_new_iter(ctx->mem, ctx->n, min, max, 0);
1023			if (!ctx->n)
1024				return REG_ESPACE;
1025		}
1026
1027		nbranch = tre_ast_new_catenation(ctx->mem, nbranch, ctx->n);
1028		if ((ere && *s == '|') ||
1029		    (ere && *s == ')' && depth) ||
1030		    (!ere && *s == '\\' && s[1] == ')') ||
1031		    /* extension: treat \| as alternation in BRE */
1032		    (!ere && *s == '\\' && s[1] == '|') ||
1033		    !*s) {
1034			/* extension: empty branch is unspecified (), (|a), (a|)
1035			   here they are not rejected but match on empty string */
1036			int c = *s;
1037			nunion = tre_ast_new_union(ctx->mem, nunion, nbranch);
1038			nbranch = 0;
1039
1040			if (c == '\\' && s[1] == '|') {
1041				s+=2;
1042				ctx->start = s;
1043			} else if (c == '|') {
1044				s++;
1045				ctx->start = s;
1046			} else {
1047				if (c == '\\') {
1048					if (!depth) return REG_EPAREN;
1049					s+=2;
1050				} else if (c == ')')
1051					s++;
1052				depth--;
1053				err = marksub(ctx, nunion, tre_stack_pop_int(stack));
1054				if (err != REG_OK)
1055					return err;
1056				if (!c && depth<0) {
1057					ctx->submatch_id = subid;
1058					return REG_OK;
1059				}
1060				if (!c || depth<0)
1061					return REG_EPAREN;
1062				nbranch = tre_stack_pop_voidptr(stack);
1063				nunion = tre_stack_pop_voidptr(stack);
1064				goto parse_iter;
1065			}
1066		}
1067	}
1068}
1069
1070
1071/***********************************************************************
1072 from tre-compile.c
1073***********************************************************************/
1074
1075
1076/*
1077  TODO:
1078   - Fix tre_ast_to_tnfa() to recurse using a stack instead of recursive
1079     function calls.
1080*/
1081
1082/*
1083  Algorithms to setup tags so that submatch addressing can be done.
1084*/
1085
1086
1087/* Inserts a catenation node to the root of the tree given in `node'.
1088   As the left child a new tag with number `tag_id' to `node' is added,
1089   and the right child is the old root. */
1090static reg_errcode_t
1091tre_add_tag_left(tre_mem_t mem, tre_ast_node_t *node, int tag_id)
1092{
1093  tre_catenation_t *c;
1094
1095  c = tre_mem_alloc(mem, sizeof(*c));
1096  if (c == NULL)
1097    return REG_ESPACE;
1098  c->left = tre_ast_new_literal(mem, TAG, tag_id, -1);
1099  if (c->left == NULL)
1100    return REG_ESPACE;
1101  c->right = tre_mem_alloc(mem, sizeof(tre_ast_node_t));
1102  if (c->right == NULL)
1103    return REG_ESPACE;
1104
1105  c->right->obj = node->obj;
1106  c->right->type = node->type;
1107  c->right->nullable = -1;
1108  c->right->submatch_id = -1;
1109  c->right->firstpos = NULL;
1110  c->right->lastpos = NULL;
1111  c->right->num_tags = 0;
1112  c->right->num_submatches = 0;
1113  node->obj = c;
1114  node->type = CATENATION;
1115  return REG_OK;
1116}
1117
1118/* Inserts a catenation node to the root of the tree given in `node'.
1119   As the right child a new tag with number `tag_id' to `node' is added,
1120   and the left child is the old root. */
1121static reg_errcode_t
1122tre_add_tag_right(tre_mem_t mem, tre_ast_node_t *node, int tag_id)
1123{
1124  tre_catenation_t *c;
1125
1126  c = tre_mem_alloc(mem, sizeof(*c));
1127  if (c == NULL)
1128    return REG_ESPACE;
1129  c->right = tre_ast_new_literal(mem, TAG, tag_id, -1);
1130  if (c->right == NULL)
1131    return REG_ESPACE;
1132  c->left = tre_mem_alloc(mem, sizeof(tre_ast_node_t));
1133  if (c->left == NULL)
1134    return REG_ESPACE;
1135
1136  c->left->obj = node->obj;
1137  c->left->type = node->type;
1138  c->left->nullable = -1;
1139  c->left->submatch_id = -1;
1140  c->left->firstpos = NULL;
1141  c->left->lastpos = NULL;
1142  c->left->num_tags = 0;
1143  c->left->num_submatches = 0;
1144  node->obj = c;
1145  node->type = CATENATION;
1146  return REG_OK;
1147}
1148
1149typedef enum {
1150  ADDTAGS_RECURSE,
1151  ADDTAGS_AFTER_ITERATION,
1152  ADDTAGS_AFTER_UNION_LEFT,
1153  ADDTAGS_AFTER_UNION_RIGHT,
1154  ADDTAGS_AFTER_CAT_LEFT,
1155  ADDTAGS_AFTER_CAT_RIGHT,
1156  ADDTAGS_SET_SUBMATCH_END
1157} tre_addtags_symbol_t;
1158
1159
1160typedef struct {
1161  int tag;
1162  int next_tag;
1163} tre_tag_states_t;
1164
1165
1166/* Go through `regset' and set submatch data for submatches that are
1167   using this tag. */
1168static void
1169tre_purge_regset(int *regset, tre_tnfa_t *tnfa, int tag)
1170{
1171  int i;
1172
1173  for (i = 0; regset[i] >= 0; i++)
1174    {
1175      int id = regset[i] / 2;
1176      int start = !(regset[i] % 2);
1177      if (start)
1178	tnfa->submatch_data[id].so_tag = tag;
1179      else
1180	tnfa->submatch_data[id].eo_tag = tag;
1181    }
1182  regset[0] = -1;
1183}
1184
1185
1186/* Adds tags to appropriate locations in the parse tree in `tree', so that
1187   subexpressions marked for submatch addressing can be traced. */
1188static reg_errcode_t
1189tre_add_tags(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *tree,
1190	     tre_tnfa_t *tnfa)
1191{
1192  reg_errcode_t status = REG_OK;
1193  tre_addtags_symbol_t symbol;
1194  tre_ast_node_t *node = tree; /* Tree node we are currently looking at. */
1195  int bottom = tre_stack_num_objects(stack);
1196  /* True for first pass (counting number of needed tags) */
1197  int first_pass = (mem == NULL || tnfa == NULL);
1198  int *regset, *orig_regset;
1199  int num_tags = 0; /* Total number of tags. */
1200  int num_minimals = 0;	 /* Number of special minimal tags. */
1201  int tag = 0;	    /* The tag that is to be added next. */
1202  int next_tag = 1; /* Next tag to use after this one. */
1203  int *parents;	    /* Stack of submatches the current submatch is
1204		       contained in. */
1205  int minimal_tag = -1; /* Tag that marks the beginning of a minimal match. */
1206  tre_tag_states_t *saved_states;
1207
1208  tre_tag_direction_t direction = TRE_TAG_MINIMIZE;
1209  if (!first_pass)
1210    {
1211      tnfa->end_tag = 0;
1212      tnfa->minimal_tags[0] = -1;
1213    }
1214
1215  regset = xmalloc(sizeof(*regset) * ((tnfa->num_submatches + 1) * 2));
1216  if (regset == NULL)
1217    return REG_ESPACE;
1218  regset[0] = -1;
1219  orig_regset = regset;
1220
1221  parents = xmalloc(sizeof(*parents) * (tnfa->num_submatches + 1));
1222  if (parents == NULL)
1223    {
1224      xfree(regset);
1225      return REG_ESPACE;
1226    }
1227  parents[0] = -1;
1228
1229  saved_states = xmalloc(sizeof(*saved_states) * (tnfa->num_submatches + 1));
1230  if (saved_states == NULL)
1231    {
1232      xfree(regset);
1233      xfree(parents);
1234      return REG_ESPACE;
1235    }
1236  else
1237    {
1238      unsigned int i;
1239      for (i = 0; i <= tnfa->num_submatches; i++)
1240	saved_states[i].tag = -1;
1241    }
1242
1243  STACK_PUSH(stack, voidptr, node);
1244  STACK_PUSH(stack, int, ADDTAGS_RECURSE);
1245
1246  while (tre_stack_num_objects(stack) > bottom)
1247    {
1248      if (status != REG_OK)
1249	break;
1250
1251      symbol = (tre_addtags_symbol_t)tre_stack_pop_int(stack);
1252      switch (symbol)
1253	{
1254
1255	case ADDTAGS_SET_SUBMATCH_END:
1256	  {
1257	    int id = tre_stack_pop_int(stack);
1258	    int i;
1259
1260	    /* Add end of this submatch to regset. */
1261	    for (i = 0; regset[i] >= 0; i++);
1262	    regset[i] = id * 2 + 1;
1263	    regset[i + 1] = -1;
1264
1265	    /* Pop this submatch from the parents stack. */
1266	    for (i = 0; parents[i] >= 0; i++);
1267	    parents[i - 1] = -1;
1268	    break;
1269	  }
1270
1271	case ADDTAGS_RECURSE:
1272	  node = tre_stack_pop_voidptr(stack);
1273
1274	  if (node->submatch_id >= 0)
1275	    {
1276	      int id = node->submatch_id;
1277	      int i;
1278
1279
1280	      /* Add start of this submatch to regset. */
1281	      for (i = 0; regset[i] >= 0; i++);
1282	      regset[i] = id * 2;
1283	      regset[i + 1] = -1;
1284
1285	      if (!first_pass)
1286		{
1287		  for (i = 0; parents[i] >= 0; i++);
1288		  tnfa->submatch_data[id].parents = NULL;
1289		  if (i > 0)
1290		    {
1291		      int *p = xmalloc(sizeof(*p) * (i + 1));
1292		      if (p == NULL)
1293			{
1294			  status = REG_ESPACE;
1295			  break;
1296			}
1297		      assert(tnfa->submatch_data[id].parents == NULL);
1298		      tnfa->submatch_data[id].parents = p;
1299		      for (i = 0; parents[i] >= 0; i++)
1300			p[i] = parents[i];
1301		      p[i] = -1;
1302		    }
1303		}
1304
1305	      /* Add end of this submatch to regset after processing this
1306		 node. */
1307	      STACK_PUSHX(stack, int, node->submatch_id);
1308	      STACK_PUSHX(stack, int, ADDTAGS_SET_SUBMATCH_END);
1309	    }
1310
1311	  switch (node->type)
1312	    {
1313	    case LITERAL:
1314	      {
1315		tre_literal_t *lit = node->obj;
1316
1317		if (!IS_SPECIAL(lit) || IS_BACKREF(lit))
1318		  {
1319		    int i;
1320		    if (regset[0] >= 0)
1321		      {
1322			/* Regset is not empty, so add a tag before the
1323			   literal or backref. */
1324			if (!first_pass)
1325			  {
1326			    status = tre_add_tag_left(mem, node, tag);
1327			    tnfa->tag_directions[tag] = direction;
1328			    if (minimal_tag >= 0)
1329			      {
1330				for (i = 0; tnfa->minimal_tags[i] >= 0; i++);
1331				tnfa->minimal_tags[i] = tag;
1332				tnfa->minimal_tags[i + 1] = minimal_tag;
1333				tnfa->minimal_tags[i + 2] = -1;
1334				minimal_tag = -1;
1335				num_minimals++;
1336			      }
1337			    tre_purge_regset(regset, tnfa, tag);
1338			  }
1339			else
1340			  {
1341			    node->num_tags = 1;
1342			  }
1343
1344			regset[0] = -1;
1345			tag = next_tag;
1346			num_tags++;
1347			next_tag++;
1348		      }
1349		  }
1350		else
1351		  {
1352		    assert(!IS_TAG(lit));
1353		  }
1354		break;
1355	      }
1356	    case CATENATION:
1357	      {
1358		tre_catenation_t *cat = node->obj;
1359		tre_ast_node_t *left = cat->left;
1360		tre_ast_node_t *right = cat->right;
1361		int reserved_tag = -1;
1362
1363
1364		/* After processing right child. */
1365		STACK_PUSHX(stack, voidptr, node);
1366		STACK_PUSHX(stack, int, ADDTAGS_AFTER_CAT_RIGHT);
1367
1368		/* Process right child. */
1369		STACK_PUSHX(stack, voidptr, right);
1370		STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
1371
1372		/* After processing left child. */
1373		STACK_PUSHX(stack, int, next_tag + left->num_tags);
1374		if (left->num_tags > 0 && right->num_tags > 0)
1375		  {
1376		    /* Reserve the next tag to the right child. */
1377		    reserved_tag = next_tag;
1378		    next_tag++;
1379		  }
1380		STACK_PUSHX(stack, int, reserved_tag);
1381		STACK_PUSHX(stack, int, ADDTAGS_AFTER_CAT_LEFT);
1382
1383		/* Process left child. */
1384		STACK_PUSHX(stack, voidptr, left);
1385		STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
1386
1387		}
1388	      break;
1389	    case ITERATION:
1390	      {
1391		tre_iteration_t *iter = node->obj;
1392
1393		if (first_pass)
1394		  {
1395		    STACK_PUSHX(stack, int, regset[0] >= 0 || iter->minimal);
1396		  }
1397		else
1398		  {
1399		    STACK_PUSHX(stack, int, tag);
1400		    STACK_PUSHX(stack, int, iter->minimal);
1401		  }
1402		STACK_PUSHX(stack, voidptr, node);
1403		STACK_PUSHX(stack, int, ADDTAGS_AFTER_ITERATION);
1404
1405		STACK_PUSHX(stack, voidptr, iter->arg);
1406		STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
1407
1408		/* Regset is not empty, so add a tag here. */
1409		if (regset[0] >= 0 || iter->minimal)
1410		  {
1411		    if (!first_pass)
1412		      {
1413			int i;
1414			status = tre_add_tag_left(mem, node, tag);
1415			if (iter->minimal)
1416			  tnfa->tag_directions[tag] = TRE_TAG_MAXIMIZE;
1417			else
1418			  tnfa->tag_directions[tag] = direction;
1419			if (minimal_tag >= 0)
1420			  {
1421			    for (i = 0; tnfa->minimal_tags[i] >= 0; i++);
1422			    tnfa->minimal_tags[i] = tag;
1423			    tnfa->minimal_tags[i + 1] = minimal_tag;
1424			    tnfa->minimal_tags[i + 2] = -1;
1425			    minimal_tag = -1;
1426			    num_minimals++;
1427			  }
1428			tre_purge_regset(regset, tnfa, tag);
1429		      }
1430
1431		    regset[0] = -1;
1432		    tag = next_tag;
1433		    num_tags++;
1434		    next_tag++;
1435		  }
1436		direction = TRE_TAG_MINIMIZE;
1437	      }
1438	      break;
1439	    case UNION:
1440	      {
1441		tre_union_t *uni = node->obj;
1442		tre_ast_node_t *left = uni->left;
1443		tre_ast_node_t *right = uni->right;
1444		int left_tag;
1445		int right_tag;
1446
1447		if (regset[0] >= 0)
1448		  {
1449		    left_tag = next_tag;
1450		    right_tag = next_tag + 1;
1451		  }
1452		else
1453		  {
1454		    left_tag = tag;
1455		    right_tag = next_tag;
1456		  }
1457
1458		/* After processing right child. */
1459		STACK_PUSHX(stack, int, right_tag);
1460		STACK_PUSHX(stack, int, left_tag);
1461		STACK_PUSHX(stack, voidptr, regset);
1462		STACK_PUSHX(stack, int, regset[0] >= 0);
1463		STACK_PUSHX(stack, voidptr, node);
1464		STACK_PUSHX(stack, voidptr, right);
1465		STACK_PUSHX(stack, voidptr, left);
1466		STACK_PUSHX(stack, int, ADDTAGS_AFTER_UNION_RIGHT);
1467
1468		/* Process right child. */
1469		STACK_PUSHX(stack, voidptr, right);
1470		STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
1471
1472		/* After processing left child. */
1473		STACK_PUSHX(stack, int, ADDTAGS_AFTER_UNION_LEFT);
1474
1475		/* Process left child. */
1476		STACK_PUSHX(stack, voidptr, left);
1477		STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
1478
1479		/* Regset is not empty, so add a tag here. */
1480		if (regset[0] >= 0)
1481		  {
1482		    if (!first_pass)
1483		      {
1484			int i;
1485			status = tre_add_tag_left(mem, node, tag);
1486			tnfa->tag_directions[tag] = direction;
1487			if (minimal_tag >= 0)
1488			  {
1489			    for (i = 0; tnfa->minimal_tags[i] >= 0; i++);
1490			    tnfa->minimal_tags[i] = tag;
1491			    tnfa->minimal_tags[i + 1] = minimal_tag;
1492			    tnfa->minimal_tags[i + 2] = -1;
1493			    minimal_tag = -1;
1494			    num_minimals++;
1495			  }
1496			tre_purge_regset(regset, tnfa, tag);
1497		      }
1498
1499		    regset[0] = -1;
1500		    tag = next_tag;
1501		    num_tags++;
1502		    next_tag++;
1503		  }
1504
1505		if (node->num_submatches > 0)
1506		  {
1507		    /* The next two tags are reserved for markers. */
1508		    next_tag++;
1509		    tag = next_tag;
1510		    next_tag++;
1511		  }
1512
1513		break;
1514	      }
1515	    }
1516
1517	  if (node->submatch_id >= 0)
1518	    {
1519	      int i;
1520	      /* Push this submatch on the parents stack. */
1521	      for (i = 0; parents[i] >= 0; i++);
1522	      parents[i] = node->submatch_id;
1523	      parents[i + 1] = -1;
1524	    }
1525
1526	  break; /* end case: ADDTAGS_RECURSE */
1527
1528	case ADDTAGS_AFTER_ITERATION:
1529	  {
1530	    int minimal = 0;
1531	    int enter_tag;
1532	    node = tre_stack_pop_voidptr(stack);
1533	    if (first_pass)
1534	      {
1535		node->num_tags = ((tre_iteration_t *)node->obj)->arg->num_tags
1536		  + tre_stack_pop_int(stack);
1537		minimal_tag = -1;
1538	      }
1539	    else
1540	      {
1541		minimal = tre_stack_pop_int(stack);
1542		enter_tag = tre_stack_pop_int(stack);
1543		if (minimal)
1544		  minimal_tag = enter_tag;
1545	      }
1546
1547	    if (!first_pass)
1548	      {
1549		if (minimal)
1550		  direction = TRE_TAG_MINIMIZE;
1551		else
1552		  direction = TRE_TAG_MAXIMIZE;
1553	      }
1554	    break;
1555	  }
1556
1557	case ADDTAGS_AFTER_CAT_LEFT:
1558	  {
1559	    int new_tag = tre_stack_pop_int(stack);
1560	    next_tag = tre_stack_pop_int(stack);
1561	    if (new_tag >= 0)
1562	      {
1563		tag = new_tag;
1564	      }
1565	    break;
1566	  }
1567
1568	case ADDTAGS_AFTER_CAT_RIGHT:
1569	  node = tre_stack_pop_voidptr(stack);
1570	  if (first_pass)
1571	    node->num_tags = ((tre_catenation_t *)node->obj)->left->num_tags
1572	      + ((tre_catenation_t *)node->obj)->right->num_tags;
1573	  break;
1574
1575	case ADDTAGS_AFTER_UNION_LEFT:
1576	  /* Lift the bottom of the `regset' array so that when processing
1577	     the right operand the items currently in the array are
1578	     invisible.	 The original bottom was saved at ADDTAGS_UNION and
1579	     will be restored at ADDTAGS_AFTER_UNION_RIGHT below. */
1580	  while (*regset >= 0)
1581	    regset++;
1582	  break;
1583
1584	case ADDTAGS_AFTER_UNION_RIGHT:
1585	  {
1586	    int added_tags, tag_left, tag_right;
1587	    tre_ast_node_t *left = tre_stack_pop_voidptr(stack);
1588	    tre_ast_node_t *right = tre_stack_pop_voidptr(stack);
1589	    node = tre_stack_pop_voidptr(stack);
1590	    added_tags = tre_stack_pop_int(stack);
1591	    if (first_pass)
1592	      {
1593		node->num_tags = ((tre_union_t *)node->obj)->left->num_tags
1594		  + ((tre_union_t *)node->obj)->right->num_tags + added_tags
1595		  + ((node->num_submatches > 0) ? 2 : 0);
1596	      }
1597	    regset = tre_stack_pop_voidptr(stack);
1598	    tag_left = tre_stack_pop_int(stack);
1599	    tag_right = tre_stack_pop_int(stack);
1600
1601	    /* Add tags after both children, the left child gets a smaller
1602	       tag than the right child.  This guarantees that we prefer
1603	       the left child over the right child. */
1604	    /* XXX - This is not always necessary (if the children have
1605	       tags which must be seen for every match of that child). */
1606	    /* XXX - Check if this is the only place where tre_add_tag_right
1607	       is used.	 If so, use tre_add_tag_left (putting the tag before
1608	       the child as opposed after the child) and throw away
1609	       tre_add_tag_right. */
1610	    if (node->num_submatches > 0)
1611	      {
1612		if (!first_pass)
1613		  {
1614		    status = tre_add_tag_right(mem, left, tag_left);
1615		    tnfa->tag_directions[tag_left] = TRE_TAG_MAXIMIZE;
1616		    if (status == REG_OK)
1617		      status = tre_add_tag_right(mem, right, tag_right);
1618		    tnfa->tag_directions[tag_right] = TRE_TAG_MAXIMIZE;
1619		  }
1620		num_tags += 2;
1621	      }
1622	    direction = TRE_TAG_MAXIMIZE;
1623	    break;
1624	  }
1625
1626	default:
1627	  assert(0);
1628	  break;
1629
1630	} /* end switch(symbol) */
1631    } /* end while(tre_stack_num_objects(stack) > bottom) */
1632
1633  if (!first_pass)
1634    tre_purge_regset(regset, tnfa, tag);
1635
1636  if (!first_pass && minimal_tag >= 0)
1637    {
1638      int i;
1639      for (i = 0; tnfa->minimal_tags[i] >= 0; i++);
1640      tnfa->minimal_tags[i] = tag;
1641      tnfa->minimal_tags[i + 1] = minimal_tag;
1642      tnfa->minimal_tags[i + 2] = -1;
1643      minimal_tag = -1;
1644      num_minimals++;
1645    }
1646
1647  assert(tree->num_tags == num_tags);
1648  tnfa->end_tag = num_tags;
1649  tnfa->num_tags = num_tags;
1650  tnfa->num_minimals = num_minimals;
1651  xfree(orig_regset);
1652  xfree(parents);
1653  xfree(saved_states);
1654  return status;
1655}
1656
1657
1658
1659/*
1660  AST to TNFA compilation routines.
1661*/
1662
1663typedef enum {
1664  COPY_RECURSE,
1665  COPY_SET_RESULT_PTR
1666} tre_copyast_symbol_t;
1667
1668/* Flags for tre_copy_ast(). */
1669#define COPY_REMOVE_TAGS	 1
1670#define COPY_MAXIMIZE_FIRST_TAG	 2
1671
1672static reg_errcode_t
1673tre_copy_ast(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *ast,
1674	     int flags, int *pos_add, tre_tag_direction_t *tag_directions,
1675	     tre_ast_node_t **copy, int *max_pos)
1676{
1677  reg_errcode_t status = REG_OK;
1678  int bottom = tre_stack_num_objects(stack);
1679  int num_copied = 0;
1680  int first_tag = 1;
1681  tre_ast_node_t **result = copy;
1682  tre_copyast_symbol_t symbol;
1683
1684  STACK_PUSH(stack, voidptr, ast);
1685  STACK_PUSH(stack, int, COPY_RECURSE);
1686
1687  while (status == REG_OK && tre_stack_num_objects(stack) > bottom)
1688    {
1689      tre_ast_node_t *node;
1690      if (status != REG_OK)
1691	break;
1692
1693      symbol = (tre_copyast_symbol_t)tre_stack_pop_int(stack);
1694      switch (symbol)
1695	{
1696	case COPY_SET_RESULT_PTR:
1697	  result = tre_stack_pop_voidptr(stack);
1698	  break;
1699	case COPY_RECURSE:
1700	  node = tre_stack_pop_voidptr(stack);
1701	  switch (node->type)
1702	    {
1703	    case LITERAL:
1704	      {
1705		tre_literal_t *lit = node->obj;
1706		int pos = lit->position;
1707		int min = lit->code_min;
1708		int max = lit->code_max;
1709		if (!IS_SPECIAL(lit) || IS_BACKREF(lit))
1710		  {
1711		    /* XXX - e.g. [ab] has only one position but two
1712		       nodes, so we are creating holes in the state space
1713		       here.  Not fatal, just wastes memory. */
1714		    pos += *pos_add;
1715		    num_copied++;
1716		  }
1717		else if (IS_TAG(lit) && (flags & COPY_REMOVE_TAGS))
1718		  {
1719		    /* Change this tag to empty. */
1720		    min = EMPTY;
1721		    max = pos = -1;
1722		  }
1723		else if (IS_TAG(lit) && (flags & COPY_MAXIMIZE_FIRST_TAG)
1724			 && first_tag)
1725		  {
1726		    /* Maximize the first tag. */
1727		    tag_directions[max] = TRE_TAG_MAXIMIZE;
1728		    first_tag = 0;
1729		  }
1730		*result = tre_ast_new_literal(mem, min, max, pos);
1731		if (*result == NULL)
1732		  status = REG_ESPACE;
1733		else {
1734		  tre_literal_t *p = (*result)->obj;
1735		  p->class = lit->class;
1736		  p->neg_classes = lit->neg_classes;
1737		}
1738
1739		if (pos > *max_pos)
1740		  *max_pos = pos;
1741		break;
1742	      }
1743	    case UNION:
1744	      {
1745		tre_union_t *uni = node->obj;
1746		tre_union_t *tmp;
1747		*result = tre_ast_new_union(mem, uni->left, uni->right);
1748		if (*result == NULL)
1749		  {
1750		    status = REG_ESPACE;
1751		    break;
1752		  }
1753		tmp = (*result)->obj;
1754		result = &tmp->left;
1755		STACK_PUSHX(stack, voidptr, uni->right);
1756		STACK_PUSHX(stack, int, COPY_RECURSE);
1757		STACK_PUSHX(stack, voidptr, &tmp->right);
1758		STACK_PUSHX(stack, int, COPY_SET_RESULT_PTR);
1759		STACK_PUSHX(stack, voidptr, uni->left);
1760		STACK_PUSHX(stack, int, COPY_RECURSE);
1761		break;
1762	      }
1763	    case CATENATION:
1764	      {
1765		tre_catenation_t *cat = node->obj;
1766		tre_catenation_t *tmp;
1767		*result = tre_ast_new_catenation(mem, cat->left, cat->right);
1768		if (*result == NULL)
1769		  {
1770		    status = REG_ESPACE;
1771		    break;
1772		  }
1773		tmp = (*result)->obj;
1774		tmp->left = NULL;
1775		tmp->right = NULL;
1776		result = &tmp->left;
1777
1778		STACK_PUSHX(stack, voidptr, cat->right);
1779		STACK_PUSHX(stack, int, COPY_RECURSE);
1780		STACK_PUSHX(stack, voidptr, &tmp->right);
1781		STACK_PUSHX(stack, int, COPY_SET_RESULT_PTR);
1782		STACK_PUSHX(stack, voidptr, cat->left);
1783		STACK_PUSHX(stack, int, COPY_RECURSE);
1784		break;
1785	      }
1786	    case ITERATION:
1787	      {
1788		tre_iteration_t *iter = node->obj;
1789		STACK_PUSHX(stack, voidptr, iter->arg);
1790		STACK_PUSHX(stack, int, COPY_RECURSE);
1791		*result = tre_ast_new_iter(mem, iter->arg, iter->min,
1792					   iter->max, iter->minimal);
1793		if (*result == NULL)
1794		  {
1795		    status = REG_ESPACE;
1796		    break;
1797		  }
1798		iter = (*result)->obj;
1799		result = &iter->arg;
1800		break;
1801	      }
1802	    default:
1803	      assert(0);
1804	      break;
1805	    }
1806	  break;
1807	}
1808    }
1809  *pos_add += num_copied;
1810  return status;
1811}
1812
1813typedef enum {
1814  EXPAND_RECURSE,
1815  EXPAND_AFTER_ITER
1816} tre_expand_ast_symbol_t;
1817
1818/* Expands each iteration node that has a finite nonzero minimum or maximum
1819   iteration count to a catenated sequence of copies of the node. */
1820static reg_errcode_t
1821tre_expand_ast(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *ast,
1822	       int *position, tre_tag_direction_t *tag_directions)
1823{
1824  reg_errcode_t status = REG_OK;
1825  int bottom = tre_stack_num_objects(stack);
1826  int pos_add = 0;
1827  int pos_add_total = 0;
1828  int max_pos = 0;
1829  int iter_depth = 0;
1830
1831  STACK_PUSHR(stack, voidptr, ast);
1832  STACK_PUSHR(stack, int, EXPAND_RECURSE);
1833  while (status == REG_OK && tre_stack_num_objects(stack) > bottom)
1834    {
1835      tre_ast_node_t *node;
1836      tre_expand_ast_symbol_t symbol;
1837
1838      if (status != REG_OK)
1839	break;
1840
1841      symbol = (tre_expand_ast_symbol_t)tre_stack_pop_int(stack);
1842      node = tre_stack_pop_voidptr(stack);
1843      switch (symbol)
1844	{
1845	case EXPAND_RECURSE:
1846	  switch (node->type)
1847	    {
1848	    case LITERAL:
1849	      {
1850		tre_literal_t *lit= node->obj;
1851		if (!IS_SPECIAL(lit) || IS_BACKREF(lit))
1852		  {
1853		    lit->position += pos_add;
1854		    if (lit->position > max_pos)
1855		      max_pos = lit->position;
1856		  }
1857		break;
1858	      }
1859	    case UNION:
1860	      {
1861		tre_union_t *uni = node->obj;
1862		STACK_PUSHX(stack, voidptr, uni->right);
1863		STACK_PUSHX(stack, int, EXPAND_RECURSE);
1864		STACK_PUSHX(stack, voidptr, uni->left);
1865		STACK_PUSHX(stack, int, EXPAND_RECURSE);
1866		break;
1867	      }
1868	    case CATENATION:
1869	      {
1870		tre_catenation_t *cat = node->obj;
1871		STACK_PUSHX(stack, voidptr, cat->right);
1872		STACK_PUSHX(stack, int, EXPAND_RECURSE);
1873		STACK_PUSHX(stack, voidptr, cat->left);
1874		STACK_PUSHX(stack, int, EXPAND_RECURSE);
1875		break;
1876	      }
1877	    case ITERATION:
1878	      {
1879		tre_iteration_t *iter = node->obj;
1880		STACK_PUSHX(stack, int, pos_add);
1881		STACK_PUSHX(stack, voidptr, node);
1882		STACK_PUSHX(stack, int, EXPAND_AFTER_ITER);
1883		STACK_PUSHX(stack, voidptr, iter->arg);
1884		STACK_PUSHX(stack, int, EXPAND_RECURSE);
1885		/* If we are going to expand this node at EXPAND_AFTER_ITER
1886		   then don't increase the `pos' fields of the nodes now, it
1887		   will get done when expanding. */
1888		if (iter->min > 1 || iter->max > 1)
1889		  pos_add = 0;
1890		iter_depth++;
1891		break;
1892	      }
1893	    default:
1894	      assert(0);
1895	      break;
1896	    }
1897	  break;
1898	case EXPAND_AFTER_ITER:
1899	  {
1900	    tre_iteration_t *iter = node->obj;
1901	    int pos_add_last;
1902	    pos_add = tre_stack_pop_int(stack);
1903	    pos_add_last = pos_add;
1904	    if (iter->min > 1 || iter->max > 1)
1905	      {
1906		tre_ast_node_t *seq1 = NULL, *seq2 = NULL;
1907		int j;
1908		int pos_add_save = pos_add;
1909
1910		/* Create a catenated sequence of copies of the node. */
1911		for (j = 0; j < iter->min; j++)
1912		  {
1913		    tre_ast_node_t *copy;
1914		    /* Remove tags from all but the last copy. */
1915		    int flags = ((j + 1 < iter->min)
1916				 ? COPY_REMOVE_TAGS
1917				 : COPY_MAXIMIZE_FIRST_TAG);
1918		    pos_add_save = pos_add;
1919		    status = tre_copy_ast(mem, stack, iter->arg, flags,
1920					  &pos_add, tag_directions, &copy,
1921					  &max_pos);
1922		    if (status != REG_OK)
1923		      return status;
1924		    if (seq1 != NULL)
1925		      seq1 = tre_ast_new_catenation(mem, seq1, copy);
1926		    else
1927		      seq1 = copy;
1928		    if (seq1 == NULL)
1929		      return REG_ESPACE;
1930		  }
1931
1932		if (iter->max == -1)
1933		  {
1934		    /* No upper limit. */
1935		    pos_add_save = pos_add;
1936		    status = tre_copy_ast(mem, stack, iter->arg, 0,
1937					  &pos_add, NULL, &seq2, &max_pos);
1938		    if (status != REG_OK)
1939		      return status;
1940		    seq2 = tre_ast_new_iter(mem, seq2, 0, -1, 0);
1941		    if (seq2 == NULL)
1942		      return REG_ESPACE;
1943		  }
1944		else
1945		  {
1946		    for (j = iter->min; j < iter->max; j++)
1947		      {
1948			tre_ast_node_t *tmp, *copy;
1949			pos_add_save = pos_add;
1950			status = tre_copy_ast(mem, stack, iter->arg, 0,
1951					      &pos_add, NULL, &copy, &max_pos);
1952			if (status != REG_OK)
1953			  return status;
1954			if (seq2 != NULL)
1955			  seq2 = tre_ast_new_catenation(mem, copy, seq2);
1956			else
1957			  seq2 = copy;
1958			if (seq2 == NULL)
1959			  return REG_ESPACE;
1960			tmp = tre_ast_new_literal(mem, EMPTY, -1, -1);
1961			if (tmp == NULL)
1962			  return REG_ESPACE;
1963			seq2 = tre_ast_new_union(mem, tmp, seq2);
1964			if (seq2 == NULL)
1965			  return REG_ESPACE;
1966		      }
1967		  }
1968
1969		pos_add = pos_add_save;
1970		if (seq1 == NULL)
1971		  seq1 = seq2;
1972		else if (seq2 != NULL)
1973		  seq1 = tre_ast_new_catenation(mem, seq1, seq2);
1974		if (seq1 == NULL)
1975		  return REG_ESPACE;
1976		node->obj = seq1->obj;
1977		node->type = seq1->type;
1978	      }
1979
1980	    iter_depth--;
1981	    pos_add_total += pos_add - pos_add_last;
1982	    if (iter_depth == 0)
1983	      pos_add = pos_add_total;
1984
1985	    break;
1986	  }
1987	default:
1988	  assert(0);
1989	  break;
1990	}
1991    }
1992
1993  *position += pos_add_total;
1994
1995  /* `max_pos' should never be larger than `*position' if the above
1996     code works, but just an extra safeguard let's make sure
1997     `*position' is set large enough so enough memory will be
1998     allocated for the transition table. */
1999  if (max_pos > *position)
2000    *position = max_pos;
2001
2002  return status;
2003}
2004
2005static tre_pos_and_tags_t *
2006tre_set_empty(tre_mem_t mem)
2007{
2008  tre_pos_and_tags_t *new_set;
2009
2010  new_set = tre_mem_calloc(mem, sizeof(*new_set));
2011  if (new_set == NULL)
2012    return NULL;
2013
2014  new_set[0].position = -1;
2015  new_set[0].code_min = -1;
2016  new_set[0].code_max = -1;
2017
2018  return new_set;
2019}
2020
2021static tre_pos_and_tags_t *
2022tre_set_one(tre_mem_t mem, int position, int code_min, int code_max,
2023	    tre_ctype_t class, tre_ctype_t *neg_classes, int backref)
2024{
2025  tre_pos_and_tags_t *new_set;
2026
2027  new_set = tre_mem_calloc(mem, sizeof(*new_set) * 2);
2028  if (new_set == NULL)
2029    return NULL;
2030
2031  new_set[0].position = position;
2032  new_set[0].code_min = code_min;
2033  new_set[0].code_max = code_max;
2034  new_set[0].class = class;
2035  new_set[0].neg_classes = neg_classes;
2036  new_set[0].backref = backref;
2037  new_set[1].position = -1;
2038  new_set[1].code_min = -1;
2039  new_set[1].code_max = -1;
2040
2041  return new_set;
2042}
2043
2044static tre_pos_and_tags_t *
2045tre_set_union(tre_mem_t mem, tre_pos_and_tags_t *set1, tre_pos_and_tags_t *set2,
2046	      int *tags, int assertions)
2047{
2048  int s1, s2, i, j;
2049  tre_pos_and_tags_t *new_set;
2050  int *new_tags;
2051  int num_tags;
2052
2053  for (num_tags = 0; tags != NULL && tags[num_tags] >= 0; num_tags++);
2054  for (s1 = 0; set1[s1].position >= 0; s1++);
2055  for (s2 = 0; set2[s2].position >= 0; s2++);
2056  new_set = tre_mem_calloc(mem, sizeof(*new_set) * (s1 + s2 + 1));
2057  if (!new_set )
2058    return NULL;
2059
2060  for (s1 = 0; set1[s1].position >= 0; s1++)
2061    {
2062      new_set[s1].position = set1[s1].position;
2063      new_set[s1].code_min = set1[s1].code_min;
2064      new_set[s1].code_max = set1[s1].code_max;
2065      new_set[s1].assertions = set1[s1].assertions | assertions;
2066      new_set[s1].class = set1[s1].class;
2067      new_set[s1].neg_classes = set1[s1].neg_classes;
2068      new_set[s1].backref = set1[s1].backref;
2069      if (set1[s1].tags == NULL && tags == NULL)
2070	new_set[s1].tags = NULL;
2071      else
2072	{
2073	  for (i = 0; set1[s1].tags != NULL && set1[s1].tags[i] >= 0; i++);
2074	  new_tags = tre_mem_alloc(mem, (sizeof(*new_tags)
2075					 * (i + num_tags + 1)));
2076	  if (new_tags == NULL)
2077	    return NULL;
2078	  for (j = 0; j < i; j++)
2079	    new_tags[j] = set1[s1].tags[j];
2080	  for (i = 0; i < num_tags; i++)
2081	    new_tags[j + i] = tags[i];
2082	  new_tags[j + i] = -1;
2083	  new_set[s1].tags = new_tags;
2084	}
2085    }
2086
2087  for (s2 = 0; set2[s2].position >= 0; s2++)
2088    {
2089      new_set[s1 + s2].position = set2[s2].position;
2090      new_set[s1 + s2].code_min = set2[s2].code_min;
2091      new_set[s1 + s2].code_max = set2[s2].code_max;
2092      /* XXX - why not | assertions here as well? */
2093      new_set[s1 + s2].assertions = set2[s2].assertions;
2094      new_set[s1 + s2].class = set2[s2].class;
2095      new_set[s1 + s2].neg_classes = set2[s2].neg_classes;
2096      new_set[s1 + s2].backref = set2[s2].backref;
2097      if (set2[s2].tags == NULL)
2098	new_set[s1 + s2].tags = NULL;
2099      else
2100	{
2101	  for (i = 0; set2[s2].tags[i] >= 0; i++);
2102	  new_tags = tre_mem_alloc(mem, sizeof(*new_tags) * (i + 1));
2103	  if (new_tags == NULL)
2104	    return NULL;
2105	  for (j = 0; j < i; j++)
2106	    new_tags[j] = set2[s2].tags[j];
2107	  new_tags[j] = -1;
2108	  new_set[s1 + s2].tags = new_tags;
2109	}
2110    }
2111  new_set[s1 + s2].position = -1;
2112  return new_set;
2113}
2114
2115/* Finds the empty path through `node' which is the one that should be
2116   taken according to POSIX.2 rules, and adds the tags on that path to
2117   `tags'.   `tags' may be NULL.  If `num_tags_seen' is not NULL, it is
2118   set to the number of tags seen on the path. */
2119static reg_errcode_t
2120tre_match_empty(tre_stack_t *stack, tre_ast_node_t *node, int *tags,
2121		int *assertions, int *num_tags_seen)
2122{
2123  tre_literal_t *lit;
2124  tre_union_t *uni;
2125  tre_catenation_t *cat;
2126  tre_iteration_t *iter;
2127  int i;
2128  int bottom = tre_stack_num_objects(stack);
2129  reg_errcode_t status = REG_OK;
2130  if (num_tags_seen)
2131    *num_tags_seen = 0;
2132
2133  status = tre_stack_push_voidptr(stack, node);
2134
2135  /* Walk through the tree recursively. */
2136  while (status == REG_OK && tre_stack_num_objects(stack) > bottom)
2137    {
2138      node = tre_stack_pop_voidptr(stack);
2139
2140      switch (node->type)
2141	{
2142	case LITERAL:
2143	  lit = (tre_literal_t *)node->obj;
2144	  switch (lit->code_min)
2145	    {
2146	    case TAG:
2147	      if (lit->code_max >= 0)
2148		{
2149		  if (tags != NULL)
2150		    {
2151		      /* Add the tag to `tags'. */
2152		      for (i = 0; tags[i] >= 0; i++)
2153			if (tags[i] == lit->code_max)
2154			  break;
2155		      if (tags[i] < 0)
2156			{
2157			  tags[i] = lit->code_max;
2158			  tags[i + 1] = -1;
2159			}
2160		    }
2161		  if (num_tags_seen)
2162		    (*num_tags_seen)++;
2163		}
2164	      break;
2165	    case ASSERTION:
2166	      assert(lit->code_max >= 1
2167		     || lit->code_max <= ASSERT_LAST);
2168	      if (assertions != NULL)
2169		*assertions |= lit->code_max;
2170	      break;
2171	    case EMPTY:
2172	      break;
2173	    default:
2174	      assert(0);
2175	      break;
2176	    }
2177	  break;
2178
2179	case UNION:
2180	  /* Subexpressions starting earlier take priority over ones
2181	     starting later, so we prefer the left subexpression over the
2182	     right subexpression. */
2183	  uni = (tre_union_t *)node->obj;
2184	  if (uni->left->nullable)
2185	    STACK_PUSHX(stack, voidptr, uni->left)
2186	  else if (uni->right->nullable)
2187	    STACK_PUSHX(stack, voidptr, uni->right)
2188	  else
2189	    assert(0);
2190	  break;
2191
2192	case CATENATION:
2193	  /* The path must go through both children. */
2194	  cat = (tre_catenation_t *)node->obj;
2195	  assert(cat->left->nullable);
2196	  assert(cat->right->nullable);
2197	  STACK_PUSHX(stack, voidptr, cat->left);
2198	  STACK_PUSHX(stack, voidptr, cat->right);
2199	  break;
2200
2201	case ITERATION:
2202	  /* A match with an empty string is preferred over no match at
2203	     all, so we go through the argument if possible. */
2204	  iter = (tre_iteration_t *)node->obj;
2205	  if (iter->arg->nullable)
2206	    STACK_PUSHX(stack, voidptr, iter->arg);
2207	  break;
2208
2209	default:
2210	  assert(0);
2211	  break;
2212	}
2213    }
2214
2215  return status;
2216}
2217
2218
2219typedef enum {
2220  NFL_RECURSE,
2221  NFL_POST_UNION,
2222  NFL_POST_CATENATION,
2223  NFL_POST_ITERATION
2224} tre_nfl_stack_symbol_t;
2225
2226
2227/* Computes and fills in the fields `nullable', `firstpos', and `lastpos' for
2228   the nodes of the AST `tree'. */
2229static reg_errcode_t
2230tre_compute_nfl(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *tree)
2231{
2232  int bottom = tre_stack_num_objects(stack);
2233
2234  STACK_PUSHR(stack, voidptr, tree);
2235  STACK_PUSHR(stack, int, NFL_RECURSE);
2236
2237  while (tre_stack_num_objects(stack) > bottom)
2238    {
2239      tre_nfl_stack_symbol_t symbol;
2240      tre_ast_node_t *node;
2241
2242      symbol = (tre_nfl_stack_symbol_t)tre_stack_pop_int(stack);
2243      node = tre_stack_pop_voidptr(stack);
2244      switch (symbol)
2245	{
2246	case NFL_RECURSE:
2247	  switch (node->type)
2248	    {
2249	    case LITERAL:
2250	      {
2251		tre_literal_t *lit = (tre_literal_t *)node->obj;
2252		if (IS_BACKREF(lit))
2253		  {
2254		    /* Back references: nullable = false, firstpos = {i},
2255		       lastpos = {i}. */
2256		    node->nullable = 0;
2257		    node->firstpos = tre_set_one(mem, lit->position, 0,
2258					     TRE_CHAR_MAX, 0, NULL, -1);
2259		    if (!node->firstpos)
2260		      return REG_ESPACE;
2261		    node->lastpos = tre_set_one(mem, lit->position, 0,
2262						TRE_CHAR_MAX, 0, NULL,
2263						(int)lit->code_max);
2264		    if (!node->lastpos)
2265		      return REG_ESPACE;
2266		  }
2267		else if (lit->code_min < 0)
2268		  {
2269		    /* Tags, empty strings, params, and zero width assertions:
2270		       nullable = true, firstpos = {}, and lastpos = {}. */
2271		    node->nullable = 1;
2272		    node->firstpos = tre_set_empty(mem);
2273		    if (!node->firstpos)
2274		      return REG_ESPACE;
2275		    node->lastpos = tre_set_empty(mem);
2276		    if (!node->lastpos)
2277		      return REG_ESPACE;
2278		  }
2279		else
2280		  {
2281		    /* Literal at position i: nullable = false, firstpos = {i},
2282		       lastpos = {i}. */
2283		    node->nullable = 0;
2284		    node->firstpos =
2285		      tre_set_one(mem, lit->position, (int)lit->code_min,
2286				  (int)lit->code_max, 0, NULL, -1);
2287		    if (!node->firstpos)
2288		      return REG_ESPACE;
2289		    node->lastpos = tre_set_one(mem, lit->position,
2290						(int)lit->code_min,
2291						(int)lit->code_max,
2292						lit->class, lit->neg_classes,
2293						-1);
2294		    if (!node->lastpos)
2295		      return REG_ESPACE;
2296		  }
2297		break;
2298	      }
2299
2300	    case UNION:
2301	      /* Compute the attributes for the two subtrees, and after that
2302		 for this node. */
2303	      STACK_PUSHR(stack, voidptr, node);
2304	      STACK_PUSHR(stack, int, NFL_POST_UNION);
2305	      STACK_PUSHR(stack, voidptr, ((tre_union_t *)node->obj)->right);
2306	      STACK_PUSHR(stack, int, NFL_RECURSE);
2307	      STACK_PUSHR(stack, voidptr, ((tre_union_t *)node->obj)->left);
2308	      STACK_PUSHR(stack, int, NFL_RECURSE);
2309	      break;
2310
2311	    case CATENATION:
2312	      /* Compute the attributes for the two subtrees, and after that
2313		 for this node. */
2314	      STACK_PUSHR(stack, voidptr, node);
2315	      STACK_PUSHR(stack, int, NFL_POST_CATENATION);
2316	      STACK_PUSHR(stack, voidptr, ((tre_catenation_t *)node->obj)->right);
2317	      STACK_PUSHR(stack, int, NFL_RECURSE);
2318	      STACK_PUSHR(stack, voidptr, ((tre_catenation_t *)node->obj)->left);
2319	      STACK_PUSHR(stack, int, NFL_RECURSE);
2320	      break;
2321
2322	    case ITERATION:
2323	      /* Compute the attributes for the subtree, and after that for
2324		 this node. */
2325	      STACK_PUSHR(stack, voidptr, node);
2326	      STACK_PUSHR(stack, int, NFL_POST_ITERATION);
2327	      STACK_PUSHR(stack, voidptr, ((tre_iteration_t *)node->obj)->arg);
2328	      STACK_PUSHR(stack, int, NFL_RECURSE);
2329	      break;
2330	    }
2331	  break; /* end case: NFL_RECURSE */
2332
2333	case NFL_POST_UNION:
2334	  {
2335	    tre_union_t *uni = (tre_union_t *)node->obj;
2336	    node->nullable = uni->left->nullable || uni->right->nullable;
2337	    node->firstpos = tre_set_union(mem, uni->left->firstpos,
2338					   uni->right->firstpos, NULL, 0);
2339	    if (!node->firstpos)
2340	      return REG_ESPACE;
2341	    node->lastpos = tre_set_union(mem, uni->left->lastpos,
2342					  uni->right->lastpos, NULL, 0);
2343	    if (!node->lastpos)
2344	      return REG_ESPACE;
2345	    break;
2346	  }
2347
2348	case NFL_POST_ITERATION:
2349	  {
2350	    tre_iteration_t *iter = (tre_iteration_t *)node->obj;
2351
2352	    if (iter->min == 0 || iter->arg->nullable)
2353	      node->nullable = 1;
2354	    else
2355	      node->nullable = 0;
2356	    node->firstpos = iter->arg->firstpos;
2357	    node->lastpos = iter->arg->lastpos;
2358	    break;
2359	  }
2360
2361	case NFL_POST_CATENATION:
2362	  {
2363	    int num_tags, *tags, assertions;
2364	    reg_errcode_t status;
2365	    tre_catenation_t *cat = node->obj;
2366	    node->nullable = cat->left->nullable && cat->right->nullable;
2367
2368	    /* Compute firstpos. */
2369	    if (cat->left->nullable)
2370	      {
2371		/* The left side matches the empty string.  Make a first pass
2372		   with tre_match_empty() to get the number of tags and
2373		   parameters. */
2374		status = tre_match_empty(stack, cat->left,
2375					 NULL, NULL, &num_tags);
2376		if (status != REG_OK)
2377		  return status;
2378		/* Allocate arrays for the tags and parameters. */
2379		tags = xmalloc(sizeof(*tags) * (num_tags + 1));
2380		if (!tags)
2381		  return REG_ESPACE;
2382		tags[0] = -1;
2383		assertions = 0;
2384		/* Second pass with tre_mach_empty() to get the list of
2385		   tags and parameters. */
2386		status = tre_match_empty(stack, cat->left, tags,
2387					 &assertions, NULL);
2388		if (status != REG_OK)
2389		  {
2390		    xfree(tags);
2391		    return status;
2392		  }
2393		node->firstpos =
2394		  tre_set_union(mem, cat->right->firstpos, cat->left->firstpos,
2395				tags, assertions);
2396		xfree(tags);
2397		if (!node->firstpos)
2398		  return REG_ESPACE;
2399	      }
2400	    else
2401	      {
2402		node->firstpos = cat->left->firstpos;
2403	      }
2404
2405	    /* Compute lastpos. */
2406	    if (cat->right->nullable)
2407	      {
2408		/* The right side matches the empty string.  Make a first pass
2409		   with tre_match_empty() to get the number of tags and
2410		   parameters. */
2411		status = tre_match_empty(stack, cat->right,
2412					 NULL, NULL, &num_tags);
2413		if (status != REG_OK)
2414		  return status;
2415		/* Allocate arrays for the tags and parameters. */
2416		tags = xmalloc(sizeof(int) * (num_tags + 1));
2417		if (!tags)
2418		  return REG_ESPACE;
2419		tags[0] = -1;
2420		assertions = 0;
2421		/* Second pass with tre_mach_empty() to get the list of
2422		   tags and parameters. */
2423		status = tre_match_empty(stack, cat->right, tags,
2424					 &assertions, NULL);
2425		if (status != REG_OK)
2426		  {
2427		    xfree(tags);
2428		    return status;
2429		  }
2430		node->lastpos =
2431		  tre_set_union(mem, cat->left->lastpos, cat->right->lastpos,
2432				tags, assertions);
2433		xfree(tags);
2434		if (!node->lastpos)
2435		  return REG_ESPACE;
2436	      }
2437	    else
2438	      {
2439		node->lastpos = cat->right->lastpos;
2440	      }
2441	    break;
2442	  }
2443
2444	default:
2445	  assert(0);
2446	  break;
2447	}
2448    }
2449
2450  return REG_OK;
2451}
2452
2453
2454/* Adds a transition from each position in `p1' to each position in `p2'. */
2455static reg_errcode_t
2456tre_make_trans(tre_pos_and_tags_t *p1, tre_pos_and_tags_t *p2,
2457	       tre_tnfa_transition_t *transitions,
2458	       int *counts, int *offs)
2459{
2460  tre_pos_and_tags_t *orig_p2 = p2;
2461  tre_tnfa_transition_t *trans;
2462  int i, j, k, l, dup, prev_p2_pos;
2463
2464  if (transitions != NULL)
2465    while (p1->position >= 0)
2466      {
2467	p2 = orig_p2;
2468	prev_p2_pos = -1;
2469	while (p2->position >= 0)
2470	  {
2471	    /* Optimization: if this position was already handled, skip it. */
2472	    if (p2->position == prev_p2_pos)
2473	      {
2474		p2++;
2475		continue;
2476	      }
2477	    prev_p2_pos = p2->position;
2478	    /* Set `trans' to point to the next unused transition from
2479	       position `p1->position'. */
2480	    trans = transitions + offs[p1->position];
2481	    while (trans->state != NULL)
2482	      {
2483#if 0
2484		/* If we find a previous transition from `p1->position' to
2485		   `p2->position', it is overwritten.  This can happen only
2486		   if there are nested loops in the regexp, like in "((a)*)*".
2487		   In POSIX.2 repetition using the outer loop is always
2488		   preferred over using the inner loop.	 Therefore the
2489		   transition for the inner loop is useless and can be thrown
2490		   away. */
2491		/* XXX - The same position is used for all nodes in a bracket
2492		   expression, so this optimization cannot be used (it will
2493		   break bracket expressions) unless I figure out a way to
2494		   detect it here. */
2495		if (trans->state_id == p2->position)
2496		  {
2497		    break;
2498		  }
2499#endif
2500		trans++;
2501	      }
2502
2503	    if (trans->state == NULL)
2504	      (trans + 1)->state = NULL;
2505	    /* Use the character ranges, assertions, etc. from `p1' for
2506	       the transition from `p1' to `p2'. */
2507	    trans->code_min = p1->code_min;
2508	    trans->code_max = p1->code_max;
2509	    trans->state = transitions + offs[p2->position];
2510	    trans->state_id = p2->position;
2511	    trans->assertions = p1->assertions | p2->assertions
2512	      | (p1->class ? ASSERT_CHAR_CLASS : 0)
2513	      | (p1->neg_classes != NULL ? ASSERT_CHAR_CLASS_NEG : 0);
2514	    if (p1->backref >= 0)
2515	      {
2516		assert((trans->assertions & ASSERT_CHAR_CLASS) == 0);
2517		assert(p2->backref < 0);
2518		trans->u.backref = p1->backref;
2519		trans->assertions |= ASSERT_BACKREF;
2520	      }
2521	    else
2522	      trans->u.class = p1->class;
2523	    if (p1->neg_classes != NULL)
2524	      {
2525		for (i = 0; p1->neg_classes[i] != (tre_ctype_t)0; i++);
2526		trans->neg_classes =
2527		  xmalloc(sizeof(*trans->neg_classes) * (i + 1));
2528		if (trans->neg_classes == NULL)
2529		  return REG_ESPACE;
2530		for (i = 0; p1->neg_classes[i] != (tre_ctype_t)0; i++)
2531		  trans->neg_classes[i] = p1->neg_classes[i];
2532		trans->neg_classes[i] = (tre_ctype_t)0;
2533	      }
2534	    else
2535	      trans->neg_classes = NULL;
2536
2537	    /* Find out how many tags this transition has. */
2538	    i = 0;
2539	    if (p1->tags != NULL)
2540	      while(p1->tags[i] >= 0)
2541		i++;
2542	    j = 0;
2543	    if (p2->tags != NULL)
2544	      while(p2->tags[j] >= 0)
2545		j++;
2546
2547	    /* If we are overwriting a transition, free the old tag array. */
2548	    if (trans->tags != NULL)
2549	      xfree(trans->tags);
2550	    trans->tags = NULL;
2551
2552	    /* If there were any tags, allocate an array and fill it. */
2553	    if (i + j > 0)
2554	      {
2555		trans->tags = xmalloc(sizeof(*trans->tags) * (i + j + 1));
2556		if (!trans->tags)
2557		  return REG_ESPACE;
2558		i = 0;
2559		if (p1->tags != NULL)
2560		  while(p1->tags[i] >= 0)
2561		    {
2562		      trans->tags[i] = p1->tags[i];
2563		      i++;
2564		    }
2565		l = i;
2566		j = 0;
2567		if (p2->tags != NULL)
2568		  while (p2->tags[j] >= 0)
2569		    {
2570		      /* Don't add duplicates. */
2571		      dup = 0;
2572		      for (k = 0; k < i; k++)
2573			if (trans->tags[k] == p2->tags[j])
2574			  {
2575			    dup = 1;
2576			    break;
2577			  }
2578		      if (!dup)
2579			trans->tags[l++] = p2->tags[j];
2580		      j++;
2581		    }
2582		trans->tags[l] = -1;
2583	      }
2584
2585	    p2++;
2586	  }
2587	p1++;
2588      }
2589  else
2590    /* Compute a maximum limit for the number of transitions leaving
2591       from each state. */
2592    while (p1->position >= 0)
2593      {
2594	p2 = orig_p2;
2595	while (p2->position >= 0)
2596	  {
2597	    counts[p1->position]++;
2598	    p2++;
2599	  }
2600	p1++;
2601      }
2602  return REG_OK;
2603}
2604
2605/* Converts the syntax tree to a TNFA.	All the transitions in the TNFA are
2606   labelled with one character range (there are no transitions on empty
2607   strings).  The TNFA takes O(n^2) space in the worst case, `n' is size of
2608   the regexp. */
2609static reg_errcode_t
2610tre_ast_to_tnfa(tre_ast_node_t *node, tre_tnfa_transition_t *transitions,
2611		int *counts, int *offs)
2612{
2613  tre_union_t *uni;
2614  tre_catenation_t *cat;
2615  tre_iteration_t *iter;
2616  reg_errcode_t errcode = REG_OK;
2617
2618  /* XXX - recurse using a stack!. */
2619  switch (node->type)
2620    {
2621    case LITERAL:
2622      break;
2623    case UNION:
2624      uni = (tre_union_t *)node->obj;
2625      errcode = tre_ast_to_tnfa(uni->left, transitions, counts, offs);
2626      if (errcode != REG_OK)
2627	return errcode;
2628      errcode = tre_ast_to_tnfa(uni->right, transitions, counts, offs);
2629      break;
2630
2631    case CATENATION:
2632      cat = (tre_catenation_t *)node->obj;
2633      /* Add a transition from each position in cat->left->lastpos
2634	 to each position in cat->right->firstpos. */
2635      errcode = tre_make_trans(cat->left->lastpos, cat->right->firstpos,
2636			       transitions, counts, offs);
2637      if (errcode != REG_OK)
2638	return errcode;
2639      errcode = tre_ast_to_tnfa(cat->left, transitions, counts, offs);
2640      if (errcode != REG_OK)
2641	return errcode;
2642      errcode = tre_ast_to_tnfa(cat->right, transitions, counts, offs);
2643      break;
2644
2645    case ITERATION:
2646      iter = (tre_iteration_t *)node->obj;
2647      assert(iter->max == -1 || iter->max == 1);
2648
2649      if (iter->max == -1)
2650	{
2651	  assert(iter->min == 0 || iter->min == 1);
2652	  /* Add a transition from each last position in the iterated
2653	     expression to each first position. */
2654	  errcode = tre_make_trans(iter->arg->lastpos, iter->arg->firstpos,
2655				   transitions, counts, offs);
2656	  if (errcode != REG_OK)
2657	    return errcode;
2658	}
2659      errcode = tre_ast_to_tnfa(iter->arg, transitions, counts, offs);
2660      break;
2661    }
2662  return errcode;
2663}
2664
2665
2666#define ERROR_EXIT(err)		  \
2667  do				  \
2668    {				  \
2669      errcode = err;		  \
2670      if (/*CONSTCOND*/1)	  \
2671      	goto error_exit;	  \
2672    }				  \
2673 while (/*CONSTCOND*/0)
2674
2675
2676int
2677regcomp(regex_t *restrict preg, const char *restrict regex, int cflags)
2678{
2679  tre_stack_t *stack;
2680  tre_ast_node_t *tree, *tmp_ast_l, *tmp_ast_r;
2681  tre_pos_and_tags_t *p;
2682  int *counts = NULL, *offs = NULL;
2683  int i, add = 0;
2684  tre_tnfa_transition_t *transitions, *initial;
2685  tre_tnfa_t *tnfa = NULL;
2686  tre_submatch_data_t *submatch_data;
2687  tre_tag_direction_t *tag_directions = NULL;
2688  reg_errcode_t errcode;
2689  tre_mem_t mem;
2690
2691  /* Parse context. */
2692  tre_parse_ctx_t parse_ctx;
2693
2694  /* Allocate a stack used throughout the compilation process for various
2695     purposes. */
2696  stack = tre_stack_new(512, 1024000, 128);
2697  if (!stack)
2698    return REG_ESPACE;
2699  /* Allocate a fast memory allocator. */
2700  mem = tre_mem_new();
2701  if (!mem)
2702    {
2703      tre_stack_destroy(stack);
2704      return REG_ESPACE;
2705    }
2706
2707  /* Parse the regexp. */
2708  memset(&parse_ctx, 0, sizeof(parse_ctx));
2709  parse_ctx.mem = mem;
2710  parse_ctx.stack = stack;
2711  parse_ctx.start = regex;
2712  parse_ctx.cflags = cflags;
2713  parse_ctx.max_backref = -1;
2714  errcode = tre_parse(&parse_ctx);
2715  if (errcode != REG_OK)
2716    ERROR_EXIT(errcode);
2717  preg->re_nsub = parse_ctx.submatch_id - 1;
2718  tree = parse_ctx.n;
2719
2720#ifdef TRE_DEBUG
2721  tre_ast_print(tree);
2722#endif /* TRE_DEBUG */
2723
2724  /* Referring to nonexistent subexpressions is illegal. */
2725  if (parse_ctx.max_backref > (int)preg->re_nsub)
2726    ERROR_EXIT(REG_ESUBREG);
2727
2728  /* Allocate the TNFA struct. */
2729  tnfa = xcalloc(1, sizeof(tre_tnfa_t));
2730  if (tnfa == NULL)
2731    ERROR_EXIT(REG_ESPACE);
2732  tnfa->have_backrefs = parse_ctx.max_backref >= 0;
2733  tnfa->have_approx = 0;
2734  tnfa->num_submatches = parse_ctx.submatch_id;
2735
2736  /* Set up tags for submatch addressing.  If REG_NOSUB is set and the
2737     regexp does not have back references, this can be skipped. */
2738  if (tnfa->have_backrefs || !(cflags & REG_NOSUB))
2739    {
2740
2741      /* Figure out how many tags we will need. */
2742      errcode = tre_add_tags(NULL, stack, tree, tnfa);
2743      if (errcode != REG_OK)
2744	ERROR_EXIT(errcode);
2745
2746      if (tnfa->num_tags > 0)
2747	{
2748	  tag_directions = xmalloc(sizeof(*tag_directions)
2749				   * (tnfa->num_tags + 1));
2750	  if (tag_directions == NULL)
2751	    ERROR_EXIT(REG_ESPACE);
2752	  tnfa->tag_directions = tag_directions;
2753	  memset(tag_directions, -1,
2754		 sizeof(*tag_directions) * (tnfa->num_tags + 1));
2755	}
2756      tnfa->minimal_tags = xcalloc((unsigned)tnfa->num_tags * 2 + 1,
2757				   sizeof(*tnfa->minimal_tags));
2758      if (tnfa->minimal_tags == NULL)
2759	ERROR_EXIT(REG_ESPACE);
2760
2761      submatch_data = xcalloc((unsigned)parse_ctx.submatch_id,
2762			      sizeof(*submatch_data));
2763      if (submatch_data == NULL)
2764	ERROR_EXIT(REG_ESPACE);
2765      tnfa->submatch_data = submatch_data;
2766
2767      errcode = tre_add_tags(mem, stack, tree, tnfa);
2768      if (errcode != REG_OK)
2769	ERROR_EXIT(errcode);
2770
2771    }
2772
2773  /* Expand iteration nodes. */
2774  errcode = tre_expand_ast(mem, stack, tree, &parse_ctx.position,
2775			   tag_directions);
2776  if (errcode != REG_OK)
2777    ERROR_EXIT(errcode);
2778
2779  /* Add a dummy node for the final state.
2780     XXX - For certain patterns this dummy node can be optimized away,
2781	   for example "a*" or "ab*".	Figure out a simple way to detect
2782	   this possibility. */
2783  tmp_ast_l = tree;
2784  tmp_ast_r = tre_ast_new_literal(mem, 0, 0, parse_ctx.position++);
2785  if (tmp_ast_r == NULL)
2786    ERROR_EXIT(REG_ESPACE);
2787
2788  tree = tre_ast_new_catenation(mem, tmp_ast_l, tmp_ast_r);
2789  if (tree == NULL)
2790    ERROR_EXIT(REG_ESPACE);
2791
2792  errcode = tre_compute_nfl(mem, stack, tree);
2793  if (errcode != REG_OK)
2794    ERROR_EXIT(errcode);
2795
2796  counts = xmalloc(sizeof(int) * parse_ctx.position);
2797  if (counts == NULL)
2798    ERROR_EXIT(REG_ESPACE);
2799
2800  offs = xmalloc(sizeof(int) * parse_ctx.position);
2801  if (offs == NULL)
2802    ERROR_EXIT(REG_ESPACE);
2803
2804  for (i = 0; i < parse_ctx.position; i++)
2805    counts[i] = 0;
2806  tre_ast_to_tnfa(tree, NULL, counts, NULL);
2807
2808  add = 0;
2809  for (i = 0; i < parse_ctx.position; i++)
2810    {
2811      offs[i] = add;
2812      add += counts[i] + 1;
2813      counts[i] = 0;
2814    }
2815  transitions = xcalloc((unsigned)add + 1, sizeof(*transitions));
2816  if (transitions == NULL)
2817    ERROR_EXIT(REG_ESPACE);
2818  tnfa->transitions = transitions;
2819  tnfa->num_transitions = add;
2820
2821  errcode = tre_ast_to_tnfa(tree, transitions, counts, offs);
2822  if (errcode != REG_OK)
2823    ERROR_EXIT(errcode);
2824
2825  tnfa->firstpos_chars = NULL;
2826
2827  p = tree->firstpos;
2828  i = 0;
2829  while (p->position >= 0)
2830    {
2831      i++;
2832      p++;
2833    }
2834
2835  initial = xcalloc((unsigned)i + 1, sizeof(tre_tnfa_transition_t));
2836  if (initial == NULL)
2837    ERROR_EXIT(REG_ESPACE);
2838  tnfa->initial = initial;
2839
2840  i = 0;
2841  for (p = tree->firstpos; p->position >= 0; p++)
2842    {
2843      initial[i].state = transitions + offs[p->position];
2844      initial[i].state_id = p->position;
2845      initial[i].tags = NULL;
2846      /* Copy the arrays p->tags, and p->params, they are allocated
2847	 from a tre_mem object. */
2848      if (p->tags)
2849	{
2850	  int j;
2851	  for (j = 0; p->tags[j] >= 0; j++);
2852	  initial[i].tags = xmalloc(sizeof(*p->tags) * (j + 1));
2853	  if (!initial[i].tags)
2854	    ERROR_EXIT(REG_ESPACE);
2855	  memcpy(initial[i].tags, p->tags, sizeof(*p->tags) * (j + 1));
2856	}
2857      initial[i].assertions = p->assertions;
2858      i++;
2859    }
2860  initial[i].state = NULL;
2861
2862  tnfa->num_transitions = add;
2863  tnfa->final = transitions + offs[tree->lastpos[0].position];
2864  tnfa->num_states = parse_ctx.position;
2865  tnfa->cflags = cflags;
2866
2867  tre_mem_destroy(mem);
2868  tre_stack_destroy(stack);
2869  xfree(counts);
2870  xfree(offs);
2871
2872  preg->TRE_REGEX_T_FIELD = (void *)tnfa;
2873  return REG_OK;
2874
2875 error_exit:
2876  /* Free everything that was allocated and return the error code. */
2877  tre_mem_destroy(mem);
2878  if (stack != NULL)
2879    tre_stack_destroy(stack);
2880  if (counts != NULL)
2881    xfree(counts);
2882  if (offs != NULL)
2883    xfree(offs);
2884  preg->TRE_REGEX_T_FIELD = (void *)tnfa;
2885  regfree(preg);
2886  return errcode;
2887}
2888
2889
2890
2891
2892void
2893regfree(regex_t *preg)
2894{
2895  tre_tnfa_t *tnfa;
2896  unsigned int i;
2897  tre_tnfa_transition_t *trans;
2898
2899  tnfa = (void *)preg->TRE_REGEX_T_FIELD;
2900  if (!tnfa)
2901    return;
2902
2903  for (i = 0; i < tnfa->num_transitions; i++)
2904    if (tnfa->transitions[i].state)
2905      {
2906	if (tnfa->transitions[i].tags)
2907	  xfree(tnfa->transitions[i].tags);
2908	if (tnfa->transitions[i].neg_classes)
2909	  xfree(tnfa->transitions[i].neg_classes);
2910      }
2911  if (tnfa->transitions)
2912    xfree(tnfa->transitions);
2913
2914  if (tnfa->initial)
2915    {
2916      for (trans = tnfa->initial; trans->state; trans++)
2917	{
2918	  if (trans->tags)
2919	    xfree(trans->tags);
2920	}
2921      xfree(tnfa->initial);
2922    }
2923
2924  if (tnfa->submatch_data)
2925    {
2926      for (i = 0; i < tnfa->num_submatches; i++)
2927	if (tnfa->submatch_data[i].parents)
2928	  xfree(tnfa->submatch_data[i].parents);
2929      xfree(tnfa->submatch_data);
2930    }
2931
2932  if (tnfa->tag_directions)
2933    xfree(tnfa->tag_directions);
2934  if (tnfa->firstpos_chars)
2935    xfree(tnfa->firstpos_chars);
2936  if (tnfa->minimal_tags)
2937    xfree(tnfa->minimal_tags);
2938  xfree(tnfa);
2939}
2940