1/*
2  tre-parse.c - Regexp parser
3
4  This software is released under a BSD-style license.
5  See the file LICENSE for details and copyright.
6
7*/
8
9/*
10  This parser is just a simple recursive descent parser for POSIX.2
11  regexps.  The parser supports both the obsolete default syntax and
12  the "extended" syntax, and some nonstandard extensions.
13*/
14
15
16#ifdef HAVE_CONFIG_H
17#include <config.h>
18#endif /* HAVE_CONFIG_H */
19#include <string.h>
20#include <assert.h>
21#include <limits.h>
22#include <stddef.h>
23
24#include "xmalloc.h"
25#include "tre-mem.h"
26#include "tre-ast.h"
27#include "tre-stack.h"
28#include "tre-parse.h"
29
30/* BSD compatibility:
31     Before looking up a collating symbol, check if the name matches in
32     the character names (cnames) array; if so, use the corresponding
33     character.
34
35     Also set ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND, which will preserve
36     the implementation choice that for ERE, a non-numeric character following
37     a left brace that would normally be a bound, causes the left brace to be
38     literal. */
39#define BSD_COMPATIBILITY
40#ifdef BSD_COMPATIBILITY
41#include "cname.h"
42#define ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND
43#endif /* BSD_COMPATIBILITY */
44
45/* Characters with special meanings in regexp syntax. */
46#define CHAR_PIPE	   L'|'
47#define CHAR_LPAREN	   L'('
48#define CHAR_RPAREN	   L')'
49#define CHAR_LBRACE	   L'{'
50#define CHAR_RBRACE	   L'}'
51#define CHAR_LBRACKET	   L'['
52#define CHAR_RBRACKET	   L']'
53#define CHAR_MINUS	   L'-'
54#define CHAR_STAR	   L'*'
55#define CHAR_QUESTIONMARK  L'?'
56#define CHAR_PLUS	   L'+'
57#define CHAR_PERIOD	   L'.'
58#define CHAR_COLON	   L':'
59#define CHAR_EQUAL	   L'='
60#define CHAR_COMMA	   L','
61#define CHAR_CARET	   L'^'
62#define CHAR_DOLLAR	   L'$'
63#define CHAR_BACKSLASH	   L'\\'
64#define CHAR_HASH	   L'#'
65#define CHAR_TILDE	   L'~'
66
67
68/* Some macros for expanding \w, \s, etc. */
69static const struct tre_macro_struct {
70  const char c;
71  const char *expansion;
72} tre_macros[] =
73  { {'t', "\t"},	   {'n', "\n"},		   {'r', "\r"},
74    {'f', "\f"},	   {'a', "\a"},		   {'e', "\033"},
75    {'w', "[[:alnum:]_]"}, {'W', "[^[:alnum:]_]"}, {'s', "[[:space:]]"},
76    {'S', "[^[:space:]]"}, {'d', "[[:digit:]]"},   {'D', "[^[:digit:]]"},
77    { 0, NULL }
78  };
79
80
81/* Expands a macro delimited by `regex' and `regex_end' to `buf', which
82   must have at least `len' items.  Sets buf[0] to zero if the there
83   is no match in `tre_macros'. */
84static void
85tre_expand_macro(const tre_char_t *regex, const tre_char_t *regex_end,
86		 tre_char_t *buf, size_t buf_len)
87{
88  int i;
89
90  buf[0] = 0;
91  if (regex >= regex_end)
92    return;
93
94  for (i = 0; tre_macros[i].expansion; i++)
95    {
96      if (tre_macros[i].c == *regex)
97	{
98	  unsigned int j;
99	  DPRINT(("Expanding macro '%c' => '%s'\n",
100		  tre_macros[i].c, tre_macros[i].expansion));
101	  for (j = 0; tre_macros[i].expansion[j] && j < buf_len; j++)
102	    buf[j] = tre_macros[i].expansion[j];
103	  buf[j] = 0;
104	  break;
105	}
106    }
107}
108
109static reg_errcode_t
110tre_new_item(tre_mem_t mem, int type, int val, int *max_i,
111	 tre_bracket_match_list_t **items)
112{
113  reg_errcode_t status = REG_OK;
114  tre_bracket_match_list_t *array = *items;
115  int i = array->num_bracket_matches;
116  /* Allocate more space if necessary. */
117  if (i >= *max_i)
118    {
119      tre_bracket_match_list_t *new_items;
120      DPRINT(("out of tre_bracket_match_list_t array space (%d)\n", i));
121      /* If the array is already 1024 items large, give up -- there's
122	 probably an error in the regexp (e.g. not a '\0' terminated
123	 string and missing ']') */
124      if (*max_i >= 1024)
125	return REG_ESPACE;
126      *max_i *= 2;
127      new_items = xrealloc(array, SIZEOF_BRACKET_MATCH_LIST_N(*max_i));
128      if (new_items == NULL)
129	return REG_ESPACE;
130      *items = array = new_items;
131    }
132  array->bracket_matches[i].type = type;
133  array->bracket_matches[i].value = val;
134  array->num_bracket_matches++;
135  return status;
136}
137
138#ifndef TRE_USE_SYSTEM_WCTYPE
139
140/* isalnum() and the rest may be macros, so wrap them to functions. */
141int tre_isalnum_func(tre_cint_t c) { return tre_isalnum(c); }
142int tre_isalpha_func(tre_cint_t c) { return tre_isalpha(c); }
143
144#ifdef tre_isascii
145int tre_isascii_func(tre_cint_t c) { return tre_isascii(c); }
146#else /* !tre_isascii */
147int tre_isascii_func(tre_cint_t c) { return !(c >> 7); }
148#endif /* !tre_isascii */
149
150#ifdef tre_isblank
151int tre_isblank_func(tre_cint_t c) { return tre_isblank(c); }
152#else /* !tre_isblank */
153int tre_isblank_func(tre_cint_t c) { return ((c == ' ') || (c == '\t')); }
154#endif /* !tre_isblank */
155
156int tre_iscntrl_func(tre_cint_t c) { return tre_iscntrl(c); }
157int tre_isdigit_func(tre_cint_t c) { return tre_isdigit(c); }
158int tre_isgraph_func(tre_cint_t c) { return tre_isgraph(c); }
159int tre_islower_func(tre_cint_t c) { return tre_islower(c); }
160int tre_isprint_func(tre_cint_t c) { return tre_isprint(c); }
161int tre_ispunct_func(tre_cint_t c) { return tre_ispunct(c); }
162int tre_isspace_func(tre_cint_t c) { return tre_isspace(c); }
163int tre_isupper_func(tre_cint_t c) { return tre_isupper(c); }
164int tre_isxdigit_func(tre_cint_t c) { return tre_isxdigit(c); }
165
166struct {
167  char *name;
168  int (*func)(tre_cint_t);
169} tre_ctype_map[] = {
170  { "alnum", &tre_isalnum_func },
171  { "alpha", &tre_isalpha_func },
172#ifdef tre_isascii
173  { "ascii", &tre_isascii_func },
174#endif /* tre_isascii */
175#ifdef tre_isblank
176  { "blank", &tre_isblank_func },
177#endif /* tre_isblank */
178  { "cntrl", &tre_iscntrl_func },
179  { "digit", &tre_isdigit_func },
180  { "graph", &tre_isgraph_func },
181  { "lower", &tre_islower_func },
182  { "print", &tre_isprint_func },
183  { "punct", &tre_ispunct_func },
184  { "space", &tre_isspace_func },
185  { "upper", &tre_isupper_func },
186  { "xdigit", &tre_isxdigit_func },
187  { NULL, NULL}
188};
189
190tre_ctype_t tre_ctype(const char *name)
191{
192  int i;
193  for (i = 0; tre_ctype_map[i].name != NULL; i++)
194    {
195      if (strcmp(name, tre_ctype_map[i].name) == 0)
196	return tre_ctype_map[i].func;
197    }
198  return (tre_ctype_t)0;
199}
200#endif /* !TRE_USE_SYSTEM_WCTYPE */
201
202#define REST(re) (int)(ctx->re_end - (re)), (re)
203
204#define START_COLLATING_SYMBOLS		16
205#define MAX_COLLATING_SYMBOL_LEN	4
206
207typedef struct {
208  const tre_char_t *start;
209  int len;
210} tre_collating_symbol;
211
212#include <xlocale.h>
213
214int __collate_equiv_value(locale_t loc, const wchar_t *str, size_t len);
215
216#ifdef BSD_COMPATIBILITY
217static wchar_t
218tre_search_cnames(const wchar_t *name, size_t len)
219{
220  size_t low = 0;
221  size_t high = NCNAMES - 1;
222  size_t cur;
223  int cmp;
224
225  while(low <= high)
226    {
227      cur = (low + high) / 2;
228      cmp = wcsncmp(name, cnames[cur].name, len);
229      if (cmp == 0 && cnames[cur].name[len] == 0) return cnames[cur].code;
230      if (cmp > 0) low = cur + 1;
231      else high = cur - 1;
232    }
233  return (wchar_t)-1;
234}
235#endif /* BSD_COMPATIBILITY */
236
237/* Scan the contents of a bracket expression, and create a
238 * tre_bracket_match_list_t encoding the bracket expression.  If during
239 * the scan, multi-character collating symbols are detected, switch
240 * into a mode to collect those MCCSs into a tre_collating_symbol
241 * list and pass them back.  tre_parse_bracket will use that to
242 * create a new string composed of a union of the bracket expression
243 * without the MCCSs and the MCCSs (e.g., [x[.ch.]] => [x]|ch), and
244 * call tre_parse (recursive) to parse that new string (which will
245 * call tre_parse_bracket and tre_parse_bracket_items again. */
246static reg_errcode_t
247tre_parse_bracket_items(tre_parse_ctx_t *ctx, tre_bracket_match_list_t **items,
248			int *items_size, tre_collating_symbol **result)
249{
250  const tre_char_t *re = ctx->re;
251  const tre_char_t *re_end = ctx->re_end;
252  tre_collating_symbol *col_syms = NULL;
253  tre_collating_symbol *cp = NULL;
254  int n_col_syms = 0;
255  reg_errcode_t status;
256  int max_i = *items_size;
257  int other = 0;  /* contains content other than multi-character collating
258		   * symbols */
259  int range = -1; /* -1 unset, 0 begin range set, +1 end range expected */
260  tre_cint_t min, c;
261  int invert = ((*items)->flags & TRE_BRACKET_MATCH_FLAG_NEGATE);
262  int collect_MCCS = 0;
263  const tre_char_t *start;
264
265  for ( ;re < re_end; re++)
266    {
267      switch (*re)
268	{
269	case CHAR_MINUS:
270	  /* A first hyphen */
271	  if (re == ctx->re)
272	    {
273	      DPRINT(("tre_parse_bracket:   char: '%.*" STRF "'\n", REST(re)));
274	      min = CHAR_MINUS;
275	      other++;
276	      range = 0;
277	      break;
278	    }
279	  /* The hyphen is the end range */
280	  if (range > 0)
281	    {
282	      DPRINT(("tre_parse_bracket:   char: '%.*" STRF "'\n", REST(re)));
283	      c = CHAR_MINUS;
284	      goto process_end_range;
285	    }
286	  if (re + 1 >= re_end)
287	    {
288	      status = REG_EBRACK;
289	      goto error;
290	    }
291	  /* The hyphen is at the end */
292	  if (re[1] == CHAR_RBRACKET)
293	    {
294	      DPRINT(("tre_parse_bracket:   char: '%.*" STRF "'\n", REST(re)));
295	      c = CHAR_MINUS;
296	      goto process_begin_range;
297	    }
298	  /* Two ranges are not allowed to share an endpoint, or begin
299	   * range is illegal. */
300	  if (range < 0)
301	    {
302	      status = REG_ERANGE;
303	      goto error;
304	    }
305	  range = 1; /* Expect end range */
306	  DPRINT(("tre_parse_bracket:   range: '%.*" STRF "'\n", REST(re)));
307	  break;
308
309	case CHAR_LBRACKET:
310	  if (re + 1 >= re_end)
311	    {
312	      status = REG_EBRACK;
313	      goto error;
314	    }
315	  switch (re[1])
316	    {
317	    case CHAR_PERIOD:
318	      {
319		re += 2;
320		start = re;
321		for (;; re++)
322		  {
323		    if (re >= re_end)
324		      {
325			status = REG_ECOLLATE;
326			goto error;
327		      }
328		    if (*re == CHAR_PERIOD)
329		      {
330			if (re + 1 >= re_end)
331			  {
332			    status = REG_ECOLLATE;
333			    goto error;
334			  }
335			/* Found end */
336			if (re[1] == CHAR_RBRACKET)
337			  {
338			    DPRINT(("tre_parse_bracket:   collating "
339				    "symbol: '%.*" STRF "'\n",
340				    REST(start - 2)));
341			    /* Empty name */
342			    if (re == start)
343			      {
344				status = REG_ECOLLATE;
345				goto error;
346			      }
347#ifdef BSD_COMPATIBILITY
348			    /* Check if the name is in cnames; if so, use
349			       the corresponding code */
350			    c = tre_search_cnames(start, re - start);
351			    if (c != (wchar_t)-1)
352			      {
353				re++;
354				goto process_single_character;
355			      }
356#endif /* BSD_COMPATIBILITY */
357			    /* Verify this is a known sequence */
358			    if (__collate_equiv_value(ctx->loc, start,
359							  re - start) <= 0)
360			      {
361				status = REG_ECOLLATE;
362				goto error;
363			      }
364			    /* Process single character collating symbols */
365			    if (re - start == 1)
366			      {
367				c = *start;
368				re++;
369				goto process_single_character;
370			      }
371			    /* Inverted MCCSs are undefined */
372			    if (invert)
373			      {
374				status = REG_ECOLLATE;
375				goto error;
376			      }
377			    /* Can't have MCCSs as an endpoint to a range */
378			    if (range > 0)
379			      {
380				status = REG_ERANGE;
381				goto error;
382			      }
383			    range = -1;
384			    /* Switch into MCCS collection mode (if not
385			     * already there */
386#if TRE_DEBUG
387			    if (!collect_MCCS)
388			      {
389				collect_MCCS = 1;
390				DPRINT(("tre_parse_bracket: Detected MCCS\n"));
391			      }
392#else /* !TRE_DEBUG */
393			    collect_MCCS = 1;
394#endif /* !TRE_DEBUG */
395			    /* Allocate a memory block the first time */
396			    if (!cp)
397			      {
398				if ((col_syms = xmalloc(sizeof(*col_syms) *
399					    (START_COLLATING_SYMBOLS + 2)))
400					    == NULL)
401				  return REG_ESPACE;
402				cp = col_syms + 1;
403				n_col_syms = START_COLLATING_SYMBOLS;
404			      }
405			    /* Enlarge the memory block is more is needed */
406			    if ((cp - col_syms) - 1 >= n_col_syms)
407			      {
408				int i = n_col_syms;
409				tre_collating_symbol *tmp =
410				    xrealloc(col_syms, sizeof(*col_syms) *
411					     ((n_col_syms *= 2) + 2));
412				if (tmp == NULL)
413				  {
414				    xfree(col_syms);
415				    return REG_ESPACE;
416				  }
417				DPRINT(("tre_list_collating_symbols: "
418					"Enlarging col_syms to %d\n",
419					n_col_syms));
420				col_syms = tmp;
421				cp = col_syms + i + 1;
422			      }
423			    cp->start = start;
424			    cp->len = re - start;
425			    cp++;
426			    re++;
427			    break;
428			  }
429		      }
430		  }
431		break;
432	      }
433
434	    case CHAR_EQUAL:
435	    case CHAR_COLON:
436	      {
437		/* Process equivalence and character classes */
438		tre_char_t kind = re[1];
439
440		/* Can't have a class as an endpoint to a range */
441		if (range > 0)
442		  {
443		    status = REG_ERANGE;
444		    goto error;
445		  }
446		if (!collect_MCCS && range == 0)
447		  {
448		    status = tre_new_item(ctx->mem, TRE_BRACKET_MATCH_TYPE_CHAR,
449					  min, &max_i, items);
450		    if (status != REG_OK)
451		      goto error;
452		  }
453		range = -1;
454		re += 2;
455		start = re;
456		for (;; re++)
457		  {
458		    if (re >= re_end)
459		      {
460			status = kind == CHAR_EQUAL ? REG_ECOLLATE : REG_ECTYPE;
461			goto error;
462		      }
463		    if (*re == kind)
464		      {
465			if (re + 1 >= re_end)
466			  {
467			    status = kind == CHAR_EQUAL ? REG_ECOLLATE :
468							  REG_ECTYPE;
469			    goto error;
470			  }
471			/* Found end */
472			if (re[1] == CHAR_RBRACKET)
473			  {
474			    if (re == start)
475			      {
476				/* Empty class name */
477				status = kind == CHAR_EQUAL ? REG_ECOLLATE :
478							      REG_ECTYPE;
479				goto error;
480			      }
481			    /* Process equivalence class */
482			    if (kind == CHAR_EQUAL)
483			      {
484				int equiv;
485
486				DPRINT(("tre_parse_bracket:   equivalence: '%.*"
487					STRF "'\n", REST(start - 2)));
488
489				/* While we find the collation value even for
490				   multi-character collating elements , we
491				   don't (yet) match any collation values
492				   against multi-character sequences.  We'd have
493				   to enumerate those multi-character sequences
494				   and like multi-character collating symbols,
495				   create a union of those sequences with the
496				   rest of the bracket expression.  While
497				   doable, a bracket expression matching
498				   multiple characters, that doesn't explicitly
499				   contain multi-character sequences, might
500				   be unexpected, so we punt for now. */
501				if ((equiv = __collate_equiv_value(ctx->loc,
502					     start, re - start)) <= 0)
503				  {
504				    /* The standard says that if no collating
505				       element if found, we use the collating
506				       symbol itself.  But __collate_equiv_value
507				       doesn't make a distinction between
508				       an element that is in a equvalence
509				       class with others, or is the only member,
510				       so we already know there is no collating
511				       symbol.  (Note that in the case of a
512				       collating element whose collation value
513				       is unique, matching against the
514				       collating element itself, or against
515				       its collation value, is equivalent.) */
516#ifdef BSD_COMPATIBILITY
517				    /* Check if the name is in cnames; if so,
518				       use the corresponding code */
519				    c = tre_search_cnames(start, re - start);
520				    if (c != (wchar_t)-1)
521				      {
522					re++;
523					goto process_single_character;
524				      }
525#endif /* BSD_COMPATIBILITY */
526				    status = REG_ECOLLATE;
527				    goto error;
528				  }
529				if (!collect_MCCS)
530				  {
531				    status = tre_new_item(ctx->mem,
532					     TRE_BRACKET_MATCH_TYPE_EQUIVALENCE,
533					     equiv, &max_i, items);
534				    if (status != REG_OK)
535				      goto error;
536				  }
537			      }
538			    else
539			      {
540				/* Process character class */
541				DPRINT(("tre_parse_bracket:  class: '%.*" STRF
542					"'\n", REST(start - 2)));
543				if (!collect_MCCS)
544				  {
545				    char tmp_str[64];
546				    tre_ctype_t class;
547				    int len = MIN(re - start, 63);
548#ifdef TRE_WCHAR
549				    {
550				      tre_char_t tmp_wcs[64];
551				      wcsncpy(tmp_wcs, start, (size_t)len);
552				      tmp_wcs[len] = L'\0';
553#if defined HAVE_WCSRTOMBS
554				      {
555					mbstate_t state;
556					const tre_char_t *src = tmp_wcs;
557					memset(&state, '\0', sizeof(state));
558					len = wcsrtombs_l(tmp_str, &src,
559						      sizeof(tmp_str), &state,
560						      ctx->loc);
561				      }
562#elif defined HAVE_WCSTOMBS
563				      len = wcstombs(tmp_str, tmp_wcs, 63);
564#endif /* defined HAVE_WCSTOMBS */
565				    }
566#else /* !TRE_WCHAR */
567				    strncpy(tmp_str, (const char*)start, len);
568#endif /* !TRE_WCHAR */
569				    tmp_str[len] = '\0';
570				    DPRINT(("  class name: %s\n", tmp_str));
571				    class = tre_ctype_l(tmp_str, ctx->loc);
572				    if (!class)
573				      {
574					status = REG_ECTYPE;
575					goto error;
576				      }
577				    status = tre_new_item(ctx->mem,
578					     TRE_BRACKET_MATCH_TYPE_CLASS,
579					     class, &max_i, items);
580				    if (status != REG_OK)
581				      goto error;
582				  }
583			      }
584			    re++;
585			    break;
586			  }
587		      }
588		  }
589		other++;
590		break;
591	      }
592
593	    default:
594	      DPRINT(("tre_parse_bracket:   char: '%.*" STRF "'\n", REST(re)));
595	      c = CHAR_LBRACKET;
596	      goto process_single_character;
597	      break;
598	    }
599	  break;
600
601	case CHAR_RBRACKET:
602	  /* A first right bracket */
603	  if (re == ctx->re)
604	    {
605	      DPRINT(("tre_parse_bracket:   char: '%.*" STRF "'\n", REST(re)));
606	      min = CHAR_RBRACKET;
607	      range = 0;
608	      other++;
609	      break;
610	    }
611	  /* Done */
612	  if (collect_MCCS)
613	    {
614	      DPRINT(("tre_parse_bracket:	done: '%.*" STRF "'\n",
615		      REST(re)));
616	      if (col_syms)
617		{
618		  /* Mark the character following the right bracket.  Set len
619		   * to whether there are other things besides the
620		   * multi-character collating symbols */
621		  col_syms->start = re + 1;
622		  col_syms->len = other;
623		  /* Mark the end of the list */
624		  cp->start = NULL;
625		}
626	      *result = col_syms;
627	      return REG_OK;
628	    }
629	  /* range > 0 is not possible, since we did a lookahead after the
630	   * hyphen */
631	  if (range == 0)
632	    {
633	      status = tre_new_item(ctx->mem, TRE_BRACKET_MATCH_TYPE_CHAR,
634				    min, &max_i, items);
635	      if (status != REG_OK)
636		goto error;
637	    }
638	  DPRINT(("tre_parse_bracket:	done: '%.*" STRF "'\n", REST(re)));
639	  *items_size = max_i;
640	  ctx->re = re + 1;
641	  return REG_OK;
642
643	default:
644	  DPRINT(("tre_parse_bracket:   char: '%.*" STRF "'\n", REST(re)));
645	  c = *re;
646process_single_character:
647	  /* Process single character */
648	  if (range > 0)
649	    {
650	      int mine, maxe;
651
652process_end_range:
653	      /* Get collation equivalence values */
654	      mine = __collate_equiv_value(ctx->loc, &min, 1);
655	      maxe = __collate_equiv_value(ctx->loc, &c, 1);
656	      if (maxe < mine)
657		{
658		  status = REG_ERANGE;
659		  goto error;
660		}
661	      if (!collect_MCCS)
662		{
663		  status = tre_new_item(ctx->mem,
664					TRE_BRACKET_MATCH_TYPE_RANGE_BEGIN,
665					mine, &max_i, items);
666		  if (status != REG_OK)
667		    goto error;
668		  status = tre_new_item(ctx->mem,
669					TRE_BRACKET_MATCH_TYPE_RANGE_END,
670					maxe, &max_i, items);
671		  if (status != REG_OK)
672		    goto error;
673		}
674	      range = -1;
675	    }
676	  else
677	    {
678process_begin_range:
679	      if (!collect_MCCS)
680		{
681		  if (range == 0)
682		    {
683		      status = tre_new_item(ctx->mem,
684					    TRE_BRACKET_MATCH_TYPE_CHAR,
685					    min, &max_i, items);
686		      if (status != REG_OK)
687			goto error;
688		    }
689		  min = c;
690		}
691	      range = 0;
692	    }
693	  other++;
694	  break;
695	}
696    }
697  status = REG_EBRACK;
698error:
699  DPRINT(("tre_parse_bracket:	error: '%.*" STRF "', status=%d\n",
700	  REST(re), status));
701  if (col_syms)
702    xfree(col_syms);
703  return status;
704}
705
706#ifdef TRE_DEBUG
707static const char *bracket_match_type_str[] = {
708  "unused",
709  "char",
710  "range begin",
711  "range end",
712  "class",
713  "equivalence value",
714};
715#endif /* TRE_DEBUG */
716
717static reg_errcode_t
718tre_parse_bracket(tre_parse_ctx_t *ctx, tre_ast_node_t **result)
719{
720  tre_ast_node_t *node;
721  reg_errcode_t status = REG_OK;
722  tre_bracket_match_list_t *items;
723  int max_i = 32;
724  tre_collating_symbol *col_syms = NULL;
725
726  /* Handle special cases [[:<:]] and [[:>:]] */
727  if (ctx->re_end - ctx->re >= 6 && ctx->re[0] == CHAR_LBRACKET
728      && ctx->re[1] == CHAR_COLON && (ctx->re[2] == L'<' || ctx->re[2] == L'>')
729      && ctx->re[3] == CHAR_COLON && ctx->re[4] == CHAR_RBRACKET
730      && ctx->re[5] == CHAR_RBRACKET)
731    {
732      *result = tre_ast_new_literal(ctx->mem, ASSERTION,
733		      (ctx->re[2] == L'<') ? ASSERT_AT_BOW : ASSERT_AT_EOW,
734		      -1);
735      DPRINT(("tre_parse_bracket: special case %s\n", (ctx->re[2] == L'<') ?
736	      "[[:<:]]" : "[[:>:]]"));
737      ctx->re += 6;
738      return *result ? REG_OK : REG_ESPACE;
739    }
740
741  /* Start off with an array of `max_i' elements. */
742  items = xcalloc(1, SIZEOF_BRACKET_MATCH_LIST_N(max_i));
743  if (items == NULL)
744    return REG_ESPACE;
745
746  if (*ctx->re == CHAR_CARET)
747    {
748      DPRINT(("tre_parse_bracket: negate: '%.*" STRF "'\n", REST(ctx->re)));
749      items->flags |= TRE_BRACKET_MATCH_FLAG_NEGATE;
750      ctx->re++;
751    }
752
753  status = tre_parse_bracket_items(ctx, &items, &max_i, &col_syms);
754
755  if (status != REG_OK)
756    goto parse_bracket_done;
757
758  /* If there are collating symbols, split off the multi-character ones
759   * into a union of the bracket expression (without the collating symbols)
760   * and the multiple-character sequences.  We create an equivalent input
761   * string and run tre_parse() recursively */
762  if (col_syms)
763    {
764      tre_char_t *str, *sp;
765      tre_collating_symbol *cp;
766      tre_parse_ctx_t subctx;
767
768      /* Allocate a new string.  We start with the size of the original
769       * bracket expression (minus 1) and add 2 (for a leading "[" and
770       * a trailing nil; don't need a "^", since it is illegal to have
771       * inverted MCCSs).  Since a multi-character collating symbols
772       * will be converted from "[.xx.]" to "|xx" (n+4 to n+1), we don't
773       * need to worry about the new string getting too long. */
774      xfree(items);
775      str = xmalloc(sizeof(*str) * ((col_syms->start - ctx->re) + 2));
776      if (str == NULL)
777	{
778	  xfree(col_syms);
779	  return REG_ESPACE;
780	}
781      sp = str;
782      if (col_syms->len > 0)
783	{
784	  /* There are other items in the bracket expression besides the
785	   * multi-character collating symbols, so create a new bracket
786	   * expression with only those other itmes. */
787	  const tre_char_t *re;
788	  ptrdiff_t i;
789
790	  *sp++ = '[';
791	  re = ctx->re;
792	  for (cp = col_syms + 1; cp->start; cp++)
793	    {
794	      /* The "- 2" is to account for the "[." */
795	      if ((i = ((cp->start - re) - 2)) > 0)
796		{
797		  memcpy(sp, re, sizeof(*sp) * i);
798		  sp += i;
799		}
800	      /* The "+ 2" is to account for the ".]" */
801	      re = cp->start + cp->len + 2;
802	    }
803	    i = col_syms->start - re; /* Includes the trailing right bracket */
804	    memcpy(sp, re, sizeof(*sp) * i);
805	    sp += i;
806	    *sp++ = '|';
807	}
808      for (cp = col_syms + 1; cp->start; cp++)
809	{
810	  memcpy(sp, cp->start, sizeof(*sp) * cp->len);
811	  sp += cp->len;
812	  if (cp[1].start)
813	    *sp++ = '|';
814	}
815      *sp = 0;
816      DPRINT(("tre_parse_bracket: Reparsing bracket expression with '%ls'\n",
817	      str));
818
819      memcpy(&subctx, ctx, sizeof(subctx));
820      subctx.re = str;
821      subctx.len = sp - str;
822      subctx.nofirstsub = 1;
823      subctx.cflags |= REG_EXTENDED; /* Force extended mode for parsing */
824      status = tre_parse(&subctx);
825      xfree(str);
826      if (status != REG_OK)
827	{
828	  xfree(col_syms);
829	  return status;
830	}
831      ctx->re = col_syms->start;
832      ctx->position = subctx.position;
833      xfree(col_syms);
834      *result = subctx.result;
835      DPRINT(("tre_parse_bracket: Returning to original string\n"));
836      return REG_OK;
837    }
838
839  DPRINT(("tre_parse_bracket: creating bracket expression literal\n"));
840  node = tre_ast_new_literal(ctx->mem, 0, TRE_CHAR_MAX, ctx->position);
841  if (node == NULL)
842    {
843      status = REG_ESPACE;
844      goto parse_bracket_done;
845    }
846  else
847    {
848      tre_literal_t *l = node->obj;
849      l->u.bracket_match_list = tre_mem_alloc(ctx->mem,
850					 SIZEOF_BRACKET_MATCH_LIST(items));
851      if (l->u.bracket_match_list == NULL)
852	{
853	  status = REG_ESPACE;
854	  goto parse_bracket_done;
855	}
856      memcpy(l->u.bracket_match_list, items, SIZEOF_BRACKET_MATCH_LIST(items));
857    }
858
859#ifdef TRE_DEBUG
860  {
861    int i;
862    tre_bracket_match_t *b;
863    DPRINT(("tre_parse_bracket: %d bracket match items, flags 0x%x\n",
864	    items->num_bracket_matches, items->flags));
865    for (i = 0, b = items->bracket_matches;
866	 i < items->num_bracket_matches; i++, b++)
867      {
868	DPRINT(("   %d: %s %d\n", i, bracket_match_type_str[b->type],
869		b->value));
870      }
871  }
872#endif /* TRE_DEBUG */
873
874 parse_bracket_done:
875  xfree(items);
876  ctx->position++;
877  *result = node;
878  return status;
879}
880
881
882/* Parses a positive decimal integer.  Returns -1 if the string does not
883   contain a valid number. */
884static int
885tre_parse_int(const tre_char_t **regex, const tre_char_t *regex_end)
886{
887  int num = -1;
888  const tre_char_t *r = *regex;
889  while (r < regex_end && *r >= L'0' && *r <= L'9')
890    {
891      if (num < 0)
892	num = 0;
893      num = num * 10 + *r - L'0';
894      r++;
895    }
896  *regex = r;
897  return num;
898}
899
900
901static reg_errcode_t
902tre_parse_bound(tre_parse_ctx_t *ctx, tre_ast_node_t **result)
903{
904  int min, max;
905#ifdef TRE_APPROX
906  int i;
907  int cost_ins, cost_del, cost_subst, cost_max;
908  int limit_ins, limit_del, limit_subst, limit_err;
909  const tre_char_t *start;
910#endif /* TRE_APPROX */
911  const tre_char_t *r = ctx->re;
912  int minimal = (ctx->cflags & REG_UNGREEDY) ? 1 : 0;
913#ifdef TRE_APPROX
914  int approx = 0;
915  int costs_set = 0;
916  int counts_set = 0;
917
918  cost_ins = cost_del = cost_subst = cost_max = TRE_PARAM_UNSET;
919  limit_ins = limit_del = limit_subst = limit_err = TRE_PARAM_UNSET;
920#endif /* TRE_APPROX */
921
922  /* Parse number (minimum repetition count). */
923  min = -1;
924  if (r >= ctx->re_end)
925#ifdef ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND
926    return (ctx->cflags & REG_EXTENDED) ? REG_NOMATCH : REG_EBRACE;
927#else /* !ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND */
928    return REG_EBRACE;
929#endif /* !ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND */
930  if (*r >= L'0' && *r <= L'9') {
931    DPRINT(("tre_parse:	  min count: '%.*" STRF "'\n", REST(r)));
932    min = tre_parse_int(&r, ctx->re_end);
933  }
934#ifndef TRE_APPROX
935  else
936#ifdef ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND
937      /* For ERE, return REG_NOMATCH to signal that the lbrace should
938         be treated as a literal */
939      return (ctx->cflags & REG_EXTENDED) ? REG_NOMATCH : REG_BADBR;
940#else /* !ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND */
941      return REG_BADBR;
942#endif /* !ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND */
943#endif /* !TRE_APPROX */
944
945  /* Parse comma and second number (maximum repetition count). */
946  max = min;
947  if (r < ctx->re_end && *r == CHAR_COMMA)
948    {
949      r++;
950      DPRINT(("tre_parse:   max count: '%.*" STRF "'\n", REST(r)));
951      max = tre_parse_int(&r, ctx->re_end);
952    }
953
954  /* Check that the repeat counts are sane. */
955  if ((max >= 0 && min > max) || min > RE_DUP_MAX || max > RE_DUP_MAX)
956    return REG_BADBR;
957
958
959#ifdef TRE_APPROX
960  /*
961   '{'
962     optionally followed immediately by a number == minimum repcount
963     optionally followed by , then a number == maximum repcount
964      + then a number == maximum insertion count
965      - then a number == maximum deletion count
966      # then a number == maximum substitution count
967      ~ then a number == maximum number of errors
968      Any of +, -, # or ~ without followed by a number means that
969      the maximum count/number of errors is infinite.
970
971      An equation of the form
972	Xi + Yd + Zs < C
973      can be specified to set costs and the cost limit to a value
974      different from the default value:
975	- X is the cost of an insertion
976	- Y is the cost of a deletion
977	- Z is the cost of a substitution
978	- C is the maximum cost
979
980      If no count limit or cost is set for an operation, the operation
981      is not allowed at all.
982  */
983
984
985  do {
986    int done;
987    start = r;
988
989    /* Parse count limit settings */
990    done = 0;
991    if (!counts_set)
992      while (r + 1 < ctx->re_end && !done)
993	{
994	  switch (*r)
995	    {
996	    case CHAR_PLUS:  /* Insert limit */
997	      DPRINT(("tre_parse:   ins limit: '%.*" STRF "'\n", REST(r)));
998	      r++;
999	      limit_ins = tre_parse_int(&r, ctx->re_end);
1000	      if (limit_ins < 0)
1001		limit_ins = INT_MAX;
1002	      counts_set = 1;
1003	      break;
1004	    case CHAR_MINUS: /* Delete limit */
1005	      DPRINT(("tre_parse:   del limit: '%.*" STRF "'\n", REST(r)));
1006	      r++;
1007	      limit_del = tre_parse_int(&r, ctx->re_end);
1008	      if (limit_del < 0)
1009		limit_del = INT_MAX;
1010	      counts_set = 1;
1011	      break;
1012	    case CHAR_HASH:  /* Substitute limit */
1013	      DPRINT(("tre_parse: subst limit: '%.*" STRF "'\n", REST(r)));
1014	      r++;
1015	      limit_subst = tre_parse_int(&r, ctx->re_end);
1016	      if (limit_subst < 0)
1017		limit_subst = INT_MAX;
1018	      counts_set = 1;
1019	      break;
1020	    case CHAR_TILDE: /* Maximum number of changes */
1021	      DPRINT(("tre_parse: count limit: '%.*" STRF "'\n", REST(r)));
1022	      r++;
1023	      limit_err = tre_parse_int(&r, ctx->re_end);
1024	      if (limit_err < 0)
1025		limit_err = INT_MAX;
1026	      approx = 1;
1027	      break;
1028	    case CHAR_COMMA:
1029	      r++;
1030	      break;
1031	    case L' ':
1032	      r++;
1033	      break;
1034	    case L'}':
1035	      done = 1;
1036	      break;
1037	    default:
1038	      done = 1;
1039	      break;
1040	    }
1041	}
1042
1043    /* Parse cost restriction equation. */
1044    done = 0;
1045    if (!costs_set)
1046      while (r + 1 < ctx->re_end && !done)
1047	{
1048	  switch (*r)
1049	    {
1050	    case CHAR_PLUS:
1051	    case L' ':
1052	      r++;
1053	      break;
1054	    case L'<':
1055	      DPRINT(("tre_parse:    max cost: '%.*" STRF "'\n", REST(r)));
1056	      r++;
1057	      while (*r == L' ')
1058		r++;
1059	      cost_max = tre_parse_int(&r, ctx->re_end);
1060	      if (cost_max < 0)
1061		cost_max = INT_MAX;
1062	      else
1063		cost_max--;
1064	      approx = 1;
1065	      break;
1066	    case CHAR_COMMA:
1067	      r++;
1068	      done = 1;
1069	      break;
1070	    default:
1071	      if (*r >= L'0' && *r <= L'9')
1072		{
1073#ifdef TRE_DEBUG
1074		  const tre_char_t *sr = r;
1075#endif /* TRE_DEBUG */
1076		  int cost = tre_parse_int(&r, ctx->re_end);
1077		  /* XXX - make sure r is not past end. */
1078		  switch (*r)
1079		    {
1080		    case L'i':	/* Insert cost */
1081		      DPRINT(("tre_parse:    ins cost: '%.*" STRF "'\n",
1082			      REST(sr)));
1083		      r++;
1084		      cost_ins = cost;
1085		      costs_set = 1;
1086		      break;
1087		    case L'd':	/* Delete cost */
1088		      DPRINT(("tre_parse:    del cost: '%.*" STRF "'\n",
1089			      REST(sr)));
1090		      r++;
1091		      cost_del = cost;
1092		      costs_set = 1;
1093		      break;
1094		    case L's':	/* Substitute cost */
1095		      DPRINT(("tre_parse:  subst cost: '%.*" STRF "'\n",
1096			      REST(sr)));
1097		      r++;
1098		      cost_subst = cost;
1099		      costs_set = 1;
1100		      break;
1101		    default:
1102		      return REG_BADBR;
1103		    }
1104		}
1105	      else
1106		{
1107		  done = 1;
1108		  break;
1109		}
1110	    }
1111	}
1112  } while (start != r);
1113#endif /* TRE_APPROX */
1114
1115  /*{*//* Missing }. */
1116  if (r >= ctx->re_end)
1117    return REG_EBRACE;
1118
1119  /* Empty contents of {}. */
1120  if (r == ctx->re)
1121    return REG_BADBR;
1122
1123  /* Parse the ending '}' or '\}'.*/
1124  if (ctx->cflags & REG_EXTENDED)
1125    {
1126      if (r >= ctx->re_end || *r != CHAR_RBRACE)
1127	return REG_BADBR;
1128      r++;
1129      /* Parse trailing '?' marking minimal repetition. */
1130      if (r < ctx->re_end)
1131	{
1132	  if (*r == CHAR_QUESTIONMARK)
1133	    {
1134	      /* Process the question mark only in enhanced mode.
1135		 Otherwise, the question mark is an error in ERE
1136		 or a literal in BRE */
1137	      if (ctx->cflags & REG_ENHANCED)
1138		{
1139		  minimal = !(ctx->cflags & REG_UNGREEDY);
1140		  r++;
1141		}
1142	      else return REG_BADRPT;
1143	    }
1144	  else if (*r == CHAR_STAR || *r == CHAR_PLUS)
1145	    {
1146	      /* These are reserved for future extensions. */
1147	      return REG_BADRPT;
1148	    }
1149	}
1150    }
1151  else
1152    {
1153      if (r + 1 >= ctx->re_end
1154	  || *r != CHAR_BACKSLASH
1155	  || *(r + 1) != CHAR_RBRACE)
1156	return REG_BADBR;
1157      r += 2;
1158      if (r < ctx->re_end && *r == CHAR_STAR)
1159	{
1160	  /* This is reserved for future extensions. */
1161	  return REG_BADRPT;
1162	}
1163    }
1164
1165  if (minimal)
1166    ctx->num_reorder_tags++;
1167
1168  if (!result) goto parse_bound_exit;
1169  /* Create the AST node(s). */
1170  /* Originally, if min == 0 && max == 0, we immediately replace the whole
1171     iteration with EMPTY.  This unfortunately drops any submatches, and
1172     messes up setting the pmatch values (we can get tags of -1, and
1173     tag values in the billions).  So we leave it and process this case as
1174     usual, and wait until tre_expand_ast() to replace with EMPTY */
1175#ifdef TRE_APPROX
1176  if (min < 0 && max < 0)
1177    /* Only approximate parameters set, no repetitions. */
1178    min = max = 1;
1179#endif /* TRE_APPROX */
1180
1181  *result = tre_ast_new_iter(ctx->mem, *result, min, max, minimal);
1182  if (!*result)
1183    return REG_ESPACE;
1184
1185#ifdef TRE_APPROX
1186  /* If approximate matching parameters are set, add them to the
1187     iteration node. */
1188  if (approx || costs_set || counts_set)
1189    {
1190      int *params;
1191      tre_iteration_t *iter = (*result)->obj;
1192
1193      if (costs_set || counts_set)
1194	{
1195	  if (limit_ins == TRE_PARAM_UNSET)
1196	    {
1197	      if (cost_ins == TRE_PARAM_UNSET)
1198		limit_ins = 0;
1199	      else
1200		limit_ins = INT_MAX;
1201	    }
1202
1203	  if (limit_del == TRE_PARAM_UNSET)
1204	    {
1205	      if (cost_del == TRE_PARAM_UNSET)
1206		limit_del = 0;
1207	      else
1208		limit_del = INT_MAX;
1209	    }
1210
1211	  if (limit_subst == TRE_PARAM_UNSET)
1212	    {
1213	      if (cost_subst == TRE_PARAM_UNSET)
1214		limit_subst = 0;
1215	      else
1216		limit_subst = INT_MAX;
1217	    }
1218	}
1219
1220      if (cost_max == TRE_PARAM_UNSET)
1221	cost_max = INT_MAX;
1222      if (limit_err == TRE_PARAM_UNSET)
1223	limit_err = INT_MAX;
1224
1225      ctx->have_approx = 1;
1226      params = tre_mem_alloc(ctx->mem, sizeof(*params) * TRE_PARAM_LAST);
1227      if (!params)
1228	return REG_ESPACE;
1229      for (i = 0; i < TRE_PARAM_LAST; i++)
1230	params[i] = TRE_PARAM_UNSET;
1231      params[TRE_PARAM_COST_INS] = cost_ins;
1232      params[TRE_PARAM_COST_DEL] = cost_del;
1233      params[TRE_PARAM_COST_SUBST] = cost_subst;
1234      params[TRE_PARAM_COST_MAX] = cost_max;
1235      params[TRE_PARAM_MAX_INS] = limit_ins;
1236      params[TRE_PARAM_MAX_DEL] = limit_del;
1237      params[TRE_PARAM_MAX_SUBST] = limit_subst;
1238      params[TRE_PARAM_MAX_ERR] = limit_err;
1239      iter->params = params;
1240    }
1241#endif /* TRE_APPROX */
1242
1243parse_bound_exit:
1244#ifdef TRE_APPROX
1245  DPRINT(("tre_parse_bound: min %d, max %d, costs [%d,%d,%d, total %d], "
1246	  "limits [%d,%d,%d, total %d]\n",
1247	  min, max, cost_ins, cost_del, cost_subst, cost_max,
1248	  limit_ins, limit_del, limit_subst, limit_err));
1249#else /* !TRE_APPROX */
1250  DPRINT(("tre_parse_bound: min %d, max %d\n", min, max));
1251#endif /* !TRE_APPROX */
1252
1253
1254  ctx->re = r;
1255  return REG_OK;
1256}
1257
1258/* Previously, we had PARSE_RESTORE_CFLAGS restore the cflags, but for
1259   non-self-contained options, like (?i), this causes ((?i)fu)bar to be
1260   treated more like ((?i)fu(?-i)bar), so the pmatch value is incorrect.
1261   Because we now set up tags for even non-capturing parenthesized
1262   subexpressions, we always call PARSE_MARK_FOR_SUBMATCH.  So if we
1263   pass the unmodified version of cflags to PARSE_MARK_FOR_SUBMATCH and
1264   have it restore cflags after the subexpression, we don't need to have
1265   a separate PARSE_RESTORE_CFLAGS, and then after processing the
1266   non-self-contained option, we can call PARSE_ATOM instead of PARSE_RE.
1267   This has the side-benefit of now matching the perl behavior: the RE
1268   foo(?i)bar|zap is foo(?i)bar OR (?i)zap instead of TRE previous behavior
1269   of foo AND (?i) (bar OR zap). */
1270typedef enum {
1271  PARSE_RE = 0,
1272  PARSE_ATOM,
1273  PARSE_MARK_FOR_SUBMATCH,
1274  PARSE_BRANCH,
1275  PARSE_PIECE,
1276  PARSE_CATENATION,
1277  PARSE_POST_CATENATION,
1278  PARSE_UNION,
1279  PARSE_POST_UNION,
1280  PARSE_POSTFIX,
1281} tre_parse_re_stack_symbol_t;
1282
1283
1284reg_errcode_t
1285tre_parse(tre_parse_ctx_t *ctx)
1286{
1287  tre_ast_node_t *result = NULL;
1288  tre_parse_re_stack_symbol_t symbol;
1289  reg_errcode_t status = REG_OK;
1290  tre_stack_t *stack = ctx->stack;
1291  int bottom = tre_stack_num_objects(stack);
1292  int depth = 0;
1293  int temporary_cflags = 0;
1294  int bre_branch_begin;
1295#ifdef TRE_DEBUG
1296  const tre_char_t *tmp_re;
1297#endif
1298
1299  DPRINT(("tre_parse: parsing '%.*" STRF "', len = %d cflags = 0%o\n",
1300	  ctx->len, ctx->re, ctx->len, ctx->cflags));
1301
1302  if (ctx->len <= 0) return REG_EMPTY;
1303  if (!ctx->nofirstsub)
1304    {
1305      STACK_PUSH(stack, int, ctx->cflags);
1306      STACK_PUSH(stack, int, ctx->submatch_id);
1307      STACK_PUSH(stack, int, PARSE_MARK_FOR_SUBMATCH);
1308      ctx->submatch_id++;
1309    }
1310  STACK_PUSH(stack, int, 0); // bre_branch_begin
1311  STACK_PUSH(stack, int, PARSE_RE);
1312  ctx->re_start = ctx->re;
1313  ctx->re_end = ctx->re + ctx->len;
1314
1315
1316  /* The following is basically just a recursive descent parser.  I use
1317     an explicit stack instead of recursive functions mostly because of
1318     two reasons: compatibility with systems which have an overflowable
1319     call stack, and efficiency (both in lines of code and speed).  */
1320  while (tre_stack_num_objects(stack) > bottom)
1321    {
1322      symbol = tre_stack_pop_int(stack);
1323      switch (symbol)
1324	{
1325	case PARSE_RE:
1326	  /* Parse a full regexp.  A regexp is one or more branches,
1327	     separated by the union operator `|'. */
1328	  bre_branch_begin = tre_stack_pop_int(stack);
1329	  if (
1330#ifdef REG_LITERAL
1331	      !(ctx->cflags & REG_LITERAL) &&
1332#endif /* REG_LITERAL */
1333	      ctx->cflags & (REG_EXTENDED | REG_ENHANCED))
1334	    STACK_PUSHX(stack, int, PARSE_UNION);
1335	  STACK_PUSHX(stack, int, bre_branch_begin);
1336	  STACK_PUSHX(stack, int, PARSE_BRANCH);
1337	  break;
1338
1339	case PARSE_BRANCH:
1340	  /* Parse a branch.  A branch is one or more pieces, concatenated.
1341	     A piece is an atom possibly followed by a postfix operator. */
1342	  bre_branch_begin = tre_stack_pop_int(stack);
1343	  STACK_PUSHX(stack, int, PARSE_CATENATION);
1344	  STACK_PUSHX(stack, int, bre_branch_begin);
1345	  STACK_PUSHX(stack, int, PARSE_PIECE);
1346	  break;
1347
1348	case PARSE_PIECE:
1349	  /* Parse a piece.  A piece is an atom possibly followed by one
1350	     or more postfix operators. */
1351	  bre_branch_begin = tre_stack_pop_int(stack);
1352	  STACK_PUSHX(stack, int, PARSE_POSTFIX);
1353	  STACK_PUSHX(stack, int, bre_branch_begin);
1354	  STACK_PUSHX(stack, int, PARSE_ATOM);
1355	  break;
1356
1357	case PARSE_CATENATION:
1358	  /* If the expression has not ended, parse another piece. */
1359	  {
1360	    tre_char_t c;
1361	    if (ctx->re >= ctx->re_end)
1362	      break;
1363	    c = *ctx->re;
1364#ifdef REG_LITERAL
1365	    if (!(ctx->cflags & REG_LITERAL))
1366	      {
1367#endif /* REG_LITERAL */
1368		if ((ctx->cflags & REG_EXTENDED && c == CHAR_PIPE) ||
1369		    ((ctx->cflags & (REG_EXTENDED | REG_ENHANCED)) == REG_ENHANCED
1370		    && ctx->re + 1 < ctx->re_end && c == CHAR_BACKSLASH &&
1371		    *(ctx->re + 1) == CHAR_PIPE))
1372		  break;
1373		if ((ctx->cflags & REG_EXTENDED
1374		     && c == CHAR_RPAREN && depth > 0)
1375		    || (!(ctx->cflags & REG_EXTENDED)
1376			&& ctx->re + 1 < ctx->re_end && c == CHAR_BACKSLASH
1377			    && *(ctx->re + 1) == CHAR_RPAREN))
1378		  {
1379		    if (!(ctx->cflags & REG_EXTENDED) && depth == 0)
1380		      return REG_EPAREN;
1381		    DPRINT(("tre_parse:	  group end: '%.*" STRF "'\n",
1382			    REST(ctx->re)));
1383		    depth--;
1384		    if (!(ctx->cflags & (REG_EXTENDED | REG_ENHANCED)))
1385		      ctx->re += 2;
1386		    break;
1387		  }
1388#ifdef REG_LITERAL
1389	      }
1390#endif /* REG_LITERAL */
1391
1392#ifdef REG_LEFT_ASSOC
1393	    if (ctx->cflags & REG_LEFT_ASSOC)
1394	      {
1395		/* Left associative concatenation. */
1396		STACK_PUSHX(stack, int, PARSE_CATENATION);
1397		STACK_PUSHX(stack, voidptr, result);
1398		STACK_PUSHX(stack, int, PARSE_POST_CATENATION);
1399		STACK_PUSHX(stack, int, 0); // bre_branch_begin
1400		STACK_PUSHX(stack, int, PARSE_PIECE);
1401	      }
1402	    else
1403#endif /* REG_LEFT_ASSOC */
1404	      {
1405		/* Default case, right associative concatenation. */
1406		STACK_PUSHX(stack, voidptr, result);
1407		STACK_PUSHX(stack, int, PARSE_POST_CATENATION);
1408		STACK_PUSHX(stack, int, PARSE_CATENATION);
1409		STACK_PUSHX(stack, int, 0); // bre_branch_begin
1410		STACK_PUSHX(stack, int, PARSE_PIECE);
1411	      }
1412	    break;
1413	  }
1414
1415	case PARSE_POST_CATENATION:
1416	  {
1417	    tre_ast_node_t *tree = tre_stack_pop_voidptr(stack);
1418	    tre_ast_node_t *tmp_node;
1419	    tmp_node = tre_ast_new_catenation(ctx->mem, tree, result);
1420	    if (!tmp_node)
1421	      return REG_ESPACE;
1422	    result = tmp_node;
1423	    break;
1424	  }
1425
1426	case PARSE_UNION:
1427	  if (ctx->re >= ctx->re_end)
1428	    break;
1429#ifdef REG_LITERAL
1430	  if (ctx->cflags & REG_LITERAL)
1431	    break;
1432#endif /* REG_LITERAL */
1433	  if (!(ctx->cflags & REG_EXTENDED))
1434	    {
1435	      if (*ctx->re != CHAR_BACKSLASH || ctx->re + 1 >= ctx->re_end)
1436		break;
1437	      ctx->re++;
1438	    }
1439	  switch (*ctx->re)
1440	    {
1441	    case CHAR_PIPE:
1442	      DPRINT(("tre_parse:	union: '%.*" STRF "'\n",
1443		      REST(ctx->re)));
1444	      STACK_PUSHX(stack, int, PARSE_UNION);
1445	      STACK_PUSHX(stack, voidptr, (void *)ctx->re);
1446	      STACK_PUSHX(stack, voidptr, result);
1447	      STACK_PUSHX(stack, int, PARSE_POST_UNION);
1448	      /* We need to pass a boolean (eventually) to PARSE_ATOM to
1449		 indicate if this is the beginning of a BRE extended branch. */
1450	      STACK_PUSHX(stack, int, (ctx->cflags & (REG_EXTENDED | REG_ENHANCED)) == REG_ENHANCED); // bre_branch_begin
1451	      STACK_PUSHX(stack, int, PARSE_BRANCH);
1452	      ctx->re++;
1453	      break;
1454
1455	    case CHAR_RPAREN:
1456	      ctx->re++;
1457	      break;
1458
1459	    default:
1460	      if (!(ctx->cflags & REG_EXTENDED))
1461		ctx->re--;
1462	      break;
1463	    }
1464	  break;
1465
1466	case PARSE_POST_UNION:
1467	  {
1468	    tre_ast_node_t *tmp_node;
1469	    tre_ast_node_t *tree = tre_stack_pop_voidptr(stack);
1470	    const tre_char_t *pipechar = tre_stack_pop_voidptr(stack);
1471	    /* error on empty expression at end of union */
1472	    if (pipechar == ctx->re - 1)
1473	      {
1474		return REG_EMPTY;
1475	      }
1476	    tmp_node = tre_ast_new_union(ctx->mem, tree, result);
1477	    if (!tmp_node)
1478	      return REG_ESPACE;
1479	    result = tmp_node;
1480	    break;
1481	  }
1482
1483	case PARSE_POSTFIX:
1484	  /* Parse postfix operators. */
1485	  if (ctx->re >= ctx->re_end)
1486	    break;
1487#ifdef REG_LITERAL
1488	  if (ctx->cflags & REG_LITERAL)
1489	    break;
1490#endif /* REG_LITERAL */
1491	  int minimal = (ctx->cflags & REG_UNGREEDY) ? 1 : 0;
1492	  int rep_min = 0;
1493	  int rep_max = -1;
1494#ifdef TRE_DEBUG
1495	  int lbrace_off;
1496#endif
1497	  switch (*ctx->re)
1498	    {
1499	    case CHAR_PLUS:
1500	    case CHAR_QUESTIONMARK:
1501	      if (!(ctx->cflags & REG_EXTENDED))
1502		break;
1503		/*FALLTHROUGH*/
1504	    case CHAR_STAR:
1505	      {
1506		tre_ast_node_t *tmp_node;
1507#ifdef TRE_DEBUG
1508		const char *tstr = "star";
1509		tmp_re = ctx->re;
1510#endif
1511
1512	handle_plus_or_question:
1513		/* error on iteration of raw assertion (not in subexpression) */
1514		if (result->type == LITERAL && result->submatch_id < 0 &&
1515		    IS_ASSERTION((tre_literal_t *)result->obj))
1516		  {
1517		    if (!(ctx->cflags & REG_EXTENDED)) break;
1518		    return REG_BADRPT;
1519		  }
1520		if (*ctx->re == CHAR_PLUS)
1521		  {
1522		    rep_min = 1;
1523#ifdef TRE_DEBUG
1524		    tstr = "plus";
1525#endif
1526		  }
1527		if (*ctx->re == CHAR_QUESTIONMARK)
1528		  {
1529		    rep_max = 1;
1530#ifdef TRE_DEBUG
1531		    tstr = "questionmark";
1532#endif
1533		  }
1534
1535		if (ctx->cflags & REG_EXTENDED)
1536		  {
1537		    if (ctx->re + 1 < ctx->re_end)
1538		      {
1539			if (*(ctx->re + 1) == CHAR_QUESTIONMARK)
1540			  {
1541			    /* Process the question mark only in enhanced mode.
1542			       Otherwise, the question mark is an error in ERE */
1543			    if (ctx->cflags & REG_ENHANCED)
1544			      {
1545				minimal = !(ctx->cflags & REG_UNGREEDY);
1546				ctx->re++;
1547			      }
1548			    else return REG_BADRPT;
1549			  }
1550			else if (*(ctx->re + 1) == CHAR_STAR
1551				 || *(ctx->re + 1) == CHAR_PLUS)
1552			  {
1553			    /* These are reserved for future extensions. */
1554			    return REG_BADRPT;
1555			  }
1556		      }
1557		  }
1558		else
1559		  {
1560		    if (ctx->re + 1 < ctx->re_end && *(ctx->re + 1) == CHAR_STAR)
1561		      {
1562			/* This is reserved for future extensions. */
1563			return REG_BADRPT;
1564		      }
1565		    if (ctx->re + 2 < ctx->re_end)
1566		      {
1567			if (*(ctx->re + 1) == CHAR_BACKSLASH && *(ctx->re + 1) == CHAR_QUESTIONMARK)
1568			  {
1569			    /* Process the question mark only in enhanced mode.
1570			       Otherwise, the question mark is a literal in BRE */
1571			    if (ctx->cflags & REG_ENHANCED)
1572			      {
1573				minimal = !(ctx->cflags & REG_UNGREEDY);
1574				ctx->re += 2;
1575			      }
1576			  }
1577			else if (*(ctx->re + 1) == CHAR_BACKSLASH && *(ctx->re + 2) == CHAR_PLUS)
1578			  {
1579			    /* This is reserved for future extensions. */
1580			    return REG_BADRPT;
1581			  }
1582		      }
1583		  }
1584
1585		if (minimal)
1586		  ctx->num_reorder_tags++;
1587
1588		DPRINT(("tre_parse: %s %s: '%.*" STRF "'\n",
1589			minimal ? "  minimal" : "greedy", tstr, REST(tmp_re)));
1590		if (result == NULL)
1591		  {
1592		    if (ctx->cflags & REG_EXTENDED) return REG_BADRPT;
1593		    else goto parse_literal;
1594		  }
1595		ctx->re++;
1596		tmp_node = tre_ast_new_iter(ctx->mem, result, rep_min, rep_max,
1597					    minimal);
1598		if (tmp_node == NULL)
1599		  return REG_ESPACE;
1600		result = tmp_node;
1601
1602		/* Set the iterator with a submatch id in the invisible range
1603		 * (which will be overridden if a real submatch is needed) */
1604		result->submatch_id = ctx->submatch_id_invisible++;
1605
1606#if 0
1607		/* We don't allow multiple postfixes, but this might be needed
1608		   to support approximate matching */
1609		STACK_PUSHX(stack, int, PARSE_POSTFIX);
1610#endif
1611	      }
1612	      break;
1613
1614	    case CHAR_BACKSLASH:
1615	      /* "\{" is special without REG_EXTENDED */
1616	      /* "\+" and "\?" are special with REG_ENHANCED for BRE */
1617	      if (!(ctx->cflags & REG_EXTENDED)
1618		  && ctx->re + 1 < ctx->re_end)
1619		{
1620		  switch (*(ctx->re + 1))
1621		    {
1622		    case CHAR_LBRACE:
1623		      ctx->re++;
1624#ifdef TRE_DEBUG
1625		      lbrace_off = 2;
1626#endif
1627		      goto parse_brace;
1628		    case CHAR_PLUS:
1629		    case CHAR_QUESTIONMARK:
1630		      if (ctx->cflags & REG_ENHANCED)
1631			{
1632#ifdef TRE_DEBUG
1633			  tmp_re = ctx->re;
1634#endif
1635			  ctx->re++;
1636			  goto handle_plus_or_question;
1637			}
1638		      break;
1639		    }
1640		  break;
1641		}
1642	      else
1643		break;
1644
1645	    case CHAR_LBRACE:
1646	      {
1647		int raw_assertion;
1648
1649		/* "{" is literal without REG_EXTENDED */
1650		if (!(ctx->cflags & REG_EXTENDED))
1651		  break;
1652#ifdef TRE_DEBUG
1653		lbrace_off = 1;
1654#endif
1655
1656	    parse_brace:
1657		/* error on iteration of raw assertion (not in subexpression),
1658		   but wait until after parsing bounds */
1659		raw_assertion = (result->type == LITERAL
1660				 && result->submatch_id < 0
1661				 && IS_ASSERTION((tre_literal_t *)result->obj));
1662		ctx->re++;
1663
1664		status = tre_parse_bound(ctx, &result);
1665#ifdef ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND
1666		/* For ERE, if status is REG_NOMATCH, this mean the lbrace
1667		   is to be treated as a literal. */
1668		if (status == REG_NOMATCH)
1669		  {
1670		    ctx->re--;
1671		    break;
1672		  }
1673#endif /* ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND */
1674		DPRINT(("tre_parse:	bound: '%.*" STRF "'\n",
1675			REST(ctx->re - lbrace_off)));
1676		if (status != REG_OK)
1677		  return status;
1678		if (raw_assertion) return REG_BADRPT;
1679
1680		/* Set the iterator with a submatch id in the invisible range
1681		 * (which will be overridden if a real submatch is needed) */
1682		if (result->type == ITERATION)
1683		  result->submatch_id = ctx->submatch_id_invisible++;
1684
1685#if 0
1686		/* We don't allow multiple postfixes, but this might be needed
1687		   to support approximate matching */
1688		STACK_PUSHX(stack, int, PARSE_POSTFIX);
1689#endif
1690		break;
1691	      }
1692	    }
1693	  break;
1694
1695	case PARSE_ATOM:
1696	  {
1697	    /* Parse an atom.  An atom is a regular expression enclosed in `()',
1698	       an empty set of `()', a bracket expression, `.', `^', `$',
1699	       a `\' followed by a character, or a single character. */
1700
1701	    /* The stack contains a boolean value, whether PARSE_ATOM is
1702	       being called just after the start of a group (left paren)
1703	       in a BRE */
1704	    bre_branch_begin = tre_stack_pop_int(stack);
1705
1706	    /* End of regexp? (empty string). */
1707	    if (ctx->re >= ctx->re_end)
1708	      goto parse_literal;
1709
1710#ifdef REG_LITERAL
1711	    if (ctx->cflags & REG_LITERAL)
1712	      goto parse_literal;
1713#endif /* REG_LITERAL */
1714
1715	    switch (*ctx->re)
1716	      {
1717	      case CHAR_LPAREN:  /* parenthesized subexpression */
1718
1719		/* Handle "(?...)" extensions.  They work in a way similar
1720		   to Perls corresponding extensions. */
1721		if ((ctx->cflags & (REG_EXTENDED|REG_ENHANCED)) ==
1722		    (REG_EXTENDED|REG_ENHANCED)
1723		    && *(ctx->re + 1) == CHAR_QUESTIONMARK)
1724		  {
1725		    int new_cflags = ctx->cflags;
1726		    int bit = 1;
1727		    int invisible_submatch = 0;
1728		    DPRINT(("tre_parse:	extension: '%.*" STRF "'\n",
1729			    REST(ctx->re)));
1730		    ctx->re += 2;
1731		    while (/*CONSTCOND*/1)
1732		      {
1733			if (*ctx->re == L'i')
1734			  {
1735			    DPRINT(("tre_parse:	    icase: '%.*" STRF "'\n",
1736				    REST(ctx->re)));
1737			    if (bit)
1738			      new_cflags |= REG_ICASE;
1739			    else
1740			      new_cflags &= ~REG_ICASE;
1741			    ctx->re++;
1742			  }
1743			else if (*ctx->re == L'n')
1744			  {
1745			    DPRINT(("tre_parse:	  newline: '%.*" STRF "'\n",
1746				    REST(ctx->re)));
1747			    if (bit)
1748			      new_cflags |= REG_NEWLINE;
1749			    else
1750			      new_cflags &= ~REG_NEWLINE;
1751			    ctx->re++;
1752			  }
1753#ifdef REG_LEFT_ASSOC
1754			else if (*ctx->re == L'l')
1755			  {
1756			    DPRINT(("tre_parse: left assoc: '%.*" STRF "'\n",
1757				    REST(ctx->re)));
1758			    if (bit)
1759			      new_cflags |= REG_LEFT_ASSOC;
1760			    else
1761			      new_cflags &= ~REG_LEFT_ASSOC;
1762			    ctx->re++;
1763			  }
1764#endif /* REG_LEFT_ASSOC */
1765#ifdef REG_UNGREEDY
1766			else if (*ctx->re == L'U')
1767			  {
1768			    DPRINT(("tre_parse:    ungreedy: '%.*" STRF "'\n",
1769				    REST(ctx->re)));
1770			    if (bit)
1771			      new_cflags |= REG_UNGREEDY;
1772			    else
1773			      new_cflags &= ~REG_UNGREEDY;
1774			    ctx->re++;
1775			  }
1776#endif /* REG_UNGREEDY */
1777			else if (*ctx->re == CHAR_MINUS)
1778			  {
1779			    DPRINT(("tre_parse:	 turn off: '%.*" STRF "'\n",
1780				    REST(ctx->re)));
1781			    ctx->re++;
1782			    bit = 0;
1783			  }
1784			else if (*ctx->re == CHAR_COLON)
1785			  {
1786			    DPRINT(("tre_parse:	 no group: '%.*" STRF
1787				    "', (invisible submatch %d)\n",
1788				    REST(ctx->re), ctx->submatch_id_invisible));
1789			    ctx->re++;
1790			    depth++;
1791			    invisible_submatch = 1;
1792			    break;
1793			  }
1794			else if (*ctx->re == CHAR_HASH)
1795			  {
1796			    DPRINT(("tre_parse:    comment: '%.*" STRF "'\n",
1797				    REST(ctx->re)));
1798			    /* A comment can contain any character except a
1799			       right parenthesis */
1800			    while (*ctx->re != CHAR_RPAREN
1801				   && ctx->re < ctx->re_end)
1802			      ctx->re++;
1803			    if (*ctx->re == CHAR_RPAREN && ctx->re < ctx->re_end)
1804			      {
1805				ctx->re++;
1806				break;
1807			      }
1808			    else
1809			      return REG_BADPAT;
1810			  }
1811			else if (*ctx->re == CHAR_RPAREN)
1812			  {
1813			    ctx->re++;
1814			    break;
1815			  }
1816			else
1817			  return REG_BADRPT;
1818		      }
1819
1820		    /* Turn on the cflags changes for the rest of the
1821		       enclosing group. */
1822		    if (invisible_submatch)
1823		      {
1824			STACK_PUSHX(stack, int, ctx->cflags);
1825			STACK_PUSHX(stack, int, ctx->submatch_id_invisible);
1826			STACK_PUSHX(stack, int, PARSE_MARK_FOR_SUBMATCH);
1827			ctx->submatch_id_invisible++;
1828			STACK_PUSHX(stack, int, 0); // bre_branch_begin
1829			STACK_PUSHX(stack, int, PARSE_RE);
1830		      }
1831		    else {
1832			STACK_PUSHX(stack, int, 0); // bre_branch_begin
1833			STACK_PUSHX(stack, int, PARSE_ATOM);
1834		    }
1835		    ctx->cflags = new_cflags;
1836		    break;
1837		  }
1838
1839		if (ctx->cflags & REG_EXTENDED)
1840		  {
1841		parse_bre_lparen:
1842		    DPRINT(("tre_parse: group begin: '%.*" STRF
1843			    "', submatch %d\n", REST(ctx->re),
1844			    ctx->submatch_id));
1845		    ctx->re++;
1846		    /* First parse a whole RE, then mark the resulting tree
1847		       for submatching. */
1848		    STACK_PUSHX(stack, int, ctx->cflags);
1849		    STACK_PUSHX(stack, int, ctx->submatch_id);
1850		    STACK_PUSHX(stack, int, PARSE_MARK_FOR_SUBMATCH);
1851		    /* We need to pass a boolean (eventually) to PARSE_ATOM to
1852		       indicate if this is the beginning of a BRE group. */
1853		    STACK_PUSHX(stack, int, !(ctx->cflags & REG_EXTENDED));
1854		    STACK_PUSHX(stack, int, PARSE_RE);
1855		    ctx->submatch_id++;
1856		    depth++;
1857		  }
1858		else
1859		  goto parse_literal;
1860		break;
1861
1862	      case CHAR_RPAREN:  /* end of current subexpression */
1863		if (ctx->cflags & REG_EXTENDED && depth > 0)
1864		  {
1865	      parse_bre_rparen_empty:
1866		    if (!(ctx->cflags & REG_EXTENDED) && depth == 0)
1867		      return REG_EPAREN;
1868		    DPRINT(("tre_parse:	    empty: '%.*" STRF "'\n",
1869			    REST(ctx->re)));
1870		    /* We were expecting an atom, but instead the current
1871		       subexpression was closed.  POSIX leaves the meaning of
1872		       this to be implementation-defined.  We interpret this as
1873		       an empty expression (which matches an empty string).  */
1874		    result = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
1875		    if (result == NULL)
1876		      return REG_ESPACE;
1877		    if (!(ctx->cflags & REG_EXTENDED))
1878		      ctx->re--;
1879		  }
1880		else
1881		  goto parse_literal;
1882		break;
1883
1884	      case CHAR_LBRACKET: /* bracket expression */
1885		DPRINT(("tre_parse:     bracket: '%.*" STRF "'\n",
1886			REST(ctx->re)));
1887		ctx->re++;
1888		status = tre_parse_bracket(ctx, &result);
1889		if (status != REG_OK)
1890		  return status;
1891		break;
1892
1893	      case CHAR_BACKSLASH:
1894		/* Deal with "\(", "\)" or "\{" for BREs */
1895		if (!(ctx->cflags & REG_EXTENDED)
1896		    && ctx->re + 1 < ctx->re_end)
1897		  {
1898		    if (*(ctx->re + 1) == CHAR_LPAREN)
1899		      {
1900			ctx->re++;
1901			goto parse_bre_lparen;
1902		      }
1903		    else if (*(ctx->re + 1) == CHAR_RPAREN)
1904		      {
1905			ctx->re++;
1906			goto parse_bre_rparen_empty;
1907		      }
1908		    if (*(ctx->re + 1) == CHAR_LBRACE) goto parse_literal;
1909		  }
1910
1911		if (ctx->re + 1 >= ctx->re_end)
1912		  /* Trailing backslash. */
1913		  return REG_EESCAPE;
1914
1915		if (!(ctx->cflags & REG_ENHANCED))
1916		  {
1917		    DPRINT(("tre_parse:  unenhanced bleep: '%.*" STRF "'\n", REST(ctx->re)));
1918		    ctx->re++;
1919		    goto unenhanced_backslash;
1920		  }
1921
1922		/* If a macro is used, parse the expanded macro recursively. */
1923		{
1924		  tre_char_t buf[64];
1925		  tre_expand_macro(ctx->re + 1, ctx->re_end,
1926				   buf, elementsof(buf));
1927		  if (buf[0] != 0)
1928		    {
1929		      tre_parse_ctx_t subctx;
1930		      memcpy(&subctx, ctx, sizeof(subctx));
1931		      subctx.re = buf;
1932		      subctx.len = tre_strlen(buf);
1933		      subctx.nofirstsub = 1;
1934		      status = tre_parse(&subctx);
1935		      if (status != REG_OK)
1936			return status;
1937		      ctx->re += 2;
1938		      ctx->position = subctx.position;
1939		      result = subctx.result;
1940		      break;
1941		    }
1942		}
1943
1944#ifdef REG_LITERAL
1945		if (*(ctx->re + 1) == L'Q')
1946		  {
1947		    DPRINT(("tre_parse: tmp literal: '%.*" STRF "'\n",
1948			    REST(ctx->re)));
1949		    ctx->cflags |= REG_LITERAL;
1950		    temporary_cflags |= REG_LITERAL;
1951		    ctx->re += 2;
1952		    STACK_PUSHX(stack, int, 0);
1953		    STACK_PUSHX(stack, int, PARSE_ATOM);
1954		    break;
1955		  }
1956#endif /* REG_LITERAL */
1957
1958		DPRINT(("tre_parse:  bleep: '%.*" STRF "'\n", REST(ctx->re)));
1959		ctx->re++;
1960		switch (*ctx->re)
1961		  {
1962		  case L'b':
1963		    result = tre_ast_new_literal(ctx->mem, ASSERTION,
1964						 ASSERT_AT_WB, -1);
1965		    ctx->re++;
1966		    break;
1967		  case L'B':
1968		    result = tre_ast_new_literal(ctx->mem, ASSERTION,
1969						 ASSERT_AT_WB_NEG, -1);
1970		    ctx->re++;
1971		    break;
1972		  case L'<':
1973		    result = tre_ast_new_literal(ctx->mem, ASSERTION,
1974						 ASSERT_AT_BOW, -1);
1975		    ctx->re++;
1976		    break;
1977		  case L'>':
1978		    result = tre_ast_new_literal(ctx->mem, ASSERTION,
1979						 ASSERT_AT_EOW, -1);
1980		    ctx->re++;
1981		    break;
1982		  case L'x':
1983		    ctx->re++;
1984		    if (ctx->re[0] != CHAR_LBRACE && ctx->re < ctx->re_end)
1985		      {
1986			/* 8 bit hex char. */
1987			char tmp[3] = {0, 0, 0};
1988			long val;
1989			DPRINT(("tre_parse:  8 bit hex: '%.*" STRF "'\n",
1990				REST(ctx->re - 2)));
1991
1992			if (tre_isxdigit_l(ctx->re[0], ctx->loc) &&
1993			    ctx->re < ctx->re_end)
1994			  {
1995			    tmp[0] = (char)ctx->re[0];
1996			    ctx->re++;
1997			  }
1998			if (tre_isxdigit_l(ctx->re[0], ctx->loc) &&
1999			    ctx->re < ctx->re_end)
2000			  {
2001			    tmp[1] = (char)ctx->re[0];
2002			    ctx->re++;
2003			  }
2004			val = strtol(tmp, NULL, 16);
2005			result = tre_ast_new_literal(ctx->mem, (int)val,
2006						     (int)val, ctx->position);
2007			ctx->position++;
2008			break;
2009		      }
2010		    else if (ctx->re < ctx->re_end)
2011		      {
2012			/* Wide char. */
2013			char tmp[32];
2014			long val;
2015			int i = 0;
2016			ctx->re++;
2017			while (ctx->re_end - ctx->re >= 0)
2018			  {
2019			    if (ctx->re[0] == CHAR_RBRACE)
2020			      break;
2021			    if (tre_isxdigit_l(ctx->re[0], ctx->loc))
2022			      {
2023				tmp[i] = (char)ctx->re[0];
2024				i++;
2025				ctx->re++;
2026				continue;
2027			      }
2028			    return REG_EBRACE;
2029			  }
2030			ctx->re++;
2031			tmp[i] = 0;
2032			val = strtol(tmp, NULL, 16);
2033			result = tre_ast_new_literal(ctx->mem, (int)val, (int)val,
2034						     ctx->position);
2035			ctx->position++;
2036			break;
2037		      }
2038		    /*FALLTHROUGH*/
2039
2040		  default:
2041		  unenhanced_backslash:
2042		    if ((ctx->cflags & (REG_EXTENDED | REG_ENHANCED)) !=
2043			REG_EXTENDED &&
2044			tre_isdigit_l(*ctx->re, ctx->loc) && *ctx->re != L'0')
2045		      {
2046			/* Back reference (only in BRE or enhanced). */
2047			int val = *ctx->re - L'0';
2048			DPRINT(("tre_parse:     backref: '%.*" STRF "'\n",
2049				REST(ctx->re - 1)));
2050			result = tre_ast_new_literal(ctx->mem, BACKREF, val,
2051						     ctx->position);
2052			if (result == NULL)
2053			  return REG_ESPACE;
2054
2055			/* Set the backref with a submatch id in the invisible
2056			 * range (which will be overridden if a real submatch
2057			 * is needed) */
2058			result->submatch_id = ctx->submatch_id_invisible++;
2059
2060			ctx->position++;
2061			ctx->num_reorder_tags++;
2062			ctx->max_backref = MAX(val, ctx->max_backref);
2063			ctx->re++;
2064		      }
2065		    else
2066		      {
2067			/* Escaped character. */
2068			DPRINT(("tre_parse:     escaped: '%.*" STRF "'\n",
2069				REST(ctx->re - 1)));
2070			result = tre_ast_new_literal(ctx->mem, *ctx->re, *ctx->re,
2071						     ctx->position);
2072			ctx->position++;
2073			ctx->re++;
2074		      }
2075		    break;
2076		  }
2077		if (result == NULL)
2078		  return REG_ESPACE;
2079		break;
2080
2081	      case CHAR_PERIOD:	 /* the any-symbol */
2082		DPRINT(("tre_parse:	  any: '%.*" STRF "'\n",
2083			REST(ctx->re)));
2084		if (ctx->cflags & REG_NEWLINE)
2085		  {
2086		    tre_ast_node_t *tmp1;
2087		    tre_ast_node_t *tmp2;
2088		    tmp1 = tre_ast_new_literal(ctx->mem, 0, L'\n' - 1,
2089					       ctx->position);
2090		    if (!tmp1)
2091		      return REG_ESPACE;
2092		    tmp2 = tre_ast_new_literal(ctx->mem, L'\n' + 1, TRE_CHAR_MAX,
2093					       ctx->position + 1);
2094		    if (!tmp2)
2095		      return REG_ESPACE;
2096		    result = tre_ast_new_union(ctx->mem, tmp1, tmp2);
2097		    if (!result)
2098		      return REG_ESPACE;
2099		    ctx->position += 2;
2100		  }
2101		else
2102		  {
2103		    result = tre_ast_new_literal(ctx->mem, 0, TRE_CHAR_MAX,
2104						 ctx->position);
2105		    if (!result)
2106		      return REG_ESPACE;
2107		    ctx->position++;
2108		  }
2109		ctx->re++;
2110		break;
2111
2112	      case CHAR_CARET:	 /* beginning of line assertion */
2113		/* '^' has a special meaning everywhere in EREs, at the
2114		   beginning of the RE and after \( is BREs.  It is also
2115		   special in enhanced BREs at the beginning of each branches
2116		   of a union */
2117		if (ctx->cflags & REG_EXTENDED
2118		    || bre_branch_begin
2119		    || ctx->re == ctx->re_start)
2120		  {
2121		    DPRINT(("tre_parse:	      BOL: '%.*" STRF "'\n",
2122			    REST(ctx->re)));
2123		    result = tre_ast_new_literal(ctx->mem, ASSERTION,
2124						 ASSERT_AT_BOL, -1);
2125		    if (result == NULL)
2126		      return REG_ESPACE;
2127		    ctx->re++;
2128		  }
2129		else
2130		  goto parse_literal;
2131		break;
2132
2133	      case CHAR_DOLLAR:	 /* end of line assertion. */
2134		/* '$' is special everywhere in EREs, and in the end of the
2135		   string and before \) is BREs. */
2136		if (ctx->cflags & REG_EXTENDED
2137		    || (ctx->re + 2 < ctx->re_end
2138			&& *(ctx->re + 1) == CHAR_BACKSLASH
2139			&& *(ctx->re + 2) == CHAR_RPAREN)
2140		    || ctx->re + 1 == ctx->re_end)
2141		  {
2142		    DPRINT(("tre_parse:	      EOL: '%.*" STRF "'\n",
2143			    REST(ctx->re)));
2144		    result = tre_ast_new_literal(ctx->mem, ASSERTION,
2145						 ASSERT_AT_EOL, -1);
2146		    if (result == NULL)
2147		      return REG_ESPACE;
2148		    ctx->re++;
2149		  }
2150		else
2151		  goto parse_literal;
2152		break;
2153
2154	      default:
2155	      parse_literal:
2156
2157		if (temporary_cflags && ctx->re + 1 < ctx->re_end
2158		    && *ctx->re == CHAR_BACKSLASH && *(ctx->re + 1) == L'E')
2159		  {
2160		    DPRINT(("tre_parse:	 end tmps: '%.*" STRF "'\n",
2161			    REST(ctx->re)));
2162		    ctx->cflags &= ~temporary_cflags;
2163		    temporary_cflags = 0;
2164		    ctx->re += 2;
2165		    if (ctx->re < ctx->re_end)
2166		      {
2167			STACK_PUSHX(stack, int, 0);
2168			STACK_PUSHX(stack, int, PARSE_ATOM);
2169		      }
2170		    else
2171		      {
2172			result = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
2173			if (!result) return REG_ESPACE;
2174		      }
2175		    break;
2176		  }
2177
2178
2179		/* We are expecting an atom.  If the subexpression (or the whole
2180		   regexp ends here, we interpret it as an empty expression
2181		   (which matches an empty string), which is an error.
2182		   Iterations of an empty expression is also an error. */
2183#ifdef REG_LITERAL
2184		if (!(ctx->cflags & REG_LITERAL))
2185		  {
2186#endif /* REG_LITERAL */
2187		    /* error on end of string */
2188		    if (ctx->re >= ctx->re_end) return depth > 0 ? REG_EPAREN
2189						       : REG_EMPTY;
2190		    /* error on unions and iterations of empty expressions */
2191		    if (ctx->cflags & REG_EXTENDED)
2192		      {
2193			if (ctx->re < ctx->re_end)
2194			  {
2195			    if (*ctx->re == CHAR_PIPE) return REG_EMPTY;
2196			    if (*ctx->re == CHAR_LBRACE)
2197			      {
2198				ctx->re++;
2199		  empty_parse_bound:
2200				/* We need to parse the bound first and return
2201				   any error, before returning REG_BADRPT */
2202				status = tre_parse_bound(ctx, NULL);
2203#ifdef ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND
2204				/* For ERE, if REG_NOMATCH is returned, we
2205				   treat the lbrace as a literal. */
2206				if (status == REG_NOMATCH)
2207				  {
2208				    ctx->re--;
2209				    /* Drop down to literal-handling code */
2210				  }
2211				else
2212				  {
2213#endif /* ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND */
2214				    if (status != REG_OK)
2215				      return status;
2216				    return REG_BADRPT;
2217#ifdef ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND
2218				  }
2219#endif /* ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND */
2220			      }
2221#ifdef ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND
2222			    else
2223#endif /* ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND */
2224			    if (*ctx->re == CHAR_STAR
2225				|| *ctx->re == CHAR_PLUS
2226				|| *ctx->re == CHAR_QUESTIONMARK)
2227			      {
2228				return REG_BADRPT;
2229			      }
2230			  }
2231		      }
2232		    else if (ctx->re + 1 < ctx->re_end
2233			     && *ctx->re == CHAR_BACKSLASH
2234			     && *(ctx->re + 1) == CHAR_LBRACE)
2235		      {
2236			ctx->re += 2;
2237			goto empty_parse_bound;
2238		      }
2239#ifdef REG_LITERAL
2240		  }
2241#endif /* REG_LITERAL */
2242
2243		DPRINT(("tre_parse:     literal: '%.*" STRF "'\n",
2244			REST(ctx->re)));
2245		/* Note that we can't use an tre_isalpha() test here, since there
2246		   may be characters which are alphabetic but neither upper or
2247		   lower case. */
2248		if (ctx->cflags & REG_ICASE
2249		    && (tre_isupper_l(*ctx->re, ctx->loc) ||
2250		    tre_islower_l(*ctx->re, ctx->loc)))
2251		  {
2252		    tre_ast_node_t *tmp1;
2253		    tre_ast_node_t *tmp2;
2254
2255		    /* XXX - Can there be more than one opposite-case
2256		       counterpoints for some character in some locale?  Or
2257		       more than two characters which all should be regarded
2258		       the same character if case is ignored?  If yes, there
2259		       does not seem to be a portable way to detect it.  I guess
2260		       that at least for multi-character collating elements there
2261		       could be several opposite-case counterpoints, but they
2262		       cannot be supported portably anyway. */
2263		    tmp1 = tre_ast_new_literal(ctx->mem,
2264					       tre_toupper_l(*ctx->re, ctx->loc),
2265					       tre_toupper_l(*ctx->re, ctx->loc),
2266					       ctx->position);
2267		    if (!tmp1)
2268		      return REG_ESPACE;
2269		    tmp2 = tre_ast_new_literal(ctx->mem,
2270					       tre_tolower_l(*ctx->re, ctx->loc),
2271					       tre_tolower_l(*ctx->re, ctx->loc),
2272					       ctx->position);
2273		    if (!tmp2)
2274		      return REG_ESPACE;
2275		    result = tre_ast_new_union(ctx->mem, tmp1, tmp2);
2276		    if (!result)
2277		      return REG_ESPACE;
2278		  }
2279		else
2280		  {
2281		    result = tre_ast_new_literal(ctx->mem, *ctx->re, *ctx->re,
2282						 ctx->position);
2283		    if (!result)
2284		      return REG_ESPACE;
2285		  }
2286		ctx->position++;
2287		ctx->re++;
2288		break;
2289	      }
2290	    break;
2291	  }
2292
2293	case PARSE_MARK_FOR_SUBMATCH:
2294	  {
2295	    int submatch_id = tre_stack_pop_int(stack);
2296
2297	    ctx->cflags = tre_stack_pop_int(stack); /* restore cflags */
2298	    if (result->submatch_id >= 0 &&
2299		result->submatch_id < SUBMATCH_ID_INVISIBLE_START)
2300	      {
2301		tre_ast_node_t *n, *tmp_node;
2302		if (submatch_id >= SUBMATCH_ID_INVISIBLE_START)
2303		  break;
2304		n = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
2305		if (n == NULL)
2306		  return REG_ESPACE;
2307		tmp_node = tre_ast_new_catenation(ctx->mem, n, result);
2308		if (tmp_node == NULL)
2309		  return REG_ESPACE;
2310		tmp_node->num_submatches = result->num_submatches;
2311		result = tmp_node;
2312	      }
2313	    result->submatch_id = submatch_id;
2314	    if (submatch_id < SUBMATCH_ID_INVISIBLE_START)
2315	      result->num_submatches++;
2316	    break;
2317	  }
2318
2319	default:
2320	  assert(0);
2321	  break;
2322	}
2323    }
2324
2325  /* Check for missing closing parentheses. */
2326  if (depth > 0)
2327    return REG_EPAREN;
2328
2329  ctx->result = result;
2330
2331  return REG_OK;
2332}
2333
2334/* EOF */
2335